Created
March 3, 2020 01:22
-
-
Save upidea/3d675eb2a231af0923f4bd98226b23ea to your computer and use it in GitHub Desktop.
tfidf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # tf-idf (term frequency - inverse document frequency) | |
| # 常用于挖掘文章的关键词; | |
| # 在同一篇文章内值大的表示该词在这篇文章中有较高区分度: | |
| # 在该篇文章中反复出现, 而在全部文档中出现较少(逆文档频率) | |
| # 整个语料中值大的, 并无特别的意义, 不适于跨文章比较 | |
| # 词频向量化 | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| # token_pattern 参数设置来指定字符切分字符串: r"(?u)\b[^@]+\b '\\b\\w+\\b' | |
| vectorizer = CountVectorizer(min_df=1) | |
| corpus = [ 'This is the first document.', | |
| 'This is the second second document.', | |
| 'And the third one.', | |
| 'Is this the first document?', | |
| ] | |
| X = vectorizer.fit_transform(corpus) # X.toarray() | |
| feature_name = vectorizer.get_feature_names() | |
| # TF-IDF预处理 | |
| from sklearn.feature_extraction.text import TfidfTransformer | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| corpus = [ 'This is the first document.', | |
| 'This is the second second document.', | |
| 'And the third one.', | |
| 'Is this the first document?', | |
| ] | |
| vectorizer=CountVectorizer() | |
| transformer = TfidfTransformer() | |
| tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) | |
| print (tfidf.toarray) | |
| # 直接计算tfidfvectorizer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| tfidf2 = TfidfVectorizer() | |
| re = tfidf2.fit_transform(corpus) | |
| print (re) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment