upidea · March 3, 2020 01:22
diff --git a/tfidf b/tfidf
 # tf-idf （term frequency - inverse document frequency） 
 # 常用于挖掘文章的关键词；  
 #       在同一篇文章内值大的表示该词在这篇文章中有较高区分度：
 #           在该篇文章中反复出现， 而在全部文档中出现较少(逆文档频率)
 #       整个语料中值大的， 并无特别的意义， 不适于跨文章比较

 # 词频向量化
 from sklearn.feature_extraction.text import CountVectorizer
 
 # token_pattern 参数设置来指定字符切分字符串：  r"(?u)\b[^@]+\b     '\\b\\w+\\b'
 vectorizer = CountVectorizer(min_df=1)
 
 corpus = [      'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
 ]
 X = vectorizer.fit_transform(corpus)   # X.toarray()
 feature_name = vectorizer.get_feature_names()


 # TF-IDF预处理
 from sklearn.feature_extraction.text import TfidfTransformer 
 from sklearn.feature_extraction.text import CountVectorizer 
 
 
 corpus = [          'This is the first document.',
 		'This is the second second document.',
 		'And the third one.',
 		'Is this the first document?',
 		]
 
 
 vectorizer=CountVectorizer()
 
 
 transformer = TfidfTransformer()
 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) 
 print (tfidf.toarray)


 # 直接计算tfidfvectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer

 tfidf2 = TfidfVectorizer()
 re = tfidf2.fit_transform(corpus)
 print (re)
	# tf-idf （term frequency - inverse document frequency）
	# 常用于挖掘文章的关键词；
	# 在同一篇文章内值大的表示该词在这篇文章中有较高区分度：
	# 在该篇文章中反复出现，而在全部文档中出现较少(逆文档频率)
	# 整个语料中值大的，并无特别的意义，不适于跨文章比较

	# 词频向量化
	from sklearn.feature_extraction.text import CountVectorizer

	# token_pattern 参数设置来指定字符切分字符串： r"(?u)\b[^@]+\b '\\b\\w+\\b'
	vectorizer = CountVectorizer(min_df=1)

	corpus = [ 'This is the first document.',
	'This is the second second document.',
	'And the third one.',
	'Is this the first document?',
	]
	X = vectorizer.fit_transform(corpus) # X.toarray()
	feature_name = vectorizer.get_feature_names()


	# TF-IDF预处理
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.feature_extraction.text import CountVectorizer


	corpus = [ 'This is the first document.',
	'This is the second second document.',
	'And the third one.',
	'Is this the first document?',
	]


	vectorizer=CountVectorizer()


	transformer = TfidfTransformer()
	tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
	print (tfidf.toarray)


	# 直接计算tfidfvectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer

	tfidf2 = TfidfVectorizer()
	re = tfidf2.fit_transform(corpus)
	print (re)
No results found