from sklearn.feature_extraction.text import TfidfVectorizer
# 语料库 可以换为其它同样形式的单词
corpus = [
list(range(-5, 5)),
list(range(-6,4)),
list(range(12)),
list(range(13))]
# corpus = [
# ['Two', 'wrongs', 'don\'t', 'make', 'a', 'right', '.'],
# ['The', 'pen', 'is', 'mightier', 'than', 'the', 'sword'],
# ['Don\'t', 'put', 'all', 'your', 'eggs', 'in', 'one', 'basket', '.']]
def dummy_fun(doc):
return doc
tfidf_vec = TfidfVectorizer(
analyzer='word',
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None)
# 使用 fit_transform() 得到 TF-IDF 矩阵。此为 scipy 稀疏矩阵
tfidf_matrix = tfidf_vec.fit_transform(corpus)
# print(tfidf_matrix)
# 使用 get_feature_names() 得到不重复的单词
print(tfidf_vec.get_feature_names_out())
# 得到每个单词对应的 ID
print(tfidf_vec.vocabulary_)
# 得到 corpus 中每个词得分
for i in range(len(corpus)):
column_indexes = [tfidf_vec.vocabulary_[key] for key in corpus[i]]
tf_idf = tfidf_matrix[i, column_indexes].toarray()[0]
print(tf_idf)
参考:
Applying scikit-learn TfidfVectorizer on tokenized text
sklearn.feature_extraction.text.TfidfVectorizer