TF-IDF简介
TF-IDF使用流程
见上图,步骤如下:
根据全部语料每个词对应的TF-IDF值每个句子对应全部语料的one-hot编码,将TF-IDF编码作为特征
Python代码
流程很简单,就不复现了,这里有两个使用的方式,第一种是梳理整个流程,第二种是工程上能够简化代码的写法:
def get_text():sentence_list = [ # 假设这是全部的训练语料"nlp drives computer programs that translate text from one language to another","nlp combines computational linguistics rule based modeling of human language with statistical","nlp model respond to text or voice data and respond with text",]return sentence_listdef main():from sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.feature_extraction.text import CountVectorizerprint("逐个流程梳理:")sentence_list = get_text()# instantiate CountVectorizer()count_vectorizer = CountVectorizer()word_count_vector = count_vectorizer.fit_transform(sentence_list) # 1. 计算词频TFtfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)tfidf_transformer.fit(word_count_vector) # 2. 计算 TF-IDF# 计算TF-IDFtf_idf_vector = tfidf_transformer.transform(word_count_vector) # 获得全部语料的tf-idf值# 开始使用print("全部语料:", count_vectorizer.get_feature_names())my_sentence = "nlp combines computational linguistics"print("转换任意的一个句子:", tfidf_transformer.transform(count_vectorizer.transform([my_sentence])).todense())def main2():print("简写操作:")from sklearn.feature_extraction.text import TfidfVectorizersentence_list = get_text()tfidf_vectorizer = TfidfVectorizer(use_idf=True)tfidf_vectorizer.fit(sentence_list)# 使用print("全部语料:", tfidf_vectorizer.get_feature_names())my_sentence = "nlp combines computational linguistics"print("转换任意的一个句子:", tfidf_vectorizer.transform([my_sentence]).todense())if __name__ == '__main__':main()main2()