附录2:TF-IDF值计算及K-Means聚类

# coding=utf-8 import time import sys import codecs import random from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import KMeans from sklearn.externals import joblib reload(sys) sys.setdefaultencoding('gbk') if __name__ == "__main__": corpus = [] for line in open('movie_top_jieba.txt', 'rb').readlines(): corpus.append(line.strip()) time.sleep(1) vectorizer = CountVectorizer() # 将文本中的词语转换为词频矩阵矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfidf.toarray() resName = "movie_TF-IDF.txt" result = codecs.open(resName, 'a', 'utf-8') for j in range(len(word)): result.write(word[j] + ' ') result.write('\r\n\r\n') # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 for i in range(len(weight)): for j in range(len(word)): result.write(str(weight[i][j]) + ' ') result.write('\r\n\r\n') result.close() # K-Means聚类 random.seed(999) print 'Start Kmeans:' clf = KMeans(n_clusters=8) s = clf.fit(weight) result = codecs.open('movie_kmeans.txt','a','utf8') result.writelines('调用K-Means聚类:'+ '\n' +str(s)+'\n') for each in clf.cluster_centers_: result.writelines(str(each)+'\n') result.writelines(str(clf.labels_)+'\n') result.writelines(str(clf.inertia_)+'\n') result.close() print '调用K-Means聚类:' print s print '聚类中心点:' print clf.cluster_centers_ # 8个中心点 print '样本所属的簇:' print clf.labels_# 每个样本所属的簇 print '点到对应簇的距离之和:' print clf.inertia_ # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数