Python词云图程序代码 |
##2020新保险合同 |
%matplotlib inline %config InlineBackend.close_figures = False import numpy as np import matplotlib.pyplot as plt
from matplotlib import rcParams from PIL import Image
import urllib import bs4 from bs4 import BeautifulSoup
from wordcloud import WordCloud,STOPWORDS import collections import jieba |
my_url = "https://www.casc.org.cn/2018/0815/213104.shtml" wp = urllib.request.urlopen(my_url) soup = BeautifulSoup(wp,'lxml') text = soup.get_text().strip() print(text)
text = text.replace(", "") text = text.replace(";", "") text = text.replace("!", "") text = text.replace("?", "") text = text.replace('“', "") text = text.replace('”', "") text = text.replace("(", "") text = text.replace(")", "") text = text.replace("。", "") text = text.replace(":", "") text = text.replace(":", "") text = text.replace(" ", "") text = text.replace("的", "") text = text.replace("了", "") text = text.replace("在", "") text = text.replace("是", "") text = text.replace("也", "") text = text.replace("\n", "") text = text.replace("、", "") text = text.replace("\xa0", "") text = text.replace("―", "") text = text.replace(">", "") text = text.replace("\u3000", "") text = text.replace("\r", "") text = text.replace("《", "") text = text.replace("》", "") text = text.replace(",", "") text = text.replace("应当", "") text = text.replace("我们", "") text = text.replace("保险", "") text = text.replace("和", "") text = text.replace("或", "") text = text.replace("与", "") text = text.replace("日", "") text = text.replace("本", "") text = text.replace("时", "") text = text.replace("将", "") text = text.replace("组", "") text = text.replace("再", "") text = text.replace("为", "") text = text.replace("不", "") text = text.replace("会计准则", "")
Textsplit = jieba.lcut(text) print(Textsplit) |
|
|
i = 0 wordcount = {} for word in Textsplit: i = i + 1 wordcount[word]=wordcount.get(word,0)+1 print(i, len(word) ", word ", wordcount[word]) print(i, wordcount) |
|
items = list(wordcount.items()) items.sort(key=lambda x: x[1], reverse=True) text1 = "" for i in range(len(items)): #print(items[i]) word, count = items[i] if len(word)>1 and len(word)<5: if word == "期初": word = "期初余额" if word == "资产": word = "资产负债表"
print(word, count) text1 = text1 + word + " "
WC = WordCloud(background_color="white",font_path = 'C:\Windows\Fonts\SimYou.ttf').generate(text1) rcParams["figure.figsize"] = (12,4) plt.imshow(WC) plt.axis('off') plt.show() |
|
d = collections.Counter(wordcount) text = " " for word, count in d.most_common(25): print(word , ":", count) text = text + word + " "
Text=str(Textsplit) |
|
d = collections.Counter(wordcount) Record = "" filename="D:/lzj/Word1.csv" f = open(filename, "w") for word, count in d.most_common(50): if len(word)>1 and len(word)<5: Record = word + " + str(count) Record = Record.replace(" "") #print(Record) f.write(Record + "\n") f.close() |
|
##2006 2009 旧保险合同准则 |
%matplotlib inline %config InlineBackend.close_figures = False import numpy as np import matplotlib.pyplot as plt
from matplotlib import rcParams from PIL import Image
import urllib import bs4 from bs4 import BeautifulSoup
from wordcloud import WordCloud,STOPWORDS import collections import jieba |
text=my_url = "file:D:/lzj2/jiu.txt" wp = urllib.request.urlopen(my_url) soup = BeautifulSoup(wp,'lxml') text = soup.get_text().strip() print(text)
text = text.replace(", "") text = text.replace(";", "") text = text.replace("!", "") text = text.replace("?", "") text = text.replace('“', "") text = text.replace('”', "") text = text.replace("(", "") text = text.replace(")", "") text = text.replace("。", "") text = text.replace(":", "") text = text.replace(":", "") text = text.replace(" ", "") text = text.replace("的", "") text = text.replace("了", "") text = text.replace("在", "") text = text.replace("是", "") text = text.replace("也", "") text = text.replace("\n", "") text = text.replace("、", "") text = text.replace("\xa0", "") text = text.replace("―", "") text = text.replace(">", "") text = text.replace("\u3000", "") text = text.replace("\r", "") text = text.replace("《", "") text = text.replace("》", "") text = text.replace(",", "") text = text.replace("有", "") text = text.replace("我们", "") text = text.replace("和", "") text = text.replace("保险", "") text = text.replace("应当", "") text = text.replace("确定", "") text = text.replace("相关", "") text = text.replace("原", "") text = text.replace("为", "") text = text.replace("与", "") text = text.replace("人", "") text = text.replace("—", "") text = text.replace("未", "") text = text.replace("再", "") text = text.replace(". ", "") text = text.replace("时", "") text = text.replace("已", "") Textsplit = jieba.lcut(text) print(Textsplit) |
|
|
i = 0 wordcount = {} for word in Textsplit: i = i + 1 wordcount[word]=wordcount.get(word,0)+1 print(i, len(word) ", word ", wordcount[word]) print(i, wordcount) |
|
items = list(wordcount.items()) items.sort(key=lambda x: x[1], reverse=True) text1 = "" for i in range(len(items)): #print(items[i]) word, count = items[i] if len(word)>1 and len(word)<5: if word == "期初": word = "期初余额" if word == "资产": word = "资产负债表"
print(word, count) text1 = text1 + word + " "
WC = WordCloud(background_color="white",font_path = 'C:\Windows\Fonts\SimYou.ttf').generate(text1) rcParams["figure.figsize"] = (12,4) plt.imshow(WC) plt.axis('off') plt.show() |
|
d = collections.Counter(wordcount) text = " " for word, count in d.most_common(25): print(word , ":", count) text = text + word + " "
Text=str(Textsplit)
|
|
d = collections.Counter(wordcount) Record = "" filename="D:/lzj2/Word2.csv" f = open(filename, "w") for word, count in d.most_common(50): if len(word)>1 and len(word)<5: Record = word + " + str(count) Record = Record.replace(" "") #print(Record) f.write(Record + "\n") f.close() |
|