Python词云图程序代码

##2020新保险合同

%matplotlib inline

%config InlineBackend.close_figures = False

import numpy as np

import matplotlib.pyplot as plt

from matplotlib import rcParams

from PIL import Image

import urllib

import bs4

from bs4 import BeautifulSoup

from wordcloud import WordCloud,STOPWORDS

import collections

import jieba

my_url = "https://www.casc.org.cn/2018/0815/213104.shtml"

wp = urllib.request.urlopen(my_url)

soup = BeautifulSoup(wp,'lxml')

text = soup.get_text().strip()

print(text)

text = text.replace(", "")

text = text.replace("；", "")

text = text.replace("！", "")

text = text.replace("？", "")

text = text.replace('“', "")

text = text.replace('”', "")

text = text.replace("(", "")

text = text.replace(")", "")

text = text.replace("。", "")

text = text.replace("：", "")

text = text.replace(":", "")

text = text.replace(" ", "")

text = text.replace("的", "")

text = text.replace("了", "")

text = text.replace("在", "")

text = text.replace("是", "")

text = text.replace("也", "")

text = text.replace("\n", "")

text = text.replace("、", "")

text = text.replace("\xa0", "")

text = text.replace("―", "")

text = text.replace(">", "")

text = text.replace("\u3000", "")

text = text.replace("\r", "")

text = text.replace("《", "")

text = text.replace("》", "")

text = text.replace("，", "")

text = text.replace("应当", "")

text = text.replace("我们", "")

text = text.replace("保险", "")

text = text.replace("和", "")

text = text.replace("或", "")

text = text.replace("与", "")

text = text.replace("日", "")

text = text.replace("本", "")

text = text.replace("时", "")

text = text.replace("将", "")

text = text.replace("组", "")

text = text.replace("再", "")

text = text.replace("为", "")

text = text.replace("不", "")

text = text.replace("会计准则", "")

Textsplit = jieba.lcut(text)

print(Textsplit)

i = 0

wordcount = {}

for word in Textsplit:

i = i + 1

wordcount[word]=wordcount.get(word,0)+1

print(i, len(word) ", word ", wordcount[word])

print(i, wordcount)

items = list(wordcount.items())

items.sort(key=lambda x: x[1], reverse=True)

text1 = ""

for i in range(len(items)):

#print(items[i])

word, count = items[i]

if len(word)>1 and len(word)<5:

if word == "期初": word = "期初余额"

if word == "资产": word = "资产负债表"

print(word, count)

text1 = text1 + word + " "

WC = WordCloud(background_color="white",font_path = 'C:\Windows\Fonts\SimYou.ttf').generate(text1)

rcParams["figure.figsize"] = (12,4)

plt.imshow(WC)

plt.axis('off')

plt.show()

d = collections.Counter(wordcount)

text = " "

for word, count in d.most_common(25):

print(word , ":", count)

text = text + word + " "

Text=str(Textsplit)

d = collections.Counter(wordcount)

Record = ""

filename="D:/lzj/Word1.csv"

f = open(filename, "w")

for word, count in d.most_common(50):

if len(word)>1 and len(word)<5:

Record = word + " + str(count)

Record = Record.replace(" "")

#print(Record)

f.write(Record + "\n")

f.close()

##2006 2009 旧保险合同准则

%matplotlib inline

%config InlineBackend.close_figures = False

import numpy as np

import matplotlib.pyplot as plt

from matplotlib import rcParams

from PIL import Image

import urllib

import bs4

from bs4 import BeautifulSoup

from wordcloud import WordCloud,STOPWORDS

import collections

import jieba

text=my_url = "file:D:/lzj2/jiu.txt"

wp = urllib.request.urlopen(my_url)

soup = BeautifulSoup(wp,'lxml')

text = soup.get_text().strip()

print(text)

text = text.replace(", "")

text = text.replace("；", "")

text = text.replace("！", "")

text = text.replace("？", "")

text = text.replace('“', "")

text = text.replace('”', "")

text = text.replace("(", "")

text = text.replace(")", "")

text = text.replace("。", "")

text = text.replace("：", "")

text = text.replace(":", "")

text = text.replace(" ", "")

text = text.replace("的", "")

text = text.replace("了", "")

text = text.replace("在", "")

text = text.replace("是", "")

text = text.replace("也", "")

text = text.replace("\n", "")

text = text.replace("、", "")

text = text.replace("\xa0", "")

text = text.replace("―", "")

text = text.replace(">", "")

text = text.replace("\u3000", "")

text = text.replace("\r", "")

text = text.replace("《", "")

text = text.replace("》", "")

text = text.replace("，", "")

text = text.replace("有", "")

text = text.replace("我们", "")

text = text.replace("和", "")

text = text.replace("保险", "")

text = text.replace("应当", "")

text = text.replace("确定", "")

text = text.replace("相关", "")

text = text.replace("原", "")

text = text.replace("为", "")

text = text.replace("与", "")

text = text.replace("人", "")

text = text.replace("—", "")

text = text.replace("未", "")

text = text.replace("再", "")

text = text.replace(". ", "")

text = text.replace("时", "")

text = text.replace("已", "")

Textsplit = jieba.lcut(text)

print(Textsplit)

i = 0

wordcount = {}

for word in Textsplit:

i = i + 1

wordcount[word]=wordcount.get(word,0)+1

print(i, len(word) ", word ", wordcount[word])

print(i, wordcount)

items = list(wordcount.items())

items.sort(key=lambda x: x[1], reverse=True)

text1 = ""

for i in range(len(items)):

#print(items[i])

word, count = items[i]

if len(word)>1 and len(word)<5:

if word == "期初": word = "期初余额"

if word == "资产": word = "资产负债表"

print(word, count)

text1 = text1 + word + " "

WC = WordCloud(background_color="white",font_path = 'C:\Windows\Fonts\SimYou.ttf').generate(text1)

rcParams["figure.figsize"] = (12,4)

plt.imshow(WC)

plt.axis('off')

plt.show()

d = collections.Counter(wordcount)

text = " "

for word, count in d.most_common(25):

print(word , ":", count)

text = text + word + " "

Text=str(Textsplit)

d = collections.Counter(wordcount)

Record = ""

filename="D:/lzj2/Word2.csv"

f = open(filename, "w")

for word, count in d.most_common(50):

if len(word)>1 and len(word)<5:

Record = word + " + str(count)

Record = Record.replace(" "")

#print(Record)

f.write(Record + "\n")

f.close()