datamining-final / gradioWordCloud.py
nelson40514's picture
Upload folder using huggingface_hub
d737845 verified
import matplotlib.pyplot as plt
import numpy as np
import requests
import jieba
import csv
from PIL import Image
from collections import Counter
from wordcloud import WordCloud
def wordCloud(csv_url):
r = requests.get(csv_url)
with open('wordCloudData/news.csv', 'wb') as f:
f.write(r.content)
news = []
## Read csv file from news.csv
with open('wordCloudData/news.csv', 'r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
for row in reader:
news.append({
'title': row['Title'],
'content': row['Content']
})
# Replace_words
replace_list = [
'\n',
'\r',
'<br />',
'<br>',
'的',
'及',
'於',
'並',
'113'
]
with open('wordCloudData/news.txt', 'w', encoding='utf-8') as f:
for n in news:
try:
title = n['title']
content = n['content']
for replace_word in replace_list:
title = title.replace(replace_word, ' ')
content = content.replace(replace_word, ' ')
f.write(title + '\n')
f.write(content + '\n')
f.write('\n')
# print(n)
except:
pass
text = open('wordCloudData/news.txt', "r",encoding="utf-8").read() #讀文字資料
jieba.set_dictionary('wordCloudData/dict.txt')
with open('wordCloudData/stopWord_test.txt', 'r', encoding='utf-8-sig') as f: #設定停用詞
stops = f.read().split('\n')
terms = [] #儲存字詞
for t in jieba.cut(text, cut_all=False): #拆解句子為字詞
if t not in stops: #不是停用詞
terms.append(t)
diction = Counter([x for x in terms if x != '\n'])
font = 'wordCloudData/msyh.ttc' #設定字型
mask = np.array(Image.open("wordCloudData/car.jpg")) #設定文字雲形狀
wordcloud = WordCloud(font_path=font)
wordcloud = WordCloud(background_color="white",mask=mask,font_path=font) #背景顏色預設黑色,改為白色
wordcloud.generate_from_frequencies(frequencies=diction) #產生文字雲
#產生圖片
plt.figure(figsize=(6,6))
plt.imshow(wordcloud)
plt.axis("off")
wordcloud.to_file("wordCloudData/news_Wordcloud.png") #存檔
return Image.open("wordCloudData/news_Wordcloud.png")