生财日历数据分析代码及数据集【完整】
发布时间:2021-12-02
付费文章:9.9元
打赏后可以查看完整代码、停用词、数据集。
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import os,re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)
# # 读取数据
# In[2]:
df = pd.read_csv('data/生财日历.csv',index_col=0)
# In[3]:
df.head()
# # 去除非文本数据
# ## 读取数据
# In[4]:
# 提取非文本数据源函数
def translate(str):
line = str
p = re.compile('[\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5
res = re.findall(p, str)
result = ''.join(res)
return result
# In[5]:
w_pure=[]
for i in df['SC2020'].values.tolist():
w_pure.append(translate(str(i)))
# In[6]:
df['w_pure'] = w_pure
# In[7]:
df.head()
# ## 结巴分词
# In[8]:
import jieba # 分词库,需要pip install jieba
# In[9]:
wd_cut = []
for w in w_pure:
s = list(jieba.cut(w))
wd_cut.append(' '.join(s)) # 用空格组合备用
# In[10]:
df['w_cut'] = wd_cut
# In[11]:
df.head()
# ## 去除停用词
# In[12]:
import pickle
file=open('stop_words.pkl','rb') # 哈工大停用词
stop_words = pickle.load(file).copy()
# In[13]:
# 去除停用词
w_stop=[]
for i in wd_cut:
sa = i.strip().split(' ')
w_list =[]
for w in sa:
if w not in stop_words: # 去除停用词
w_list.append(w)
sis = ' '.join(w_list)
w_stop.append(sis)
# In[14]:
df['w_stop'] = w_stop
# In[15]:
df.head()
# ## 词频及词云
# In[16]:
from PIL import Image
from collections import Counter
from wordcloud import WordCloud , ImageColorGenerator
# In[17]:
all_coms = (' '.join(w_stop)).split(' ') # 先合并评论,后对所有评论进行拆分
all_coms = [i for i in all_coms if 1<len(i)<=5]
all_coms = [i for i in all_coms if '一' not in i]
words = Counter(all_coms).most_common(20)
# In[18]:
words # 这是一个list,就不保存了
# In[19]:
root_path = os.getcwd()
back_img = np.array(Image.open(root_path+"\\data\\background.jpg")) # 最好使用绝对路径
image_colors = ImageColorGenerator(back_img)
wc = WordCloud(font_path="simhei.ttf",background_color="white", mask=back_img,scale=4,
max_font_size=300, width=800, height=600,max_words=100)
# In[20]:
word_counts = dict(Counter(all_coms)) # 对分词做词频统计
wc.generate_from_frequencies(word_counts)
im= wc.recolor(color_func=image_colors)
# 显示图片
plt.figure(figsize=(10, 5))
plt.imshow(im)
plt.axis("off") # 关闭图像坐标系
# plt.title("生财体力2020")
plt.show()
# # 特征提取
# In[21]:
import gensim
from gensim import corpora, models, similarities
# In[22]:
#词袋,doc2bow
contents_clean = [i.split(' ') for i in w_stop]
# 建立字典
dictionary = corpora.Dictionary(contents_clean)
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
num_topics = 25 # ++++++++++++提取主题数
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) #类似Kmeans自己指定K值,定义主题数
# 每个主题的词分布
topn = 30 # 每个主题显示几个主题词
num_show_term = 10 # 只显示前几个主题词
for topic_id in range(num_topics):
print('主题#%d:\t' % topic_id)
term_distribute_all = lda.get_topic_terms(topicid=topic_id, topn=topn) # topic_id:主题编号;topn:主题词数量
term_distribute = term_distribute_all[:num_show_term]
term_distribute = np.array(term_distribute)
term_id = term_distribute[:, 0].astype(np.int)
print('词:', end="")
for t in term_id:
print(dictionary.id2token[t], end=' ')
print('概率:', end="")
print(term_distribute[:, 1])