生财日历数据分析代码及数据集【完整】

发布时间:2021-12-02 付费文章:9.9元

打赏后可以查看完整代码、停用词、数据集。

 

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os,re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False   # 步骤二(解决坐标轴负数的负号显示问题)


# # 读取数据

# In[2]:


df = pd.read_csv('data/生财日历.csv',index_col=0)


# In[3]:


df.head()


# # 去除非文本数据

# ## 读取数据

# In[4]:


# 提取非文本数据源函数
def translate(str):
    line = str
    p = re.compile('[\u4e00-\u9fa5]')  # 中文的编码范围是:\u4e00到\u9fa5    
    res = re.findall(p, str)
    result = ''.join(res)
    return result


# In[5]:


w_pure=[]
for i in df['SC2020'].values.tolist():
    w_pure.append(translate(str(i)))


# In[6]:


df['w_pure'] = w_pure


# In[7]:


df.head()


# ## 结巴分词

# In[8]:


import jieba # 分词库,需要pip install jieba


# In[9]:


wd_cut = []
for w in w_pure:
    s = list(jieba.cut(w))
    wd_cut.append(' '.join(s)) # 用空格组合备用


# In[10]:


df['w_cut'] = wd_cut


# In[11]:


df.head()


# ## 去除停用词

# In[12]:


import pickle
file=open('stop_words.pkl','rb') # 哈工大停用词
stop_words = pickle.load(file).copy()


# In[13]:


# 去除停用词
w_stop=[]
for i in wd_cut:
    sa = i.strip().split(' ')
    w_list =[]
    for w in sa:
        if w not in stop_words: # 去除停用词
            w_list.append(w)
    sis = ' '.join(w_list) 
    w_stop.append(sis)


# In[14]:


df['w_stop'] = w_stop


# In[15]:


df.head()


# ## 词频及词云

# In[16]:


from PIL import Image
from collections import Counter
from wordcloud import WordCloud , ImageColorGenerator


# In[17]:


all_coms = (' '.join(w_stop)).split(' ') # 先合并评论,后对所有评论进行拆分
all_coms = [i for i in all_coms if 1<len(i)<=5]
all_coms = [i for i in all_coms if '一' not in i]
words = Counter(all_coms).most_common(20)


# In[18]:


words  # 这是一个list,就不保存了


# In[19]:


root_path = os.getcwd()
back_img  = np.array(Image.open(root_path+"\\data\\background.jpg")) # 最好使用绝对路径
image_colors = ImageColorGenerator(back_img)
wc = WordCloud(font_path="simhei.ttf",background_color="white", mask=back_img,scale=4,
           max_font_size=300, width=800, height=600,max_words=100)


# In[20]:


word_counts = dict(Counter(all_coms)) # 对分词做词频统计
wc.generate_from_frequencies(word_counts) 
im= wc.recolor(color_func=image_colors)
# 显示图片
plt.figure(figsize=(10, 5))  
plt.imshow(im)
plt.axis("off")  # 关闭图像坐标系  
# plt.title("生财体力2020")
plt.show()


# # 特征提取

# In[21]:


import gensim
from gensim import corpora, models, similarities


# In[22]:


#词袋,doc2bow
contents_clean =  [i.split(' ') for i in w_stop]

# 建立字典
dictionary = corpora.Dictionary(contents_clean)
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
num_topics = 25  # ++++++++++++提取主题数
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) #类似Kmeans自己指定K值,定义主题数
# 每个主题的词分布
topn = 30  # 每个主题显示几个主题词
num_show_term = 10  # 只显示前几个主题词
for topic_id in range(num_topics):
    print('主题#%d:\t' % topic_id)
    term_distribute_all = lda.get_topic_terms(topicid=topic_id, topn=topn)  # topic_id:主题编号;topn:主题词数量
    term_distribute = term_distribute_all[:num_show_term]
    term_distribute = np.array(term_distribute)
    term_id = term_distribute[:, 0].astype(np.int)
    print('词:', end="")
    for t in term_id:
        print(dictionary.id2token[t], end=' ')
    print('概率:', end="")
    print(term_distribute[:, 1])