打赏后可以查看完整代码、停用词、数据集。
#!/usr/bin/env python # coding: utf-8 # In[1]: import os,re import pandas as pd import numpy as np import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题) # # 读取数据 # In[2]: df = pd.read_csv('data/生财日历.csv',index_col=0) # In[3]: df.head() # # 去除非文本数据 # ## 读取数据 # In[4]: # 提取非文本数据源函数 def translate(str): line = str p = re.compile('[\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5 res = re.findall(p, str) result = ''.join(res) return result # In[5]: w_pure=[] for i in df['SC2020'].values.tolist(): w_pure.append(translate(str(i))) # In[6]: df['w_pure'] = w_pure # In[7]: df.head() # ## 结巴分词 # In[8]: import jieba # 分词库,需要pip install jieba # In[9]: wd_cut = [] for w in w_pure: s = list(jieba.cut(w)) wd_cut.append(' '.join(s)) # 用空格组合备用 # In[10]: df['w_cut'] = wd_cut # In[11]: df.head() # ## 去除停用词 # In[12]: import pickle file=open('stop_words.pkl','rb') # 哈工大停用词 stop_words = pickle.load(file).copy() # In[13]: # 去除停用词 w_stop=[] for i in wd_cut: sa = i.strip().split(' ') w_list =[] for w in sa: if w not in stop_words: # 去除停用词 w_list.append(w) sis = ' '.join(w_list) w_stop.append(sis) # In[14]: df['w_stop'] = w_stop # In[15]: df.head() # ## 词频及词云 # In[16]: from PIL import Image from collections import Counter from wordcloud import WordCloud , ImageColorGenerator # In[17]: all_coms = (' '.join(w_stop)).split(' ') # 先合并评论,后对所有评论进行拆分 all_coms = [i for i in all_coms if 1<len(i)<=5] all_coms = [i for i in all_coms if '一' not in i] words = Counter(all_coms).most_common(20) # In[18]: words # 这是一个list,就不保存了 # In[19]: root_path = os.getcwd() back_img = np.array(Image.open(root_path+"\\data\\background.jpg")) # 最好使用绝对路径 image_colors = ImageColorGenerator(back_img) wc = WordCloud(font_path="simhei.ttf",background_color="white", mask=back_img,scale=4, max_font_size=300, width=800, height=600,max_words=100) # In[20]: word_counts = dict(Counter(all_coms)) # 对分词做词频统计 wc.generate_from_frequencies(word_counts) im= wc.recolor(color_func=image_colors) # 显示图片 plt.figure(figsize=(10, 5)) plt.imshow(im) plt.axis("off") # 关闭图像坐标系 # plt.title("生财体力2020") plt.show() # # 特征提取 # In[21]: import gensim from gensim import corpora, models, similarities # In[22]: #词袋,doc2bow contents_clean = [i.split(' ') for i in w_stop] # 建立字典 dictionary = corpora.Dictionary(contents_clean) corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean] num_topics = 25 # ++++++++++++提取主题数 lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics) #类似Kmeans自己指定K值,定义主题数 # 每个主题的词分布 topn = 30 # 每个主题显示几个主题词 num_show_term = 10 # 只显示前几个主题词 for topic_id in range(num_topics): print('主题#%d:\t' % topic_id) term_distribute_all = lda.get_topic_terms(topicid=topic_id, topn=topn) # topic_id:主题编号;topn:主题词数量 term_distribute = term_distribute_all[:num_show_term] term_distribute = np.array(term_distribute) term_id = term_distribute[:, 0].astype(np.int) print('词:', end="") for t in term_id: print(dictionary.id2token[t], end=' ') print('概率:', end="") print(term_distribute[:, 1])
如忘记保存,或后续再查看,可凭“订单号” 手动获取