实例介绍
【实例简介】
LDA一致性与困惑度分析
【核心代码】
import pandas as pd
import jieba
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
# Load stopwords
def stopwordslist(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
stopwords = [line.strip() for line in file.readlines()]
return stopwords
stopwords = stopwordslist('C:/Users/lenovo/Desktop/lda_results/停用词.txt')
# Define data cleaning function
def data_cleaning(content_list):
content_seg = []
symbols = set('-\\n~%≥℃|/​``​↓#~_「♂!?\',、:;。《》()()·—.…,0123456789abcdefghijklnmopqrstuvwxyz')
for content in content_list:
content = ''.join([' ' if con in symbols else con for con in content])
con_list = jieba.cut(content, cut_all=False)
result_list = [con for con in con_list if con not in stopwords and con.strip()]
content_seg.append(' '.join(result_list))
return content_seg
# Read text data line by line
with open('C:/Users/lenovo/Desktop/lda_results/待分析数据.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# Segment text and remove stopwords
participle = data_cleaning(lines)
df = pd.DataFrame({'文章内容': lines, '文章内容去停用词分词结果': participle})
# Build dictionary and bag-of-words model
train_set = df['文章内容去停用词分词结果'].apply(lambda x: x.split())
dictionary = corpora.Dictionary(train_set)
corpus = [dictionary.doc2bow(text) for text in train_set]
# Define function to compute perplexity and coherence
def compute_metrics(num_topics):
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=5, random_state=1)
perplexity_score = lda_model.log_perplexity(corpus)
coherence_model = CoherenceModel(model=lda_model, texts=train_set, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
return num_topics, perplexity_score, coherence_score
if __name__ == "__main__":
# Compute perplexity and coherence in parallel
num_topics_range = range(2, 11)
with ProcessPoolExecutor() as executor:
results = list(executor.map(compute_metrics, num_topics_range))
# Extract results
num_topics_list, perplexity_scores, coherence_scores = zip(*results)
# Plot perplexity and coherence curves on one graph with two y-axes
fig, ax1 = plt.subplots(figsize=(12, 6)) # Adjust the figure size for a suitable aspect ratio
color = 'tab:blue'
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color=color)
ax1.plot(num_topics_list, perplexity_scores, marker='o', color=color, linestyle='-')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color = 'tab:orange'
ax2.set_ylabel('Coherence', color=color)
ax2.plot(num_topics_list, coherence_scores, marker='o', color=color, linestyle='--')
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.title('Perplexity and Coherence vs. Number of Topics')
# Save the plot as a PDF
plt.savefig('C:/Users/lenovo/Desktop/lda_results/perplexity_coherence3.pdf', format='pdf')
plt.show()
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明
网友评论
我要评论