基于结巴分词、SnowNLP、Kmeans自然语言处理之京东评论情感分析

您所在的位置：网站首页 › snownlp情感分析电影评论 › 基于结巴分词、SnowNLP、Kmeans自然语言处理之京东评论情感分析

基于结巴分词、SnowNLP、Kmeans自然语言处理之京东评论情感分析

2023-07-18 01:21| 来源: 网络整理| 查看: 265

【1】需要处理的评论数据已经存到MongoDB（评论数据总量约为3万条）

import jieba import jieba.analyse from pymongo import MongoClient from snownlp import SnowNLP comments = '' client=MongoClient() results = client.jd.shouhuan.find({}) for result in results: for content in result['商品总评论']: comments+=content[:-26] # 第一步去除"噪音" # 【STEP 1 从数据库/文件读取评论内容】 #导入自定义词典 # 【目标：】让jieba识别新词 jieba.load_userdict("/Users/macbookair/Desktop/NLP1221/dict.txt") # ===START============================= # 去除停用词 #【目标：】去除文本噪音 # =================================== stopwords = {}.fromkeys(['一晃','准功','平理','一大','充好','；','?','*','**','??????','1','2','3','4','5','6','7','8','9','10','0','a','b','c','d','e','f','g','h','i','g','k','m','n','o','p','q','r','s','t','u','v','w','x','y','z','*^★*☆','丶','helliphellip','；','*?acute╰╯`?','hellip','哦','与','下次','～','!',"（','）",'�','\n','、','~','再','来','给','有','&','的', '包括', '等', '是', '了', '和','开始','用','怎么','说','呢','还是','，',' ','。','：','而且','似乎','都','！','?','hellip',';','还有','就','直接','会','第二天','按','之后','一款']) # ===END============================= # 建立停用词表 # =================================== # ===START============================= # 使用停用词降低噪音，并分词 # =================================== segs = jieba.cut(comments, cut_all=False) # 精确模式，默认为精准模式，且用精准模式。因为全模式或搜索模式，会产生更多噪音。 final = '' for seg in segs: if seg not in stopwords: # 从文本中去除停用词，降低噪音 final += seg # 重新做成string类型内容 len(final) # ===END============================= # 使用停用词降低噪音 # =================================== # ==START============================ # 获得特征词 1221 # =================================== # tags = jieba.analyse.extract_tags(final, topK=500, withWeight=True, allowPOS=('n')) # # print (tags) # for tag in tags: # with open("/Users/macbookair/Desktop/NLP1221/特征词_1221.txt", 'a') as f: # f.write(str(tag[0])+' '+ str(tag[1]) + '\n') # 特征词 = tag[0] # 权重 = tag[1] # 特_权 = {'特征词':特征词, '权重':权重} # client = MongoClient() # client.jd.特征词_权重_1221.insert_one(特_权) # ==END============================== # 获得特征词 # ===================================

【2】获得所有分词

segs = jieba.lcut(comments, cut_all=False) final = '' for seg in segs: if seg not in stopwords and len(seg)>1: # 从文本中去除停用词，降低噪音 with open("/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt", 'a') as f: f.write(str(seg) + '\n')

【3】基于word2vec 训练模型

from gensim.models import word2vec # ===START=========================== # 训练模型，做词向量 # =================================== sentence = [] with open('/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt') as f: # 从分词文件读取词 result = f.read() sentence.append(result.split('\n')) # 使用 CBOW 算法 model = word2vec.Word2Vec(sentence, sg=0, size=100, window=5, min_count=0, negative=3, sample=0.001, hs=1, workers=4) model.save('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model') model.wv.save_word2vec_format('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model.txt', '/Users/macbookair/Desktop/NLP1221/wordvec-1221.vocab.txt', binary=False)

【4】基于 kmeans 聚类

from gensim.models import Word2Vec from sklearn.cluster import KMeans # 加载词空间模型 model = Word2Vec.load('/Users/macbookair/Desktop/NLP1221/wordvec-1221.model') keys = model.wv.vocab.keys() # 获取词对应的词向量 wordvector = [] for key in keys: wordvector.append(model[key]) # 聚类，分10个蔟 clf = KMeans(n_clusters=20) s = clf.fit_predict(wordvector) print(len(s)) # 读取特征词库 name = [] with open('/Users/macbookair/Desktop/NLP1221/全文分词_1221.txt') as f: # 从分词文件读取词 result = f.read() names = result.replace('\n',' ').split(' ') for i in range(0, 21): label_i = [] for j in range(0, len(s)): if s[j] == i: label_i.append(names[j]) # 将聚类结果保存至文件 with open('/Users/macbookair/Desktop/NLP1221/全词聚类-1221-2.model.txt', 'a') as f: f.write('label_' + str(i) + ':' + str(label_i) + '\n')

【5】将长句切成短句并存储，为短句情感分析做准备。（短句的情感分析比长句情感分析精准很多）

from pymongo import MongoClient comments = '' client=MongoClient() results = client.jd.shouhuan_qinggan_zhaiyao.find({}) for comment in results: c = comment['评论内容'].replace('！','，').replace('~','，').replace('～','，').replace('。','，').replace('？','，').replace(',','，').replace('\n','，').replace('\n\n\n','，') cs = {'商品ID':comment['商品ID'], '评论内容':comment['评论内容'],'评论时间':comment['评论时间'],'长句情感积极度':comment['情感积极度'], '短句集合':c} client.jd.短句集合.insert_one(cs)

【6】使用主语（名词）+情感词（形容词）匹配短句，并给短句打分

from pymongo import MongoClient from snownlp import SnowNLP ## 读取人工处理后的特征词 with open('/Users/macbookair/Desktop/NLP1221/人工处理过的特征词_1221.txt') as f: 特征词 = (f.read()).split('\n') # 读取人工处理后的情感词 with open('/Users/macbookair/Desktop/NLP1221/人工处理过的情感词_1221.txt') as f: 情感词 = (f.read()).split('\n') # print(情感词) # # 读取短句并变成list类型 client = MongoClient() 短句集合 = client.jd.短句集合.find({}).skip(500).limit(10) # 短句集合 = client.jd.短句集合.find({}) for 短句 in 短句集合: all_短句 = 短句['短句集合'].replace(' ','，').replace('：','，').replace('……','，').replace('、','，').split('，') # print('------------------>', 短句['商品ID']) for 特征 in 特征词: 词 = (特征.split(' '))[0] 权 = (特征.split(' '))[1] # print('词===》',词,'权 ====》', 权) for 情感 in 情感词: 情感词词 = (情感.split(' '))[0] 情感词权 = (情感.split(' '))[1] 情感词倾 = (情感.split(' '))[2] # print('情感词词===>',情感词词, '情感词权=====》',情感词权,'情感词倾=====>', 情感词倾) for _短句 in all_短句: if 词 in _短句 and 情感词词 in _短句: 短句情感倾向 = SnowNLP(_短句).sentiments result = {'商品ID':短句['商品ID'], '长句':短句['评论内容'],'长句情感倾向':短句['长句情感积极度'], '评论时间':短句['评论时间'],'短句':_短句, '短句情感倾向':短句情感倾向,'特征词':词,'特征权重':权, '情感词':情感词词, '情感词权':情感词权, '情感词倾向':情感词倾} # print(result) client.jd.特征情感短句.insert_one(result)

【本文地址】

基于结巴分词、SnowNLP、Kmeans自然语言处理之京东评论情感分析

基于结巴分词、SnowNLP、Kmeans自然语言处理之京东评论情感分析

今日新闻

推荐新闻