时间:2023-03-15 22:40:01 | 来源:电子商务
时间:2023-03-15 22:40:01 来源:电子商务
针对用户在电商平台上留下的评论数据,对其进行分词、词性标注和去除停用词等文本预处理。基于预处理后的数据进行情感分析,并使用LDA主题模型提取评论关键信息,以了解用户的需求、意见、购买原因及产品的优缺点等,最终提出改善产品的建议。import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport reimport jieba.posseg as psgimport warningswarnings.filterwarnings("ignore")%matplotlib inlinepath = '/home/mw/input/data/emotion_analysi7147'reviews = pd.read_csv(path+'/reviews.csv')print(reviews.shape)reviews.head()
# 删除数据记录中所有列值相同的记录reviews = reviews[['content','content_type']].drop_duplicates()content = reviews['content']reviews.shapereviews
数据清洗# 去除英文、数字、京东、美的、电热水器等词语strinfo = re.compile('[0-9a-zA-Z]|京东|美的|电热水器|热水器|')content = content.apply(lambda x: strinfo.sub('',x))
分词、词性标注、去除停用词# 分词worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] # 自定义简单分词函数seg_word = content.apply(worker)seg_word.head()
# 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置n_word = seg_word.apply(lambda x: len(x)) # 每一评论中词的个数n_content = [[x+1]*y for x,y in zip(list(seg_word.index), list(n_word))]# 将嵌套的列表展开,作为词所在评论的idindex_content = sum(n_content, [])seg_word = sum(seg_word, [])# 词word = [x[0] for x in seg_word]# 词性nature = [x[1] for x in seg_word]content_type = [[x]*y for x,y in zip(list(reviews['content_type']), list(n_word))]# 评论类型content_type = sum(content_type, [])result = pd.DataFrame({"index_content":index_content, "word":word, "nature":nature, "content_type":content_type})result.head()
# 删除标点符号result = result[result['nature'] != 'x'] # x表示标点符号# 删除停用词stop_path = open(path+"/stoplist.txt", 'r',encoding='UTF-8')stop = stop_path.readlines()stop = [x.replace('/n', '') for x in stop]word = list(set(word) - set(stop))result = result[result['word'].isin(word)]result.head()
# 构造各词在对应评论的位置列n_word = list(result.groupby(by = ['index_content'])['index_content'].count())index_word = [list(np.arange(0, y)) for y in n_word]# 词语在该评论的位置index_word = sum(index_word, [])# 合并评论idresult['index_word'] = index_wordresult.head()
提取含名词的评论# 提取含有名词类的评论,即词性含有“n”的评论ind = result[['n' in x for x in result['nature']]]['index_content'].unique()result = result[[x in ind for x in result['index_content']]]result.head()
绘制词云import matplotlib.pyplot as pltfrom wordcloud import WordCloudfrequencies = result.groupby('word')['word'].count()frequencies = frequencies.sort_values(ascending = False)backgroud_Image=plt.imread(path+'/pl.jpg')# 自己上传中文字体到kescifont_path = '/home/kesci/work/data/fonts/MSYHL.TTC'wordcloud = WordCloud(font_path=font_path, # 设置字体,不设置就会出现乱码 max_words=100, background_color='white', mask=backgroud_Image)# 词云形状my_wordcloud = wordcloud.fit_words(frequencies)plt.imshow(my_wordcloud)plt.axis('off') plt.show()
由图可以看出,对评论数据进行预处理后,分词效果较为符合预期。其中“安装”“师傅”“售后”“物流”“服务”等词出现频率较高,因此可以初步判断用户对产品的这几个方面比较重视。# 将结果保存result.to_csv("./word.csv", index = False, encoding = 'utf-8')
二、词典匹配word = pd.read_csv("./word.csv")# 读入正面、负面情感评价词pos_comment = pd.read_csv(path+"/正面评价词语(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python')neg_comment = pd.read_csv(path+"/负面评价词语(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python')pos_emotion = pd.read_csv(path+"/正面情感词语(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python')neg_emotion = pd.read_csv(path+"/负面情感词语(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python') # 合并情感词与评价词positive = set(pos_comment.iloc[:,0])|set(pos_emotion.iloc[:,0])negative = set(neg_comment.iloc[:,0])|set(neg_emotion.iloc[:,0])# 正负面情感词表中相同的词语intersection = positive&negativepositive = list(positive - intersection)negative = list(negative - intersection)positive = pd.DataFrame({"word":positive, "weight":[1]*len(positive)})negative = pd.DataFrame({"word":negative, "weight":[-1]*len(negative)}) posneg = positive.append(negative)# 将分词结果与正负面情感词表合并,定位情感词data_posneg = posneg.merge(word, left_on = 'word', right_on = 'word', how = 'right')data_posneg = data_posneg.sort_values(by = ['index_content','index_word'])data_posneg.head()
修正情感倾向# 载入否定词表notdict = pd.read_csv(path+"/not.csv")# 构造新列,作为经过否定词修正后的情感值data_posneg['amend_weight'] = data_posneg['weight']data_posneg['id'] = np.arange(0, len(data_posneg))# 只保留有情感值的词语only_inclination = data_posneg.dropna().reset_index(drop=True)index = only_inclination['id']for i in np.arange(0, len(only_inclination)): # 提取第i个情感词所在的评论 review = data_posneg[data_posneg['index_content'] == only_inclination['index_content'][i]] review.index = np.arange(0, len(review)) # 第i个情感值在该文档的位置 affective = only_inclination['index_word'][i] if affective == 1: ne = sum([i in notdict['term'] for i in review['word'][affective - 1]])%2 if ne == 1: data_posneg['amend_weight'][index[i]] = -data_posneg['weight'][index[i]] elif affective > 1: ne = sum([i in notdict['term'] for i in review['word'][[affective - 1, affective - 2]]])%2 if ne == 1: data_posneg['amend_weight'][index[i]] = -data_posneg['weight'][index[i]] # 更新只保留情感值的数据only_inclination = only_inclination.dropna()# 计算每条评论的情感值emotional_value = only_inclination.groupby(['index_content'], as_index=False)['amend_weight'].sum()# 去除情感值为0的评论emotional_value = emotional_value[emotional_value['amend_weight'] != 0]
查看情感分析效果# 给情感值大于0的赋予评论类型(content_type)为pos,小于0的为negemotional_value['a_type'] = ''emotional_value['a_type'][emotional_value['amend_weight'] > 0] = 'pos'emotional_value['a_type'][emotional_value['amend_weight'] < 0] = 'neg'emotional_value.head()
# 查看情感分析结果result = emotional_value.merge(word, left_on = 'index_content', right_on = 'index_content', how = 'left')result.head()
result = result[['index_content','content_type', 'a_type']].drop_duplicates()result.head()
假定用户在评论时不存在“选了好评的标签,而写了差评内容”的情况,比较原评论的评论类型与情感分析得出的评论类型,绘制情感倾向分析混淆矩阵,查看词表的情感分析的准确率。# 交叉表:统计分组频率的特殊透视表confusion_matrix = pd.crosstab(result['content_type'], result['a_type'], margins=True)confusion_matrix.head()
(confusion_matrix.iat[0,0] + confusion_matrix.iat[1,1])/confusion_matrix.iat[2,2]# 提取正负面评论信息ind_pos = list(emotional_value[emotional_value['a_type'] == 'pos']['index_content'])ind_neg = list(emotional_value[emotional_value['a_type'] == 'neg']['index_content'])posdata = word[[i in ind_pos for i in word['index_content']]]negdata = word[[i in ind_neg for i in word['index_content']]]# 绘制词云import matplotlib.pyplot as pltfrom wordcloud import WordCloud# 正面情感词词云freq_pos = posdata.groupby('word')['word'].count()freq_pos = freq_pos.sort_values(ascending = False)backgroud_Image=plt.imread(path+'/pl.jpg')wordcloud = WordCloud(font_path=font_path, max_words=100, background_color='white', mask=backgroud_Image)pos_wordcloud = wordcloud.fit_words(freq_pos)plt.imshow(pos_wordcloud)plt.axis('off') plt.show()# 负面情感词词云freq_neg = negdata.groupby(by = ['word'])['word'].count()freq_neg = freq_neg.sort_values(ascending = False)neg_wordcloud = wordcloud.fit_words(freq_neg)plt.imshow(neg_wordcloud)plt.axis('off') plt.show()
# 将结果写出,每条评论作为一行posdata.to_csv("./posdata.csv", index = False, encoding = 'utf-8')negdata.to_csv("./negdata.csv", index = False, encoding = 'utf-8')
由图正面情感评论词云可知,“不错”“满意”“好评”等正面情感词出现的频数较高,并且没有掺杂负面情感词语,可以看出情感分析能较好地将正面情感评论抽取出来。reviews.head()
reviews['content_type'] = reviews['content_type'].map(lambda x:1.0 if x == 'pos' else 0.0)reviews.head()
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF # 原始文本转化为tf-idf的特征矩阵from sklearn.svm import LinearSVCfrom sklearn.calibration import CalibratedClassifierCVfrom sklearn.model_selection import train_test_split# 将有标签的数据集划分成训练集和测试集train_X,valid_X,train_y,valid_y = train_test_split(reviews['content'],reviews['content_type'],test_size=0.2,random_state=42)train_X.shape,train_y.shape,valid_X.shape,valid_y.shape# 模型构建model_tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1,3), use_idf=1, smooth_idf=1)# 学习idf vectormodel_tfidf.fit(train_X)# 把文档转换成 X矩阵(该文档中该特征词出现的频次),行是文档个数,列是特征词的个数train_vec = model_tfidf.transform(train_X)# 模型训练model_SVC = LinearSVC()clf = CalibratedClassifierCV(model_SVC)clf.fit(train_vec,train_y)# 把文档转换成矩阵valid_vec = model_tfidf.transform(valid_X)# 验证pre_valid = clf.predict_proba(valid_vec)pre_valid[:5]pre_valid = clf.predict(valid_vec)print('正例:',sum(pre_valid == 1))print('负例:',sum(pre_valid == 0))from sklearn.metrics import accuracy_scorescore = accuracy_score(pre_valid,valid_y)print("准确率:",score)
四、LDA模型import reimport itertoolsfrom gensim import corpora, models# 载入情感分析后的数据posdata = pd.read_csv("./posdata.csv", encoding = 'utf-8')negdata = pd.read_csv("./negdata.csv", encoding = 'utf-8')# 建立词典pos_dict = corpora.Dictionary([[i] for i in posdata['word']]) # 正面neg_dict = corpora.Dictionary([[i] for i in negdata['word']]) # 负面# 建立语料库pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in posdata['word']]] # 正面neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in negdata['word']]] # 负面
五、主题数寻优# 余弦相似度函数def cos(vector1, vector2): dot_product = 0.0; normA = 0.0; normB = 0.0; for a,b in zip(vector1, vector2): dot_product += a*b normA += a**2 normB += b**2 if normA == 0.0 or normB==0.0: return(None) else: return(dot_product / ((normA*normB)**0.5)) # 主题数寻优def lda_k(x_corpus, x_dict): # 初始化平均余弦相似度 mean_similarity = [] mean_similarity.append(1) # 循环生成主题并计算主题间相似度 for i in np.arange(2,11): # LDA模型训练 lda = models.LdaModel(x_corpus, num_topics = i, id2word = x_dict) for j in np.arange(i): term = lda.show_topics(num_words = 50) # 提取各主题词 top_word = [] for k in np.arange(i): top_word.append([''.join(re.findall('"(.*)"',i)) / for i in term[k][1].split('+')]) # 列出所有词 # 构造词频向量 word = sum(top_word,[]) # 列出所有的词 unique_word = set(word) # 去除重复的词 # 构造主题词列表,行表示主题号,列表示各主题词 mat = [] for j in np.arange(i): top_w = top_word[j] mat.append(tuple([top_w.count(k) for k in unique_word])) p = list(itertools.permutations(list(np.arange(i)),2)) l = len(p) top_similarity = [0] for w in np.arange(l): vector1 = mat[p[w][0]] vector2 = mat[p[w][1]] top_similarity.append(cos(vector1, vector2)) # 计算平均余弦相似度 mean_similarity.append(sum(top_similarity)/l) return(mean_similarity)# 计算主题平均余弦相似度pos_k = lda_k(pos_corpus, pos_dict)neg_k = lda_k(neg_corpus, neg_dict)# 绘制主题平均余弦相似度图形from matplotlib.font_manager import FontProperties font = FontProperties(size=14)fig = plt.figure(figsize=(10,8))ax1 = fig.add_subplot(211)ax1.plot(pos_k)ax1.set_xlabel('正面评论LDA主题数寻优', fontproperties=font)ax2 = fig.add_subplot(212)ax2.plot(neg_k)ax2.set_xlabel('负面评论LDA主题数寻优', fontproperties=font)
由图可知,对于正面评论数据,当主题数为2或3时,主题间的平均余弦相似度就达到了最低。因此,对正面评论数据做LDA,可以选择主题数为3;对于负面评论数据,当主题数为3时,主题间的平均余弦相似度也达到了最低。因此,对负面评论数据做LDA,也可以选择主题数为3。# LDA主题分析pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict) neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict)pos_lda.print_topics(num_words = 10)
[(0,
'0.031*"服务" + 0.025*"好评" + 0.021*"信赖" + 0.020*"售后" + 0.019*"人员" + 0.016*"太" + 0.016*"送" + 0.015*"品牌" + 0.014*"电话" + 0.013*"质量"'),
(1,
'0.029*"很快" + 0.028*"不错" + 0.026*"值得" + 0.023*"客服" + 0.017*"物流" + 0.017*"差" + 0.014*"速度" + 0.012*"态度" + 0.012*"赞" + 0.011*"收到"'),
(2,
'0.115*"安装" + 0.050*"满意" + 0.038*"师傅" + 0.028*"送货" + 0.017*"东西" + 0.013*"购物" + 0.012*"家里" + 0.011*"装" + 0.010*"真的" + 0.010*"预约"')]
neg_lda.print_topics(num_words = 10)
[(0,七、可视化模型训练结果
'0.022*"东西" + 0.019*"装" + 0.016*"加热" + 0.016*"烧水" + 0.015*"漏水" + 0.013*"真的" + 0.011*"产品" + 0.010*"钱" + 0.009*"电话" + 0.009*"价格"'),
(1,
'0.140*"安装" + 0.033*"师傅" + 0.032*"太" + 0.019*"收费" + 0.019*"打电话" + 0.018*"贵" + 0.017*"慢" + 0.016*"太慢" + 0.012*"材料" + 0.011*"高"'),
(2,
'0.031*"垃圾" + 0.029*"售后" + 0.027*"差" + 0.023*"安装费" + 0.019*"客服" + 0.018*"小时" + 0.017*"不好" + 0.017*"收" + 0.012*"人员" + 0.012*"坑人"')]
import pyLDAvisvis = pyLDAvis.gensim.prepare(pos_lda,pos_corpus,pos_dict)# 需要的三个参数都可以从硬盘读取的,前面已经存储下来了# 在浏览器中心打开一个界面# pyLDAvis.show(vis)# 在notebook的output cell中显示pyLDAvis.display(vis)
综合以上对主题及其中的高频特征词的分析得出,美的电热水器有价格实惠、性价比高、外观好看、服务好等优势。相对而言,用户对美的电热水器的抱怨点主要体现在安装的费用高及售后服务差等方面。因此,用户的购买原因可以总结为以下几个方面:美的是大品牌值得信赖、美的电热水器价格实惠、性价比高。关键词:数据,分析,评论,情感,产品