Source code for harvesttext.sentiment
from .resources import get_qh_sent_dict
from .algorithms.sent_dict import SentDict
[docs]class SentimentMixin:
"""
情感分析模块:
- 基于SO-PMI的情感词典挖掘和情感分析算法
"""
[docs] def build_sent_dict(self, sents, method="PMI", min_times=5, scale="None",
pos_seeds=None, neg_seeds=None, stopwords=None):
'''利用种子词,构建情感词典
:param sents: list of string, 一般建议为句子,是计算共现PMI的基本单元
:param method: "PMI", 使用的算法,目前仅支持PMI
:param min_times: int, 默认为5, 在所有句子中出现次数少于这个次数的词语将被过滤
:param scale: {"None","0-1","+-1"}, 默认为"None",否则将对情感值进行变换
若为"0-1",按照最大为1,最小为0进行线性伸缩,0.5未必是中性
若为"+-1", 在正负区间内分别伸缩,保留0作为中性的语义
:param pos_seeds: list of string, 积极种子词,如不填写将默认采用清华情感词典
:param neg_seeds: list of string, 消极种子词,如不填写将默认采用清华情感词典
:param stopwords: list of string, stopwords词,如不填写将不使用
:return: sent_dict: dict,可以查询单个词语的情感值
'''
if pos_seeds is None and neg_seeds is None:
sdict = get_qh_sent_dict()
pos_seeds, neg_seeds = sdict["pos"], sdict["neg"]
docs = [set(self.seg(sent)) for sent in sents]
if not stopwords is None:
stopwords = set(stopwords)
for i in range(len(docs)):
docs[i] = docs[i] - stopwords
docs = list(filter(lambda x: len(x) > 0, docs))
self.sent_dict = SentDict(docs, method, min_times, scale, pos_seeds, neg_seeds)
return self.sent_dict.sent_dict
[docs] def analyse_sent(self, sent, avg=True):
"""输入句子,输出其情感值,默认使用句子中,在情感词典中的词语的情感值的平均来计算
:param sent: string, 句子
:param avg: (default True) 是否使用平均值计算句子情感值
:return: float情感值(if avg == True), 否则为词语情感值列表
"""
return self.sent_dict.analyse_sent(self.seg(sent), avg)