Source code for harvesttext.sentiment

from .resources import get_qh_sent_dict
from .algorithms.sent_dict import SentDict

[docs]class SentimentMixin:
    """
    情感分析模块：
    - 基于SO-PMI的情感词典挖掘和情感分析算法
    """
[docs]    def build_sent_dict(self, sents, method="PMI", min_times=5, scale="None",
                        pos_seeds=None, neg_seeds=None, stopwords=None):
        '''利用种子词，构建情感词典

        :param sents: list of string, 一般建议为句子，是计算共现PMI的基本单元
        :param method: "PMI", 使用的算法，目前仅支持PMI
        :param min_times: int, 默认为5， 在所有句子中出现次数少于这个次数的词语将被过滤
        :param scale: {"None","0-1","+-1"}, 默认为"None"，否则将对情感值进行变换
            若为"0-1"，按照最大为1，最小为0进行线性伸缩，0.5未必是中性
            若为"+-1", 在正负区间内分别伸缩，保留0作为中性的语义
        :param pos_seeds: list of string, 积极种子词，如不填写将默认采用清华情感词典
        :param neg_seeds: list of string, 消极种子词，如不填写将默认采用清华情感词典
        :param stopwords: list of string, stopwords词，如不填写将不使用
        :return: sent_dict: dict,可以查询单个词语的情感值
        '''
        if pos_seeds is None and neg_seeds is None:
            sdict = get_qh_sent_dict()
            pos_seeds, neg_seeds = sdict["pos"], sdict["neg"]
        docs = [set(self.seg(sent)) for sent in sents]
        if not stopwords is None:
            stopwords = set(stopwords)
            for i in range(len(docs)):
                docs[i] = docs[i] - stopwords
            docs = list(filter(lambda x: len(x) > 0, docs))
        self.sent_dict = SentDict(docs, method, min_times, scale, pos_seeds, neg_seeds)
        return self.sent_dict.sent_dict

[docs]    def analyse_sent(self, sent, avg=True):
        """输入句子，输出其情感值，默认使用句子中，在情感词典中的词语的情感值的平均来计算

        :param sent: string, 句子
        :param avg: (default True) 是否使用平均值计算句子情感值
        :return: float情感值(if avg == True), 否则为词语情感值列表
        """
        return self.sent_dict.analyse_sent(self.seg(sent), avg)