Source code for harvesttext.resources

#coding=utf-8
#!/usr/bin/env python

# Resources

# 褒贬义词典   清华大学 李军
#
# 此资源被用于以下论文中:
# Jun Li and Maosong Sun, Experimental Study on Sentiment Classification of Chinese Review using Machine Learning Techniques, in Proceding of IEEE NLPKE 2007
# 李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008
import os
import json
from collections import defaultdict

[docs]def get_qh_sent_dict():
    """
    获得参考褒贬义词典：
    褒贬义词典   清华大学 李军

    此资源被用于以下论文中:
    Jun Li and Maosong Sun, Experimental Study on Sentiment Classification of Chinese Review using Machine Learning Techniques, in Proceding of IEEE NLPKE 2007
    李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008

    :return: qh_sent_dict = {"pos":[words],"neg":[words]}

    """

    pwd = os.path.abspath(os.path.dirname(__file__))
    with open(pwd+"/resources/qh_sent_dict.json","r",encoding="utf-8") as f:
        qh_sent_dict = json.load(f)
    return qh_sent_dict

[docs]def get_baidu_stopwords():
    """
    获得百度停用词列表
    来源，网上流传的版本：https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html
    包含了中英文常见词及部分标点符号

    :return: stopwords: set of string

    """
    pwd = os.path.abspath(os.path.dirname(__file__))
    with open(pwd + "/resources/bd_stopwords.json", "r", encoding="utf-8") as f:
        stopwords = json.load(f)
    return set(stopwords)

[docs]def get_nltk_en_stopwords():
    """
    来自nltk的英语停用词

    :return: stopwords: set of string
    """
    import nltk
    try:
        nltk.data.find('corpora/stopwords')
    except:
        nltk.download('stopwords')
    from nltk.corpus import stopwords
    return set(stopwords.words('english'))

[docs]def get_qh_typed_words(used_types = ['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']):
    """
    THUOCL：清华大学开放中文词库
    http://thuocl.thunlp.org/
    IT	财经	成语	地名	历史名人	诗词	医学	饮食	法律	汽车	动物

    :param used_types:
    :return: typed_words: 字典，键为类型，值为该类的词语组成的set

    """
    pwd = os.path.abspath(os.path.dirname(__file__))
    with open(pwd + "/resources/THUOCL.json", "r", encoding="utf-8") as f:
        typed_words0 = json.load(f)
    typed_words = dict()
    for type0 in typed_words0:
        if type0 in used_types:
            typed_words[type0] = set(typed_words0[type0])
    return typed_words

[docs]def get_sanguo():
    """
    获得三国演义原文

    :return: ["章节1文本","章节2文本",...]

    """
    pwd = os.path.abspath(os.path.dirname(__file__))
    with open(pwd+"/resources/sanguo_docs.json","r",encoding="utf-8") as f:
        docs = json.load(f)
    return docs

[docs]def get_sanguo_entity_dict():
    """
    获得三国演义中的人名、地名、势力名的知识库。
    自行搭建的简单版，一定有遗漏和错误，仅供参考使用

    :return: entity_mention_dict,entity_type_dict

    """
    import json
    pwd = os.path.abspath(os.path.dirname(__file__))
    with open(pwd+"/resources/sanguo_entity_dict.json","r",encoding="utf-8") as f:
        entity_dict = json.load(f)
    return entity_dict["mention"], entity_dict["type"]

[docs]def get_english_senti_lexicon(type="LH"):
    """
    获得英语情感词汇表

    目前默认为来自这里的词汇表
    https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon

    If you use this list, please cite the following paper:

       Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews."
           Proceedings of the ACM SIGKDD International Conference on Knowledge
           Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle,
           Washington, USA,

    :return: sent_dict = {"pos":[words],"neg":[words]}
    """
    pwd = os.path.abspath(os.path.dirname(__file__))
    with open(pwd + "/resources/LH_senti_lexicon.json", "r", encoding="utf-8") as f:
        senti_lexicon = json.load(f)
    return senti_lexicon

[docs]def get_jieba_dict(min_freq=0, max_freq=float('inf'), with_pos=False, use_proxy=False, proxies=None):
    """
    获得jieba自带的中文词语词频词典
    
    :params min_freq: 选取词语需要的最小词频
    :params max_freq: 选取词语允许的最大词频
    :params with_pos: 返回结果是否包括词性信息
    :return if not with_pos, dict of {wd: freq}, else, dict of {(wd, pos): freq} 
    """
    from .download_utils import RemoteFileMetadata, check_download_resource
    remote = RemoteFileMetadata(
        filename='jieba_dict.txt',
        url='https://github.com/blmoistawinde/HarvestText/releases/download/V0.8/jieba_dict.txt',
        checksum='7197c3211ddd98962b036cdf40324d1ea2bfaa12bd028e68faa70111a88e12a8')
    file_path = check_download_resource(remote, use_proxy, proxies)
    ret = defaultdict(int)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if len(line.strip().split()) == 3:
                wd, freq, pos = line.strip().split()
                freq = int(freq)
            if freq > min_freq and freq < max_freq:
                if not with_pos:
                    ret[wd] = freq
                else:
                    ret[(wd, pos)] = freq
    return ret