Source code for harvesttext.ent_network

import networkx as nx
from itertools import combinations

[docs]class EntNetworkMixin:
    """
    实体网络模块：
    - 根据实体在文档中的共现关系
        - 建立全局社交网络
        - 建立以某一个实体为中心的社交网络
    """
[docs]    def build_entity_graph(self, docs, min_freq=0, inv_index={}, used_types=[]):
        G = nx.Graph()
        links = {}
        if len(inv_index) == 0:
            for i, sent in enumerate(docs):
                entities_info = self.entity_linking(sent)
                if len(used_types) == 0:
                    entities = set(entity for span, (entity, type0) in entities_info)
                else:
                    entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
                for u, v in combinations(entities, 2):
                    pair0 = tuple(sorted((u, v)))
                    if pair0 not in links:
                        links[pair0] = 1
                    else:
                        links[pair0] += 1
        else:  # 已经有倒排文档，可以更快速检索
            if len(used_types) == 0:
                entities = self.entity_type_dict.keys()
            else:
                entities = iter(entity for (entity, type0) in self.entity_type_dict.items() if type0 in used_types)
            for u, v in combinations(entities, 2):
                pair0 = tuple(sorted((u, v)))
                ids = inv_index[u] & inv_index[v]
                if len(ids) > 0:
                    links[pair0] = len(ids)
        for (u, v) in links:
            if links[(u, v)] >= min_freq:
                G.add_edge(u, v, weight=links[(u, v)])
        self.entity_graph = G
        return G

[docs]    def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other_min_freq=-1, stopwords=None):
        '''根据文本和指定限定词，获得以限定词为中心的各词语的关系。
        限定词可以是一个特定的方面（衣食住行这类文档），这样就可以从词语中心图中获得关于这个方面的简要信息

        :param docs: 文本的列表
        :param word: 限定词
        :param standard_name: 把所有实体的指称化为标准实体名
        :param stopwords: 需要过滤的停用词
        :param min_freq: 作为边加入到图中的与中心词最小共现次数，用于筛掉可能过多的边
        :param other_min_freq: 中心词以外词语关系的最小共现次数
        :return: G（networxX中的Graph）

        '''
        G = nx.Graph()
        links = {}
        if other_min_freq == -1:
            other_min_freq = min_freq
        for doc in docs:
            if stopwords:
                words = set(x for x in self.seg(doc, standard_name=standard_name) if x not in stopwords)
            else:
                words = self.seg(doc, standard_name=standard_name)
            if word in words:
                for u, v in combinations(words, 2):
                    pair0 = tuple(sorted((u, v)))
                    if pair0 not in links:
                        links[pair0] = 1
                    else:
                        links[pair0] += 1

        used_nodes = set([word])  # 关系对中涉及的词语必须与实体有关（>= min_freq）
        for (u, v) in links:
            w = links[(u, v)]
            if word in (u, v) and w >= min_freq:
                used_nodes.add(v if word == u else u)
                G.add_edge(u, v, weight=w)
            elif w >= other_min_freq:
                G.add_edge(u, v, weight=w)
        G = G.subgraph(used_nodes).copy()
        return G

[docs]    def build_entity_ego_graph(self, docs, word, min_freq=0, other_min_freq=-1, inv_index={}, used_types=[]):
        '''Entity only version of build_word_ego_graph()
        '''
        G = nx.Graph()
        links = {}
        if other_min_freq == -1:
            other_min_freq = min_freq
        if len(inv_index) != 0:
            related_docs = self.search_entity(word, docs, inv_index)
        else:
            related_docs = []
            for doc in docs:
                entities_info = self.entity_linking(doc)
                entities = [entity0 for [[l,r], (entity0,type0)] in entities_info]
                if word in entities:
                    related_docs.append(doc)

        for i, sent in enumerate(related_docs):
            entities_info = self.entity_linking(sent)
            if len(used_types) == 0:
                entities = set(entity for span, (entity, type0) in entities_info)
            else:
                entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types)
            for u, v in combinations(entities, 2):
                pair0 = tuple(sorted((u, v)))
                if pair0 not in links:
                    links[pair0] = 1
                else:
                    links[pair0] += 1

        used_nodes = set([word])  # 关系对中涉及的词语必须与实体有关（>= min_freq）
        for (u, v) in links:
            w = links[(u, v)]
            if word in (u, v) and w >= min_freq:
                used_nodes.add(v if word == u else u)
                G.add_edge(u, v, weight=w)
            elif w >= other_min_freq:
                G.add_edge(u, v, weight=w)
        G = G.subgraph(used_nodes).copy()
        return G