Source code for harvesttext.ent_network

import networkx as nx
from itertools import combinations

[docs]class EntNetworkMixin: """ 实体网络模块: - 根据实体在文档中的共现关系 - 建立全局社交网络 - 建立以某一个实体为中心的社交网络 """
[docs] def build_entity_graph(self, docs, min_freq=0, inv_index={}, used_types=[]): G = nx.Graph() links = {} if len(inv_index) == 0: for i, sent in enumerate(docs): entities_info = self.entity_linking(sent) if len(used_types) == 0: entities = set(entity for span, (entity, type0) in entities_info) else: entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types) for u, v in combinations(entities, 2): pair0 = tuple(sorted((u, v))) if pair0 not in links: links[pair0] = 1 else: links[pair0] += 1 else: # 已经有倒排文档,可以更快速检索 if len(used_types) == 0: entities = self.entity_type_dict.keys() else: entities = iter(entity for (entity, type0) in self.entity_type_dict.items() if type0 in used_types) for u, v in combinations(entities, 2): pair0 = tuple(sorted((u, v))) ids = inv_index[u] & inv_index[v] if len(ids) > 0: links[pair0] = len(ids) for (u, v) in links: if links[(u, v)] >= min_freq: G.add_edge(u, v, weight=links[(u, v)]) self.entity_graph = G return G
[docs] def build_word_ego_graph(self, docs, word, standard_name=True, min_freq=0, other_min_freq=-1, stopwords=None): '''根据文本和指定限定词,获得以限定词为中心的各词语的关系。 限定词可以是一个特定的方面(衣食住行这类文档),这样就可以从词语中心图中获得关于这个方面的简要信息 :param docs: 文本的列表 :param word: 限定词 :param standard_name: 把所有实体的指称化为标准实体名 :param stopwords: 需要过滤的停用词 :param min_freq: 作为边加入到图中的与中心词最小共现次数,用于筛掉可能过多的边 :param other_min_freq: 中心词以外词语关系的最小共现次数 :return: G(networxX中的Graph) ''' G = nx.Graph() links = {} if other_min_freq == -1: other_min_freq = min_freq for doc in docs: if stopwords: words = set(x for x in self.seg(doc, standard_name=standard_name) if x not in stopwords) else: words = self.seg(doc, standard_name=standard_name) if word in words: for u, v in combinations(words, 2): pair0 = tuple(sorted((u, v))) if pair0 not in links: links[pair0] = 1 else: links[pair0] += 1 used_nodes = set([word]) # 关系对中涉及的词语必须与实体有关(>= min_freq) for (u, v) in links: w = links[(u, v)] if word in (u, v) and w >= min_freq: used_nodes.add(v if word == u else u) G.add_edge(u, v, weight=w) elif w >= other_min_freq: G.add_edge(u, v, weight=w) G = G.subgraph(used_nodes).copy() return G
[docs] def build_entity_ego_graph(self, docs, word, min_freq=0, other_min_freq=-1, inv_index={}, used_types=[]): '''Entity only version of build_word_ego_graph() ''' G = nx.Graph() links = {} if other_min_freq == -1: other_min_freq = min_freq if len(inv_index) != 0: related_docs = self.search_entity(word, docs, inv_index) else: related_docs = [] for doc in docs: entities_info = self.entity_linking(doc) entities = [entity0 for [[l,r], (entity0,type0)] in entities_info] if word in entities: related_docs.append(doc) for i, sent in enumerate(related_docs): entities_info = self.entity_linking(sent) if len(used_types) == 0: entities = set(entity for span, (entity, type0) in entities_info) else: entities = set(entity for span, (entity, type0) in entities_info if type0[1:-1] in used_types) for u, v in combinations(entities, 2): pair0 = tuple(sorted((u, v))) if pair0 not in links: links[pair0] = 1 else: links[pair0] += 1 used_nodes = set([word]) # 关系对中涉及的词语必须与实体有关(>= min_freq) for (u, v) in links: w = links[(u, v)] if word in (u, v) and w >= min_freq: used_nodes.add(v if word == u else u) G.add_edge(u, v, weight=w) elif w >= other_min_freq: G.add_edge(u, v, weight=w) G = G.subgraph(used_nodes).copy() return G