Source code for harvesttext.algorithms.keyword
import numpy as np
import networkx as nx
[docs]def combine(word_list, window = 2):
"""构造在window下的单词组合,用来构造单词之间的边。
:params word_list: list of str, 由单词组成的列表。
:params window: int, 窗口大小。
"""
if window < 2: window = 2
for x in range(1, window):
if x >= len(word_list):
break
word_list2 = word_list[x:]
res = zip(word_list, word_list2)
for r in res:
yield r
[docs]def textrank(block_words, topK, with_score=False, window=2, weighted=False):
G = nx.Graph()
for word_list in block_words:
for u, v in combine(word_list, window):
if not weighted:
G.add_edge(u, v)
else:
if G.has_edge(u, v):
G[u][v]['weight'] += 1
else:
G.add_edge(u, v, weight=1)
pr = nx.pagerank(G)
pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
if with_score:
return pr_sorted[:topK]
else:
return [w for (w, imp) in pr_sorted[:topK]]