Source code for harvesttext.summary
import numpy as np
import scipy.special
from itertools import combinations
from .algorithms.utils import sent_sim_textrank, sent_sim_cos
[docs]class SummaryMixin:
"""
文本摘要模块:
- 基于textrank+MMR的无监督抽取式摘要方法
"""
[docs] def get_summary(self, sents, topK=5, stopwords=None, with_importance=False, standard_name=True,
maxlen=None, avoid_repeat=False, sim_func='default'):
'''使用Textrank算法得到文本中的关键句
:param sents: str句子列表
:param topK: 选取几个句子, 如果设置了maxlen,则优先考虑长度
:param stopwords: 在算法中采用的停用词
:param with_importance: 返回时是否包括算法得到的句子重要性
:param standard_name: 如果有entity_mention_list的话,在算法中正规化实体名,一般有助于提升算法效果
:param maxlen: 设置得到的摘要最长不超过多少字数,如果已经达到长度限制但未达到topK句也会停止
:param avoid_repeat: 使用MMR principle惩罚与已经抽取的摘要重复的句子,避免重复
:param sim_func: textrank使用的相似度量函数,默认为基于词重叠的函数(原论文),也可以是任意一个接受两个字符串列表参数的函数
:return: 句子列表,或者with_importance=True时,(句子,分数)列表
'''
assert topK > 0
import networkx as nx
maxlen = float('inf') if maxlen is None else maxlen
sim_func = sent_sim_textrank if sim_func == 'default' else sim_func
# 使用standard_name,相似度可以基于实体链接的结果计算而更加准确
sent_tokens = [self.seg(sent.strip(), standard_name=standard_name, stopwords=stopwords) for sent in sents]
if self.language == "en":
try:
from pattern.en import lemma
sent_tokens = [[lemma(wd) for wd in sent] for sent in sent_tokens]
except:
print(" `pattern` is not installed, so the english words will not be lemmatized, this might slightly hurt the summary quality")
sent_tokens = [sent for sent in sent_tokens if len(sent) > 0]
G = nx.Graph()
for u, v in combinations(range(len(sent_tokens)), 2):
G.add_edge(u, v, weight=sim_func(sent_tokens[u], sent_tokens[v]))
pr = nx.pagerank(G) # sometimes fail to converge
pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
if not avoid_repeat:
ret = []
curr_len = 0
for i, imp in pr_sorted[:topK]:
curr_len += len(sents[i])
if curr_len > maxlen: break
ret.append((sents[i], imp) if with_importance else sents[i])
return ret
else:
assert topK <= len(sent_tokens)
ret = []
curr_len = 0
curr_sumy_words = []
candidate_ids = list(range(len(sent_tokens)))
i, imp = pr_sorted[0]
curr_len += len(sents[i])
if curr_len > maxlen:
return ret
ret.append((sents[i], imp) if with_importance else sents[i])
curr_sumy_words.extend(sent_tokens[i])
candidate_ids.remove(i)
for iter in range(topK-1):
importance = [pr[i] for i in candidate_ids]
norm_importance = scipy.special.softmax(importance)
redundancy = np.array([sent_sim_cos(curr_sumy_words, sent_tokens[i]) for i in candidate_ids])
scores = 0.6*norm_importance - 0.4*redundancy
id_in_cands = np.argmax(scores)
i, imp = candidate_ids[id_in_cands], importance[id_in_cands]
curr_len += len(sents[i])
if curr_len > maxlen:
return ret
ret.append((sents[i], imp) if with_importance else sents[i])
curr_sumy_words.extend(sent_tokens[i])
del candidate_ids[id_in_cands]
return ret