Source code for harvesttext.algorithms.texttile
import numpy as np
from .utils import sent_sim_cos
[docs]class TextTile:
def __init__(self):
pass
[docs] def depth_scores(self, sim_scores):
assert len(sim_scores) > 0
sim_scores = np.array(sim_scores)
left_peaks = sim_scores.copy()
right_peaks = sim_scores.copy()
offset = 0
for i, score in enumerate(sim_scores[:-2]):
if sim_scores[i+1] < score:
right_peaks[offset:i+1] = score
offset = i+1
offset = len(sim_scores)
for i in range(len(sim_scores)-1, 0, -1):
score = sim_scores[i]
if sim_scores[i-1] < score:
left_peaks[i:offset] = score
offset = i
depths = left_peaks + right_peaks - 2*sim_scores
return depths[:-1]
def _align_boundary(self, predicted_boundary_ids, original_boundary_ids):
for i, pid in enumerate(predicted_boundary_ids):
# avoid exhausts original ids before all aligned
preserve_to = len(original_boundary_ids) - len(predicted_boundary_ids) + i + 1
aligned_oid_at = preserve_to - 1
dist = original_boundary_ids[aligned_oid_at]
for j, oid in enumerate(original_boundary_ids[:preserve_to]):
dist0 = abs(pid-oid)
if dist0 > dist: break
dist, aligned_oid_at = dist0, j
predicted_boundary_ids[i] = original_boundary_ids[aligned_oid_at]
# avoid duplicating or even forward boundary, will change the list
del original_boundary_ids[:aligned_oid_at+1]
return predicted_boundary_ids
[docs] def cut_paragraphs(self, sent_words, num_paras=None, block_sents=3, std_weight=0.5,
align_boundary=True, original_boundary_ids=None):
sims = [0 for i in range(len(sent_words))]
# for i in range(block_sents, len(sentences)-block_sents):
for i in range(1, len(sent_words)):
left_words = [x for words in sent_words[max(0, i-block_sents):i] for x in words]
right_words = [x for words in sent_words[i:min(len(sent_words), i+block_sents)] for x in words]
sims[i-1] = sent_sim_cos(left_words, right_words)
depths = self.depth_scores(sims) # ignore the last one, must be boundary
if num_paras is None: # automatically determine according to stats
num_paras = np.sum(depths > np.mean(depths) - std_weight*np.std(depths))
if align_boundary:
assert original_boundary_ids is not None
if num_paras >= len(original_boundary_ids):
return original_boundary_ids
# last sentence must be a boundary
predicted_boundary_ids = (1+np.argsort(depths)[::-1][:num_paras-1]).tolist() + [len(sent_words)]
predicted_boundary_ids = list(sorted(predicted_boundary_ids))
if align_boundary:
# move the predicted boundary to the nearest original one to align
# if 2 predicted boundaries falls in the same original paragraph, move it to the next one
predicted_boundary_ids = self._align_boundary(predicted_boundary_ids, original_boundary_ids)
return predicted_boundary_ids