Source code for harvesttext.algorithms.texttile

import numpy as np
from .utils import sent_sim_cos

[docs]class TextTile:
    def __init__(self):
        pass

[docs]    def depth_scores(self, sim_scores):
        assert len(sim_scores) > 0
        sim_scores = np.array(sim_scores)
        left_peaks = sim_scores.copy()
        right_peaks = sim_scores.copy()
        offset = 0
        for i, score in enumerate(sim_scores[:-2]):
            if sim_scores[i+1] < score:
                right_peaks[offset:i+1] = score
                offset = i+1
        offset = len(sim_scores)
        for i in range(len(sim_scores)-1, 0, -1):
            score = sim_scores[i]
            if sim_scores[i-1] < score:
                left_peaks[i:offset] = score
                offset = i
        depths = left_peaks + right_peaks - 2*sim_scores
        return depths[:-1]

    def _align_boundary(self, predicted_boundary_ids, original_boundary_ids):
        for i, pid in enumerate(predicted_boundary_ids):
            # avoid exhausts original ids before all aligned
            preserve_to = len(original_boundary_ids) - len(predicted_boundary_ids) + i + 1
            aligned_oid_at = preserve_to - 1
            dist = original_boundary_ids[aligned_oid_at]
            for j, oid in enumerate(original_boundary_ids[:preserve_to]):
                dist0 = abs(pid-oid)
                if dist0 > dist: break
                dist, aligned_oid_at = dist0, j
            predicted_boundary_ids[i] = original_boundary_ids[aligned_oid_at]
            # avoid duplicating or even forward boundary, will change the list
            del original_boundary_ids[:aligned_oid_at+1]
        return predicted_boundary_ids

[docs]    def cut_paragraphs(self, sent_words, num_paras=None, block_sents=3, std_weight=0.5,
                       align_boundary=True, original_boundary_ids=None):

        sims = [0 for i in range(len(sent_words))]
        # for i in range(block_sents, len(sentences)-block_sents):
        for i in range(1, len(sent_words)):
            left_words = [x for words in sent_words[max(0, i-block_sents):i] for x in words]
            right_words = [x for words in sent_words[i:min(len(sent_words), i+block_sents)] for x in words]
            sims[i-1] = sent_sim_cos(left_words, right_words)

        depths = self.depth_scores(sims)    # ignore the last one, must be boundary

        if num_paras is None:   # automatically determine according to stats
            num_paras = np.sum(depths > np.mean(depths) - std_weight*np.std(depths))
            if align_boundary:
                assert original_boundary_ids is not None
                if num_paras >= len(original_boundary_ids):
                    return original_boundary_ids

        # last sentence must be a boundary
        predicted_boundary_ids = (1+np.argsort(depths)[::-1][:num_paras-1]).tolist() + [len(sent_words)]
        predicted_boundary_ids = list(sorted(predicted_boundary_ids))
        if align_boundary:
            # move the predicted boundary to the nearest original one to align
            # if 2 predicted boundaries falls in the same original paragraph, move it to the next one
            predicted_boundary_ids = self._align_boundary(predicted_boundary_ids, original_boundary_ids)
        return predicted_boundary_ids