topic_segmentation /
naveed92's picture
history blame
No virus
2.65 kB
from itertools import islice
import numpy as np
# Sliding window function
def window(seq, n=3):
Returns a sliding window of width n over data from the iterable seq"""
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
# Compute depth scores
def get_depths(scores):
"""Given a sequence of coherence scores of length n, compute a sequence of depth scores of similar length"""
def climb(seq, i, mode='left'):
"""Given a sequence seq of values and index i, advance the index either to the right or left while the
value keeps increasing, then return the value at new index
if mode == 'left':
while True:
curr = seq[i]
if i == 0:
return curr
i = i-1
if not seq[i] > curr:
return curr
if mode == 'right':
while True:
curr = seq[i]
if i == (len(seq)-1):
return curr
i = i+1
if not seq[i] > curr:
return curr
depths = []
for i in range(len(scores)):
score = scores[i]
l_peak = climb(scores, i, mode='left')
r_peak = climb(scores, i, mode='right')
depth = 0.5 * (l_peak + r_peak - (2*score))
return np.array(depths)
from scipy.signal import argrelmax
# Filter out local maxima
def get_local_maxima(depth_scores, order=1):
"""Given a sequence of depth scores, return a filtered sequence where only local maxima
selected based on the given order"""
maxima_ids = argrelmax(depth_scores, order=order)[0]
filtered_scores = np.zeros(len(depth_scores))
filtered_scores[maxima_ids] = depth_scores[maxima_ids]
return filtered_scores
# Automatic threshold computation
def compute_threshold(scores):
"""From Texttiling:
Automatically compute an appropriate threshold given a sequence of depth scores
s = scores[np.nonzero(scores)]
threshold = np.mean(s) - (np.std(s) / 2)
# threshold = np.mean(s) - (np.std(s))
return threshold
def get_threshold_segments(scores, threshold=0.1):
"""Given a sequence of depth scores, return indexes where the value is greater than the threshold"""
segment_ids = np.where(scores >= threshold)[0]
return segment_ids