import re from thefuzz import fuzz import numpy as np def match_mask_and_transcript(split_punct, transcript, classification): """ Input: split_punct: the punctuated text, split on ?/!/.\s, transcript: original transcript with timestamps classification: classification object (list of numbers 0,1) Output: times """ # Get the sponsored part sponsored_segment = [] for i, val in enumerate(classification): if val == 1: sponsored_segment.append(split_punct[i]) segment = " ".join(sponsored_segment) sim_scores = list() # Check the similarity scores between the sponsored part and the transcript parts for elem in transcript: sim_scores.append(fuzz.partial_ratio(segment, elem["text"])) # Get the scores and check if they are above mean + 2*stdev scores = np.array(sim_scores) timestamp_mask = (scores > np.mean(scores) + np.std(scores) * 2).astype(int) timestamps = [ (transcript[i]["start"], transcript[i]["duration"]) for i, elem in enumerate(timestamp_mask) if elem == 1 ] # Get the timestamp segments times = [] current = -1 current_time = 0 for elem in timestamps: # Threshold of 5 to see if it is a jump to another segment (also to make sure smaller segments are added together if elem[0] > (current_time + 15): current += 1 times.append((elem[0], elem[0] + elem[1])) current_time = elem[0] + elem[1] else: times[current] = (times[current][0], elem[0] + elem[1]) current_time = elem[0] + elem[1] return_times = [x for x in times if (x[1] - x[0]) > 10] return return_times, timestamps