Spaces:
Runtime error
Runtime error
from functools import lru_cache | |
import attr | |
import pandas as pd | |
import numpy as np | |
import spacy | |
from nltk.tokenize.texttiling import TextTilingTokenizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sentence_transformers import SentenceTransformer | |
def load_sentence_transformer(model_name='all-MiniLM-L6-v2'): | |
""" | |
all_MiniLM_L6_v2 - offline | |
all-MiniLM-L6-v2 - Online | |
""" | |
model = SentenceTransformer(model_name) | |
return model | |
def load_spacy(): | |
return spacy.load('en_core_web_sm') | |
model = load_sentence_transformer() | |
nlp = load_spacy() | |
class SemanticTextSegmentation: | |
""" | |
Segment a call transcript based on topics discussed in the call using | |
TextTilling with Sentence Similarity via sentence transformer. | |
Paramters | |
--------- | |
data: pd.Dataframe | |
Pass the trascript in the dataframe format | |
utterance: str | |
pass the column name which represent utterance in transcript dataframe | |
""" | |
data = attr.ib() | |
utterance = attr.ib(default='utterance') | |
def __attrs_post_init__(self): | |
columns = self.data.columns.tolist() | |
def get_segments(self, threshold=0.7): | |
""" | |
returns the transcript segments computed with texttiling and sentence-transformer. | |
Paramters | |
--------- | |
threshold: float | |
sentence similarity threshold. (used to merge the sentences into coherant segments) | |
Return | |
------ | |
new_segments: list | |
list of segments | |
""" | |
segments = self._text_tilling() | |
merge_index = self._merge_segments(segments, threshold) | |
new_segments = [] | |
for i in merge_index: | |
seg = ' '.join([segments[_] for _ in i]) | |
new_segments.append(seg) | |
return new_segments | |
def _merge_segments(self, segments, threshold): | |
segment_map = [0] | |
for index, (text1, text2) in enumerate(zip(segments[:-1], segments[1:])): | |
sim = self._get_similarity(text1, text2) | |
if sim >= threshold: | |
segment_map.append(0) | |
else: | |
segment_map.append(1) | |
return self._index_mapping(segment_map) | |
def _index_mapping(self, segment_map): | |
index_list = [] | |
temp = [] | |
for index, i in enumerate(segment_map): | |
if i == 1: | |
index_list.append(temp) | |
temp = [index] | |
else: | |
temp.append(index) | |
index_list.append(temp) | |
return index_list | |
def _get_similarity(self, text1, text2): | |
sentence_1 = [i.text.strip() | |
for i in nlp(text1).sents if len(i.text.split(' ')) > 1] | |
sentence_2 = [i.text.strip() | |
for i in nlp(text2).sents if len(i.text.split(' ')) > 2] | |
embeding_1 = model.encode(sentence_1) | |
embeding_2 = model.encode(sentence_2) | |
embeding_1 = np.mean(embeding_1, axis=0).reshape(1, -1) | |
embeding_2 = np.mean(embeding_2, axis=0).reshape(1, -1) | |
sim = cosine_similarity(embeding_1, embeding_2) | |
return sim | |
def _text_tilling(self): | |
tt = TextTilingTokenizer(w=15, k=10) | |
text = '\n\n\t'.join(self.data[self.utterance].tolist()) | |
segment = tt.tokenize(text) | |
segment = [i.replace("\n\n\t", ' ') for i in segment] | |
return segment |