import glob,json,unicodedata,re # basic search helper: # download transcript data, run searcher.py on it, # open output html file in a browser, # it contains quick links to listen to the results. # notes: # linked segment times are very approximate. # audio loading can be delayed, click segments again until they play. # transcripts aren't tagged or parsed, DIY morphosyntax by regex. def get_segs(tx_path): with open(tx_path,'r') as handle: tx = json.load(handle) tx = tx['segments'] return [get_sent(sent) for sent in tx] def get_sent(sent): def ms(t): # time in ms return int(float(t.replace('s',''))*1000) # keys of sent are 'startTime', 'endTime', 'words', 'speakerId' st = sent['startTime'] et = sent['endTime'] ws = ''.join([wd['word'] for wd in sent['words']]) if st is None: st=sent['words'][0]['startTime'] if et is None: et=sent['words'][-1]['endTime'] return(ms(st),ms(et),ws) def html_line(match_line,url): w,sk,ix = match_line h = f'

({sk}) [{ix}] {w}

' return h def snorm(s): s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ]) while ' ' in s: s = s.replace(' ', ' ') return s # the search function must operate on the conversation # and return the results in expected format def search_convos(corpus_dir, base_url, output_path, search_func, search_string=None): convos = glob.glob(corpus_dir+'*/') convos = [c.split(corpus_dir)[1].split('/')[0] for c in convos] convos = sorted(convos) result_html = '' for i, convo in enumerate(convos): convo_url = f'{base_url}{convo}.html' txa = f'{corpus_dir}{convo}/speaker_a_convo_{convo}_transcript.json' txb = f'{corpus_dir}{convo}/speaker_b_convo_{convo}_transcript.json' sega = [(s,e,w,'a') for s,e,w in get_segs(txa)] segb = [(s,e,w,'b') for s,e,w in get_segs(txb)] segs = sega + segb segs.sort(key=lambda s: s[0]) # discard timestamps but add turn number segs = [(segs[i][2], segs[i][3], i) for i in range(len(segs))] matches = search_func(segs,search_string) if matches: result_html += f'

{convo}

' result_html += '\n'.join([html_line(m,convo_url) for m in matches]) result_html += f'
' with open(output_path,'w') as handle: handle.write(result_html) def simple_search1(convo,search_string): search_string = snorm(search_string) norm = [(snorm(w),sk,ln) for w,sk,ln in convo] matches = [(w,sk,ln) for w,sk,ln in norm if search_string in w] return matches def regex_search1(convo,search_rx): matches = [(w,sk,ln) for w,sk,ln in convo if re.findall(search_rx,snorm(w))] return matches if __name__ == "__main__": corpus_dir = './full_conversations/' base_url = 'https://clr-spjall.static.hf.space/pages/' output_path = './tmp.html' #search_func = simple_search1 search_func = regex_search1 #search_string = 'kannski' #search_string = 'eða' #search_string = r'\Wá \w+ eða \w+' #search_string = r'\Wí \w+ eða \w+' #search_string = r'nei\S? \w+ \w+ (ekki|aldrei|ekkert)'#|enga|engu|eng\w\w)' #search_string = r'hvor\S* .* eða' #search_string = r'\Wef .* þá' search_string = r'^\w+ sem' search_convos(corpus_dir, base_url, output_path, search_func, search_string)