clr commited on
Commit
b663aa2
1 Parent(s): 43243d3

offline text search

Browse files
Files changed (1) hide show
  1. searcher.py +115 -0
searcher.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob,json,unicodedata,re
2
+
3
+ # basic search helper:
4
+ # download transcript data, run searcher.py on it,
5
+ # open output html file in a browser,
6
+ # it contains quick links to listen to the results.
7
+
8
+ # notes:
9
+ # linked segment times are very approximate.
10
+ # audio loading can be delayed, click segments again until they play.
11
+ # transcripts aren't tagged or parsed, DIY morphosyntax by regex.
12
+
13
+
14
+
15
+ def get_segs(tx_path):
16
+ with open(tx_path,'r') as handle:
17
+ tx = json.load(handle)
18
+ tx = tx['segments']
19
+
20
+ return [get_sent(sent) for sent in tx]
21
+
22
+
23
+ def get_sent(sent):
24
+ def ms(t): # time in ms
25
+ return int(float(t.replace('s',''))*1000)
26
+ # keys of sent are 'startTime', 'endTime', 'words', 'speakerId'
27
+ st = sent['startTime']
28
+ et = sent['endTime']
29
+ ws = ''.join([wd['word'] for wd in sent['words']])
30
+ if st is None:
31
+ st=sent['words'][0]['startTime']
32
+ if et is None:
33
+ et=sent['words'][-1]['endTime']
34
+ return(ms(st),ms(et),ws)
35
+
36
+
37
+ def html_line(match_line,url):
38
+ w,sk,ix = match_line
39
+ h = f'<p>({sk}) [{ix}] <a href="{url}#{ix}">{w}</a></p>'
40
+ return h
41
+
42
+
43
+ def snorm(s):
44
+ s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ])
45
+ while ' ' in s:
46
+ s = s.replace(' ', ' ')
47
+ return s
48
+
49
+
50
+ # the search function must operate on the conversation
51
+ # and return the results in expected format
52
+ def search_convos(corpus_dir, base_url, output_path, search_func, search_string=None):
53
+ convos = glob.glob(corpus_dir+'*/')
54
+ convos = [c.split(corpus_dir)[1].split('/')[0] for c in convos]
55
+ convos = sorted(convos)
56
+
57
+ result_html = ''
58
+
59
+ for i, convo in enumerate(convos):
60
+
61
+ convo_url = f'{base_url}{convo}.html'
62
+ txa = f'{corpus_dir}{convo}/speaker_a_convo_{convo}_transcript.json'
63
+ txb = f'{corpus_dir}{convo}/speaker_b_convo_{convo}_transcript.json'
64
+
65
+ sega = [(s,e,w,'a') for s,e,w in get_segs(txa)]
66
+ segb = [(s,e,w,'b') for s,e,w in get_segs(txb)]
67
+ segs = sega + segb
68
+ segs.sort(key=lambda s: s[0])
69
+
70
+ # discard timestamps but add turn number
71
+ segs = [(segs[i][2], segs[i][3], i) for i in range(len(segs))]
72
+
73
+ matches = search_func(segs,search_string)
74
+
75
+ if matches:
76
+ result_html += f'<h4>{convo}</h4>'
77
+ result_html += '\n'.join([html_line(m,convo_url) for m in matches])
78
+ result_html += f'<hr />'
79
+
80
+ with open(output_path,'w') as handle:
81
+ handle.write(result_html)
82
+
83
+
84
+ def simple_search1(convo,search_string):
85
+ search_string = snorm(search_string)
86
+ norm = [(snorm(w),sk,ln) for w,sk,ln in convo]
87
+ matches = [(w,sk,ln) for w,sk,ln in norm if search_string in w]
88
+ return matches
89
+
90
+
91
+ def regex_search1(convo,search_rx):
92
+ matches = [(w,sk,ln) for w,sk,ln in convo if re.findall(search_rx,snorm(w))]
93
+ return matches
94
+
95
+
96
+ if __name__ == "__main__":
97
+ corpus_dir = './full_conversations/'
98
+ base_url = 'https://clr-spjall.static.hf.space/pages/'
99
+
100
+ output_path = './tmp.html'
101
+
102
+ #search_func = simple_search1
103
+ search_func = regex_search1
104
+ #search_string = 'kannski'
105
+ #search_string = 'eða'
106
+ #search_string = r'\Wá \w+ eða \w+'
107
+ #search_string = r'\Wí \w+ eða \w+'
108
+ #search_string = r'nei\S? \w+ \w+ (ekki|aldrei|ekkert)'#|enga|engu|eng\w\w)'
109
+ #search_string = r'hvor\S* .* eða'
110
+ #search_string = r'\Wef .* þá'
111
+ search_string = r'^\w+ sem'
112
+
113
+
114
+ search_convos(corpus_dir, base_url, output_path, search_func, search_string)
115
+