Neal Caren commited on
Commit
da8c6a9
1 Parent(s): c089e11
Files changed (1) hide show
  1. app.py +63 -0
app.py CHANGED
@@ -5,10 +5,73 @@ import subprocess
5
  from simple_diarizer.diarizer import Diarizer
6
  import streamlit as st
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def monotize(uploaded):
9
  cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
10
  subprocess.Popen(cmd, shell=True).wait()
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  form = st.form(key='my_form')
14
  uploaded = form.file_uploader("Choose a file")
 
5
  from simple_diarizer.diarizer import Diarizer
6
  import streamlit as st
7
 
8
+ def speech_to_text(uploaded):
9
+ model = whisper.load_model('tiny')
10
+ result = model.transcribe(uploaded,verbose=True)
11
+ return f'You said: {result["text"]}'
12
+
13
+ def segment(nu_speakers):
14
+
15
+ diar = Diarizer(embed_model='xvec',cluster_method='sc')
16
+ segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
17
+
18
+ sdf = pd.DataFrame(segments)
19
+
20
+ # reorganize so the first speaker is always speaker 1
21
+ speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
22
+ speaker_d = dict((v,k+1) for k,v in speaker_s.items())
23
+
24
+ sdf['speaker'] = sdf['label'].replace(speaker_d)
25
+ return sdf
26
+
27
+ def audio_to_df(uploaded):
28
+ monotize(uploaded)
29
+ model = whisper.load_model('tiny')
30
+ result = model.transcribe('mono.wav',verbose=True,
31
+ without_timestamps=False)
32
+ tdf = pd.DataFrame(result['segments'])
33
+ return tdf
34
+
35
  def monotize(uploaded):
36
  cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
37
  subprocess.Popen(cmd, shell=True).wait()
38
 
39
+ def add_preface(row):
40
+ text = row['text'].replace('\n','')
41
+ speaker = row['speaker']
42
+ return f'Speaker {speaker}: {text}'
43
+
44
+ def transcribe(uploaded, nu_speakers):
45
+
46
+ monotize(uploaded)
47
+ tdf = audio_to_df(uploaded)
48
+ sdf = segment(nu_speakers)
49
+
50
+ ns_list = sdf[['start','speaker']].to_dict(orient='records')
51
+
52
+ # Find the nearest transcript line to the start of each speaker
53
+ for row in ns_list:
54
+ input = row['start']
55
+ id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
56
+ tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
57
+
58
+ tdf['speaker'].fillna(method = 'ffill', inplace = True)
59
+ tdf['speaker'].fillna(method = 'bfill', inplace = True)
60
+
61
+ tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
62
+ tdf['speach'] = tdf['n1'].cumsum()
63
+ binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
64
+
65
+ binned_df['speaker'] = binned_df['speaker'].astype(int)
66
+ binned_df['output'] = binned_df.apply(add_preface, axis=1)
67
+
68
+ lines = []
69
+ for row in binned_df['output'].values:
70
+ st.write(row)
71
+ lines.append(row)
72
+
73
+ return '\n'.join(lines)
74
+
75
 
76
  form = st.form(key='my_form')
77
  uploaded = form.file_uploader("Choose a file")