Spaces:
Runtime error
Runtime error
Neal Caren
commited on
Commit
•
014d79d
1
Parent(s):
fd2a8f0
shrunk
Browse files
app.py
CHANGED
@@ -6,89 +6,10 @@ from simple_diarizer.diarizer import Diarizer
|
|
6 |
import streamlit as st
|
7 |
|
8 |
|
9 |
-
''
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
return f'You said: {result["text"]}'
|
14 |
-
|
15 |
-
def segment(nu_speakers):
|
16 |
-
|
17 |
-
diar = Diarizer(embed_model='xvec',cluster_method='sc')
|
18 |
-
segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
|
19 |
-
|
20 |
-
sdf = pd.DataFrame(segments)
|
21 |
-
|
22 |
-
# reorganize so the first speaker is always speaker 1
|
23 |
-
speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
|
24 |
-
speaker_d = dict((v,k+1) for k,v in speaker_s.items())
|
25 |
-
|
26 |
-
sdf['speaker'] = sdf['label'].replace(speaker_d)
|
27 |
-
return sdf
|
28 |
-
|
29 |
-
def audio_to_df(uploaded):
|
30 |
-
monotize(uploaded)
|
31 |
-
model = whisper.load_model('tiny')
|
32 |
-
result = model.transcribe('mono.wav',verbose=True,
|
33 |
-
without_timestamps=False)
|
34 |
-
tdf = pd.DataFrame(result['segments'])
|
35 |
-
return tdf
|
36 |
-
|
37 |
-
def monotize(uploaded):
|
38 |
-
cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
|
39 |
-
subprocess.Popen(cmd, shell=True).wait()
|
40 |
-
|
41 |
-
def add_preface(row):
|
42 |
-
text = row['text'].replace('\n','')
|
43 |
-
speaker = row['speaker']
|
44 |
-
return f'Speaker {speaker}: {text}'
|
45 |
-
|
46 |
-
def transcribe(uploaded, nu_speakers):
|
47 |
-
|
48 |
-
monotize(uploaded)
|
49 |
-
tdf = audio_to_df(uploaded)
|
50 |
-
sdf = segment(nu_speakers)
|
51 |
-
|
52 |
-
ns_list = sdf[['start','speaker']].to_dict(orient='records')
|
53 |
-
|
54 |
-
# Find the nearest transcript line to the start of each speaker
|
55 |
-
for row in ns_list:
|
56 |
-
input = row['start']
|
57 |
-
id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
|
58 |
-
tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
|
59 |
-
|
60 |
-
tdf['speaker'].fillna(method = 'ffill', inplace = True)
|
61 |
-
tdf['speaker'].fillna(method = 'bfill', inplace = True)
|
62 |
-
|
63 |
-
tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
|
64 |
-
tdf['speach'] = tdf['n1'].cumsum()
|
65 |
-
binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
|
66 |
-
|
67 |
-
binned_df['speaker'] = binned_df['speaker'].astype(int)
|
68 |
-
binned_df['output'] = binned_df.apply(add_preface, axis=1)
|
69 |
-
|
70 |
-
lines = []
|
71 |
-
for row in binned_df['output'].values:
|
72 |
-
st.write(row)
|
73 |
-
lines.append(row)
|
74 |
-
|
75 |
-
return '\n'.join(lines)
|
76 |
-
|
77 |
-
|
78 |
-
descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
|
79 |
-
"audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
|
80 |
-
"to partition the text by speaker.\n"
|
81 |
-
"* Creating the transcript takes some time. "
|
82 |
-
"Using the default base transcription model, the process takes approximately 20% of the length of the audio file.\n "
|
83 |
-
"* There seems to be cap on the uploaded file size of about 20MBs. My [colab](https://colab.research.google.com/drive/18AD-mb3bT4s8k3UNhZu-ghPq2DT5il3V?usp=sharing) version "
|
84 |
-
"can handle any file size, but requies some Python knowledge.\n"
|
85 |
-
"* After uploading the file, **be sure to select the number of speakers**." )
|
86 |
-
|
87 |
-
'''
|
88 |
-
with st.form(key='my_form'):
|
89 |
-
uploaded = st.file_uploader("Choose a file")
|
90 |
-
nu_speakers = st.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1)
|
91 |
-
submit = st.form_submit_button("Transcribe!")
|
92 |
|
93 |
|
94 |
if submit:
|
@@ -97,7 +18,4 @@ if submit:
|
|
97 |
outfile.write(bytes_data)
|
98 |
#st.write('Converting audio file.')
|
99 |
#monotize('temp_audio')
|
100 |
-
text = transcribe('temp_audio', nu_speakers)
|
101 |
-
|
102 |
-
|
103 |
-
# To read file as bytes:
|
|
|
6 |
import streamlit as st
|
7 |
|
8 |
|
9 |
+
form = st.form(key='my_form')
|
10 |
+
uploaded = form.file_uploader("Choose a file")
|
11 |
+
nu_speakers = form.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1)
|
12 |
+
submit = form.form_submit_button("Transcribe!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
if submit:
|
|
|
18 |
outfile.write(bytes_data)
|
19 |
#st.write('Converting audio file.')
|
20 |
#monotize('temp_audio')
|
21 |
+
#text = transcribe('temp_audio', nu_speakers)
|
|
|
|
|
|