Neal Caren commited on
Commit
9ec01b8
1 Parent(s): 555bd19
Files changed (1) hide show
  1. app.py +92 -6
app.py CHANGED
@@ -1,13 +1,99 @@
 
 
 
 
 
1
  import streamlit as st
2
 
3
 
4
- uploaded_file = st.file_uploader("Choose a file")
5
 
6
- number_of_speakers = st.number_input('Insert a number')
 
 
 
7
 
8
- st.write('The current number is ', number_of_speakers)
9
 
10
- if uploaded_file is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # To read file as bytes:
12
- bytes_data = uploaded_file.getvalue()
13
- st.write(bytes_data)
 
1
+ import whisper
2
+ import pandas as pd
3
+ import whisper
4
+ import subprocess
5
+ from simple_diarizer.diarizer import Diarizer
6
  import streamlit as st
7
 
8
 
 
9
 
10
+ def speech_to_text(uploaded):
11
+ model = whisper.load_model('tiny')
12
+ result = model.transcribe(uploaded)
13
+ return f'You said: {result["text"]}'
14
 
15
+ def segment(nu_speakers):
16
 
17
+ diar = Diarizer(embed_model='xvec',cluster_method='sc')
18
+ segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
19
+
20
+ sdf = pd.DataFrame(segments)
21
+
22
+ # reorganize so the first speaker is always speaker 1
23
+ speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
24
+ speaker_d = dict((v,k+1) for k,v in speaker_s.items())
25
+
26
+ sdf['speaker'] = sdf['label'].replace(speaker_d)
27
+ return sdf
28
+
29
+ def audio_to_df(uploaded):
30
+ monotize(uploaded)
31
+ model = whisper.load_model('base')
32
+ result = model.transcribe('mono.wav',verbose=True,
33
+ without_timestamps=False)
34
+ tdf = pd.DataFrame(result['segments'])
35
+ return tdf
36
+
37
+ def monotize(uploaded):
38
+ print(uploaded.name)
39
+ cmd = f"ffmpeg -y -i {uploaded.name} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
40
+ subprocess.Popen(cmd, shell=True).wait()
41
+
42
+ def add_preface(row):
43
+ text = row['text'].replace('\n','')
44
+ speaker = row['speaker']
45
+ return f'Speaker {speaker}: {text}'
46
+
47
+ def transcribe(uploaded, nu_speakers):
48
+
49
+ monotize(uploaded)
50
+ tdf = audio_to_df(uploaded)
51
+ sdf = segment(nu_speakers)
52
+
53
+ ns_list = sdf[['start','speaker']].to_dict(orient='records')
54
+
55
+ # Find the nearest transcript line to the start of each speaker
56
+ for row in ns_list:
57
+ input = row['start']
58
+ id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
59
+ tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
60
+
61
+ tdf['speaker'].fillna(method = 'ffill', inplace = True)
62
+ tdf['speaker'].fillna(method = 'bfill', inplace = True)
63
+
64
+ tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
65
+ tdf['speach'] = tdf['n1'].cumsum()
66
+ binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
67
+
68
+ binned_df['speaker'] = binned_df['speaker'].astype(int)
69
+ binned_df['output'] = binned_df.apply(add_preface, axis=1)
70
+
71
+ lines = []
72
+ for row in binned_df['output'].values:
73
+ lines.append(row)
74
+
75
+ return '\n'.join(lines)
76
+
77
+
78
+ descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
79
+ "audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
80
+ "to partition the text by speaker.\n"
81
+ "* Creating the transcript takes some time. "
82
+ "Using the default base transcription model, the process takes approximately 20% of the length of the audio file.\n "
83
+ "* There seems to be cap on the uploaded file size of about 20MBs. My [colab](https://colab.research.google.com/drive/18AD-mb3bT4s8k3UNhZu-ghPq2DT5il3V?usp=sharing) version "
84
+ "can handle any file size, but requies some Python knowledge.\n"
85
+ "* After uploading the file, **be sure to select the number of speakers**." )
86
+
87
+
88
+ with st.form(key='my_form'):
89
+ uploaded = st.file_uploader("Choose a file")
90
+ nu_speakers = st.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1)
91
+ submit = st.form_submit_button("Transcribe!")
92
+
93
+
94
+ if submit:
95
+ bytes_data = uploaded.getvalue()
96
+ with open('temp_audio', 'wb') as outfile:
97
+ outfile.write(bytes_data)
98
+ speech_to_text('temp_audio')
99
  # To read file as bytes: