Neal Caren commited on
Commit
1ac8b8b
1 Parent(s): 9908ddd

Multilingual.

Browse files
Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -14,15 +14,9 @@ def create_download_link(val, filename, label):
14
  return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">{label}</a>'
15
 
16
 
17
- def speech_to_text(uploaded):
18
- st.write(f'Using {model_size} model.')
19
- model = whisper.load_model(model_size)
20
- result = model.transcribe(uploaded,verbose=True)
21
- return f'You said: {result["text"]}'
22
-
23
  def segment(nu_speakers):
24
 
25
- diar = Diarizer(embed_model='xvec',cluster_method='sc')
26
  segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
27
 
28
  sdf = pd.DataFrame(segments)
@@ -38,7 +32,8 @@ def audio_to_df(uploaded):
38
  monotize(uploaded)
39
  model = whisper.load_model(model_size)
40
  result = model.transcribe('mono.wav',verbose=True,
41
- without_timestamps=False)
 
42
  tdf = pd.DataFrame(result['segments'])
43
  return tdf
44
 
@@ -99,6 +94,8 @@ descript = ("This web app creates transcripts using OpenAI's [Whisper](https://g
99
  "* You can upload an audio or video file of up to 200MBs.\n"
100
  "* Creating the transcript takes some time. "
101
  "The process takes approximately 20% of the length of the audio file using the base Whisper model.\n "
 
 
102
  "* After uploading the file, be sure to select the number of speakers." )
103
 
104
  st.title("Automated Transcription")
@@ -110,7 +107,7 @@ nu_speakers = form.slider('Number of speakers in recording:', min_value=1, max_v
110
  models = form.selectbox(
111
  'Which Whisper model?',
112
  ('Tiny (fast)', 'Base (good)', 'Small (great but slow)'), index=1)
113
-
114
  submit = form.form_submit_button("Transcribe!")
115
 
116
 
@@ -122,6 +119,11 @@ if submit:
122
  elif models == 'Small (great but slow)':
123
  model_size = 'small'
124
 
 
 
 
 
 
125
  bytes_data = uploaded.getvalue()
126
  with open('temp_audio', 'wb') as outfile:
127
  outfile.write(bytes_data)
 
14
  return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">{label}</a>'
15
 
16
 
 
 
 
 
 
 
17
  def segment(nu_speakers):
18
 
19
+ diar = Diarizer(embed_model='ecapa',cluster_method='sc')
20
  segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
21
 
22
  sdf = pd.DataFrame(segments)
 
32
  monotize(uploaded)
33
  model = whisper.load_model(model_size)
34
  result = model.transcribe('mono.wav',verbose=True,
35
+ without_timestamps=False,
36
+ task = task)
37
  tdf = pd.DataFrame(result['segments'])
38
  return tdf
39
 
 
94
  "* You can upload an audio or video file of up to 200MBs.\n"
95
  "* Creating the transcript takes some time. "
96
  "The process takes approximately 20% of the length of the audio file using the base Whisper model.\n "
97
+ "* The transcription process handles a variety of languages, and can also translate the audio to English. The tiny model is not good at translating. \n"
98
+ "* Speaker segmentation seems to work best with the base model. The small model produces better transcripts, but something seems off with the timecodes, degrading the speaker attribution. \n"
99
  "* After uploading the file, be sure to select the number of speakers." )
100
 
101
  st.title("Automated Transcription")
 
107
  models = form.selectbox(
108
  'Which Whisper model?',
109
  ('Tiny (fast)', 'Base (good)', 'Small (great but slow)'), index=1)
110
+ translate = form.checkbox('Translate to English?')
111
  submit = form.form_submit_button("Transcribe!")
112
 
113
 
 
119
  elif models == 'Small (great but slow)':
120
  model_size = 'small'
121
 
122
+ if translate == True:
123
+ task = 'translate'
124
+ else:
125
+ task = 'transcribe'
126
+
127
  bytes_data = uploaded.getvalue()
128
  with open('temp_audio', 'wb') as outfile:
129
  outfile.write(bytes_data)