kobakhit commited on
Commit
cfcb5c6
1 Parent(s): b79e42d
Files changed (2) hide show
  1. .streamlit/config.toml +1 -1
  2. app.py +48 -36
.streamlit/config.toml CHANGED
@@ -1,6 +1,6 @@
1
  [theme]
2
  primaryColor = "#696969s"
3
  backgroundColor = "#000000"
4
- secondaryBackgroundColor = "#282828"
5
  textColor = "#fafafa"
6
  font = "sans serif"
 
1
  [theme]
2
  primaryColor = "#696969s"
3
  backgroundColor = "#000000"
4
+ secondaryBackgroundColor = "#1b1b1b"
5
  textColor = "#fafafa"
6
  font = "sans serif"
app.py CHANGED
@@ -22,7 +22,8 @@ from matplotlib import pyplot as plt
22
 
23
  st.set_page_config(
24
  page_title="Speech-to-chat",
25
- page_icon = '🌊'
 
26
  )
27
 
28
  # Set your OpenAI, Hugging Face API keys
@@ -106,30 +107,48 @@ initial_prompt = [{"role": "system", "content": "You are helping to analyze and
106
  {"role": 'user', "content": 'Please summarize briefly the following transcript\n{}'}]
107
  if "messages" not in st.session_state:
108
  st.session_state.messages = initial_prompt
109
-
110
-
111
 
112
 
113
  st.title("Speech to Chat")
114
  reddit_thread = 'https://www.reddit.com/r/dataisbeautiful/comments/17413bq/oc_speech_diarization_app_that_transcribes_audio'
115
- with st.expander('About', expanded=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  st.markdown(f'''
117
- Given an audio file this app will
118
- - [x] 1. Identify and diarize the speakers using `pyannote` [HuggingFace Speaker Diarization api](https://huggingface.co/pyannote/speaker-diarization-3.0)
119
- - [x] 2. Transcribe the audio and attribute to speakers using [OpenAi Whisper API](https://platform.openai.com/docs/guides/speech-to-text/quickstart)
120
- - [x] 3. Set up an LLM chat with the transcript loaded into its knowledge database, so that a user can "talk" to the transcript of the audio file
121
 
122
- This version will only process up to first 6 minutes of an audio file due to limited resources of Streamlit.io apps.
 
 
 
 
 
123
  A local version with access to a GPU can process 1 hour of audio in 1 to 5 minutes.
124
  If you would like to use this app at scale reach out directly by creating an issue on [github🤖](https://github.com/KobaKhit/speech-to-text-app/issues)!
125
 
126
- Rule of thumb, for this Streamlit.io hosted app it takes half the duration of the audio to complete processing, ex. g. 6 minute youtube video will take 3 minutes to diarize.
127
 
128
- [github repo](https://github.com/KobaKhit/speech-to-text-app)
129
  ''')
130
 
131
 
132
- option = st.radio("Select source:", ["Upload an audio file", "Use YouTube link","See Example"], index=2)
 
 
 
 
133
 
134
  # Upload audio file
135
  if option == "Upload an audio file":
@@ -172,7 +191,7 @@ elif option == "Use YouTube link":
172
  # audio = audio.set_frame_rate(sample_rate)
173
  # except Exception as e:
174
  # st.write(f"Error: {str(e)}")
175
- elif option == 'See Example':
176
  youtube_link = 'https://www.youtube.com/watch?v=TamrOZX9bu8'
177
  audio_name = 'Stephen A. Smith has JOKES with Shannon Sharpe'
178
  st.write(f'Loaded audio file from {youtube_link} - {audio_name} 👏😂')
@@ -191,11 +210,9 @@ elif option == 'See Example':
191
  st.session_state.transcript_file = 'example/steve a smith jokes.json'
192
 
193
  st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
194
-
195
-
196
  # Diarize
197
  if "audio" in locals():
198
- st.write('Performing Diarization...')
199
  # create stream
200
  duration = audio.duration_seconds
201
  if duration > 360:
@@ -205,26 +222,25 @@ if "audio" in locals():
205
 
206
 
207
  # Perform diarization with PyAnnote
208
- # "pyannote/speaker-diarization-3.0",
209
- # use_auth_token=hf_api_key
210
  pipeline = Pipeline.from_pretrained(
211
  "pyannote/speaker-diarization-3.0", use_auth_token=hf_api_key)
212
  if torch.cuda.device_count() > 0: # use gpu if available
213
  pipeline.to(torch.device('cuda'))
214
 
215
  # run the pipeline on an audio file
216
- if 'rttm' in st.session_state and st.session_state.rttm != None:
217
- st.write(f'Loading {st.session_state.rttm}')
218
- diarization = load_rttm_file(st.session_state.rttm )
219
- else:
220
- # with ProgressHook() as hook:
221
- audio_ = create_audio_stream(audio)
222
- # diarization = pipeline(audio_, hook=hook)
223
- diarization = pipeline(audio_)
224
- # dump the diarization output to disk using RTTM format
225
- with open(f'{audio_name.split(".")[0]}.rttm', "w") as f:
226
- diarization.write_rttm(f)
227
- st.session_state.rttm = f'{audio_name.split(".")[0]}.rttm'
 
228
 
229
  # Display the diarization results
230
  st.write("Diarization Results:")
@@ -256,7 +272,7 @@ if "audio" in locals():
256
  st.pyplot(figure)
257
 
258
  st.write('Speakers and Audio Samples')
259
- with st.expander('Samples', expanded=False):
260
  for speaker in set(s['speaker'] for s in sp_chunks):
261
  temp = max(filter(lambda d: d['speaker'] == speaker, sp_chunks), key=lambda x: x['duration'])
262
  speak_time = sum(c['duration'] for c in filter(lambda d: d['speaker'] == speaker, sp_chunks))
@@ -266,16 +282,12 @@ if "audio" in locals():
266
  speaker_summary += f" {add_query_parameter(youtube_link, {'t':str(int(temp['start']))})}"
267
  st.write(speaker_summary)
268
  st.audio(create_audio_stream(temp['audio']))
269
-
270
-
271
- # st.write("Transcription with Whisper ASR:")
272
 
273
  st.divider()
274
  # # Perform transcription with Whisper ASR
275
 
276
 
277
  # Transcript containers
278
- container_transcript_chat = st.container()
279
  st.write('Transcribing using Whisper API (150 requests limit)...')
280
  container_transcript_completed = st.container()
281
 
@@ -359,7 +371,7 @@ if "audio" in locals():
359
 
360
  # chat field
361
  with st.form("Chat",clear_on_submit=True):
362
- prompt = st.text_input("Chat with the Transcript (2 prompts limit)")
363
  st.form_submit_button()
364
 
365
  # message list
 
22
 
23
  st.set_page_config(
24
  page_title="Speech-to-chat",
25
+ page_icon = '🌊',
26
+ layout='wide'
27
  )
28
 
29
  # Set your OpenAI, Hugging Face API keys
 
107
  {"role": 'user', "content": 'Please summarize briefly the following transcript\n{}'}]
108
  if "messages" not in st.session_state:
109
  st.session_state.messages = initial_prompt
 
 
110
 
111
 
112
  st.title("Speech to Chat")
113
  reddit_thread = 'https://www.reddit.com/r/dataisbeautiful/comments/17413bq/oc_speech_diarization_app_that_transcribes_audio'
114
+
115
+ with st.sidebar:
116
+ st.markdown('''
117
+ # How to Use
118
+
119
+ 1. Enter a youtube link or upload an audio file.
120
+ 2. "Chat" with the file.
121
+
122
+ Example prompts:
123
+ - Which speaker spoke the most?
124
+ - What are important keywords in the transcript for SEO?
125
+ ''')
126
+
127
+ st.divider()
128
+
129
  st.markdown(f'''
130
+ # About
 
 
 
131
 
132
+ Given an audio file or a youtube link this app will
133
+ - [x] 1. Parition the audio according to the identity of each speaker (diarization) using `pyannote` [HuggingFace Speaker Diarization api](https://huggingface.co/pyannote/speaker-diarization-3.0)
134
+ - [x] 2. Transcribe each audio segment using [OpenAi Whisper API](https://platform.openai.com/docs/guides/speech-to-text/quickstart)
135
+ - [x] 3. Set up an LLM chat with the transcript loaded into its knowledge database, so that a user can "talk" to the transcript of the audio file.
136
+
137
+ This version will only process up to first 6 minutes of an audio file due to limited resources of free tier Streamlit.io/HuggingFace Spaces.
138
  A local version with access to a GPU can process 1 hour of audio in 1 to 5 minutes.
139
  If you would like to use this app at scale reach out directly by creating an issue on [github🤖](https://github.com/KobaKhit/speech-to-text-app/issues)!
140
 
141
+ Rule of thumb, for this free tier hosted app it takes half the duration of the audio to complete processing, ex. g. 6 minute youtube video will take 3 minutes to diarize.
142
 
143
+ Made by [kobakhit](https://github.com/KobaKhit/speech-to-text-app)
144
  ''')
145
 
146
 
147
+ # Chat container
148
+ container_transcript_chat = st.container()
149
+
150
+ # Source Selection
151
+ option = st.radio("Select source:", ["Upload an audio file", "Use YouTube link","Example"], index=2)
152
 
153
  # Upload audio file
154
  if option == "Upload an audio file":
 
191
  # audio = audio.set_frame_rate(sample_rate)
192
  # except Exception as e:
193
  # st.write(f"Error: {str(e)}")
194
+ elif option == 'Example':
195
  youtube_link = 'https://www.youtube.com/watch?v=TamrOZX9bu8'
196
  audio_name = 'Stephen A. Smith has JOKES with Shannon Sharpe'
197
  st.write(f'Loaded audio file from {youtube_link} - {audio_name} 👏😂')
 
210
  st.session_state.transcript_file = 'example/steve a smith jokes.json'
211
 
212
  st.audio(create_audio_stream(audio), format="audio/mp4", start_time=0)
213
+
 
214
  # Diarize
215
  if "audio" in locals():
 
216
  # create stream
217
  duration = audio.duration_seconds
218
  if duration > 360:
 
222
 
223
 
224
  # Perform diarization with PyAnnote
 
 
225
  pipeline = Pipeline.from_pretrained(
226
  "pyannote/speaker-diarization-3.0", use_auth_token=hf_api_key)
227
  if torch.cuda.device_count() > 0: # use gpu if available
228
  pipeline.to(torch.device('cuda'))
229
 
230
  # run the pipeline on an audio file
231
+ with st.spinner('Performing Diarization...'):
232
+ if 'rttm' in st.session_state and st.session_state.rttm != None:
233
+ st.write(f'Loading {st.session_state.rttm}')
234
+ diarization = load_rttm_file(st.session_state.rttm )
235
+ else:
236
+ # with ProgressHook() as hook:
237
+ audio_ = create_audio_stream(audio)
238
+ # diarization = pipeline(audio_, hook=hook)
239
+ diarization = pipeline(audio_)
240
+ # dump the diarization output to disk using RTTM format
241
+ with open(f'{audio_name.split(".")[0]}.rttm', "w") as f:
242
+ diarization.write_rttm(f)
243
+ st.session_state.rttm = f'{audio_name.split(".")[0]}.rttm'
244
 
245
  # Display the diarization results
246
  st.write("Diarization Results:")
 
272
  st.pyplot(figure)
273
 
274
  st.write('Speakers and Audio Samples')
275
+ with st.expander('Samples', expanded=True):
276
  for speaker in set(s['speaker'] for s in sp_chunks):
277
  temp = max(filter(lambda d: d['speaker'] == speaker, sp_chunks), key=lambda x: x['duration'])
278
  speak_time = sum(c['duration'] for c in filter(lambda d: d['speaker'] == speaker, sp_chunks))
 
282
  speaker_summary += f" {add_query_parameter(youtube_link, {'t':str(int(temp['start']))})}"
283
  st.write(speaker_summary)
284
  st.audio(create_audio_stream(temp['audio']))
 
 
 
285
 
286
  st.divider()
287
  # # Perform transcription with Whisper ASR
288
 
289
 
290
  # Transcript containers
 
291
  st.write('Transcribing using Whisper API (150 requests limit)...')
292
  container_transcript_completed = st.container()
293
 
 
371
 
372
  # chat field
373
  with st.form("Chat",clear_on_submit=True):
374
+ prompt = st.text_input('Chat with the Transcript (2 prompts limit)')
375
  st.form_submit_button()
376
 
377
  # message list