NealCaren commited on
Commit
159d2d1
1 Parent(s): 4e5530a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +71 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisperx
2
+ import streamlit as st
3
+ import torch
4
+ import tempfile
5
+ import subprocess
6
+
7
+
8
+ def transcribe(audio_file):
9
+
10
+
11
+ if torch.cuda.is_available():
12
+ device = "gpu"
13
+ else:
14
+ device = "cpu"
15
+ batch_size = 16 # reduce if low on GPU mem
16
+ compute_type = "int8" # change to "float16" if high on GPU mem (may reduce accuracy)
17
+ YOUR_HF_TOKEN = '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'
18
+
19
+ # load audio file
20
+ audio_bytes = uploaded_file.getvalue()
21
+ with open(temp_file, 'wb') as f:
22
+ f.write(audio_bytes)
23
+
24
+ # 1. Transcribe with original whisper (batched)
25
+ model = whisperx.load_model("tiny", device = device, compute_type=compute_type)
26
+
27
+ audio = whisperx.load_audio(temp_file)
28
+ result = model.transcribe(audio, batch_size=batch_size)
29
+ st.write("Transcribed! Here's what we have so far:")
30
+ st.write(result["segments"]) # before alignment
31
+
32
+ # delete model if low on GPU resources
33
+ # import gc; gc.collect(); torch.cuda.empty_cache(); del model
34
+
35
+ # 2. Align whisper output
36
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
37
+ result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
38
+ st.write("Aligned! Here's what we have so far:")
39
+ st.write(result["segments"]) # after alignment
40
+
41
+ # delete model if low on GPU resources
42
+ # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
43
+
44
+ # 3. Assign speaker labels
45
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
46
+
47
+ # add min/max number of speakers if known
48
+ diarize_segments = diarize_model(audio_file)
49
+ # diarize_model(audio_file, min_speakers=min_speakers, max_speakers=max_speakers)
50
+
51
+ result = whisperx.assign_word_speakers(diarize_segments, result)
52
+ st.write(diarize_segments)
53
+ st.write(result["segments"]) # segments are now assigned speaker IDs
54
+
55
+
56
+ st.title("Automated Transcription")
57
+
58
+ form = st.form(key='my_form')
59
+ uploaded_file = form.file_uploader("Choose a file")
60
+
61
+ submit = form.form_submit_button("Transcribe!")
62
+
63
+
64
+ if submit:
65
+ #temporary file to store audio_file
66
+ tmp_dir = tempfile.TemporaryDirectory()
67
+ temp_file = tmp_dir.name + '/mono.wav'
68
+ cmd = f"ffmpeg -y -i {uploaded_file} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}"
69
+ subprocess.Popen(cmd, shell=True).wait()
70
+
71
+ transcribe(temp_file)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/m-bain/whisperx.git
2
+ streamlit
3
+ pandas