Spaces:
Runtime error
Runtime error
File size: 6,183 Bytes
9ec01b8 faf2a27 ee99df3 19ccd4b 4ccc97f ee99df3 a507bd6 ee99df3 4ccc97f da8c6a9 a507bd6 da8c6a9 1ac8b8b 19ccd4b da8c6a9 a507bd6 da8c6a9 a507bd6 4ccc97f 19ccd4b a507bd6 da8c6a9 a507bd6 faf2a27 da8c6a9 a507bd6 da8c6a9 a507bd6 6178c80 4ccc97f a507bd6 19ccd4b 4ccc97f 72a5c50 4ccc97f a507bd6 9908ddd 6178c80 a507bd6 6178c80 da8c6a9 a507bd6 da8c6a9 a507bd6 da8c6a9 a507bd6 da8c6a9 4ccc97f c0d73db ee99df3 da8c6a9 649cc15 80f0f94 4ccc97f 80f0f94 4ccc97f 1ac8b8b 4ccc97f 80f0f94 014d79d 4ccc97f 9908ddd a507bd6 1ac8b8b 014d79d 9ec01b8 c089e11 9908ddd a507bd6 9908ddd 1ac8b8b 19ccd4b 9ec01b8 a507bd6 ee99df3 a507bd6 ee99df3 a507bd6 ee99df3 19ccd4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import whisper
import pandas as pd
import whisper
import subprocess
from simple_diarizer.diarizer import Diarizer
import streamlit as st
import base64
import tempfile
def create_download_link(val, filename, label):
'''Hack to have a stable download link in Streamlit'''
b64 = base64.b64encode(val)
return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">{label}</a>'
def segment(nu_speakers):
'''Segment the audio using simple_diarizer.
Defaults to the speechbrain ECAPA-TDNN embeddings.'''
diar = Diarizer(embed_model='ecapa',cluster_method='sc')
segments = diar.diarize(temp_file, num_speakers=nu_speakers)
sdf = pd.DataFrame(segments)
# reorganize so the first speaker is always speaker 1
speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
speaker_d = dict((v,k+1) for k,v in speaker_s.items())
sdf['speaker'] = sdf['label'].replace(speaker_d)
return sdf
def monotize(uploaded):
'''Convert the upload file to audio file.'''
cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}"
subprocess.Popen(cmd, shell=True).wait()
def audio_to_df(uploaded):
'''Turn the upload file in a segemented dataframe.'''
#monotize(uploaded)
model = whisper.load_model(model_size)
result = model.transcribe(temp_file,
without_timestamps=False,
task = task)
tdf = pd.DataFrame(result['segments'])
return tdf
def add_preface(row):
''' Add speaker prefix to transcript during transcribe().'''
text = row['text'].replace('\n','')
speaker = row['speaker']
return f'Speaker {speaker}: {text}'
def transcribe(uploaded, nu_speakers):
# Convert file to mono
with st.spinner(text="Converting file..."):
monotize('temp_audio')
# Make audio available to play in UI
audio_file = open(temp_file, 'rb')
audio_bytes = audio_file.read()
st.audio(temp_file, format='audio/wav')
# trancibe file
with st.spinner(text=f"Transcribing using {model_size} model..."):
tdf = audio_to_df(uploaded)
# segement file
with st.spinner(text="Segmenting..."):
sdf = segment(nu_speakers)
# Find the nearest transcript line to the start of each speaker
ns_list = sdf[['start','speaker']].to_dict(orient='records')
for row in ns_list:
input = row['start']
id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
tdf['speaker'].fillna(method = 'ffill', inplace = True)
tdf['speaker'].fillna(method = 'bfill', inplace = True)
tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
tdf['speach'] = tdf['n1'].cumsum()
# collaps the dataframe by speach turn.
binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
binned_df['speaker'] = binned_df['speaker'].astype(int)
binned_df['output'] = binned_df.apply(add_preface, axis=1)
# Display the transcript and prepare for export
lines = []
for row in binned_df['output'].values:
st.write(row)
lines.append(row)
tdf['speaker'] = tdf['speaker'].astype(int)
tdf_cols = ['speaker','start','end','text']
#st.dataframe(tdf[tdf_cols])
return {'text':lines, 'df': tdf[tdf_cols]}
descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
"audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
"to partition the text by speaker.\n"
"* You can upload an audio or video file of up to 200MBs.\n"
"* Creating the transcript takes some time. "
"The process takes approximately 20% of the length of the audio file using the base Whisper model.\n "
"* The transcription process handles a variety of languages, and can also translate the audio to English. The tiny model is not good at translating. \n"
"* Speaker segmentation seems to work best with the base model. The small model produces better transcripts, but something seems off with the timecodes, degrading the speaker attribution. \n"
"* After uploading the file, be sure to select the number of speakers." )
st.title("Automated Transcription")
st.markdown(descript)
form = st.form(key='my_form')
uploaded = form.file_uploader("Choose a file")
nu_speakers = form.slider('Number of speakers in recording:', min_value=1, max_value=8, value=2, step=1)
models = form.selectbox(
'Which Whisper model?',
('Tiny (fast)', 'Base (good)', 'Small (great but slow)', 'Medium (greater but slower)'), index=1)
translate = form.checkbox('Translate to English?')
submit = form.form_submit_button("Transcribe!")
if submit:
if models == 'Tiny (fast)':
model_size = 'tiny'
elif models == 'Base (good)':
model_size ='base'
elif models == 'Small (great but slow)':
model_size = 'small'
elif models == 'Medium (greater but slower)':
model_size = 'medium'
if translate == True:
task = 'translate'
else:
task = 'transcribe'
#temporary file to store audio_file
tmp_dir = tempfile.TemporaryDirectory()
temp_file = tmp_dir.name + '/mono.wav'
bytes_data = uploaded.getvalue()
with open('temp_audio', 'wb') as outfile:
outfile.write(bytes_data)
# Transcribe/translate and segment
transcript = transcribe('temp_audio', nu_speakers)
# Prepare text file for export.
text = '\n'.join(transcript['text']).encode('utf-8')
download_url = create_download_link(text, 'transcript.txt', 'Download transcript as plain text.')
st.markdown(download_url, unsafe_allow_html=True)
# prepare CSV file for expport.
csv = transcript['df'].to_csv( float_format='%.2f', index=False).encode('utf-8')
download_url = create_download_link(csv, 'transcript.csv', 'Download transcript as CSV (with time codes)')
st.markdown(download_url, unsafe_allow_html=True)
tmp_dir.cleanup()
|