import whisper import pandas as pd import whisper import subprocess from simple_diarizer.diarizer import Diarizer import streamlit as st import base64 import tempfile def create_download_link(val, filename, label): '''Hack to have a stable download link in Streamlit''' b64 = base64.b64encode(val) return f'{label}' def segment(nu_speakers): '''Segment the audio using simple_diarizer. Defaults to the speechbrain ECAPA-TDNN embeddings.''' diar = Diarizer(embed_model='ecapa',cluster_method='sc') segments = diar.diarize(temp_file, num_speakers=nu_speakers) sdf = pd.DataFrame(segments) # reorganize so the first speaker is always speaker 1 speaker_s = sdf['label'].drop_duplicates().reset_index()['label'] speaker_d = dict((v,k+1) for k,v in speaker_s.items()) sdf['speaker'] = sdf['label'].replace(speaker_d) return sdf def monotize(uploaded): '''Convert the upload file to audio file.''' cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}" subprocess.Popen(cmd, shell=True).wait() def audio_to_df(uploaded): '''Turn the upload file in a segemented dataframe.''' #monotize(uploaded) model = whisper.load_model(model_size) result = model.transcribe(temp_file, without_timestamps=False, task = task) tdf = pd.DataFrame(result['segments']) return tdf def add_preface(row): ''' Add speaker prefix to transcript during transcribe().''' text = row['text'].replace('\n','') speaker = row['speaker'] return f'Speaker {speaker}: {text}' def transcribe(uploaded, nu_speakers): # Convert file to mono with st.spinner(text="Converting file..."): monotize('temp_audio') # Make audio available to play in UI audio_file = open(temp_file, 'rb') audio_bytes = audio_file.read() st.audio(temp_file, format='audio/wav') # trancibe file with st.spinner(text=f"Transcribing using {model_size} model..."): tdf = audio_to_df(uploaded) # segement file with st.spinner(text="Segmenting..."): sdf = segment(nu_speakers) # Find the nearest transcript line to the start of each speaker ns_list = sdf[['start','speaker']].to_dict(orient='records') for row in ns_list: input = row['start'] id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0] tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker'] tdf['speaker'].fillna(method = 'ffill', inplace = True) tdf['speaker'].fillna(method = 'bfill', inplace = True) tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1) tdf['speach'] = tdf['n1'].cumsum() # collaps the dataframe by speach turn. binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index() binned_df['speaker'] = binned_df['speaker'].astype(int) binned_df['output'] = binned_df.apply(add_preface, axis=1) # Display the transcript and prepare for export lines = [] for row in binned_df['output'].values: st.write(row) lines.append(row) tdf['speaker'] = tdf['speaker'].astype(int) tdf_cols = ['speaker','start','end','text'] #st.dataframe(tdf[tdf_cols]) return {'text':lines, 'df': tdf[tdf_cols]} descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe " "audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) " "to partition the text by speaker.\n" "* You can upload an audio or video file of up to 200MBs.\n" "* Creating the transcript takes some time. " "The process takes approximately 20% of the length of the audio file using the base Whisper model.\n " "* The transcription process handles a variety of languages, and can also translate the audio to English. The tiny model is not good at translating. \n" "* Speaker segmentation seems to work best with the base model. The small model produces better transcripts, but something seems off with the timecodes, degrading the speaker attribution. \n" "* After uploading the file, be sure to select the number of speakers." ) st.title("Automated Transcription") st.markdown(descript) form = st.form(key='my_form') uploaded = form.file_uploader("Choose a file") nu_speakers = form.slider('Number of speakers in recording:', min_value=1, max_value=8, value=2, step=1) models = form.selectbox( 'Which Whisper model?', ('Tiny (fast)', 'Base (good)', 'Small (great but slow)', 'Medium (greater but slower)'), index=1) translate = form.checkbox('Translate to English?') submit = form.form_submit_button("Transcribe!") if submit: if models == 'Tiny (fast)': model_size = 'tiny' elif models == 'Base (good)': model_size ='base' elif models == 'Small (great but slow)': model_size = 'small' elif models == 'Medium (greater but slower)': model_size = 'medium' if translate == True: task = 'translate' else: task = 'transcribe' #temporary file to store audio_file tmp_dir = tempfile.TemporaryDirectory() temp_file = tmp_dir.name + '/mono.wav' bytes_data = uploaded.getvalue() with open('temp_audio', 'wb') as outfile: outfile.write(bytes_data) # Transcribe/translate and segment transcript = transcribe('temp_audio', nu_speakers) # Prepare text file for export. text = '\n'.join(transcript['text']).encode('utf-8') download_url = create_download_link(text, 'transcript.txt', 'Download transcript as plain text.') st.markdown(download_url, unsafe_allow_html=True) # prepare CSV file for expport. csv = transcript['df'].to_csv( float_format='%.2f', index=False).encode('utf-8') download_url = create_download_link(csv, 'transcript.csv', 'Download transcript as CSV (with time codes)') st.markdown(download_url, unsafe_allow_html=True) tmp_dir.cleanup()