import whisper import pandas as pd import whisper import subprocess from simple_diarizer.diarizer import Diarizer import streamlit as st def speech_to_text(uploaded): model = whisper.load_model('tiny') result = model.transcribe(uploaded,verbose=True) return f'You said: {result["text"]}' def segment(nu_speakers): diar = Diarizer(embed_model='xvec',cluster_method='sc') segments = diar.diarize('mono.wav', num_speakers=nu_speakers) sdf = pd.DataFrame(segments) # reorganize so the first speaker is always speaker 1 speaker_s = sdf['label'].drop_duplicates().reset_index()['label'] speaker_d = dict((v,k+1) for k,v in speaker_s.items()) sdf['speaker'] = sdf['label'].replace(speaker_d) return sdf def audio_to_df(uploaded): monotize(uploaded) model = whisper.load_model('tiny') result = model.transcribe('mono.wav',verbose=True, without_timestamps=False) tdf = pd.DataFrame(result['segments']) return tdf def monotize(uploaded): cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav" subprocess.Popen(cmd, shell=True).wait() def add_preface(row): text = row['text'].replace('\n','') speaker = row['speaker'] return f'Speaker {speaker}: {text}' def transcribe(uploaded, nu_speakers): monotize(uploaded) tdf = audio_to_df(uploaded) sdf = segment(nu_speakers) ns_list = sdf[['start','speaker']].to_dict(orient='records') # Find the nearest transcript line to the start of each speaker for row in ns_list: input = row['start'] id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0] tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker'] tdf['speaker'].fillna(method = 'ffill', inplace = True) tdf['speaker'].fillna(method = 'bfill', inplace = True) tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1) tdf['speach'] = tdf['n1'].cumsum() binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index() binned_df['speaker'] = binned_df['speaker'].astype(int) binned_df['output'] = binned_df.apply(add_preface, axis=1) lines = [] for row in binned_df['output'].values: st.write(row) lines.append(row) return '\n'.join(lines) form = st.form(key='my_form') uploaded = form.file_uploader("Choose a file") nu_speakers = form.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1) submit = form.form_submit_button("Transcribe!") if submit: bytes_data = uploaded.getvalue() with open('temp_audio', 'wb') as outfile: outfile.write(bytes_data) st.write('Converting audio file.') monotize('temp_audio') text = transcribe('temp_audio', nu_speakers)