transcript / app.py
Neal Caren
full?
6003d58
raw
history blame
2.79 kB
import whisper
import pandas as pd
import whisper
import subprocess
from simple_diarizer.diarizer import Diarizer
import streamlit as st
def speech_to_text(uploaded):
model = whisper.load_model('tiny')
result = model.transcribe(uploaded,verbose=True)
return f'You said: {result["text"]}'
def segment(nu_speakers):
diar = Diarizer(embed_model='xvec',cluster_method='sc')
segments = diar.diarize('mono.wav', num_speakers=nu_speakers)
sdf = pd.DataFrame(segments)
# reorganize so the first speaker is always speaker 1
speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
speaker_d = dict((v,k+1) for k,v in speaker_s.items())
sdf['speaker'] = sdf['label'].replace(speaker_d)
return sdf
def audio_to_df(uploaded):
monotize(uploaded)
model = whisper.load_model('tiny')
result = model.transcribe('mono.wav',verbose=True,
without_timestamps=False)
tdf = pd.DataFrame(result['segments'])
return tdf
def monotize(uploaded):
cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 mono.wav"
subprocess.Popen(cmd, shell=True).wait()
def add_preface(row):
text = row['text'].replace('\n','')
speaker = row['speaker']
return f'Speaker {speaker}: {text}'
def transcribe(uploaded, nu_speakers):
monotize(uploaded)
tdf = audio_to_df(uploaded)
sdf = segment(nu_speakers)
ns_list = sdf[['start','speaker']].to_dict(orient='records')
# Find the nearest transcript line to the start of each speaker
for row in ns_list:
input = row['start']
id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
tdf['speaker'].fillna(method = 'ffill', inplace = True)
tdf['speaker'].fillna(method = 'bfill', inplace = True)
tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
tdf['speach'] = tdf['n1'].cumsum()
binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
binned_df['speaker'] = binned_df['speaker'].astype(int)
binned_df['output'] = binned_df.apply(add_preface, axis=1)
lines = []
for row in binned_df['output'].values:
st.write(row)
lines.append(row)
return '\n'.join(lines)
form = st.form(key='my_form')
uploaded = form.file_uploader("Choose a file")
nu_speakers = form.slider('Number of speakers in audio file:', min_value=1, max_value=6, value=2, step=1)
submit = form.form_submit_button("Transcribe!")
if submit:
bytes_data = uploaded.getvalue()
with open('temp_audio', 'wb') as outfile:
outfile.write(bytes_data)
st.write('Converting audio file.')
monotize('temp_audio')
text = transcribe('temp_audio', nu_speakers)