File size: 6,183 Bytes
9ec01b8
 
 
 
 
faf2a27
ee99df3
19ccd4b
4ccc97f
ee99df3
 
 
a507bd6
ee99df3
 
4ccc97f
 
da8c6a9
a507bd6
 
da8c6a9
1ac8b8b
19ccd4b
da8c6a9
 
 
 
 
 
 
 
 
a507bd6
 
 
 
da8c6a9
 
a507bd6
 
4ccc97f
19ccd4b
a507bd6
 
da8c6a9
 
 
a507bd6
faf2a27
da8c6a9
a507bd6
da8c6a9
 
 
 
 
a507bd6
6178c80
 
4ccc97f
a507bd6
19ccd4b
4ccc97f
72a5c50
4ccc97f
a507bd6
9908ddd
6178c80
a507bd6
6178c80
 
da8c6a9
 
a507bd6
da8c6a9
 
 
 
 
 
 
 
 
a507bd6
 
da8c6a9
 
 
a507bd6
da8c6a9
 
 
 
4ccc97f
 
 
c0d73db
ee99df3
da8c6a9
649cc15
80f0f94
 
 
4ccc97f
80f0f94
4ccc97f
1ac8b8b
 
4ccc97f
80f0f94
 
 
 
014d79d
 
4ccc97f
9908ddd
 
a507bd6
1ac8b8b
014d79d
9ec01b8
 
c089e11
9908ddd
 
 
 
 
 
a507bd6
 
9908ddd
1ac8b8b
 
 
 
 
19ccd4b
 
 
 
9ec01b8
 
 
a507bd6
 
 
ee99df3
 
a507bd6
ee99df3
 
 
 
a507bd6
 
ee99df3
 
19ccd4b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import whisper
import pandas as pd
import whisper
import subprocess
from simple_diarizer.diarizer import Diarizer
import streamlit as st
import base64
import tempfile



def create_download_link(val, filename, label):
    '''Hack to have a stable download link in Streamlit'''
    b64 = base64.b64encode(val)
    return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">{label}</a>'


def segment(nu_speakers):
    '''Segment the audio using simple_diarizer.
    Defaults to the speechbrain ECAPA-TDNN embeddings.'''

    diar = Diarizer(embed_model='ecapa',cluster_method='sc')
    segments = diar.diarize(temp_file, num_speakers=nu_speakers)

    sdf = pd.DataFrame(segments)

    # reorganize so the first speaker is always speaker 1
    speaker_s = sdf['label'].drop_duplicates().reset_index()['label']
    speaker_d = dict((v,k+1) for k,v in speaker_s.items())

    sdf['speaker'] = sdf['label'].replace(speaker_d)
    return sdf
def monotize(uploaded):
    '''Convert the upload file to audio file.'''
    cmd = f"ffmpeg -y -i {uploaded} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}"
    subprocess.Popen(cmd, shell=True).wait()

def audio_to_df(uploaded):
    '''Turn the upload file in a segemented dataframe.'''
    #monotize(uploaded)
    model = whisper.load_model(model_size)
    result = model.transcribe(temp_file,
                              without_timestamps=False,
                              task = task)
    tdf = pd.DataFrame(result['segments'])
    return tdf



def add_preface(row):
    ''' Add speaker prefix to transcript during transcribe().'''
    text = row['text'].replace('\n','')
    speaker = row['speaker']
    return f'Speaker {speaker}: {text}'

def transcribe(uploaded, nu_speakers):
    # Convert file to mono
    with st.spinner(text="Converting file..."):
        monotize('temp_audio')

    # Make audio available to play in UI
    audio_file = open(temp_file, 'rb')
    audio_bytes = audio_file.read()
    st.audio(temp_file, format='audio/wav')

    # trancibe file
    with st.spinner(text=f"Transcribing using {model_size} model..."):
        tdf = audio_to_df(uploaded)
    # segement file
    with st.spinner(text="Segmenting..."):
        sdf = segment(nu_speakers)

    # Find the nearest transcript line to the start of each speaker
    ns_list = sdf[['start','speaker']].to_dict(orient='records')
    for row in ns_list:
        input = row['start']
        id = tdf.iloc[(tdf['start']-input).abs().argsort()[:1]]['id'].values[0]
        tdf.loc[tdf['id'] ==id, 'speaker'] = row['speaker']
    tdf['speaker'].fillna(method = 'ffill', inplace = True)
    tdf['speaker'].fillna(method = 'bfill', inplace = True)
    tdf['n1'] = tdf['speaker'] != tdf['speaker'].shift(1)
    tdf['speach'] = tdf['n1'].cumsum()

    # collaps the dataframe by speach turn.
    binned_df = tdf.groupby(['speach', 'speaker'])['text'].apply('\n'.join).reset_index()
    binned_df['speaker'] = binned_df['speaker'].astype(int)
    binned_df['output'] = binned_df.apply(add_preface, axis=1)

    # Display the transcript and prepare for export
    lines = []
    for row in binned_df['output'].values:
        st.write(row)
        lines.append(row)
    tdf['speaker'] = tdf['speaker'].astype(int)

    tdf_cols = ['speaker','start','end','text']
    #st.dataframe(tdf[tdf_cols])
    return {'text':lines, 'df': tdf[tdf_cols]}


descript = ("This web app creates transcripts using OpenAI's [Whisper](https://github.com/openai/whisper) to transcribe "
            "audio files combined with [Chau](https://github.com/cvqluu)'s [Simple Diarizer](https://github.com/cvqluu/simple_diarizer) "
            "to partition the text by speaker.\n"
            "* You can upload an audio or video file of up to 200MBs.\n"
            "* Creating the transcript takes some time. "
            "The process takes approximately 20% of the length of the audio file using the base Whisper model.\n "
            "* The transcription process handles a variety of languages, and can also translate the audio to English. The tiny model is not good at translating. \n"
            "* Speaker segmentation seems to work best with the base model. The small model produces better transcripts, but something seems off with the timecodes, degrading the speaker attribution. \n"
            "* After uploading the file, be sure to select the number of speakers." )

st.title("Automated Transcription")
st.markdown(descript)

form = st.form(key='my_form')
uploaded = form.file_uploader("Choose a file")
nu_speakers = form.slider('Number of speakers in recording:', min_value=1, max_value=8, value=2, step=1)
models = form.selectbox(
    'Which Whisper model?',
    ('Tiny (fast)', 'Base (good)', 'Small (great but slow)', 'Medium (greater but slower)'), index=1)
translate = form.checkbox('Translate to English?')
submit = form.form_submit_button("Transcribe!")


if submit:
    if models == 'Tiny (fast)':
        model_size = 'tiny'
    elif models == 'Base (good)':
        model_size ='base'
    elif models == 'Small (great but slow)':
        model_size = 'small'
    elif models == 'Medium (greater but slower)':
        model_size = 'medium'

    if translate == True:
        task = 'translate'
    else:
        task = 'transcribe'

    #temporary file to store audio_file
    tmp_dir = tempfile.TemporaryDirectory()
    temp_file = tmp_dir.name + '/mono.wav'

    bytes_data = uploaded.getvalue()
    with open('temp_audio', 'wb') as outfile:
        outfile.write(bytes_data)


    # Transcribe/translate and segment
    transcript = transcribe('temp_audio', nu_speakers)

    # Prepare text file for export.
    text = '\n'.join(transcript['text']).encode('utf-8')
    download_url = create_download_link(text, 'transcript.txt', 'Download transcript as plain text.')
    st.markdown(download_url, unsafe_allow_html=True)

    # prepare CSV file for expport.
    csv = transcript['df'].to_csv( float_format='%.2f', index=False).encode('utf-8')
    download_url = create_download_link(csv, 'transcript.csv', 'Download transcript as CSV (with time codes)')
    st.markdown(download_url, unsafe_allow_html=True)
    tmp_dir.cleanup()