RASMUS commited on
Commit
6c3d109
1 Parent(s): dceab61

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -0
app.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from pathlib import Path
4
+ import pysrt
5
+ import pandas as pd
6
+
7
+ if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'):
8
+ print("Models already loaded")
9
+ else:
10
+ os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
11
+ os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format')
12
+ os.system('make -C ./whisper.cpp')
13
+
14
+
15
+
16
+ whisper_models = ["medium", "large"]
17
+ whisper_modelpath_translator= {
18
+ "medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin",
19
+ "large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-model-large-v3.bin"
20
+ }
21
+
22
+
23
+
24
+ def speech_to_text(audio_path, whisper_model):
25
+
26
+ if(audio_path is None):
27
+ raise ValueError("Error no audio input")
28
+ print(audio_path)
29
+ try:
30
+
31
+ _,file_ending = os.path.splitext(f'{audio_path}')
32
+ print(f'file enging is {file_ending}')
33
+ print("starting conversion to wav")
34
+ os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{audio_path.replace(file_ending, ".wav")}"')
35
+ print("conversion to wav ready")
36
+
37
+ except Exception as e:
38
+ raise RuntimeError(f'Error Running inference with local model: {e}') from e
39
+
40
+ try:
41
+
42
+ print("starting whisper c++")
43
+ srt_path = str(audio_path.replace(file_ending, ".wav")) + ".srt"
44
+ os.system(f'rm -f {srt_path}')
45
+ os.system(f'./whisper.cpp/main "{audio_path.replace(file_ending, ".wav")}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt')
46
+ print("starting whisper done with whisper")
47
+ except Exception as e:
48
+ raise RuntimeError(f'Error running Whisper cpp model: {e}') from e
49
+
50
+ try:
51
+
52
+ df = pd.DataFrame(columns = ['start','end','text'])
53
+ srt_path = str(audio_path.replace(file_ending, ".wav")) + ".srt"
54
+ subs = pysrt.open(srt_path)
55
+
56
+
57
+ rows = []
58
+ for sub in subs:
59
+ start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
60
+ end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
61
+
62
+ start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
63
+ end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
64
+
65
+ start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
66
+ end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
67
+
68
+ start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
69
+ end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
70
+ rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
71
+
72
+ for row in rows:
73
+ srt_to_df = {
74
+ 'start': [row[1]],
75
+ 'end': [row[2]],
76
+ 'text': [row[0]]
77
+ }
78
+
79
+ df = pd.concat([df, pd.DataFrame(srt_to_df)])
80
+
81
+ except Exception as e:
82
+ print(f"Error creating srt df with error: {e}")
83
+
84
+ return df
85
+
86
+ def output_to_files(df):
87
+
88
+ df.reset_index(inplace=True)
89
+
90
+
91
+ print("Starting SRT-file creation")
92
+ print(df.head())
93
+
94
+ with open('subtitles.vtt','w', encoding="utf-8") as file:
95
+ print("Starting WEBVTT-file creation")
96
+
97
+ for i in range(len(df)):
98
+ if i == 0:
99
+ file.write('WEBVTT')
100
+ file.write('\n')
101
+
102
+ else:
103
+ file.write(str(i+1))
104
+ file.write('\n')
105
+ start = df.iloc[i]['start']
106
+
107
+
108
+ file.write(f"{start.strip()}")
109
+
110
+ stop = df.iloc[i]['end']
111
+
112
+
113
+ file.write(' --> ')
114
+ file.write(f"{stop}")
115
+ file.write('\n')
116
+ file.writelines(df.iloc[i]['text'])
117
+ if int(i) != len(df)-1:
118
+ file.write('\n\n')
119
+
120
+ print("WEBVTT DONE")
121
+
122
+ with open('subtitles.srt','w', encoding="utf-8") as file:
123
+ print("Starting SRT-file creation")
124
+
125
+ for i in range(len(df)):
126
+ file.write(str(i+1))
127
+ file.write('\n')
128
+ start = df.iloc[i]['start']
129
+
130
+
131
+ file.write(f"{start.strip()}")
132
+
133
+ stop = df.iloc[i]['end']
134
+
135
+
136
+ file.write(' --> ')
137
+ file.write(f"{stop}")
138
+ file.write('\n')
139
+ file.writelines(df.iloc[i]['text'])
140
+ if int(i) != len(df)-1:
141
+ file.write('\n\n')
142
+
143
+ print("SRT DONE")
144
+ subtitle_files_out = ['subtitles.vtt','subtitles.srt']
145
+
146
+ return subtitle_files_out
147
+
148
+ # ---- Gradio Layout -----
149
+
150
+
151
+
152
+
153
+
154
+ demo = gr.Blocks(css='''
155
+ #cut_btn, #reset_btn { align-self:stretch; }
156
+ #\\31 3 { max-width: 540px; }
157
+ .output-markdown {max-width: 65ch !important;}
158
+ ''')
159
+ demo.encrypt = False
160
+
161
+
162
+ with demo:
163
+ with gr.Row():
164
+ with gr.Column():
165
+ gr.Markdown('''
166
+ # FINNISH Audio --> TEXT APP
167
+ ### This space allows you to:
168
+ 1. Insert audio file or record with microphone
169
+ 2. Run audio through transcription process using speech recognition models
170
+ 3. Download generated transcriptions in .vtt and .srt formats
171
+ ''')
172
+
173
+
174
+ with gr.Row():
175
+ with gr.Column():
176
+ audio_in = gr.Audio(label="Audio file", type='filepath')
177
+ transcribe_btn = gr.Button("Step 1. Transcribe audio")
178
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True)
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe")#, row_count=(1, "dynamic"))
183
+
184
+ with gr.Row():
185
+ with gr.Column():
186
+ translate_transcriptions_button = gr.Button("Step 2. Create subtitle files")
187
+
188
+
189
+ with gr.Row():
190
+ with gr.Column():
191
+ gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''')
192
+ subtitle_files = gr.File(
193
+ label="Download files",
194
+ file_count="multiple",
195
+ type="filepath",
196
+ interactive=False,
197
+ )
198
+
199
+ # Functionalities
200
+ transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df])
201
+ translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files])
202
+
203
+ demo.launch()