Spaces:
Runtime error
Runtime error
import os | |
import time | |
import gradio as gr | |
from pathlib import Path | |
import pysrt | |
import pandas as pd | |
if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'): | |
print("Models already loaded") | |
else: | |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git') | |
os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format') | |
os.system('make -C ./whisper.cpp') | |
whisper_models = ["medium", "large"] | |
whisper_modelpath_translator= { | |
"medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin", | |
"large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-large-v3.bin" | |
} | |
def speech_to_text(audio_path, whisper_model): | |
if(audio_path is None): | |
retry_cnt = 0 | |
for retry_cnt in range(3): | |
if(audio_path is None): | |
print(f'Retrying, retry counter: {retry_cnt +1}') | |
time.sleep(0.5) | |
retry_cnt +=1 | |
if retry_cnt == 3: | |
raise ValueError("Error no audio input") | |
else: | |
break | |
print(audio_path) | |
try: | |
retry_cnt = 0 | |
for retry_cnt in range(3): | |
try: | |
_,file_ending = os.path.splitext(f'{audio_path}') | |
print(f'file enging is {file_ending}') | |
print("starting conversion to wav") | |
new_path = audio_path.replace(file_ending, "_converted.wav") | |
os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{new_path}"') | |
print("conversion to wav ready") | |
break | |
except Exception as e: | |
time.sleep(0.5) | |
retry_cnt +=1 | |
if retry_cnt == 3: | |
pass | |
except Exception as e: | |
raise RuntimeError(f'Error Running inference with local model: {e}') from e | |
try: | |
print("starting whisper c++") | |
srt_path = new_path + ".srt" | |
os.system(f'rm -f {srt_path}') | |
os.system(f'./whisper.cpp/main "{new_path}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt') | |
print("starting whisper done with whisper") | |
except Exception as e: | |
raise RuntimeError(f'Error running Whisper cpp model: {e}') from e | |
try: | |
df = pd.DataFrame(columns = ['start','end','text']) | |
subs = pysrt.open(srt_path) | |
rows = [] | |
for sub in subs: | |
start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2] | |
end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2] | |
start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2] | |
end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2] | |
start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2] | |
end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2] | |
start_millis = str(str(sub.start.milliseconds) + "000")[0:3] | |
end_millis = str(str(sub.end.milliseconds) + "000")[0:3] | |
rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}']) | |
for row in rows: | |
srt_to_df = { | |
'start': [row[1]], | |
'end': [row[2]], | |
'text': [row[0]] | |
} | |
df = pd.concat([df, pd.DataFrame(srt_to_df)]) | |
except Exception as e: | |
print(f"Error creating srt df with error: {e}") | |
return df | |
def output_to_files(df): | |
df.reset_index(inplace=True) | |
print("Starting SRT-file creation") | |
print(df.head()) | |
with open('subtitles.vtt','w', encoding="utf-8") as file: | |
print("Starting WEBVTT-file creation") | |
for i in range(len(df)): | |
if i == 0: | |
file.write('WEBVTT') | |
file.write('\n') | |
else: | |
file.write(str(i+1)) | |
file.write('\n') | |
start = df.iloc[i]['start'] | |
file.write(f"{start.strip()}") | |
stop = df.iloc[i]['end'] | |
file.write(' --> ') | |
file.write(f"{stop}") | |
file.write('\n') | |
file.writelines(df.iloc[i]['text']) | |
if int(i) != len(df)-1: | |
file.write('\n\n') | |
print("WEBVTT DONE") | |
with open('subtitles.srt','w', encoding="utf-8") as file: | |
print("Starting SRT-file creation") | |
for i in range(len(df)): | |
file.write(str(i+1)) | |
file.write('\n') | |
start = df.iloc[i]['start'] | |
file.write(f"{start.strip()}") | |
stop = df.iloc[i]['end'] | |
file.write(' --> ') | |
file.write(f"{stop}") | |
file.write('\n') | |
file.writelines(df.iloc[i]['text']) | |
if int(i) != len(df)-1: | |
file.write('\n\n') | |
print("SRT DONE") | |
subtitle_files_out = ['subtitles.vtt','subtitles.srt'] | |
return subtitle_files_out | |
# ---- Gradio Layout ----- | |
demo = gr.Blocks(css=''' | |
#cut_btn, #reset_btn { align-self:stretch; } | |
#\\31 3 { max-width: 540px; } | |
.output-markdown {max-width: 65ch !important;} | |
''') | |
demo.encrypt = False | |
with demo: | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(''' | |
# Simple Finnish Audio --> Text app | |
### This space allows you to: | |
1. Insert audio file or record with microphone | |
2. Run audio through transcription process using speech recognition models | |
3. Download generated transcriptions in .vtt and .srt formats | |
''') | |
with gr.Row(): | |
with gr.Column(): | |
audio_in = gr.Audio(label="Audio file", type='filepath') | |
transcribe_btn = gr.Button("Step 1. Transcribe audio") | |
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True) | |
with gr.Row(): | |
with gr.Column(): | |
transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe") | |
with gr.Row(): | |
with gr.Column(): | |
translate_transcriptions_button = gr.Button("Step 2. Create subtitle files") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''') | |
subtitle_files = gr.File( | |
label="Download files", | |
file_count="multiple", | |
type="filepath", | |
interactive=False, | |
) | |
# Functionalities | |
transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df]) | |
translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files]) | |
demo.launch() |