RASMUS's picture
Update app.py
95c5b27
raw
history blame
7.61 kB
import os
import time
import gradio as gr
from pathlib import Path
import pysrt
import pandas as pd
if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'):
print("Models already loaded")
else:
os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format')
os.system('make -C ./whisper.cpp')
whisper_models = ["medium", "large"]
whisper_modelpath_translator= {
"medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin",
"large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-large-v3.bin"
}
def speech_to_text(audio_path, whisper_model):
if(audio_path is None):
retry_cnt = 0
for retry_cnt in range(3):
if(audio_path is None):
print(f'Retrying, retry counter: {retry_cnt +1}')
time.sleep(0.5)
retry_cnt +=1
if retry_cnt == 3:
raise ValueError("Error no audio input")
else:
break
print(audio_path)
try:
retry_cnt = 0
for retry_cnt in range(3):
try:
_,file_ending = os.path.splitext(f'{audio_path}')
print(f'file enging is {file_ending}')
print("starting conversion to wav")
new_path = audio_path.replace(file_ending, "_converted.wav")
os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{new_path}"')
print("conversion to wav ready")
break
except Exception as e:
time.sleep(0.5)
retry_cnt +=1
if retry_cnt == 3:
pass
except Exception as e:
raise RuntimeError(f'Error Running inference with local model: {e}') from e
try:
print("starting whisper c++")
srt_path = new_path + ".srt"
os.system(f'rm -f {srt_path}')
os.system(f'./whisper.cpp/main "{new_path}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt')
print("starting whisper done with whisper")
except Exception as e:
raise RuntimeError(f'Error running Whisper cpp model: {e}') from e
try:
df = pd.DataFrame(columns = ['start','end','text'])
subs = pysrt.open(srt_path)
rows = []
for sub in subs:
start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
for row in rows:
srt_to_df = {
'start': [row[1]],
'end': [row[2]],
'text': [row[0]]
}
df = pd.concat([df, pd.DataFrame(srt_to_df)])
except Exception as e:
print(f"Error creating srt df with error: {e}")
return df
def output_to_files(df):
df.reset_index(inplace=True)
print("Starting SRT-file creation")
print(df.head())
with open('subtitles.vtt','w', encoding="utf-8") as file:
print("Starting WEBVTT-file creation")
for i in range(len(df)):
if i == 0:
file.write('WEBVTT')
file.write('\n')
else:
file.write(str(i+1))
file.write('\n')
start = df.iloc[i]['start']
file.write(f"{start.strip()}")
stop = df.iloc[i]['end']
file.write(' --> ')
file.write(f"{stop}")
file.write('\n')
file.writelines(df.iloc[i]['text'])
if int(i) != len(df)-1:
file.write('\n\n')
print("WEBVTT DONE")
with open('subtitles.srt','w', encoding="utf-8") as file:
print("Starting SRT-file creation")
for i in range(len(df)):
file.write(str(i+1))
file.write('\n')
start = df.iloc[i]['start']
file.write(f"{start.strip()}")
stop = df.iloc[i]['end']
file.write(' --> ')
file.write(f"{stop}")
file.write('\n')
file.writelines(df.iloc[i]['text'])
if int(i) != len(df)-1:
file.write('\n\n')
print("SRT DONE")
subtitle_files_out = ['subtitles.vtt','subtitles.srt']
return subtitle_files_out
# ---- Gradio Layout -----
demo = gr.Blocks(css='''
#cut_btn, #reset_btn { align-self:stretch; }
#\\31 3 { max-width: 540px; }
.output-markdown {max-width: 65ch !important;}
''')
demo.encrypt = False
with demo:
with gr.Row():
with gr.Column():
gr.Markdown('''
# Simple Finnish Audio --> Text app
### This space allows you to:
1. Insert audio file or record with microphone
2. Run audio through transcription process using speech recognition models
3. Download generated transcriptions in .vtt and .srt formats
''')
with gr.Row():
with gr.Column():
audio_in = gr.Audio(label="Audio file", type='filepath')
transcribe_btn = gr.Button("Step 1. Transcribe audio")
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True)
with gr.Row():
with gr.Column():
transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe")
with gr.Row():
with gr.Column():
translate_transcriptions_button = gr.Button("Step 2. Create subtitle files")
with gr.Row():
with gr.Column():
gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''')
subtitle_files = gr.File(
label="Download files",
file_count="multiple",
type="filepath",
interactive=False,
)
# Functionalities
transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df])
translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files])
demo.launch()