Spaces:

RASMUS
/

Finnish-Audio-to-Text

Sleeping

App Files Files Community

RASMUS commited on Nov 29, 2023

Commit

6c3d109

•

1 Parent(s): dceab61

Create app.py

Browse files

Files changed (1) hide show

app.py +203 -0

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import gradio as gr
+from pathlib import Path
+import pysrt
+import pandas as pd
+if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'):
+    print("Models already loaded")
+else:
+    os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
+    os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format')
+    os.system('make -C ./whisper.cpp')
+whisper_models = ["medium", "large"]
+whisper_modelpath_translator= {
+    "medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin",
+    "large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-model-large-v3.bin"
+    }
+def speech_to_text(audio_path, whisper_model):
+    if(audio_path is None):
+        raise ValueError("Error no audio input")
+    print(audio_path)
+    try:
+        _,file_ending = os.path.splitext(f'{audio_path}')
+        print(f'file enging is {file_ending}')
+        print("starting conversion to wav")
+        os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{audio_path.replace(file_ending, ".wav")}"')
+        print("conversion to wav ready")
+    except Exception as e:
+        raise RuntimeError(f'Error Running inference with local model: {e}') from e
+    try:
+        print("starting whisper c++")
+        srt_path = str(audio_path.replace(file_ending, ".wav")) + ".srt"
+        os.system(f'rm -f {srt_path}')
+        os.system(f'./whisper.cpp/main "{audio_path.replace(file_ending, ".wav")}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt')
+        print("starting whisper done with whisper")
+    except Exception as e:
+        raise RuntimeError(f'Error running Whisper cpp model: {e}') from e
+    try:
+        df = pd.DataFrame(columns = ['start','end','text'])
+        srt_path = str(audio_path.replace(file_ending, ".wav")) + ".srt"
+        subs = pysrt.open(srt_path)
+        rows = []
+        for sub in subs:
+            start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
+            end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
+            start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
+            end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
+            start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
+            end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
+            start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
+            end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
+            rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
+        for row in rows:
+            srt_to_df = {
+            'start': [row[1]],
+            'end': [row[2]],
+            'text': [row[0]]
+            }
+            df = pd.concat([df, pd.DataFrame(srt_to_df)])
+    except Exception as e:
+        print(f"Error creating srt df with error: {e}")
+    return df
+def output_to_files(df):
+    df.reset_index(inplace=True)
+    print("Starting SRT-file creation")
+    print(df.head())
+    with open('subtitles.vtt','w', encoding="utf-8") as file:
+        print("Starting WEBVTT-file creation")
+        for i in range(len(df)):
+            if i == 0:
+                file.write('WEBVTT')
+                file.write('\n')
+            else:
+                file.write(str(i+1))
+                file.write('\n')
+                start = df.iloc[i]['start']
+                file.write(f"{start.strip()}")
+                stop = df.iloc[i]['end']
+                file.write(' --> ')
+                file.write(f"{stop}")
+                file.write('\n')
+                file.writelines(df.iloc[i]['text'])
+                if int(i) != len(df)-1:
+                    file.write('\n\n')
+    print("WEBVTT DONE")
+    with open('subtitles.srt','w', encoding="utf-8") as file:
+        print("Starting SRT-file creation")
+        for i in range(len(df)):
+            file.write(str(i+1))
+            file.write('\n')
+            start = df.iloc[i]['start']
+            file.write(f"{start.strip()}")
+            stop = df.iloc[i]['end']
+            file.write(' --> ')
+            file.write(f"{stop}")
+            file.write('\n')
+            file.writelines(df.iloc[i]['text'])
+            if int(i) != len(df)-1:
+                file.write('\n\n')
+    print("SRT DONE")
+    subtitle_files_out = ['subtitles.vtt','subtitles.srt']
+    return subtitle_files_out
+# ---- Gradio Layout -----
+demo = gr.Blocks(css='''
+#cut_btn, #reset_btn { align-self:stretch; }
+#\\31 3 { max-width: 540px; }
+.output-markdown {max-width: 65ch !important;}
+''')
+demo.encrypt = False
+with demo:
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''
+            # FINNISH Audio --> TEXT APP
+            ### This space allows you to:
+            1. Insert audio file or record with microphone
+            2. Run audio through transcription process using speech recognition models
+            3. Download generated transcriptions in .vtt and .srt formats
+            ''')
+    with gr.Row():
+        with gr.Column():
+            audio_in = gr.Audio(label="Audio file", type='filepath')
+            transcribe_btn = gr.Button("Step 1. Transcribe audio")
+            selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True)
+    with gr.Row():
+        with gr.Column():
+            transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe")#, row_count=(1, "dynamic"))
+    with gr.Row():
+        with gr.Column():
+            translate_transcriptions_button = gr.Button("Step 2. Create subtitle files")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''')
+            subtitle_files = gr.File(
+                label="Download files",
+                file_count="multiple",
+                type="filepath",
+                interactive=False,
+            )
+    # Functionalities
+    transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df])
+    translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files])
+demo.launch()