Spaces:

RASMUS
/

Finnish-Audio-to-Text

Runtime error

File size: 7,313 Bytes

import os
import time
import gradio as gr
from pathlib import Path
import pysrt
import pandas as pd

if os.path.isdir(f'{os.getcwd() + os.sep}whisper.cpp'):
    print("Models already loaded")
else:
    os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
    os.system('git clone https://huggingface.co/Finnish-NLP/Finnish-finetuned-whisper-models-ggml-format')
    os.system('make -C ./whisper.cpp')



whisper_models = ["medium", "large"]
whisper_modelpath_translator= {
    "medium": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-medium.bin",
    "large": "./Finnish-finetuned-whisper-models-ggml-format/ggml-model-fi-large-v3.bin"
    }



def speech_to_text(audio_path, whisper_model):

    if(audio_path is None):
        retry_cnt = 0
        for retry_cnt in range(3):
            if(audio_path is None):
                print(f'Retrying, retry counter: {retry_cnt +1}')
                time.sleep(0.5)
                retry_cnt +=1
                if retry_cnt == 3:
                    raise ValueError("Error no audio input")
            else:
                break
    print(audio_path)
    try:

        _,file_ending = os.path.splitext(f'{audio_path}')
        print(f'file enging is {file_ending}')
        print("starting conversion to wav")
        new_path = audio_path.replace(file_ending, "_converted.wav")
        os.system(f'ffmpeg -i "{audio_path}" -ar 16000 -y -ac 1 -c:a pcm_s16le "{new_path}"')
        print("conversion to wav ready")

    except Exception as e:
        raise RuntimeError(f'Error Running inference with local model: {e}') from e

    try:

        print("starting whisper c++")
        srt_path = new_path + ".srt"
        os.system(f'rm -f {srt_path}')
        os.system(f'./whisper.cpp/main "{new_path}" -t 4 -m ./{whisper_modelpath_translator.get(whisper_model)} -osrt')
        print("starting whisper done with whisper")
    except Exception as e:
        raise RuntimeError(f'Error running Whisper cpp model: {e}') from e

    try:    

        df = pd.DataFrame(columns = ['start','end','text'])
        subs = pysrt.open(srt_path)


        rows = []
        for sub in subs:
            start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
            end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
            
            start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
            end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
            
            start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
            end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
            
            start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
            end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
            rows.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])

        for row in rows:
            srt_to_df = {
            'start': [row[1]],
            'end': [row[2]], 
            'text': [row[0]] 
            }
    
            df = pd.concat([df, pd.DataFrame(srt_to_df)])
            
    except Exception as e:
        print(f"Error creating srt df with error: {e}")
                    
    return df

def output_to_files(df):
    
    df.reset_index(inplace=True)
    

    print("Starting SRT-file creation")
    print(df.head())
    
    with open('subtitles.vtt','w', encoding="utf-8") as file:
        print("Starting WEBVTT-file creation")
    
        for i in range(len(df)):
            if i == 0:
                file.write('WEBVTT')
                file.write('\n')

            else:
                file.write(str(i+1))
                file.write('\n')
                start = df.iloc[i]['start']
               
            
                file.write(f"{start.strip()}")
                
                stop = df.iloc[i]['end']
                
                
                file.write(' --> ')
                file.write(f"{stop}")
                file.write('\n')
                file.writelines(df.iloc[i]['text'])
                if int(i) != len(df)-1:
                    file.write('\n\n')

    print("WEBVTT DONE") 

    with open('subtitles.srt','w', encoding="utf-8") as file:
        print("Starting SRT-file creation")
    
        for i in range(len(df)):
            file.write(str(i+1))
            file.write('\n')
            start = df.iloc[i]['start']
           
        
            file.write(f"{start.strip()}")
            
            stop = df.iloc[i]['end']
            
            
            file.write(' --> ')
            file.write(f"{stop}")
            file.write('\n')
            file.writelines(df.iloc[i]['text'])
            if int(i) != len(df)-1:
                file.write('\n\n')
        
    print("SRT DONE") 
    subtitle_files_out = ['subtitles.vtt','subtitles.srt']

    return subtitle_files_out

# ---- Gradio Layout -----





demo = gr.Blocks(css='''
#cut_btn, #reset_btn { align-self:stretch; }
#\\31 3 { max-width: 540px; }
.output-markdown {max-width: 65ch !important;}
''')
demo.encrypt = False


with demo:
    with gr.Row():
        with gr.Column():
            gr.Markdown('''
            # Simple Finnish Audio --> Text app
            ### This space allows you to: 
            1. Insert audio file or record with microphone
            2. Run audio through transcription process using speech recognition models
            3. Download generated transcriptions in .vtt and .srt formats
            ''')
            
            
    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(label="Audio file", type='filepath')
            transcribe_btn = gr.Button("Step 1. Transcribe audio")
            selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large", label="Selected Whisper model", interactive=True)
            
    with gr.Row():
        with gr.Column():
            transcription_df = gr.DataFrame(headers = ['start','end','text'], label="Transcription dataframe")
            
    with gr.Row():
        with gr.Column():
            translate_transcriptions_button = gr.Button("Step 2. Create subtitle files")
            
            
    with gr.Row():
        with gr.Column():
            gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''')
            subtitle_files = gr.File(
                label="Download files",
                file_count="multiple",
                type="filepath",
                interactive=False,
            )
            
    # Functionalities 
    transcribe_btn.click(speech_to_text, [audio_in, selected_whisper_model], [transcription_df])
    translate_transcriptions_button.click(output_to_files, transcription_df, [subtitle_files])
    
demo.launch()