|
import gradio as gr |
|
import torch |
|
from faster_whisper import WhisperModel |
|
import pandas as pd |
|
|
|
model_size = "large-v2" |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
if device == "cuda:0": |
|
|
|
model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16") |
|
|
|
|
|
else: |
|
|
|
model_whisper = WhisperModel(model_size, device="cpu", compute_type="int8") |
|
|
|
def get_filename(file_obj): |
|
return file_obj.name.split("/")[-1] |
|
|
|
def audio_to_transcript(file_obj): |
|
|
|
try: |
|
filename = get_filename(file_obj) |
|
segments, _ = model_whisper.transcribe(file_obj.name, beam_size=5, vad_filter=True) |
|
except: |
|
filename = file_obj.split("/")[-1] |
|
segments, _ = model_whisper.transcribe(file_obj, beam_size=5, vad_filter=True) |
|
|
|
start_segments, end_segments, text_segments = list(), list(), list() |
|
for segment in segments: |
|
start, end, text = segment.start, segment.end, segment.text |
|
start_segments.append(start) |
|
end_segments.append(end) |
|
text_segments.append(text) |
|
|
|
|
|
df = pd.DataFrame() |
|
df["start"] = start_segments |
|
df["end"] = end_segments |
|
df["text"] = text_segments |
|
|
|
csv_file = filename.split(".")[0] + ".csv" |
|
df.to_csv(csv_file, encoding="utf-8", index=False) |
|
path_to_csv = gr.File.update(value=csv_file, visible=True) |
|
|
|
return filename, path_to_csv, df |
|
|
|
|
|
headers = ["start", "end", "text"] |
|
iface = gr.Interface(fn=audio_to_transcript, |
|
inputs=gr.File(label="Audio file"), |
|
outputs=[ |
|
gr.Textbox(label="Audio file name"), |
|
gr.File(label="Transcript csv file"), |
|
gr.DataFrame(label="Transcript", headers=headers), |
|
], |
|
allow_flagging="never", |
|
title="Audio to Transcript", |
|
description="Just paste any audio file and get its corresponding transcript with timeline.", |
|
) |
|
iface.launch() |