Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import time | |
import gradio as gr | |
from pydub import AudioSegment | |
from transformers import pipeline | |
is_hf = os.getenv("SYSTEM") == "spaces" | |
generate_kwargs = { | |
"language": "Japanese", | |
"do_sample": False, | |
"num_beams": 1, | |
"no_repeat_ngram_size": 3, | |
} | |
model_dict = { | |
"whisper-large-v2": "openai/whisper-large-v2", | |
"whisper-large-v3": "openai/whisper-large-v3", | |
"whisper-large-v3-turbo": "openai/whisper-large-v3-turbo", | |
"kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0", | |
"kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0", | |
"galgame-whisper-wip": ( | |
"litagin/galgame-whisper-wip" | |
if is_hf | |
else "../whisper_finetune/galgame-whisper" | |
), | |
} | |
# Download models | |
for model in model_dict.values(): | |
pipeline("automatic-speech-recognition", model=model) | |
def transcribe_common(audio: str, model: str) -> tuple[str, float]: | |
# Get duration of audio | |
duration = AudioSegment.from_file(audio).duration_seconds | |
if duration > 15: | |
return "Audio too long, limit is 15 seconds", 0 | |
start_time = time.time() | |
pipe = pipeline("automatic-speech-recognition", model=model) | |
end_time = time.time() | |
return pipe(audio, generate_kwargs=generate_kwargs)["text"], end_time - start_time | |
def transcribe_large_v2(audio) -> tuple[str, float]: | |
return transcribe_common(audio, model_dict["whisper-large-v2"]) | |
def transcribe_large_v3(audio) -> tuple[str, float]: | |
return transcribe_common(audio, model_dict["whisper-large-v3"]) | |
def transcribe_large_v3_turbo(audio) -> tuple[str, float]: | |
return transcribe_common(audio, model_dict["whisper-large-v3-turbo"]) | |
def transcribe_kotoba_v1(audio) -> tuple[str, float]: | |
return transcribe_common(audio, model_dict["kotoba-whisper-v1.0"]) | |
def transcribe_kotoba_v2(audio) -> tuple[str, float]: | |
return transcribe_common(audio, model_dict["kotoba-whisper-v2.0"]) | |
def transcribe_galgame_whisper(audio) -> tuple[str, float]: | |
return transcribe_common(audio, model_dict["galgame-whisper-wip"]) | |
initial_md = """ | |
# Galgame-Whisper (WIP) Demo | |
- 日本語のみ対応 | |
- 他の書き起こしとついでに比較できるようにいろいろ入れた | |
- 現在0.1エポックくらい | |
- 速度はCPUです | |
- 音声は15秒まで | |
""" | |
with gr.Blocks() as app: | |
audio = gr.Audio(type="filepath") | |
gr.Markdown("### Kotoba-Whisper-V1.0") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Whisper-Large-V2") | |
button_v2 = gr.Button("Transcribe with Whisper-Large-V2") | |
output_v2 = gr.Textbox() | |
time_v2 = gr.Textbox("Time taken") | |
with gr.Column(): | |
gr.Markdown("### Whisper-Large-V3") | |
button_v3 = gr.Button("Transcribe with Whisper-Large-V3") | |
output_v3 = gr.Textbox() | |
time_v3 = gr.Textbox("Time taken") | |
with gr.Column(): | |
gr.Markdown("### Whisper-Large-V3-Turbo") | |
button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo") | |
output_v3_turbo = gr.Textbox() | |
time_v3_turbo = gr.Textbox() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Kotoba-Whisper-V1.0") | |
button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0") | |
output_kotoba_v1 = gr.Textbox() | |
time_kotoba_v1 = gr.Textbox("Time taken") | |
with gr.Column(): | |
gr.Markdown("### Kotoba-Whisper-V2.0") | |
button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0") | |
output_kotoba_v2 = gr.Textbox() | |
time_kotoba_v2 = gr.Textbox("Time taken") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Galgame-Whisper (WIP)") | |
button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)") | |
output_galgame = gr.Textbox() | |
time_galgame = gr.Textbox("Time taken") | |
button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2]) | |
button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3]) | |
button_v3_turbo.click( | |
transcribe_large_v3_turbo, | |
inputs=audio, | |
outputs=[output_v3_turbo, time_v3_turbo], | |
) | |
button_kotoba_v1.click( | |
transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1] | |
) | |
button_kotoba_v2.click( | |
transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2] | |
) | |
button_galgame.click( | |
transcribe_galgame_whisper, | |
inputs=audio, | |
outputs=[output_galgame, time_galgame], | |
) | |
app.launch(inbrowser=True) | |