litagin's picture
init
bef66de
raw
history blame
4.64 kB
import os
import time
import gradio as gr
from pydub import AudioSegment
from transformers import pipeline
is_hf = os.getenv("SYSTEM") == "spaces"
generate_kwargs = {
"language": "Japanese",
"do_sample": False,
"num_beams": 1,
"no_repeat_ngram_size": 3,
}
model_dict = {
"whisper-large-v2": "openai/whisper-large-v2",
"whisper-large-v3": "openai/whisper-large-v3",
"whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
"kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
"kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
"galgame-whisper-wip": (
"litagin/galgame-whisper-wip"
if is_hf
else "../whisper_finetune/galgame-whisper"
),
}
# Download models
for model in model_dict.values():
pipeline("automatic-speech-recognition", model=model)
def transcribe_common(audio: str, model: str) -> tuple[str, float]:
# Get duration of audio
duration = AudioSegment.from_file(audio).duration_seconds
if duration > 15:
return "Audio too long, limit is 15 seconds", 0
start_time = time.time()
pipe = pipeline("automatic-speech-recognition", model=model)
end_time = time.time()
return pipe(audio, generate_kwargs=generate_kwargs)["text"], end_time - start_time
def transcribe_large_v2(audio) -> tuple[str, float]:
return transcribe_common(audio, model_dict["whisper-large-v2"])
def transcribe_large_v3(audio) -> tuple[str, float]:
return transcribe_common(audio, model_dict["whisper-large-v3"])
def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
return transcribe_common(audio, model_dict["whisper-large-v3-turbo"])
def transcribe_kotoba_v1(audio) -> tuple[str, float]:
return transcribe_common(audio, model_dict["kotoba-whisper-v1.0"])
def transcribe_kotoba_v2(audio) -> tuple[str, float]:
return transcribe_common(audio, model_dict["kotoba-whisper-v2.0"])
def transcribe_galgame_whisper(audio) -> tuple[str, float]:
return transcribe_common(audio, model_dict["galgame-whisper-wip"])
initial_md = """
# Galgame-Whisper (WIP) Demo
- 日本語のみ対応
- 他の書き起こしとついでに比較できるようにいろいろ入れた
- 現在0.1エポックくらい
- 速度はCPUです
- 音声は15秒まで
"""
with gr.Blocks() as app:
audio = gr.Audio(type="filepath")
gr.Markdown("### Kotoba-Whisper-V1.0")
with gr.Row():
with gr.Column():
gr.Markdown("### Whisper-Large-V2")
button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
output_v2 = gr.Textbox()
time_v2 = gr.Textbox("Time taken")
with gr.Column():
gr.Markdown("### Whisper-Large-V3")
button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
output_v3 = gr.Textbox()
time_v3 = gr.Textbox("Time taken")
with gr.Column():
gr.Markdown("### Whisper-Large-V3-Turbo")
button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
output_v3_turbo = gr.Textbox()
time_v3_turbo = gr.Textbox()
with gr.Row():
with gr.Column():
gr.Markdown("### Kotoba-Whisper-V1.0")
button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
output_kotoba_v1 = gr.Textbox()
time_kotoba_v1 = gr.Textbox("Time taken")
with gr.Column():
gr.Markdown("### Kotoba-Whisper-V2.0")
button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
output_kotoba_v2 = gr.Textbox()
time_kotoba_v2 = gr.Textbox("Time taken")
with gr.Row():
with gr.Column():
gr.Markdown("### Galgame-Whisper (WIP)")
button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
output_galgame = gr.Textbox()
time_galgame = gr.Textbox("Time taken")
button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
button_v3_turbo.click(
transcribe_large_v3_turbo,
inputs=audio,
outputs=[output_v3_turbo, time_v3_turbo],
)
button_kotoba_v1.click(
transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1]
)
button_kotoba_v2.click(
transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
)
button_galgame.click(
transcribe_galgame_whisper,
inputs=audio,
outputs=[output_galgame, time_galgame],
)
app.launch(inbrowser=True)