easy-whisper / app.py
yachimat's picture
Update app.py
17e13cb verified
raw
history blame contribute delete
No virus
2.5 kB
import gradio as gr
from pydub import AudioSegment
from io import BytesIO
import imageio_ffmpeg
from docx import Document
from openai import OpenAI
# Setting up paths and initial configurations
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffmpeg_path
client = OpenAI()
def is_valid_mp3(data):
headers = data.read(10)
data.seek(0)
return headers.startswith(b'ID3') or headers[0:2] == b'\xff\xfb'
def process_audio(audio_file_path, output_format, progress=None):
if not audio_file_path:
return "No audio file provided.", None
file_type = audio_file_path.split('.')[-1]
audio = AudioSegment.from_file(audio_file_path, format=file_type)
if file_type != "mp3":
audio_data = BytesIO()
audio.export(audio_data, format="mp3")
audio_data.seek(0)
audio = AudioSegment.from_file(audio_data, format="mp3")
duration_ms = len(audio)
max_size_bytes = 26_214_400
max_duration_per_part = duration_ms * max_size_bytes / len(audio.raw_data)
parts = []
start_ms = 0
total_parts = int(duration_ms / max_duration_per_part) + 1
part_index = 0
while start_ms < duration_ms:
end_ms = min(start_ms + max_duration_per_part, duration_ms)
part = audio[start_ms:end_ms]
parts.append(part)
start_ms = end_ms
part_index += 1
if progress:
progress(part_index / total_parts)
full_transcript = ""
for part_index, part in enumerate(parts):
part_data = BytesIO()
part.export(part_data, format="mp3")
part_data.seek(0)
if is_valid_mp3(part_data):
part_data.name = f"part_{part_index}.mp3"
transcript_part = client.audio.transcriptions.create(
model="whisper-1",
file=part_data
)
full_transcript += transcript_part.text + "\n"
# text_bytes = BytesIO(full_transcript.encode('utf-8'))
return full_transcript #, text_bytes
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="音声ファイルをアップロード"),
outputs=[
gr.Textbox(label="出力", show_copy_button=True),
# gr.File(label=".txtダウンロード", type='binary')
],
title="音声ファイルをテキストに変換",
description="目安として10分の音声は1分程度かかります。",
allow_flagging="never"
)
iface.launch()