import gradio as gr from pydub import AudioSegment from io import BytesIO import imageio_ffmpeg from docx import Document from openai import OpenAI # Setting up paths and initial configurations ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe() AudioSegment.converter = ffmpeg_path AudioSegment.ffprobe = ffmpeg_path client = OpenAI() def is_valid_mp3(data): headers = data.read(10) data.seek(0) return headers.startswith(b'ID3') or headers[0:2] == b'\xff\xfb' def process_audio(audio_file_path, output_format, progress=None): if not audio_file_path: return "No audio file provided.", None file_type = audio_file_path.split('.')[-1] audio = AudioSegment.from_file(audio_file_path, format=file_type) if file_type != "mp3": audio_data = BytesIO() audio.export(audio_data, format="mp3") audio_data.seek(0) audio = AudioSegment.from_file(audio_data, format="mp3") duration_ms = len(audio) max_size_bytes = 26_214_400 max_duration_per_part = duration_ms * max_size_bytes / len(audio.raw_data) parts = [] start_ms = 0 total_parts = int(duration_ms / max_duration_per_part) + 1 part_index = 0 while start_ms < duration_ms: end_ms = min(start_ms + max_duration_per_part, duration_ms) part = audio[start_ms:end_ms] parts.append(part) start_ms = end_ms part_index += 1 if progress: progress(part_index / total_parts) full_transcript = "" for part_index, part in enumerate(parts): part_data = BytesIO() part.export(part_data, format="mp3") part_data.seek(0) if is_valid_mp3(part_data): part_data.name = f"part_{part_index}.mp3" transcript_part = client.audio.transcriptions.create( model="whisper-1", file=part_data ) full_transcript += transcript_part.text + "\n" # text_bytes = BytesIO(full_transcript.encode('utf-8')) return full_transcript #, text_bytes iface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath", label="音声ファイルをアップロード"), outputs=[ gr.Textbox(label="出力", show_copy_button=True), # gr.File(label=".txtダウンロード", type='binary') ], title="音声ファイルをテキストに変換", description="目安として10分の音声は1分程度かかります。", allow_flagging="never" ) iface.launch()