Spaces:
Sleeping
Sleeping
File size: 2,495 Bytes
2cbdc24 caab576 2cbdc24 2584122 caab576 2cbdc24 caab576 2cbdc24 7680995 caab576 7680995 f3aa15e 15502cd caab576 15502cd 3c8cfdd 2cbdc24 4956c3c caab576 3c8cfdd 2cbdc24 4956c3c b060537 4956c3c b060537 f3aa15e caab576 18c82d0 4956c3c caab576 4956c3c caab576 4956c3c cdaa362 18c82d0 2cbdc24 ea30c4b b74e8c1 cdaa362 06f3209 2cbdc24 17e13cb b060537 2cbdc24 b060537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
from pydub import AudioSegment
from io import BytesIO
import imageio_ffmpeg
from docx import Document
from openai import OpenAI
# Setting up paths and initial configurations
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffmpeg_path
client = OpenAI()
def is_valid_mp3(data):
headers = data.read(10)
data.seek(0)
return headers.startswith(b'ID3') or headers[0:2] == b'\xff\xfb'
def process_audio(audio_file_path, output_format, progress=None):
if not audio_file_path:
return "No audio file provided.", None
file_type = audio_file_path.split('.')[-1]
audio = AudioSegment.from_file(audio_file_path, format=file_type)
if file_type != "mp3":
audio_data = BytesIO()
audio.export(audio_data, format="mp3")
audio_data.seek(0)
audio = AudioSegment.from_file(audio_data, format="mp3")
duration_ms = len(audio)
max_size_bytes = 26_214_400
max_duration_per_part = duration_ms * max_size_bytes / len(audio.raw_data)
parts = []
start_ms = 0
total_parts = int(duration_ms / max_duration_per_part) + 1
part_index = 0
while start_ms < duration_ms:
end_ms = min(start_ms + max_duration_per_part, duration_ms)
part = audio[start_ms:end_ms]
parts.append(part)
start_ms = end_ms
part_index += 1
if progress:
progress(part_index / total_parts)
full_transcript = ""
for part_index, part in enumerate(parts):
part_data = BytesIO()
part.export(part_data, format="mp3")
part_data.seek(0)
if is_valid_mp3(part_data):
part_data.name = f"part_{part_index}.mp3"
transcript_part = client.audio.transcriptions.create(
model="whisper-1",
file=part_data
)
full_transcript += transcript_part.text + "\n"
# text_bytes = BytesIO(full_transcript.encode('utf-8'))
return full_transcript #, text_bytes
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="音声ファイルをアップロード"),
outputs=[
gr.Textbox(label="出力", show_copy_button=True),
# gr.File(label=".txtダウンロード", type='binary')
],
title="音声ファイルをテキストに変換",
description="目安として10分の音声は1分程度かかります。",
allow_flagging="never"
)
iface.launch()
|