File size: 4,422 Bytes
bf9542e
 
2446386
1b12140
922050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b12140
bf9542e
 
922050b
 
bf9542e
 
 
6aeab44
 
51a9e4f
bf9542e
 
 
2446386
bf9542e
 
 
0648a36
 
 
 
922050b
 
 
 
 
 
 
 
 
 
 
 
 
 
7e868b7
922050b
 
 
 
 
7e868b7
6aeab44
922050b
 
 
 
 
 
 
 
 
 
 
e6882db
 
 
 
922050b
e6882db
 
922050b
e6882db
922050b
e6882db
922050b
bf9542e
2446386
5f6bd2c
2446386
 
 
4f4216e
2446386
 
 
cc37f03
2446386
 
bf9542e
4f4216e
 
2446386
 
db145e2
4e41537
cc37f03
922050b
cc37f03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import openai
import yt_dlp
import os
import io
import tempfile
from pydub import AudioSegment

def split_audio(file_path, chunk_length_ms):
    audio = AudioSegment.from_file(file_path)
    duration = len(audio)
    chunks = []
    start_time = 0
    while start_time < duration:
        end_time = start_time + chunk_length_ms
        if end_time > duration:
            end_time = duration
        chunk = audio[start_time:end_time]
        chunks.append(chunk)
        start_time += chunk_length_ms
    return chunks

def split_string_by_tokens(text, max_tokens=500):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

openai.api_key = os.environ['OPENAI_API_KEY']

def asr(url):
    # delete the video
    os.system("rm *audio_download*")
    # download audio
    # Options for youtube-dl
    ydl_opts = {
       'format': 'bestaudio/best',
        'outtmpl': 'audio_downloaded.%(ext)s',
        'no_continue': True,
    }

    # Create a youtube-dl object
    ydl = yt_dlp.YoutubeDL(ydl_opts)

    # Download the video
    info_dict = ydl.extract_info(url, download=True)
    if info_dict is not None:
        audio_file_name = "audio_downloaded.{}".format(info_dict["ext"])
    else:
        return "下载音频发生错误,请确认链接再试一次。", "Error downloading the audio. Check the URL and try again."
    
    yield "下载视频完成. 开始分割视频...", ""
    chunks = split_audio(audio_file_name, chunk_length_ms=30 * 1000)
    transcripts = []

    for idx, chunk in enumerate(chunks):
        
        temp_file_path = None
        with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as temp_file:
            temp_file_path = temp_file.name
            chunk.export(temp_file.name, format="wav")

        with open(temp_file_path, "rb") as temp_file:
            transcript = openai.Audio.transcribe("whisper-1", temp_file)

        os.remove(temp_file_path)
        transcripts.append(transcript["text"])
        
        yield "请耐心等待语音识别完成...({}/{})".format(idx + 1, len(chunks)), " ".join(transcripts)
    
    # delete the video
    os.system("rm {}".format(audio_file_name))

    translations = []
    full_transcript = " ".join(transcripts)
    # split into 500 tokens
    transcript_chunks = split_string_by_tokens(full_transcript, max_tokens=500)
    yield "语音识别完成, 开始翻译...(0/{})".format(len(transcript_chunks)), full_transcript
    # split transcripts if its too long
    for idx, transcript in enumerate(transcript_chunks):
        output = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "user", 
                 "content": "Transcript: {transcript}. \n Translate the video conversation transcript into fluent Chinese. Chinese: ".format(transcript=transcript)},
            ],
        stream=True,
        )
        for event in output:
            translations.append(event["choices"][0].get("delta", "").get("content", ""))
        
            yield "请耐心等候翻译:({}/{})...".format(idx+1, len(transcript_chunks)) + "".join(translations), " ".join(transcripts)

    full_translation = "".join(translations)
    yield full_translation, full_transcript

title = """
轻声细译"""
# Create an instruction input component
instruction = """
<div style="border: 2px solid #000; padding: 10px; border-radius: 5px;">
视频翻译 (video-translation):输入视频链接,进行中文翻译 <span style="color: grey;">-- powered by OpenAI Whisper & ChatGPT.</span>.<br>

1.将视频链接(支持Twitter、YouTube)复制粘贴至输入框,点击提交(Submit)即可;
</div>"""

demo = gr.Interface(fn=asr, 
                    inputs=gr.inputs.Textbox(label="粘贴视频链接"),
                    outputs=[
                        gr.outputs.Textbox(label="翻译"),
                        gr.outputs.Textbox(label="音频转录")
                    ],
                    title=title,
                    description=instruction,
                    theme="JohnSmith9982/small_and_pretty")

demo.queue()
demo.launch()