File size: 4,392 Bytes
bf9542e
 
2446386
1b12140
922050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b12140
bf9542e
 
922050b
 
bf9542e
 
 
6aeab44
 
51a9e4f
bf9542e
 
 
2446386
bf9542e
 
 
0648a36
 
 
 
922050b
 
 
 
 
 
 
 
 
 
 
 
 
 
7e868b7
922050b
 
 
 
 
7e868b7
6aeab44
922050b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf9542e
2446386
5f6bd2c
2446386
 
 
 
 
 
 
 
 
 
 
 
bf9542e
2446386
5f6bd2c
2446386
 
922050b
 
1f1863f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import openai
import yt_dlp
import os
import io
import tempfile
from pydub import AudioSegment

def split_audio(file_path, chunk_length_ms):
    audio = AudioSegment.from_file(file_path)
    duration = len(audio)
    chunks = []
    start_time = 0
    while start_time < duration:
        end_time = start_time + chunk_length_ms
        if end_time > duration:
            end_time = duration
        chunk = audio[start_time:end_time]
        chunks.append(chunk)
        start_time += chunk_length_ms
    return chunks

def split_string_by_tokens(text, max_tokens=500):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

openai.api_key = os.environ['OPENAI_API_KEY']

def asr(url):
    # delete the video
    os.system("rm *audio_download*")
    # download audio
    # Options for youtube-dl
    ydl_opts = {
       'format': 'bestaudio/best',
        'outtmpl': 'audio_downloaded.%(ext)s',
        'no_continue': True,
    }

    # Create a youtube-dl object
    ydl = yt_dlp.YoutubeDL(ydl_opts)

    # Download the video
    info_dict = ydl.extract_info(url, download=True)
    if info_dict is not None:
        audio_file_name = "audio_downloaded.{}".format(info_dict["ext"])
    else:
        return "下载音频发生错误,请确认链接再试一次。", "Error downloading the audio. Check the URL and try again."
    
    yield "下载视频完成. 开始分割视频...", ""
    chunks = split_audio(audio_file_name, chunk_length_ms=30 * 1000)
    transcripts = []

    for idx, chunk in enumerate(chunks):
        
        temp_file_path = None
        with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as temp_file:
            temp_file_path = temp_file.name
            chunk.export(temp_file.name, format="wav")

        with open(temp_file_path, "rb") as temp_file:
            transcript = openai.Audio.transcribe("whisper-1", temp_file)

        os.remove(temp_file_path)
        transcripts.append(transcript["text"])
        
        yield "请耐心等待语音识别完成...({}/{})".format(idx + 1, len(chunks)), " ".join(transcripts)
    
    # delete the video
    os.system("rm {}".format(audio_file_name))

    translations = []
    full_transcript = " ".join(transcripts)
    # split into 500 tokens
    transcript_chunks = split_string_by_tokens(full_transcript, max_tokens=500)
    yield "语音识别完成, 开始翻译...(0/{})".format(len(transcript_chunks)), full_transcript
    # split transcripts if its too long
    for idx, transcript in enumerate(transcript_chunks):
        output = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "user", "content": "Transcript: {transcript}. \n Translate the video conversation transcript into fluent Chinese. Chinese: ".format(transcript=transcript)},
            ]
        )
        translation = output['choices'][0]['message']['content']
        translations.append(translation)
        
        yield "请耐心等候翻译:({}/{})...".format(idx+1, len(transcript_chunks)) + " ".join(translations), " ".join(transcripts)

    full_translation = " ".join(translations)
    yield full_translation, full_transcript

title = """
轻声细译"""
# Create an instruction input component
instruction = """
<div style="border: 2px solid #000; padding: 10px; border-radius: 5px;">
一键输入视频链接,轻松实现中文翻译,畅享视频无障碍沟通 <span style="color: grey;">-- powered by OpenAI Whisper & ChatGPT.</span>.<br>

1.将视频链接(支持Twitter、YouTube)复制粘贴至输入框,点击提交(Submit)即可;
</div>"""
# Create a text input component
text_input = gr.inputs.Textbox()

demo = gr.Interface(fn=asr, 
                    inputs=gr.inputs.Textbox(label="粘贴视频链接"),
                    outputs=[
                        gr.outputs.Textbox(label="中文"),
                        gr.outputs.Textbox(label="英文")
                    ],
                    title=title,
                    description=instruction,theme='huggingface')
demo.queue()
demo.launch()