File size: 3,161 Bytes
c494b72
efc0dfb
6ddb846
8647b8a
 
d7d5bfb
00cf38b
b0b323a
8647b8a
 
 
 
 
 
 
 
 
 
b0b323a
 
 
 
 
 
 
d7d5bfb
 
5d955d6
 
0987717
5d955d6
 
6ddb846
 
 
 
 
 
 
 
 
 
 
 
 
5d955d6
6ddb846
 
8647b8a
5d955d6
d7d5bfb
 
 
 
 
 
 
 
8647b8a
 
 
 
 
 
 
 
 
 
 
 
 
d7d5bfb
 
8647b8a
 
 
 
a9a2aff
8647b8a
 
 
 
 
 
 
 
 
 
 
6ddb846
8647b8a
6ddb846
 
 
d7d5bfb
8647b8a
d7d5bfb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os
import openai

model = pipeline(model="SofiaK/dataset")

openai_api_key = os.getenv("OPENAI_API_KEY")

# Check if the API key is available
if openai_api_key is None:
    raise ValueError(
        "OpenAI API key is not set. Make sure to set it as a secret variable in Hugging Face Spaces."
    )

openai.api_key = openai_api_key


def youtube_to_text(youtube_url):
    video = YouTube(youtube_url).streams.filter(only_audio=True).all()
    audio = video[0].download()
    text = model(audio)["text"]
    return text


with gr.Blocks() as demo:
    with gr.Row():
        gr.HTML(
            value="<h1 style='text-align: center;'>Speech-To-Text transcription for Russian Language</h1>"
        )

    with gr.Row():
        with gr.Column():
            radio = gr.Radio(
                choices=["Audio", "Youtube"],
                label="Choose your input type: an audio or a youtube link",
                value="Audio",
            )
            audio_input = gr.Audio(
                sources=["upload", "microphone"], type="filepath", visible=True
            )
            youtube_input = gr.Textbox(
                value="https://www.youtube.com/", label="Youtube Link", visible=False
            )
            btn = gr.Button("Transcript")
        with gr.Column():
            output = gr.Text(label="Model Output")
            chatbot = gr.Chatbot()
            msg = gr.Textbox(label="Ask a question!")

    def make_visible(val):
        audio_visible = val == "Audio"
        return {
            audio_input: {"visible": audio_visible, "__type__": "update"},
            youtube_input: {"visible": not audio_visible, "__type__": "update"},
        }

    history_gpt = []

    def respond(msg, chat_history):
        history_gpt.append({"role": "user", "content": msg})
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo", messages=history_gpt
        )
        history_gpt.append(
            {"role": "assistant", "content": response.choices[0].message.content}
        )
        chat_history.append((msg, response.choices[0].message.content))
        return "", chat_history

    radio.change(make_visible, inputs=radio, outputs=[audio_input, youtube_input])

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

    def transcript(audio_input, youtube_input, radio):
        if radio == "Audio":
            txt = model(audio_input)["text"]
        else:
            txt = youtube_to_text(youtube_input)
        history_gpt.append(
            {
                "role": "system",
                "content": "Here is a text in Russian that was transcripted from an audio or a video. The user will ask questions about this text such as to translate it to another language, to summarize it, or to get relevant information. By default respond in english, apart if the user tells you otherwise. Here is the text"
                + txt,
            }
        )
        return txt

    btn.click(
        fn=transcript,
        inputs=[audio_input, youtube_input, radio],
        outputs=output,
    )


demo.launch()