File size: 2,804 Bytes
03ce9f7
eeaef84
19ae03c
 
eeaef84
 
19ae03c
 
ab18fd9
03ce9f7
9ce0232
eeaef84
9ce0232
 
eeaef84
 
9ce0232
eeaef84
9ce0232
19ae03c
eeaef84
ab18fd9
eeaef84
 
 
9ce0232
eeaef84
 
ab18fd9
eeaef84
 
 
 
19ae03c
6ede7a5
eeaef84
19ae03c
9ce0232
eeaef84
9ce0232
 
eeaef84
ce406e3
9ce0232
 
19ae03c
eeaef84
9ce0232
03ce9f7
e2a6612
 
 
03ce9f7
9ce0232
 
eeaef84
 
 
 
 
7037773
eeaef84
 
9ce0232
 
eeaef84
9ce0232
 
 
 
 
 
e2a6612
9ce0232
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from transformers import Conversation, ConversationalPipeline, pipeline, AlbertTokenizerFast
import tempfile
import gradio as gr
from ukrainian_tts.tts import TTS, Voices, Stress
from enum import Enum


tts = TTS() # can try device=cpu|gpu|mps

p = pipeline(
    "automatic-speech-recognition", "robinhad/wav2vec2-xls-r-300m-uk"
)


tokenizer = AlbertTokenizerFast.from_pretrained("robinhad/gpt2-uk-conversational")
conv: ConversationalPipeline = pipeline(
    "conversational", "robinhad/gpt2-uk-conversational", tokenizer=tokenizer
)

class VoiceOption(Enum):
    Tetiana = "Тетяна (жіночий) 👩"
    Mykyta = "Микита (чоловічий) 👨"
    Lada = "Лада (жіночий) 👩"
    Dmytro = "Дмитро (чоловічий) 👨"


voice_mapping = {
    VoiceOption.Tetiana.value: Voices.Tetiana.value,
    VoiceOption.Mykyta.value: Voices.Mykyta.value,
    VoiceOption.Lada.value: Voices.Lada.value,
    VoiceOption.Dmytro.value: Voices.Dmytro.value,
}


def transcribe(audio, selected_voice, history):
    text = p(audio)["text"]
    history = history or []
    selected_voice = voice_mapping[selected_voice]
    past_user_inputs = [i[0] for i in history]
    generated_responses = [i[1] for i in history]
    next_output_length = len(tokenizer.encode("".join(generated_responses + past_user_inputs))) + 60
    response = conv(Conversation(text, past_user_inputs, generated_responses), max_length=next_output_length)
    response = response.generated_responses[-1]
    history.append((text, response))
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        _, output_text = tts.tts(response, selected_voice, Stress.Dictionary.value, fp)
        return text, fp.name, history, history

with open("README.md") as file:
    article = file.read()
    article = article[article.find("---\n", 4) + 5 : :]

iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath"), 
        gr.components.Radio(
            label="Голос",
            choices=[option.value for option in VoiceOption],
            value=VoiceOption.Tetiana.value,
        ),
        "state"],
    outputs=[
        gr.outputs.Textbox(label="Recognized text"),
        gr.outputs.Audio(label="Output", type="filepath"),
        gr.outputs.Chatbot(label="Chat"),
        "state",
    ],
    description="""Це альфа-версія end-to-end розмовного бота, з яким можна поспілкуватися голосом.  
    Перейдіть сюди для доступу до текстової версії: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)  
    """,
    article=article,
)
iface.launch()