Yurii Paniv commited on
Commit
eeaef84
1 Parent(s): 95bbd8f

Speed up demo

Browse files
Files changed (4) hide show
  1. .gitignore +1 -1
  2. README.md +3 -1
  3. app.py +38 -46
  4. requirements.txt +7 -7
.gitignore CHANGED
@@ -131,5 +131,5 @@ dmypy.json
131
  .DS_Store
132
 
133
  #models
134
- config.json
135
  *.pth
 
131
  .DS_Store
132
 
133
  #models
134
+ config.yaml
135
  *.pth
README.md CHANGED
@@ -4,6 +4,8 @@ emoji: 🇺🇦
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
 
 
7
  app_file: app.py
8
  pinned: false
9
  ---
@@ -17,7 +19,7 @@ Link to speaking demo: [https://huggingface.co/spaces/robinhad/ukrainian-ai](htt
17
  Link to text demo: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
18
  # Technologies used:
19
 
20
- - [Wav2Vec2 XLS-R 300M fine-tuned to Ukrainian language](https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm) for speech recognition.
21
  - [Ukrainian VITS TTS](https://github.com/robinhad/ukrainian-tts) for text-to-speech generation.
22
  - Conversational pipeline (this repository)
23
 
 
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version : 3.16
8
+ python_version: 3.10
9
  app_file: app.py
10
  pinned: false
11
  ---
 
19
  Link to text demo: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
20
  # Technologies used:
21
 
22
+ - [Wav2Vec2 XLS-R 300M fine-tuned to Ukrainian language](https://huggingface.co/robinhad/wav2vec2-xls-r-300m-uk) for speech recognition.
23
  - [Ukrainian VITS TTS](https://github.com/robinhad/ukrainian-tts) for text-to-speech generation.
24
  - Conversational pipeline (this repository)
25
 
app.py CHANGED
@@ -1,83 +1,75 @@
1
  import gradio as gr
2
- from transformers import Conversation, ConversationalPipeline, pipeline
3
  import tempfile
4
- import torch
5
- from os.path import exists
6
- import requests
7
- from TTS.utils.synthesizer import Synthesizer
8
  import gradio as gr
 
 
9
 
10
- def download(url, file_name):
11
- if not exists(file_name):
12
- print(f"Downloading {file_name}")
13
- r = requests.get(url, allow_redirects=True)
14
- with open(file_name, "wb") as file:
15
- file.write(r.content)
16
- else:
17
- print(f"Found {file_name}. Skipping download...")
18
 
19
-
20
- print("downloading uk/mykyta/vits-tts")
21
- release_number = "v2.0.0-beta"
22
- model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
23
- config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
24
-
25
- model_path = "model.pth"
26
- config_path = "config.json"
27
-
28
- download(model_link, model_path)
29
- download(config_link, config_path)
30
 
31
  p = pipeline(
32
- "automatic-speech-recognition", "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm"
33
  )
34
 
 
 
35
  conv: ConversationalPipeline = pipeline(
36
- "conversational", "robinhad/gpt2-uk-conversational"
37
  )
38
 
39
- synthesizer = Synthesizer(
40
- model_path,
41
- config_path,
42
- None,
43
- None,
44
- None,
45
- )
46
 
47
- badge = (
48
- "https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-ai"
49
- )
 
 
 
 
 
50
 
51
 
52
- def transcribe(audio, history):
53
  text = p(audio)["text"]
54
  history = history or []
 
55
  past_user_inputs = [i[0] for i in history]
56
  generated_responses = [i[1] for i in history]
57
- response = conv(Conversation(text, past_user_inputs, generated_responses))
 
58
  response = response.generated_responses[-1]
59
  history.append((text, response))
60
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
61
- with torch.no_grad():
62
- wavs = synthesizer.tts(response)
63
- synthesizer.save_wav(wavs, fp)
64
  return text, fp.name, history, history
65
 
66
 
67
  iface = gr.Interface(
68
  fn=transcribe,
69
- inputs=[gr.inputs.Audio(source="microphone", type="filepath"), "state"],
 
 
 
 
 
 
 
70
  outputs=[
71
  gr.outputs.Textbox(label="Recognized text"),
72
- gr.outputs.Audio(label="Output"),
73
  gr.outputs.Chatbot(label="Chat"),
74
  "state",
75
  ],
76
  description="""Це альфа-версія end-to-end розмовного бота, з яким можна поспілкуватися голосом.
77
  Перейдіть сюди для доступу до текстової версії: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
78
  """,
79
- article=f"""Розпізнавання української: [https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm](https://huggingface.co/Yehor/wav2vec2-xls-r-300m-uk-with-small-lm)
80
- Синтез української: [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
81
- <center><img src="{badge}" alt="visitors badge"/></center>""",
82
  )
83
  iface.launch()
 
1
  import gradio as gr
2
+ from transformers import Conversation, ConversationalPipeline, pipeline, AlbertTokenizerFast
3
  import tempfile
 
 
 
 
4
  import gradio as gr
5
+ from ukrainian_tts.tts import TTS, Voices, Stress
6
+ from enum import Enum
7
 
 
 
 
 
 
 
 
 
8
 
9
+ tts = TTS(device="cpu") # can try gpu, mps
 
 
 
 
 
 
 
 
 
 
10
 
11
  p = pipeline(
12
+ "automatic-speech-recognition", "robinhad/wav2vec2-xls-r-300m-uk"
13
  )
14
 
15
+
16
+ tokenizer = AlbertTokenizerFast.from_pretrained("robinhad/gpt2-uk-conversational")
17
  conv: ConversationalPipeline = pipeline(
18
+ "conversational", "robinhad/gpt2-uk-conversational", tokenizer=tokenizer
19
  )
20
 
21
+ class VoiceOption(Enum):
22
+ Olena = "Олена (жіночий) 👩"
23
+ Mykyta = "Микита (чоловічий) 👨"
24
+ Lada = "Лада (жіночий) 👩"
25
+ Dmytro = "Дмитро (чоловічий) 👨"
26
+ Olga = "Ольга (жіночий) 👩"
 
27
 
28
+
29
+ voice_mapping = {
30
+ VoiceOption.Olena.value: Voices.Olena.value,
31
+ VoiceOption.Mykyta.value: Voices.Mykyta.value,
32
+ VoiceOption.Lada.value: Voices.Lada.value,
33
+ VoiceOption.Dmytro.value: Voices.Dmytro.value,
34
+ VoiceOption.Olga.value: Voices.Olga.value,
35
+ }
36
 
37
 
38
+ def transcribe(audio, selected_voice, history):
39
  text = p(audio)["text"]
40
  history = history or []
41
+ selected_voice = voice_mapping[selected_voice]
42
  past_user_inputs = [i[0] for i in history]
43
  generated_responses = [i[1] for i in history]
44
+ next_output_length = len(tokenizer.encode("".join(generated_responses + past_user_inputs))) + 60
45
+ response = conv(Conversation(text, past_user_inputs, generated_responses), max_length=next_output_length, penalty_alpha=0.6, top_k=4)
46
  response = response.generated_responses[-1]
47
  history.append((text, response))
48
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
49
+ _, output_text = tts.tts(response, selected_voice, Stress.Dictionary.value, fp)
 
 
50
  return text, fp.name, history, history
51
 
52
 
53
  iface = gr.Interface(
54
  fn=transcribe,
55
+ inputs=[
56
+ gr.inputs.Audio(source="microphone", type="filepath"),
57
+ gr.components.Radio(
58
+ label="Голос",
59
+ choices=[option.value for option in VoiceOption],
60
+ value=VoiceOption.Olena.value,
61
+ ),
62
+ "state"],
63
  outputs=[
64
  gr.outputs.Textbox(label="Recognized text"),
65
+ gr.outputs.Audio(label="Output", type="filepath"),
66
  gr.outputs.Chatbot(label="Chat"),
67
  "state",
68
  ],
69
  description="""Це альфа-версія end-to-end розмовного бота, з яким можна поспілкуватися голосом.
70
  Перейдіть сюди для доступу до текстової версії: [https://huggingface.co/robinhad/gpt2-uk-conversational](https://huggingface.co/robinhad/gpt2-uk-conversational)
71
  """,
72
+ article=f"""Розпізнавання української: [https://huggingface.co/robinhad/wav2vec2-xls-r-300m-uk](https://huggingface.co/robinhad/wav2vec2-xls-r-300m-uk)
73
+ Синтез української: [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)""",
 
74
  )
75
  iface.launch()
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- gradio
2
- transformers==4.19.4
3
- TTS==0.6.2
4
- torch
5
- pyctcdecode
6
- https://github.com/kpu/kenlm/archive/master.zip
7
- sentencepiece==0.1.96
 
1
+ gradio==3.16
2
+ transformers==4.26
3
+ git+https://github.com/robinhad/ukrainian-tts.git@d3459a5e8a78dd95bfd1b43a4a659637a12a61d7
4
+ # this would be needed if model would have an LM
5
+ #pyctcdecode
6
+ #https://github.com/kpu/kenlm/archive/master.zip
7
+ #sentencepiece==0.1.96