Kuznetsov AV commited on
Commit
6feeeab
1 Parent(s): c910ab2

text-to-speech module completed

Browse files
kuznetsov_av/__init__.py ADDED
File without changes
kuznetsov_av/kuznetsov_av.py DELETED
@@ -1,23 +0,0 @@
1
- from transformers import pipeline
2
- from datasets import load_dataset
3
- import torch
4
- import streamlit as st
5
-
6
- @st.cache_resource
7
- def load_model():
8
- synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
9
-
10
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
11
- speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
12
-
13
- return synthesiser, speaker_embedding
14
-
15
- synthesiser, speaker_embedding = load_model()
16
-
17
- text = st.text_area('Enter English text here')
18
- st.write(f'You wrote {len(text)} characters.')
19
-
20
- if st.button('Speech'):
21
- speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
22
-
23
- st.audio(speech['audio'], sample_rate=speech['sampling_rate'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
kuznetsov_av/requirements.txt DELETED
@@ -1,4 +0,0 @@
1
- datasets==2.14.6
2
- streamlit==1.28.1
3
- torch==2.1.0
4
- transformers==4.35.0
 
 
 
 
 
kuznetsov_av/text_to_speech_converter.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import transformers.pipelines.text_to_audio
3
+ from datasets import load_dataset
4
+ import datasets.arrow_dataset
5
+ import torch
6
+ import numpy as np
7
+
8
+
9
+ def load_model() -> transformers.pipelines.text_to_audio.TextToAudioPipeline:
10
+ """
11
+ Подгрузка модели преобразования текста в речь
12
+ :return: class TextToAudioPipeline
13
+ """
14
+ return pipeline("text-to-speech", "microsoft/speecht5_tts")
15
+
16
+
17
+ def load_speaker_dataset() -> datasets.arrow_dataset.Dataset:
18
+ """
19
+ Подгрузка датасета для озвучивания текста
20
+ :return: class Dataset
21
+ """
22
+ return load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
23
+
24
+
25
+ def text_to_speech(
26
+ text: str,
27
+ synthesiser: transformers.pipelines.text_to_audio.TextToAudioPipeline,
28
+ embeddings_dataset: datasets.arrow_dataset.Dataset
29
+ ) -> (np.ndarray, int):
30
+ """
31
+ Преобразование текста в речь
32
+ :param text: Текст
33
+ :param synthesiser: pipeline для озвучивания текста
34
+ :param embeddings_dataset: dataset для озвучивания текста
35
+ :return: tuple (audio data, sampling rate)
36
+ """
37
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
38
+
39
+ speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
40
+
41
+ return speech['audio'], speech['sampling_rate']
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  datasets==2.14.6
 
2
  streamlit==1.28.1
3
  torch==2.1.0
4
  transformers==4.35.0
5
- sentencepiece=0.1.99
6
- sacremoses=0.1.1
 
1
  datasets==2.14.6
2
+ numpy==1.26.2
3
  streamlit==1.28.1
4
  torch==2.1.0
5
  transformers==4.35.0
6
+ sentencepiece==0.1.99
7
+ sacremoses==0.1.1
run.py CHANGED
@@ -2,9 +2,12 @@ import streamlit as st
2
 
3
  from mulyavin_aa import langdetector
4
  from mulyavin_aa import translator
 
5
 
6
  LANG_DETECTOR = "LANG_DETECTOR"
7
  TRANSLATOR = "TRANSLATOR"
 
 
8
 
9
 
10
  @st.cache_resource
@@ -16,6 +19,8 @@ def load_models() -> dict:
16
  models = dict()
17
  models[LANG_DETECTOR] = langdetector.load_text_detection_model()
18
  models[TRANSLATOR] = translator.load_text_translator_model()
 
 
19
 
20
  return models
21
 
@@ -49,7 +54,10 @@ def main_app():
49
  tab1, tab2, tab3 = st.tabs(['Озвученный текст', 'Таб 2', 'Таб 3'])
50
  with tab1:
51
  st.header("Озвученный текст на английском языке")
52
- # st.audio()
 
 
 
53
 
54
  with tab2:
55
  st.header("Таб 2")
 
2
 
3
  from mulyavin_aa import langdetector
4
  from mulyavin_aa import translator
5
+ from kuznetsov_av import text_to_speech_converter
6
 
7
  LANG_DETECTOR = "LANG_DETECTOR"
8
  TRANSLATOR = "TRANSLATOR"
9
+ TEXT_TO_SPEECH = "TEXT_TO_SPEECH"
10
+ SPEAKER_DATASET = "SPEAKER_DATASET"
11
 
12
 
13
  @st.cache_resource
 
19
  models = dict()
20
  models[LANG_DETECTOR] = langdetector.load_text_detection_model()
21
  models[TRANSLATOR] = translator.load_text_translator_model()
22
+ models[TEXT_TO_SPEECH] = text_to_speech_converter.load_model()
23
+ models[SPEAKER_DATASET] = text_to_speech_converter.load_speaker_dataset()
24
 
25
  return models
26
 
 
54
  tab1, tab2, tab3 = st.tabs(['Озвученный текст', 'Таб 2', 'Таб 3'])
55
  with tab1:
56
  st.header("Озвученный текст на английском языке")
57
+ # Преобразование текста в речь
58
+ audio_data, sampling_rate = text_to_speech_converter.text_to_speech(
59
+ input_text, models[TEXT_TO_SPEECH], models[SPEAKER_DATASET])
60
+ st.audio(data=audio_data, sample_rate=sampling_rate)
61
 
62
  with tab2:
63
  st.header("Таб 2")