Libra7578 julien-c HF staff commited on
Commit
cdc4ccc
0 Parent(s):

Duplicate from CVMX-jaca-tonos/Generate-Gender-Neutralized-Audios

Browse files

Co-authored-by: Julien Chaumond <julien-c@users.noreply.huggingface.co>

Files changed (13) hide show
  1. .gitattributes +27 -0
  2. Example1.wav +0 -0
  3. Example2.wav +0 -0
  4. Example3.wav +0 -0
  5. README.md +13 -0
  6. app.py +155 -0
  7. audio1.wav +0 -0
  8. example2.wav +0 -0
  9. example3.wav +0 -0
  10. packages.txt +2 -0
  11. requirements.txt +6 -0
  12. travel.mp3 +0 -0
  13. travel.wav +0 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Example1.wav ADDED
Binary file (479 kB). View file
 
Example2.wav ADDED
Binary file (250 kB). View file
 
Example3.wav ADDED
Binary file (834 kB). View file
 
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Generate Gender Neutralized Audios
3
+ emoji: 🦀
4
+ colorFrom: pink
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 2.9.4
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: CVMX-jaca-tonos/Generate-Gender-Neutralized-Audios
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import librosa
4
+ import tempfile
5
+ from typing import Optional
6
+ from TTS.config import load_config
7
+ from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
8
+ from TTS.utils.manage import ModelManager
9
+ from TTS.utils.synthesizer import Synthesizer
10
+
11
+
12
+ first_generation = True
13
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
14
+
15
+
16
+ def load_and_fix_data(input_file, model_sampling_rate):
17
+ speech, sample_rate = librosa.load(input_file)
18
+ if len(speech.shape) > 1:
19
+ speech = speech[:, 0] + speech[:, 1]
20
+ if sample_rate != model_sampling_rate:
21
+ speech = librosa.resample(speech, sample_rate, model_sampling_rate)
22
+ return speech
23
+
24
+
25
+ feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-xls-r-1b-spanish")
26
+ sampling_rate = feature_extractor.sampling_rate
27
+
28
+ asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-xls-r-1b-spanish")
29
+
30
+ prefix = ''
31
+ model_checkpoint = "hackathon-pln-es/es_text_neutralizer"
32
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
33
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
34
+
35
+
36
+ manager = ModelManager()
37
+ MODEL_NAMES = manager.list_tts_models()
38
+
39
+
40
+ def postproc(input_sentence, preds):
41
+ try:
42
+ preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ')
43
+ if preds[0].islower():
44
+ preds = preds.capitalize()
45
+ preds = preds.replace(' . ', '. ').replace(' , ', ', ')
46
+
47
+ # Nombres en mayusculas
48
+ prev_letter = ''
49
+ for word in input_sentence.split(' '):
50
+ if word:
51
+ if word[0].isupper():
52
+ if word.lower() in preds and word != input_sentence.split(' ')[0]:
53
+ if prev_letter == '.':
54
+ preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ')
55
+ else:
56
+ if word[-1] == '.':
57
+ preds = preds.replace(word.lower(), word)
58
+ else:
59
+ preds = preds.replace(word.lower() + ' ', word + ' ')
60
+ prev_letter = word[-1]
61
+ preds = preds.strip() # quitar ultimo espacio
62
+ except:
63
+ pass
64
+ return preds
65
+
66
+ model_name = "es/mai/tacotron2-DDC"
67
+ MAX_TXT_LEN = 100
68
+
69
+ def predict_and_ctc_lm_decode(input_file, speaker_idx: str=None):
70
+ speech = load_and_fix_data(input_file, sampling_rate)
71
+ transcribed_text = asr(speech, chunk_length_s=10, stride_length_s=1)
72
+ transcribed_text = transcribed_text["text"]
73
+ inputs = tokenizer([prefix + transcribed_text], return_tensors="pt", padding=True)
74
+ with torch.no_grad():
75
+ if first_generation:
76
+ output_sequence = model.generate(
77
+ input_ids=inputs["input_ids"].to(device),
78
+ attention_mask=inputs["attention_mask"].to(device),
79
+ do_sample=False, # disable sampling to test if batching affects output
80
+ )
81
+ else:
82
+
83
+ output_sequence = model.generate(
84
+ input_ids=inputs["input_ids"].to(device),
85
+ attention_mask=inputs["attention_mask"].to(device),
86
+ do_sample=False,
87
+ num_beams=2,
88
+ repetition_penalty=2.5,
89
+ # length_penalty=1.0,
90
+ early_stopping=True# disable sampling to test if batching affects output
91
+ )
92
+ text = postproc(transcribed_text,
93
+ preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
94
+ if len(text) > MAX_TXT_LEN:
95
+ text = text[:MAX_TXT_LEN]
96
+ print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
97
+ print(text, model_name)
98
+ # download model
99
+ model_path, config_path, model_item = manager.download_model(f"tts_models/{model_name}")
100
+ vocoder_name: Optional[str] = model_item["default_vocoder"]
101
+ # download vocoder
102
+ vocoder_path = None
103
+ vocoder_config_path = None
104
+ if vocoder_name is not None:
105
+ vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
106
+ # init synthesizer
107
+ synthesizer = Synthesizer(
108
+ model_path, config_path, None, None, vocoder_path, vocoder_config_path,
109
+ )
110
+ # synthesize
111
+ if synthesizer is None:
112
+ raise NameError("model not found")
113
+ wavs = synthesizer.tts(text, speaker_idx)
114
+ # return output
115
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
116
+ synthesizer.save_wav(wavs, fp)
117
+ return fp.name
118
+
119
+ description = """This is a Gradio demo for generating gender-neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralized audio is generated.
120
+
121
+ Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co/jonatasgrosman/wav2vec2-xls-r-1b-spanish)
122
+
123
+ Pre-trained model used for Gender Neutralization: [hackathon-pln-es/es_text_neutralizer](https://huggingface.co/hackathon-pln-es/es_text_neutralizer)
124
+
125
+ Pre-trained model used for TTS: 🐸💬 CoquiTTS => model_name = "es/mai/tacotron2-DDC"
126
+
127
+ """
128
+
129
+
130
+ article = """ **ACKNOWLEDGEMENT:**
131
+
132
+ **This project is based on the following Spaces:**
133
+
134
+ [CoquiTTS](https://huggingface.co/spaces/coqui/CoquiTTS)
135
+
136
+ [es_nlp_gender_neutralizer](https://huggingface.co/spaces/hackathon-pln-es/es_nlp_gender_neutralizer)
137
+
138
+ [Hindi_ASR](https://huggingface.co/spaces/anuragshas/Hindi_ASR)
139
+
140
+ """
141
+
142
+
143
+ gr.Interface(
144
+ predict_and_ctc_lm_decode,
145
+ inputs=[
146
+ gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
147
+ ],
148
+ outputs=gr.outputs.Audio(label="Output"),
149
+ examples=[["Example1.wav"],["Example2.wav"],["Example3.wav"]],
150
+ title="Generate-Gender-Neutralized-Audios",
151
+ description = description,
152
+ article=article,
153
+ layout="horizontal",
154
+ theme="huggingface",
155
+ ).launch(enable_queue=True, cache_examples=True)
audio1.wav ADDED
Binary file (348 kB). View file
 
example2.wav ADDED
Binary file (68 kB). View file
 
example3.wav ADDED
Binary file (235 kB). View file
 
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ espeak-ng
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ librosa==0.8.0
4
+ pyctcdecode
5
+ pypi-kenlm
6
+ git+https://github.com/coqui-ai/TTS@dev#egg=TTS
travel.mp3 ADDED
Binary file (6.63 kB). View file
 
travel.wav ADDED
Binary file (48.5 kB). View file