camilosegura commited on
Commit
7d873e2
1 Parent(s): d1c8358

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ audio_output.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ video.mp4 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by .ignore support plugin (hsz.mobi)
2
+ ### Python template
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ data/
14
+ flagged/
15
+ env/
16
+ venv/
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *,cover
51
+ .hypothesis/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+
61
+ # Flask stuff:
62
+ instance/
63
+ .webassets-cache
64
+
65
+ # Scrapy stuff:
66
+ .scrapy
67
+
68
+ # Sphinx documentation
69
+ docs/_build/
70
+
71
+ # PyBuilder
72
+ target/
73
+
74
+ # IPython Notebook
75
+ .ipynb_checkpoints
76
+
77
+ # pyenv
78
+ .python-version
79
+
80
+ # celery beat schedule file
81
+ celerybeat-schedule
82
+
83
+ # dotenv
84
+ .env
85
+
86
+ # virtualenv
87
+ venv/
88
+ ENV/
89
+
90
+ # Spyder project settings
91
+ .spyderproject
92
+
93
+ # Rope project settings
94
+ .ropeproject
95
+ ### VirtualEnv template
96
+ # Virtualenv
97
+ # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
98
+ [Bb]in
99
+ [Ii]nclude
100
+ [Ll]ib
101
+ [Ll]ib64
102
+ [Ll]ocal
103
+ [Ss]cripts
104
+ pyvenv.cfg
105
+ .venv
106
+ pip-selfcheck.json
107
+
108
+ ### JetBrains template
109
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
110
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
111
+
112
+ # User-specific stuff
113
+ .idea/**/workspace.xml
114
+ .idea/**/tasks.xml
115
+ .idea/**/usage.statistics.xml
116
+ .idea/**/dictionaries
117
+ .idea/**/shelf
118
+
119
+ # AWS User-specific
120
+ .idea/**/aws.xml
121
+
122
+ # Generated files
123
+ .idea/**/contentModel.xml
124
+
125
+ # Sensitive or high-churn files
126
+ .idea/**/dataSources/
127
+ .idea/**/dataSources.ids
128
+ .idea/**/dataSources.local.xml
129
+ .idea/**/sqlDataSources.xml
130
+ .idea/**/dynamic.xml
131
+ .idea/**/uiDesigner.xml
132
+ .idea/**/dbnavigator.xml
133
+
134
+ # Gradle
135
+ .idea/**/gradle.xml
136
+ .idea/**/libraries
137
+
138
+ # Gradle and Maven with auto-import
139
+ # When using Gradle or Maven with auto-import, you should exclude module files,
140
+ # since they will be recreated, and may cause churn. Uncomment if using
141
+ # auto-import.
142
+ # .idea/artifacts
143
+ # .idea/compiler.xml
144
+ # .idea/jarRepositories.xml
145
+ # .idea/modules.xml
146
+ # .idea/*.iml
147
+ # .idea/modules
148
+ # *.iml
149
+ # *.ipr
150
+
151
+ # CMake
152
+ cmake-build-*/
153
+
154
+ # Mongo Explorer plugin
155
+ .idea/**/mongoSettings.xml
156
+
157
+ # File-based project format
158
+ *.iws
159
+
160
+ # IntelliJ
161
+ out/
162
+
163
+ # mpeltonen/sbt-idea plugin
164
+ .idea_modules/
165
+
166
+ # JIRA plugin
167
+ atlassian-ide-plugin.xml
168
+
169
+ # Cursive Clojure plugin
170
+ .idea/replstate.xml
171
+
172
+ # SonarLint plugin
173
+ .idea/sonarlint/
174
+
175
+ # Crashlytics plugin (for Android Studio and IntelliJ)
176
+ com_crashlytics_export_strings.xml
177
+ crashlytics.properties
178
+ crashlytics-build.properties
179
+ fabric.properties
180
+
181
+ # Editor-based Rest Client
182
+ .idea/httpRequests
183
+
184
+ # Android studio 3.1+ serialized cache file
185
+ .idea/caches/build_file_checksums.ser
186
+
187
+ # idea folder, uncomment if you don't need it
188
+ .idea
189
+ *.gz
190
+ *.wav
191
+
192
+ main.py
193
+ mms_ars.py
194
+ .DS_Store
Lenguajes soportados Deep Translator.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ 'afrikaans': 'af',
3
+ 'albanian': 'sq',
4
+ 'amharic': 'am',
5
+ 'arabic': 'ar',
6
+ 'armenian': 'hy',
7
+ 'assamese': 'as',
8
+ 'aymara': 'ay',
9
+ 'azerbaijani': 'az',
10
+ 'bambara': 'bm',
11
+ 'basque': 'eu',
12
+ 'belarusian': 'be',
13
+ 'bengali': 'bn',
14
+ 'bhojpuri': 'bho',
15
+ 'bosnian': 'bs',
16
+ 'bulgarian': 'bg',
17
+ 'catalan': 'ca',
18
+ 'cebuano': 'ceb',
19
+ 'chichewa': 'ny',
20
+ 'chinese (simplified)': 'zh-CN',
21
+ 'chinese (traditional)': 'zh-TW',
22
+ 'corsican': 'co',
23
+ 'croatian': 'hr',
24
+ 'czech': 'cs',
25
+ 'danish': 'da',
26
+ 'dhivehi': 'dv',
27
+ 'dogri': 'doi',
28
+ 'dutch': 'nl',
29
+ 'english': 'en',
30
+ 'esperanto': 'eo',
31
+ 'estonian': 'et',
32
+ 'ewe': 'ee',
33
+ 'filipino': 'tl',
34
+ 'finnish': 'fi',
35
+ 'french': 'fr',
36
+ 'frisian': 'fy',
37
+ 'galician': 'gl',
38
+ 'georgian': 'ka',
39
+ 'german': 'de',
40
+ 'greek': 'el',
41
+ 'guarani': 'gn',
42
+ 'gujarati': 'gu',
43
+ 'haitian creole': 'ht',
44
+ 'hausa': 'ha',
45
+ 'hawaiian': 'haw',
46
+ 'hebrew': 'iw',
47
+ 'hindi': 'hi',
48
+ 'hmong': 'hmn',
49
+ 'hungarian': 'hu',
50
+ 'icelandic': 'is',
51
+ 'igbo': 'ig',
52
+ 'ilocano': 'ilo',
53
+ 'indonesian': 'id',
54
+ 'irish': 'ga',
55
+ 'italian': 'it',
56
+ 'japanese': 'ja',
57
+ 'javanese': 'jw',
58
+ 'kannada': 'kn',
59
+ 'kazakh': 'kk',
60
+ 'khmer': 'km',
61
+ 'kinyarwanda': 'rw',
62
+ 'konkani': 'gom',
63
+ 'korean': 'ko',
64
+ 'krio': 'kri',
65
+ 'kurdish (kurmanji)': 'ku',
66
+ 'kurdish (sorani)': 'ckb',
67
+ 'kyrgyz': 'ky',
68
+ 'lao': 'lo',
69
+ 'latin': 'la',
70
+ 'latvian': 'lv',
71
+ 'lingala': 'ln',
72
+ 'lithuanian': 'lt',
73
+ 'luganda': 'lg',
74
+ 'luxembourgish': 'lb',
75
+ 'macedonian': 'mk',
76
+ 'maithili': 'mai',
77
+ 'malagasy': 'mg',
78
+ 'malay': 'ms',
79
+ 'malayalam': 'ml',
80
+ 'maltese': 'mt',
81
+ 'maori': 'mi',
82
+ 'marathi': 'mr',
83
+ 'meiteilon (manipuri)': 'mni-Mtei',
84
+ 'mizo': 'lus',
85
+ 'mongolian': 'mn',
86
+ 'myanmar': 'my',
87
+ 'nepali': 'ne',
88
+ 'norwegian': 'no',
89
+ 'odia (oriya)': 'or',
90
+ 'oromo': 'om',
91
+ 'pashto': 'ps',
92
+ 'persian': 'fa',
93
+ 'polish': 'pl',
94
+ 'portuguese': 'pt',
95
+ 'punjabi': 'pa',
96
+ 'quechua': 'qu',
97
+ 'romanian': 'ro',
98
+ 'russian': 'ru',
99
+ 'samoan': 'sm',
100
+ 'sanskrit': 'sa',
101
+ 'scots gaelic': 'gd',
102
+ 'sepedi': 'nso',
103
+ 'serbian': 'sr',
104
+ 'sesotho': 'st',
105
+ 'shona': 'sn',
106
+ 'sindhi': 'sd',
107
+ 'sinhala': 'si',
108
+ 'slovak': 'sk',
109
+ 'slovenian': 'sl',
110
+ 'somali': 'so',
111
+ 'spanish': 'es',
112
+ 'sundanese': 'su',
113
+ 'swahili': 'sw',
114
+ 'swedish': 'sv',
115
+ 'tajik': 'tg',
116
+ 'tamil': 'ta',
117
+ 'tatar': 'tt',
118
+ 'telugu': 'te',
119
+ 'thai': 'th',
120
+ 'tigrinya': 'ti',
121
+ 'tsonga': 'ts',
122
+ 'turkish': 'tr',
123
+ 'turkmen': 'tk',
124
+ 'twi': 'ak',
125
+ 'ukrainian': 'uk',
126
+ 'urdu': 'ur',
127
+ 'uyghur': 'ug',
128
+ 'uzbek': 'uz',
129
+ 'vietnamese': 'vi',
130
+ 'welsh': 'cy',
131
+ 'xhosa': 'xh',
132
+ 'yiddish': 'yi',
133
+ 'yoruba': 'yo',
134
+ 'zulu': 'zu'
135
+ }
README.md CHANGED
@@ -1,12 +1,42 @@
1
  ---
2
- title: Traductor Multilenguaje
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 3.38.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: traductor-multilenguaje
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.35.2
6
  ---
7
+ # Web App of Meta's META's Massively Multilingual Speech (MMS)
8
+
9
+ This repository contains a Python code that implements a [META's Massively Multilingual Speech (MMS)](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) using the Gradio library. The application allows users to record audio and convert it to text, or enter text and generate corresponding local speech output.
10
+
11
+ ## Step 1: Clone repo
12
+ ```shell
13
+ git clone https://github.com/ikequan/meta-mms.git
14
+ cd meta-mms
15
+ ```
16
+ ## Step 2: Prerequisites
17
+
18
+ Before running the code, make sure you have the following requirements installed:
19
+
20
+ - Python 3.x
21
+ - gradio
22
+ - speech_recognition
23
+ - ttsmms
24
+ - deep_translator
25
+
26
+ You can install the required packages using the following command:
27
+
28
+ ```shell
29
+ pip install gradio SpeechRecognition ttsmms deep_translator
30
+ ```
31
+
32
+ ## Step 3: Download language model
33
+ Check [here](https://github.com/wannaphong/ttsmms/blob/main/support_list.txt) for supported languages and their iso code for this step.
34
+ ```shell
35
+ curl https://dl.fbaipublicfiles.com/mms/tts/{put your language iso code here}.tar.gz --output {put your language iso code here}.tar.gz # Update lang
36
+ mkdir -p data && tar -xzf {put your language iso code here}.tar.gz -C data/ # Update langcode
37
+ ```
38
 
39
+ ## Step 4: Run code
40
+ ```shell
41
+ python app.py
42
+ ```
Tutorial meta-mms.txt ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://www.youtube.com/watch?v=7K4b2S7X99w
2
+ https://github.com/ikequan/meta-mms
3
+
4
+ #Github proyecto
5
+ https://github.com/AYTECOL/traductor-multilenguaje.git
6
+
7
+ #Las librerias se instalan en:
8
+ C:\Users\jorge\AppData\Local\Programs\Python\Python311\Scripts
9
+
10
+ # Salida de Audio de video
11
+ C:\Users\jorge\AppData\Local\Temp\gradio\04300dd9108b391bd8a7984ab530b47d54bfec91\
12
+
13
+ #Lenguajes soportados:
14
+ https://github.com/wannaphong/ttsmms/blob/main/support_list.txt
15
+
16
+ # Descargar e instalar Phyton si no está instalado
17
+ Instalar en C:\Users\jorge\AppData\Local\Programs\Python\Python311\
18
+ Pasar al directorio \Scripts para que tome el comando .\pip
19
+
20
+ # Comprobar la instalacion de PIP:
21
+ .\pip help
22
+
23
+ # Si PIP no está instalado descargarlo de:
24
+ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
25
+ # Instalar PIP, pasar a la carpeta donde se descargó el archivo get-pip.py:
26
+ py get-pip.py
27
+ # Agregar el PATH de la instalación en las variables de entorno y colocarlo al inicio de la lista
28
+
29
+ # Si no está instalado Microsoft Visual C++ 14.0 or greater instalarlo de:
30
+ https://visualstudio.microsoft.com/visual-cpp-build-tools/
31
+ seleccionando las utilidades para desktop windows
32
+
33
+ # Instalar transformers:
34
+ .\pip install torch datasets[audio]
35
+ .\pip install --upgrade transformers
36
+
37
+ # Instalar los complementos necesarios para la aplicación:
38
+ .\pip install gradio SpeechRecognition ttsmms deep_translator
39
+
40
+ # instalar ffmpeg mediante cmd como administrador:
41
+ choco install ffmpeg
42
+
43
+ # Comprobar ISO de idiomas disponibles:
44
+ https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html
45
+
46
+ # descargar los idiomas a soportar cambiando el ISO del idioma correspondiente:
47
+ https://dl.fbaipublicfiles.com/mms/tts/full_model/eng.tar.gz #Inglés (eng)
48
+ https://dl.fbaipublicfiles.com/mms/tts/full_model/spa.tar.gz #Español (spa)
49
+ https://dl.fbaipublicfiles.com/mms/tts/full_model/gum.tar.gz #Misak (gum)
50
+ https://dl.fbaipublicfiles.com/mms/tts/full_model/quz.tar.gz #Quechua Cuzco (quz)
51
+
52
+ # crear carpeta "data" y descomprimir ahi los lenguajes dentro del proyecto:
53
+ /meta-mms/data/spa/
54
+ /meta-mms/data/eng/
55
+
56
+ # Comandos procesamiento de Audio y Video
57
+ # extraer audio de un video
58
+ ffmpeg -y -i input.mp4 -ar 16000 -ac 1 output_audio.wav
59
+
60
+ # dejar un video sin audio
61
+ ffmpeg -y -i input.mp4 -t 43 output_muted.webm
62
+ ffmpeg -y -i input.mp4 -shortest output_muted.webm
63
+
64
+ # unir audio con video
65
+ ffmpeg -y -i input.mp4 -i audio.wav -an output_muted.webm
66
+
67
+ # subtitulos
68
+ ffmpeg -y -copyts -i input.webm -vf subtitles=subtitle.srt output_srt.webm
69
+ ffmpeg -y -copyts -i noticias_caracol_tv.mp4 -vf subtitles=noticias_caracol_tv_subtitles.srt output_srt.webm
70
+
71
+ # Para ejecutar la aplicación:
72
+ py app.py
app.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import speech_recognition as sr # Libreria de Audio
3
+ from ttsmms import TTS
4
+ from deep_translator import GoogleTranslator
5
+
6
+ import subprocess # Libreria para procesamiento de comandos cmd para video
7
+ import os
8
+ import time # Libreria para manejo de tiempos
9
+ import math # Libreria matemática, usada para redondeo de cifras
10
+ from threading import Thread # Librería para manejo de Hilos de procesamiento
11
+
12
+ # Idioma de ingreso
13
+ input_language = 'es-ES'
14
+ output_audio_format = 'mp3'
15
+ output_video_format = 'mp4'
16
+
17
+ # Inicializa el modelo TTS para los idiomas soportados
18
+ spanish = TTS("data/spa") #español
19
+ english = TTS("data/eng") #inglés
20
+ misak = TTS("data/gum") #misak
21
+ quechua = TTS("data/quz") #quechua
22
+
23
+ # Crea la lista de idiomas soportados para traducir y su modelo TTS correspondiente
24
+ #langs = [{"lang": 'spanish', "tts": spanish}, {"lang": 'english', "tts": english}, {"lang": 'guarani', "tts": misak}, {"lang": 'quechua', "tts": quechua}]
25
+ langs = [{"lang": 'english', "tts": english}, {"lang": 'quechua', "tts": quechua}, {"lang": 'spanish', "tts": spanish}]
26
+
27
+ # *************************** MÉTODOS ***************************
28
+ # TEXT TO TEXT: Función que convierte texto a texto
29
+ def text_to_text(text, lang, logs_file):
30
+ tiempo = time.ctime().split()
31
+ print(tiempo[3] + " - Traduciendo el texto a texto en el idioma seleccionado...")
32
+ logs_file.write(tiempo[3] + " - Traduciendo el texto a texto en el idioma seleccionado...\n")
33
+ selected_lang = next((lang_item for lang_item in langs if lang_item["lang"] == lang), None) # Busca el idioma seleccionado en la lista de idiomas disponibles
34
+ if selected_lang is None:
35
+ raise ValueError(f"Lenguaje '{lang}' no disponible.")
36
+ text_translated = GoogleTranslator(source='auto', target=lang).translate(text) # Traduce el texto al idioma seleccionado usando Google Translator
37
+ tiempo = time.ctime().split()
38
+ print(tiempo[3] + " - Texto traducido: ", text_translated)
39
+ logs_file.write(tiempo[3] + " - Texto traducido: " + text_translated + "\n")
40
+ return text_translated
41
+
42
+ # TEXT TO AUDIO: Función que convierte texto a audio
43
+ def text_to_audio(text, lang, logs_file):
44
+ tiempo = time.ctime().split()
45
+ print(tiempo[3] + " - Convirtiendo el texto extraido a audio en el idioma seleccionado...")
46
+ logs_file.write(tiempo[3] + " - Convirtiendo el texto extraido a audio en el idioma seleccionado...\n")
47
+ selected_lang = next((lang_item for lang_item in langs if lang_item["lang"] == lang), None) # Busca el idioma seleccionado en la lista de idiomas disponibles
48
+ if selected_lang is None:
49
+ raise ValueError(f"Lenguaje '{lang}' no disponible.")
50
+ selected_tts = selected_lang["tts"]
51
+ text_translated = text_to_text(text, lang, logs_file) # Traduce el texto al idioma seleccionado usando Google Translator
52
+ wav_path = "audio_output." + output_audio_format
53
+ print("wav_path!!!!!!!!!!!!!!!!1", wav_path)
54
+ selected_tts.synthesis(text_translated, wav_path=wav_path) # Genera el audio y lo graba como un archivo WAV
55
+ tiempo = time.ctime().split()
56
+ print(tiempo[3] + " - Audio traducido generado: ",wav_path)
57
+ logs_file.write(tiempo[3] + " - Audio traducido generado: " + wav_path + "\n")
58
+ return wav_path, text_translated
59
+
60
+ # AUDIO TO TEXT: Función que convierte audio a texto usando Google's speech recognition API
61
+ def audio_to_text(audio_file, logs_file):
62
+ tiempo = time.ctime().split()
63
+ print(tiempo[3] + " - Convirtiendo el audio a texto...")
64
+ logs_file.write(tiempo[3] + " - Convirtiendo el audio a texto...\n")
65
+ r = sr.Recognizer()
66
+ with sr.AudioFile(audio_file) as source:
67
+ audio = r.record(source)
68
+ try:
69
+ text = r.recognize_google(audio, language=input_language)
70
+ tiempo = time.ctime().split()
71
+ print(tiempo[3] + " - Reconocimiento de texto obtenido del audio: ",text)
72
+ logs_file.write(tiempo[3] + " - Reconocimiento de texto obtenido del audio: " + text + "\n")
73
+ return text
74
+ except sr.UnknownValueError:
75
+ print("Google Speech Recognition no pudo transcribir el audio.")
76
+ logs_file.write("Google Speech Recognition no pudo transcribir el audio.\n")
77
+ return None
78
+ except sr.RequestError:
79
+ print("Reconocimiento de audio no disponible.")
80
+ logs_file.write("Reconocimiento de audio no disponible.\n")
81
+ return None
82
+
83
+ # VIDEO TO AUDIO: Función que extrae el audio del video
84
+ def video_to_audio(video_file, output_audio_ext, logs_file):
85
+ tiempo = time.ctime().split()
86
+ print(tiempo[3] + " - Extrayendo el audio del video...")
87
+ logs_file.write(tiempo[3] + " - Extrayendo el audio del video...\n")
88
+ filename, ext = os.path.splitext(video_file) # Se extrae el nombre del archivo y su extensión
89
+ subprocess.call(["ffmpeg", "-y", "-i", video_file, "-ar", "16000", "-ac", "1", f"{filename+'_audio'}.{output_audio_ext}"], # Se extrae el archivo de audio del video
90
+ stdout=subprocess.DEVNULL,
91
+ stderr=subprocess.STDOUT)
92
+ audio_video = filename + "_audio." + output_audio_ext
93
+ tiempo = time.ctime().split()
94
+ print(tiempo[3] + " - Audio extraido: ",audio_video)
95
+ logs_file.write(tiempo[3] + " - Audio extraido: " + audio_video + "\n")
96
+ return audio_video
97
+
98
+ # VIDEO TO VIDEO: Función que concatena audio con el video traducido
99
+ def video_to_video(video_file, audio_file_traslated, output_video_ext, logs_file):
100
+ tiempo = time.ctime().split()
101
+ print(tiempo[3] + " - Procesando el video para obtenerlo sin audio...")
102
+ logs_file.write(tiempo[3] + " - Procesando el video para obtenerlo sin audio...\n")
103
+ filename, ext = os.path.splitext(video_file) # Se extrae el nombre del archivo y su extensión
104
+ subprocess.call(["ffmpeg", "-y", "-i", video_file, "-an", f"{filename+'_muted'}.{output_video_ext}"],
105
+ stdout=subprocess.DEVNULL, # Se extrae el video sin audio
106
+ stderr=subprocess.STDOUT)
107
+ video_mute = filename + "_muted." + output_video_ext
108
+
109
+ tiempo = time.ctime().split()
110
+ print(tiempo[3] + " - Doblando el video con el audio traducido...")
111
+ logs_file.write(tiempo[3] + " - Doblando el video con el audio traducido...\n")
112
+ subprocess.call(["ffmpeg", "-y", "-i", video_mute, "-i", audio_file_traslated, "-shortest", f"{filename+'_traslated'}.{output_video_ext}"],
113
+ stdout=subprocess.DEVNULL, # Se concatena el video sin audio con el audio traducido
114
+ stderr=subprocess.STDOUT)
115
+ video_traslated = filename + "_traslated." + output_video_ext
116
+ tiempo = time.ctime().split()
117
+ print(tiempo[3] + " - Video traducido: ",video_traslated)
118
+ logs_file.write(tiempo[3] + " - Video traducido: " + video_traslated + "\n")
119
+ return video_traslated
120
+
121
+ # VIDEO TO VIDEO SUBTITULADO: Función que coloca subtitulos traducidos al video
122
+ def video_to_video_subtitled(video_file, text_traslated, output_video_ext, logs_file):
123
+ tiempo = time.ctime().split()
124
+ print(tiempo[3] + " - Procesando el video subtitulado...")
125
+ logs_file.write(tiempo[3] + " - Procesando el video subtitulado...\n")
126
+ subtitles = text_traslated.split()
127
+ filename, ext = os.path.splitext(video_file) # Se extrae el nombre del archivo y su extensión
128
+ #filedir = os.path.dirname(video_file)
129
+ #subtitles_file = open(f"{filename+'_subtitles'}.srt","w+")
130
+ length_video = get_length_video(video_file)
131
+ length_line_subtitle = math.ceil(len(subtitles)/length_video) # Rate de palabras por segundo de video
132
+ i=0
133
+ j=0
134
+ subtitles_line = []
135
+ while i < length_video//length_line_subtitle: # Ciclo para timming de subtítulos
136
+ while j < len(subtitles):
137
+ line = ' '.join(subtitles[j:j+length_line_subtitle]) # Concatena palabras en una frase de acuerdo al rate de palabras
138
+ subtitles_line.append(line) # Inserta la frase en el vector final de subtítulos
139
+ j += length_line_subtitle
140
+ i += 1
141
+
142
+ subtitles_file = open(f"{'video_subtitles'}.srt","w+") # Se genera el archivo de subtítulos .srt
143
+ i=0
144
+ while i < len(subtitles_line):
145
+ subtitles_content = (''''''+str(i+1)+'''
146
+ '''+ time.strftime('%H:%M:%S', time.gmtime(i)) + ''',001 --> ''' + time.strftime('%H:%M:%S', time.gmtime(i+1)) + ''',000 --> ''' +
147
+ '''
148
+ ''''''<b>'''+ subtitles_line[i] +'''</b>'''
149
+ '''
150
+ ''')
151
+ subtitles_file.write(subtitles_content)
152
+ i += 1
153
+
154
+ subtitles_file.close()
155
+ #subtitles_file = filename + "_subtitles.srt"
156
+ subprocess.call(["ffmpeg", "-y", "-copyts", "-i", video_file, "-vf", "subtitles=video_subtitles.srt:force_style='Fontname=Futura,Fontsize=20,MarginV=50,Shadow=1'", f"{filename+'_subtitled'}.{output_video_ext}"],
157
+ stdout=subprocess.DEVNULL, # Se insertan los subtitulos al video con el audio original
158
+ stderr=subprocess.STDOUT)
159
+
160
+ video_subtitled = filename + "_subtitled." + output_video_ext
161
+ tiempo = time.ctime().split()
162
+ print(tiempo[3] + " - Video subtitulado: ",video_subtitled)
163
+ logs_file.write(tiempo[3] + " - Video subtitulado: " + video_subtitled + "\n")
164
+ return video_subtitled
165
+
166
+ def get_length_video(filename):
167
+ result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
168
+ "format=duration", "-of",
169
+ "default=noprint_wrappers=1:nokey=1", filename],
170
+ stdout=subprocess.PIPE,
171
+ stderr=subprocess.STDOUT)
172
+ return float(result.stdout)
173
+
174
+ # *************************** MAIN ***************************
175
+ # ************************** ROUTER **************************
176
+ # ROUTER: Función para transcribir video, audio y texto al lenguaje seleccionado
177
+ def multimedia_to_multimedia_app(lang_input, video_file_upload, audio_file_upload, video_file_webcam, audio_file_microphone, text_input):
178
+ tiempo = time.ctime().split()
179
+ logs_file = open("logs.txt","w+")
180
+ logs_file.write("LOGS TRADUCTOR MULTILENGUAJE\n")
181
+ if video_file_webcam and lang_input:
182
+ print("PROCESANDO GRABACIÓN VIDEO DE LA WEBCAM")
183
+ logs_file.write("PROCESANDO GRABACIÓN VIDEO DE LA WEBCAM\n")
184
+ print(tiempo[3] + " - Traduciendo el video grabado: " + video_file_webcam + " al idioma " + lang_input)
185
+ logs_file.write(tiempo[3] + " - Traduciendo el video grabado: " + video_file_webcam + " al idioma " + lang_input + "\n")
186
+ text_transcribed = convert_video_to_text_app(lang_input, video_file_webcam, logs_file)
187
+ audio_traslated, text_translated = text_to_audio(text_transcribed, lang_input, logs_file)
188
+ #video_subtitled = convert_video_to_video_subtitled_app(video_file_webcam, text_translated, logs_file)
189
+ #video_traslated = convert_video_to_video_app(video_file_webcam, audio_traslated, logs_file)
190
+ return_video_subtitled = [None]*1
191
+ return_video_traslated = [None]*1
192
+ hilo_video_subtitled = Thread(target=convert_video_to_video_subtitled_app, args=(video_file_webcam, text_translated,logs_file,return_video_subtitled,))
193
+ hilo_video_traslated = Thread(target=convert_video_to_video_app, args=(video_file_webcam, audio_traslated,logs_file,return_video_traslated,))
194
+ hilo_video_subtitled.start()
195
+ hilo_video_traslated.start()
196
+ hilo_video_subtitled.join()
197
+ hilo_video_traslated.join()
198
+ video_subtitled = return_video_subtitled[0]
199
+ video_traslated = return_video_traslated[0]
200
+ print("FIN PROCESO GRABACIÓN VIDEO DE LA WEBCAM")
201
+ logs_file.write("FIN PROCESO GRABACIÓN VIDEO DE LA WEBCAM\n")
202
+ logs_file.close()
203
+ return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
204
+ if audio_file_microphone and lang_input:
205
+ print("PROCESANDO GRABACIÓN AUDIO DEL MICRÓFONO")
206
+ logs_file.write("PROCESANDO GRABACIÓN AUDIO DEL MICRÓFONO\n")
207
+ print(tiempo[3] + " - Traduciendo el audio grabado " + audio_file_microphone + " al idioma " + lang_input)
208
+ logs_file.write(tiempo[3] + " - Traduciendo el audio grabado " + audio_file_microphone + " al idioma " + lang_input + "\n")
209
+ text_translated, text_transcribed, audio_traslated = convert_audio_to_audio_app(lang_input,audio_file_microphone,logs_file)
210
+ video_subtitled = None
211
+ video_traslated = None
212
+ print("FIN PROCESO GRABACIÓN AUDIO DEL MICRÓFONO")
213
+ logs_file.write("FIN PROCESO GRABACIÓN AUDIO DEL MICRÓFONO\n")
214
+ logs_file.close()
215
+ return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
216
+ if video_file_upload and lang_input:
217
+ print("PROCESANDO ARCHIVO DE VIDEO")
218
+ logs_file.write("PROCESANDO ARCHIVO DE VIDEO\n")
219
+ print(tiempo[3] + " - Traduciendo el video ingresado " + video_file_upload + " al idioma " + lang_input)
220
+ logs_file.write(tiempo[3] + " - Traduciendo el video ingresado " + video_file_upload + " al idioma " + lang_input + "\n")
221
+ text_transcribed = convert_video_to_text_app(lang_input,video_file_upload,logs_file)
222
+ audio_traslated, text_translated = text_to_audio(text_transcribed, lang_input,logs_file)
223
+ #video_subtitled = convert_video_to_video_subtitled_app(video_file_upload, text_translated,logs_file)
224
+ #video_traslated = convert_video_to_video_app(video_file_upload, audio_traslated,logs_file)
225
+ return_video_subtitled = [None]*1
226
+ return_video_traslated = [None]*1
227
+ hilo_video_subtitled = Thread(target=convert_video_to_video_subtitled_app, args=(video_file_upload, text_translated,logs_file,return_video_subtitled,))
228
+ hilo_video_traslated = Thread(target=convert_video_to_video_app, args=(video_file_upload, audio_traslated,logs_file,return_video_traslated,))
229
+ hilo_video_subtitled.start()
230
+ hilo_video_traslated.start()
231
+ hilo_video_subtitled.join()
232
+ hilo_video_traslated.join()
233
+ video_subtitled = return_video_subtitled[0]
234
+ video_traslated = return_video_traslated[0]
235
+
236
+ print("FIN PROCESO ARCHIVO DE VIDEO")
237
+ logs_file.write("FIN PROCESO ARCHIVO DE VIDEO\n")
238
+ logs_file.close()
239
+ return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
240
+ if audio_file_upload and lang_input:
241
+ print("PROCESANDO ARCHIVO DE AUDIO")
242
+ logs_file.write("PROCESANDO ARCHIVO DE AUDIO\n")
243
+ print(tiempo[3] + " - Traduciendo el audio ingresado " + audio_file_upload + " al idioma " + lang_input)
244
+ logs_file.write(tiempo[3] + " - Traduciendo el audio ingresado " + audio_file_upload + " al idioma " + lang_input + "\n")
245
+ text_translated, text_transcribed, audio_traslated = convert_audio_to_audio_app(lang_input,audio_file_upload,logs_file)
246
+ video_subtitled = None
247
+ video_traslated = None
248
+ print("FIN PROCESO ARCHIVO DE AUDIO")
249
+ logs_file.write("FIN PROCESO ARCHIVO DE AUDIO\n")
250
+ logs_file.close()
251
+ return text_transcribed, text_translated, audio_traslated, video_subtitled, video_traslated
252
+ if text_input and lang_input:
253
+ print("PROCESANDO TEXTO INGRESADO")
254
+ logs_file.write("PROCESANDO TEXTO INGRESADO\n")
255
+ print(tiempo[3] + " - Traduciendo el texto ingresado " + text_input + " al idioma " + lang_input)
256
+ logs_file.write(tiempo[3] + " - Traduciendo el texto ingresado " + text_input + " al idioma " + lang_input + "\n")
257
+ audio_traslated, text_translated = text_to_audio(text_input, lang_input, logs_file)
258
+ video_subtitled = None
259
+ video_traslated = None
260
+ print("FIN PROCESO TEXTO INGRESADO")
261
+ logs_file.write("FIN PROCESO TEXTO INGRESADO\n")
262
+ logs_file.close()
263
+ return text_input, text_translated, audio_traslated, video_subtitled, video_traslated
264
+ if not lang_input:
265
+ print("Error - Lenguaje no ingresado")
266
+ raise gr.Error("Debes ingresar el idioma a traducir") # Muestra la alerta si no se ingresa un idioma a traducir
267
+
268
+ # *************************** SERVICIOS ***************************
269
+ # t2t: Traducir el texto a texto en el idioma deseado
270
+ def convert_text_to_text_app(lang_input, text_to_translate, logs_file):
271
+ if text_to_translate:
272
+ print("Traduciendo texto " + text_to_translate + " al idioma " + lang_input)
273
+ logs_file.write("Traduciendo texto " + text_to_translate + " al idioma " + lang_input + "\n")
274
+ text_translated = text_to_text(text_to_translate, lang_input, logs_file)
275
+ return text_translated
276
+
277
+ # a2t: Transcribir el audio a texto
278
+ def convert_audio_to_text_app(lang_input, audio_file, logs_file):
279
+ if audio_file:
280
+ print("Convirtiendo audio " + audio_file + " al idioma " + lang_input)
281
+ logs_file.write("Convirtiendo audio " + audio_file + " al idioma " + lang_input + "\n")
282
+ text_translated = audio_to_text(audio_file, logs_file)
283
+ return text_translated
284
+
285
+ # a2a: Transcribir el audio a texto y de texto al audio traducido
286
+ def convert_audio_to_audio_app(lang_input, audio_file, logs_file):
287
+ if audio_file:
288
+ print("Traduciendo audio " + audio_file + " al idioma deseado...")
289
+ logs_file.write("Traduciendo audio " + audio_file + " al idioma deseado...\n")
290
+ text_transcribed = audio_to_text(audio_file, logs_file)
291
+ audio_traslated, text_translated = text_to_audio(text_transcribed, lang_input, logs_file)
292
+ return text_translated, text_transcribed, audio_traslated
293
+
294
+ # v2t: Convertir video a audio usando 'ffmpeg' con módulo 'subprocess'
295
+ def convert_video_to_text_app(lang_input,video_file, logs_file, output_audio_ext="wav"):
296
+ print("Procesando video " + video_file + " para convertirlo a texto...")
297
+ logs_file.write("Procesando video " + video_file + " para convertirlo a texto...\n")
298
+ audio_video = video_to_audio(video_file, output_audio_ext, logs_file)
299
+ text_translated = convert_audio_to_text_app(lang_input,audio_video, logs_file)
300
+ return text_translated
301
+
302
+ # v2v: Convertir video a video
303
+ def convert_video_to_video_app(video_file, audio_file_traslated, logs_file, return_video_traslated, output_video_ext=output_video_format):
304
+ print("Procesando video " + video_file + " para traducirlo...")
305
+ logs_file.write("Procesando video " + video_file + " para traducirlo...\n")
306
+ video_traslated = video_to_video(video_file, audio_file_traslated, output_video_ext,logs_file)
307
+ return_video_traslated[0] = video_traslated
308
+ #return video_traslated
309
+
310
+ # v2vs: Convertir video a video subtitulado
311
+ def convert_video_to_video_subtitled_app(video_file, text_translated, logs_file, return_video_subtitled, output_video_ext=output_video_format):
312
+ print("Procesando video " + video_file + " para subtitularlo...")
313
+ logs_file.write("Procesando video " + video_file + " para subtitularlo...\n")
314
+ video_subtitled = video_to_video_subtitled(video_file, text_translated, output_video_ext, logs_file)
315
+ return_video_subtitled[0] = video_subtitled
316
+ #return video_subtitled
317
+
318
+ # *************************** INTERFAZ ***************************
319
+ # Entradas y salidas en la interfaz Gradio
320
+ lang_input = gr.components.Dropdown(choices=[lang["lang"] for lang in langs], label="Selecciona el idioma al cual deseas traducir:*")
321
+
322
+ #video_input_file = gr.Video(label= "Noticias Caracol", value="D:/Noticias/noticias_caracol_long.mp4")
323
+ video_input_file = gr.Video()
324
+ video_input_file = gr.Video(label= "Noticias Caracol", source="upload")
325
+ video_input_webcam = gr.Video(label= "Noticias Caracol en vivo", source="webcam", include_audio=1)
326
+ #audio_input_file = gr.Audio(label="Blue Radio", value="D:/Noticias/caracol_radio.mp3")
327
+ audio_input_file = gr.Audio(label="Blue Radio", source="upload", type="filepath")
328
+ audio_input_microphone = gr.Audio(label="Blue Radio en vivo", source="microphone", type="filepath")
329
+ text_input = gr.components.Textbox(label="Noticia a traducir:")
330
+ output_text_transcribed = gr.components.Textbox(label="Transcripción")
331
+ output_text_traslated = gr.components.Textbox(label="Traducción")
332
+ output_audio = gr.components.Audio(label="Audio traducido", format=output_audio_format)
333
+ output_video_subtitled = gr.components.Video(label="Noticia subtitulada", format=output_video_format)
334
+ output_video_traslated = gr.components.Video(label="Noticia traducida", format=output_video_format)
335
+
336
+ """""""""
337
+ embed_html = '<iframe width="560" height="315" src="https://www.youtube.com/embed/EngW7tLk6R8" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
338
+ with gr.Blocks() as interface:
339
+ gr.HTML(embed_html)
340
+ """""""""
341
+
342
+ # Crea la interfaz Gradio para multimedia_to_multimedia_app
343
+ interface = gr.Interface(
344
+ fn=multimedia_to_multimedia_app,
345
+ inputs=[lang_input, video_input_file, audio_input_file, video_input_webcam, audio_input_microphone, text_input],
346
+ outputs=[output_text_transcribed, output_text_traslated, output_audio, output_video_subtitled, output_video_traslated],
347
+ title="TRADUCTOR MULTILENGUA DE NOTICIAS | AYTÉ - CARACOL",
348
+ description="Ingresa la noticia que deseas traducir:",
349
+ #theme = gr.themes.Soft()
350
+ theme=gr.themes.Default(primary_hue="blue")
351
+ )
352
+ #interface.launch() # Lanza la interfaz
353
+ #interface.launch(share=True, auth=("caracol", "caracol"), server_name=("127.0.0.1"), server_port=(7860), favicon_path=())
354
+ interface.launch(share=True, auth=("caracol", "caracol"), server_name=("127.0.0.1"), server_port=(7860))
assets/images/favico.ico ADDED
assets/images/icono.png ADDED
assets/images/logo.png ADDED
assets/styles/css.css ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ h1 {
2
+ color: orange;
3
+ }
audio_output.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d0e34a258a63efae8713b1b054d0875819a225531f988aac50e3751c6a394a
3
+ size 1763386
data/eng/D_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b1d7a2726b3cb27c18604ace828556d9c17c09f65eb041f690a89c99d7aea4
3
+ size 561110135
data/eng/G_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d276cee0f8291de23c8ed4f4a2ed15e3e4cff7b2d6af43660cd6b5e6e1149110
3
+ size 436618116
data/eng/config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 64,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 8192,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0
21
+ },
22
+ "data": {
23
+ "training_files": "train.ltr",
24
+ "validation_files": "dev.ltr",
25
+ "text_cleaners": [
26
+ "transliteration_cleaners"
27
+ ],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 16000,
30
+ "filter_length": 1024,
31
+ "hop_length": 256,
32
+ "win_length": 1024,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 0,
38
+ "cleaned_text": true
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2
76
+ ],
77
+ "upsample_initial_channel": 512,
78
+ "upsample_kernel_sizes": [
79
+ 16,
80
+ 16,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false
86
+ }
87
+ }
data/eng/vocab.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ k
2
+ '
3
+ z
4
+ y
5
+ u
6
+ d
7
+ h
8
+ e
9
+ s
10
+ w
11
+
12
+ 3
13
+ c
14
+ p
15
+ -
16
+ 1
17
+ j
18
+ m
19
+ i
20
+
21
+ f
22
+ l
23
+ o
24
+ 0
25
+ b
26
+ r
27
+ a
28
+ 4
29
+ 2
30
+ n
31
+ _
32
+ x
33
+ v
34
+ t
35
+ q
36
+ 5
37
+ 6
38
+ g
data/gum/D_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0755e583b0b39fe2cc3cf7dfd5c4c9d184de3c83bf562281c7fa23a272bcf9d2
3
+ size 561109839
data/gum/G_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea406973d9699d994463477d4adfeada83625459e1fa606b7cc7e0593f4c31c2
3
+ size 436625202
data/gum/config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 64,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 8192,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0
21
+ },
22
+ "data": {
23
+ "training_files": "train.ltr",
24
+ "validation_files": "dev.ltr",
25
+ "text_cleaners": [
26
+ "transliteration_cleaners"
27
+ ],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 16000,
30
+ "filter_length": 1024,
31
+ "hop_length": 256,
32
+ "win_length": 1024,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 0,
38
+ "cleaned_text": true
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2
76
+ ],
77
+ "upsample_initial_channel": 512,
78
+ "upsample_kernel_sizes": [
79
+ 16,
80
+ 16,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false
86
+ }
87
+ }
data/gum/vocab.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ |
2
+ a
3
+ i
4
+ r
5
+ n
6
+ e
7
+ ø
8
+ u
9
+ g
10
+ m
11
+ b
12
+ t
13
+ s
14
+ k
15
+ h
16
+ c
17
+ l
18
+ w
19
+ p
20
+ y
21
+ d
22
+ o
23
+ ñ
24
+ ú
25
+ j
26
+
27
+ í
28
+ z
29
+ é
30
+ á
31
+ '
32
+ f
33
+ v
34
+ -
35
+ ó
36
+ q
37
+ 0
38
+ x
39
+ 1
40
+ 2
41
+ 4
42
+ 3
43
+
data/quz/D_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22fd86a89725af83c7faf37d3824db296563871f8d357e07578f6183a992ffb0
3
+ size 561078748
data/quz/G_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1723774e696a2f11c58ce5e89a2ee2b47aad65955b0abae3d8865af28adf9364
3
+ size 436378676
data/quz/config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 64,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 8192,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0
21
+ },
22
+ "data": {
23
+ "training_files": "train.ltr",
24
+ "validation_files": "dev.ltr",
25
+ "text_cleaners": [
26
+ "transliteration_cleaners"
27
+ ],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 16000,
30
+ "filter_length": 1024,
31
+ "hop_length": 256,
32
+ "win_length": 1024,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 0,
38
+ "cleaned_text": true
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2
76
+ ],
77
+ "upsample_initial_channel": 512,
78
+ "upsample_kernel_sizes": [
79
+ 16,
80
+ 16,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false
86
+ }
87
+ }
data/quz/vocab.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ |
3
+ n
4
+ i
5
+ u
6
+ q
7
+ k
8
+ s
9
+ h
10
+ p
11
+ y
12
+ c
13
+ t
14
+ m
15
+ r
16
+ l
17
+ o
18
+ w
19
+ e
20
+ ñ
21
+ '
22
+ d
23
+ j
24
+ g
25
+ b
26
+ -
27
+
28
+ v
29
+ f
30
+ í
31
+ z
32
+ é
33
+ á
34
+ ó
35
+ ú
36
+ x
37
+
data/spa/D_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07ab22829d36992fc47d7fde4d9e1313f2a8108d2442d489a0953b1910628d7a
3
+ size 561110151
data/spa/G_100000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8feb91089b706e231efb18d0038f5827f1a9d1e45c57c61fba7ebe2198a7c1e6
3
+ size 436635085
data/spa/config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 0.0002,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 64,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 8192,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0
21
+ },
22
+ "data": {
23
+ "training_files": "train.ltr",
24
+ "validation_files": "dev.ltr",
25
+ "text_cleaners": [
26
+ "transliteration_cleaners"
27
+ ],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 16000,
30
+ "filter_length": 1024,
31
+ "hop_length": 256,
32
+ "win_length": 1024,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 0,
38
+ "cleaned_text": true
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2
76
+ ],
77
+ "upsample_initial_channel": 512,
78
+ "upsample_kernel_sizes": [
79
+ 16,
80
+ 16,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false
86
+ }
87
+ }
data/spa/vocab.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 7
2
+ a
3
+ v
4
+ c
5
+
6
+ 0
7
+ 5
8
+ ó
9
+ 8
10
+ p
11
+ y
12
+ z
13
+ 4
14
+ m
15
+ ü
16
+ k
17
+ s
18
+ á
19
+ q
20
+ h
21
+ n
22
+ é
23
+ _
24
+ 9
25
+ 1
26
+ f
27
+ t
28
+
29
+ x
30
+ d
31
+ í
32
+ b
33
+ 3
34
+ j
35
+ g
36
+ l
37
+ 2
38
+ i
39
+ u
40
+ e
41
+ ú
42
+ o
43
+ ñ
44
+ r
45
+ 6
logs.txt ADDED
File without changes
output.wav ADDED
Binary file (359 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ SpeechRecognition
3
+ ttsmms
4
+ deep_translator
video.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5508d3536f55aa02a9cde9a8326799e72f0b148003d50936e735dd23c40cd3ba
3
+ size 2476504
video_subtitles.srt ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1
2
+ 00:00:00,001 --> 00:00:01,000 -->
3
+ <b>Can you</b>
4
+ 2
5
+ 00:00:01,001 --> 00:00:02,000 -->
6
+ <b>imagine leaving</b>
7
+ 3
8
+ 00:00:02,001 --> 00:00:03,000 -->
9
+ <b>home and</b>
10
+ 4
11
+ 00:00:03,001 --> 00:00:04,000 -->
12
+ <b>finding pieces</b>
13
+ 5
14
+ 00:00:04,001 --> 00:00:05,000 -->
15
+ <b>of a</b>
16
+ 6
17
+ 00:00:05,001 --> 00:00:06,000 -->
18
+ <b>plane? Well,</b>
19
+ 7
20
+ 00:00:06,001 --> 00:00:07,000 -->
21
+ <b>that happened</b>
22
+ 8
23
+ 00:00:07,001 --> 00:00:08,000 -->
24
+ <b>in a</b>
25
+ 9
26
+ 00:00:08,001 --> 00:00:09,000 -->
27
+ <b>Chicago neighborhood,</b>
28
+ 10
29
+ 00:00:09,001 --> 00:00:10,000 -->
30
+ <b>where an</b>
31
+ 11
32
+ 00:00:10,001 --> 00:00:11,000 -->
33
+ <b>emergency evacuation</b>
34
+ 12
35
+ 00:00:11,001 --> 00:00:12,000 -->
36
+ <b>slide was</b>
37
+ 13
38
+ 00:00:12,001 --> 00:00:13,000 -->
39
+ <b>found that</b>
40
+ 14
41
+ 00:00:13,001 --> 00:00:14,000 -->
42
+ <b>had detached</b>
43
+ 15
44
+ 00:00:14,001 --> 00:00:15,000 -->
45
+ <b>from an</b>
46
+ 16
47
+ 00:00:15,001 --> 00:00:16,000 -->
48
+ <b>aircraft that</b>
49
+ 17
50
+ 00:00:16,001 --> 00:00:17,000 -->
51
+ <b>hit you</b>
52
+ 18
53
+ 00:00:17,001 --> 00:00:18,000 -->
54
+ <b>at the</b>
55
+ 19
56
+ 00:00:18,001 --> 00:00:19,000 -->
57
+ <b>International Airport.</b>
58
+ 20
59
+ 00:00:19,001 --> 00:00:20,000 -->
60
+ <b>Authorities confirmed</b>
61
+ 21
62
+ 00:00:20,001 --> 00:00:21,000 -->
63
+ <b>that there</b>
64
+ 22
65
+ 00:00:21,001 --> 00:00:22,000 -->
66
+ <b>were no</b>
67
+ 23
68
+ 00:00:22,001 --> 00:00:23,000 -->
69
+ <b>injuries. The</b>
70
+ 24
71
+ 00:00:23,001 --> 00:00:24,000 -->
72
+ <b>large piece</b>
73
+ 25
74
+ 00:00:24,001 --> 00:00:25,000 -->
75
+ <b>of plastic</b>
76
+ 26
77
+ 00:00:25,001 --> 00:00:26,000 -->
78
+ <b>was removed</b>
79
+ 27
80
+ 00:00:26,001 --> 00:00:27,000 -->
81
+ <b>and later</b>
82
+ 28
83
+ 00:00:27,001 --> 00:00:28,000 -->
84
+ <b>it was</b>
85
+ 29
86
+ 00:00:28,001 --> 00:00:29,000 -->
87
+ <b>determined that</b>
88
+ 30
89
+ 00:00:29,001 --> 00:00:30,000 -->
90
+ <b>it belonged</b>
91
+ 31
92
+ 00:00:30,001 --> 00:00:31,000 -->
93
+ <b>to a</b>
94
+ 32
95
+ 00:00:31,001 --> 00:00:32,000 -->
96
+ <b>United Airlines</b>
97
+ 33
98
+ 00:00:32,001 --> 00:00:33,000 -->
99
+ <b>plane from</b>
100
+ 34
101
+ 00:00:33,001 --> 00:00:34,000 -->
102
+ <b>Switzerland that</b>
103
+ 35
104
+ 00:00:34,001 --> 00:00:35,000 -->
105
+ <b>landed safely</b>
106
+ 36
107
+ 00:00:35,001 --> 00:00:36,000 -->
108
+ <b>with 155</b>
109
+ 37
110
+ 00:00:36,001 --> 00:00:37,000 -->
111
+ <b>passengers and</b>
112
+ 38
113
+ 00:00:37,001 --> 00:00:38,000 -->
114
+ <b>10 crew</b>
115
+ 39
116
+ 00:00:38,001 --> 00:00:39,000 -->
117
+ <b>members.</b>