victan commited on
Commit
fd3583e
1 Parent(s): c9be1e7

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +567 -0
app.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # MIT_LICENSE file in the root directory of this source tree.
7
+
8
+ import os
9
+ import pathlib
10
+ import tempfile
11
+ from pydub import AudioSegment, silence
12
+ import gradio as gr
13
+ import torch
14
+ import torchaudio
15
+ from fairseq2.assets import InProcAssetMetadataProvider, asset_store
16
+ from fairseq2.data import Collater, SequenceData, VocabularyInfo
17
+ from fairseq2.data.audio import (
18
+ AudioDecoder,
19
+ WaveformToFbankConverter,
20
+ WaveformToFbankOutput,
21
+ )
22
+
23
+ from seamless_communication.inference import SequenceGeneratorOptions
24
+ from fairseq2.generation import NGramRepeatBlockProcessor
25
+ from fairseq2.memory import MemoryBlock
26
+ from fairseq2.typing import DataType, Device
27
+ from huggingface_hub import snapshot_download
28
+ from seamless_communication.inference import BatchedSpeechOutput, Translator, SequenceGeneratorOptions
29
+ from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
30
+ from seamless_communication.models.unity import (
31
+ UnitTokenizer,
32
+ load_gcmvn_stats,
33
+ load_unity_text_tokenizer,
34
+ load_unity_unit_tokenizer,
35
+ )
36
+ from torch.nn import Module
37
+ from seamless_communication.cli.expressivity.evaluate.pretssel_inference_helper import PretsselGenerator
38
+
39
+ from utils import LANGUAGE_CODE_TO_NAME
40
+
41
+ DESCRIPTION = """\
42
+ # Seamless Expressive
43
+ [SeamlessExpressive](https://github.com/facebookresearch/seamless_communication) is a speech-to-speech translation model that captures certain underexplored aspects of prosody such as speech rate and pauses, while preserving the style of one's voice and high content translation quality.
44
+ """
45
+
46
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
47
+
48
+ CHECKPOINTS_PATH = pathlib.Path(os.getenv("CHECKPOINTS_PATH", "/workspace/seamless_communication/demo/expressive/models"))
49
+ if not CHECKPOINTS_PATH.exists():
50
+ snapshot_download(repo_id="facebook/seamless-expressive", repo_type="model", local_dir=CHECKPOINTS_PATH)
51
+ snapshot_download(repo_id="facebook/seamless-m4t-v2-large", repo_type="model", local_dir=CHECKPOINTS_PATH)
52
+
53
+ # Ensure that we do not have any other environment resolvers and always return
54
+ # "demo" for demo purposes.
55
+ asset_store.env_resolvers.clear()
56
+ asset_store.env_resolvers.append(lambda: "demo")
57
+
58
+ # Construct an `InProcAssetMetadataProvider` with environment-specific metadata
59
+ # that just overrides the regular metadata for "demo" environment. Note the "@demo" suffix.
60
+ demo_metadata = [
61
+ {
62
+ "name": "seamless_expressivity@demo",
63
+ "checkpoint": f"file://{CHECKPOINTS_PATH}/m2m_expressive_unity.pt",
64
+ "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
65
+ },
66
+ {
67
+ "name": "vocoder_pretssel@demo",
68
+ "checkpoint": f"file://{CHECKPOINTS_PATH}/pretssel_melhifigan_wm-final.pt",
69
+ },
70
+ {
71
+ "name": "seamlessM4T_v2_large@demo",
72
+ "checkpoint": f"file://{CHECKPOINTS_PATH}/seamlessM4T_v2_large.pt",
73
+ "char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
74
+ },
75
+ ]
76
+
77
+ asset_store.metadata_providers.append(InProcAssetMetadataProvider(demo_metadata))
78
+
79
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in LANGUAGE_CODE_TO_NAME.items()}
80
+
81
+
82
+ if torch.cuda.is_available():
83
+ device = torch.device("cuda:0")
84
+ dtype = torch.float16
85
+ else:
86
+ device = torch.device("cpu")
87
+ dtype = torch.float32
88
+
89
+
90
+ MODEL_NAME = "seamless_expressivity"
91
+ VOCODER_NAME = "vocoder_pretssel"
92
+
93
+ # used for ASR for toxicity
94
+ m4t_translator = Translator(
95
+ model_name_or_card="seamlessM4T_v2_large",
96
+ vocoder_name_or_card=None,
97
+ device=device,
98
+ dtype=dtype,
99
+ )
100
+ unit_tokenizer = load_unity_unit_tokenizer(MODEL_NAME)
101
+
102
+ _gcmvn_mean, _gcmvn_std = load_gcmvn_stats(VOCODER_NAME)
103
+ gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype)
104
+ gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype)
105
+
106
+ translator = Translator(
107
+ MODEL_NAME,
108
+ vocoder_name_or_card=None,
109
+ device=device,
110
+ dtype=dtype,
111
+ apply_mintox=False,
112
+ )
113
+
114
+ text_generation_opts = SequenceGeneratorOptions(
115
+ beam_size=5,
116
+ unk_penalty=torch.inf,
117
+ soft_max_seq_len=(0, 200),
118
+ step_processor=NGramRepeatBlockProcessor(
119
+ ngram_size=10,
120
+ ),
121
+ )
122
+ m4t_text_generation_opts = SequenceGeneratorOptions(
123
+ beam_size=5,
124
+ unk_penalty=torch.inf,
125
+ soft_max_seq_len=(1, 200),
126
+ step_processor=NGramRepeatBlockProcessor(
127
+ ngram_size=10,
128
+ ),
129
+ )
130
+
131
+ pretssel_generator = PretsselGenerator(
132
+ VOCODER_NAME,
133
+ vocab_info=unit_tokenizer.vocab_info,
134
+ device=device,
135
+ dtype=dtype,
136
+ )
137
+
138
+ decode_audio = AudioDecoder(dtype=torch.float32, device=device)
139
+
140
+ convert_to_fbank = WaveformToFbankConverter(
141
+ num_mel_bins=80,
142
+ waveform_scale=2**15,
143
+ channel_last=True,
144
+ standardize=False,
145
+ device=device,
146
+ dtype=dtype,
147
+ )
148
+
149
+
150
+ def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput:
151
+ fbank = data["fbank"]
152
+ std, mean = torch.std_mean(fbank, dim=0)
153
+ data["fbank"] = fbank.subtract(mean).divide(std)
154
+ data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std)
155
+ return data
156
+
157
+
158
+ collate = Collater(pad_value=0, pad_to_multiple=1)
159
+
160
+
161
+ AUDIO_SAMPLE_RATE = 16000
162
+ MAX_INPUT_AUDIO_LENGTH = 10 # in seconds
163
+
164
+
165
+ from pydub import AudioSegment
166
+
167
+ def adjust_audio_duration(input_audio_path, output_audio_path):
168
+ input_audio = AudioSegment.from_file(input_audio_path)
169
+ output_audio = AudioSegment.from_file(output_audio_path)
170
+
171
+ input_duration = len(input_audio)
172
+ output_duration = len(output_audio)
173
+
174
+ # Calcul de la différence de durée
175
+ duration_diff = input_duration - output_duration
176
+
177
+ # Ajout de silence à la fin si l'audio de sortie est plus court
178
+ if duration_diff > 0:
179
+ print("Duration diff : ",duration_diff)
180
+ silence = AudioSegment.silent(duration=duration_diff)
181
+ output_audio += silence
182
+
183
+ # Enregistrer l'audio ajusté
184
+ output_audio.export(output_audio_path, format='wav')
185
+
186
+ return output_audio_path
187
+
188
+
189
+
190
+
191
+ import yt_dlp
192
+ def dowloadYoutubeAudio(url):
193
+ print("Téléchargement de l'audio YouTube en cours...")
194
+ ydl_opts = {
195
+ 'format': 'm4a/bestaudio/best',
196
+ 'outtmpl': os.getcwd() + "/audio", # Mise à jour du chemin de sortie
197
+ 'postprocessors': [{
198
+ 'key': 'FFmpegExtractAudio',
199
+ 'preferredcodec': 'wav', # Utilisation du format WAV
200
+ }]
201
+ }
202
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
203
+ error_code = ydl.download([url])
204
+
205
+ if error_code == 0:
206
+ print("Sauvegarde du fichier audio...")
207
+ print("download_finished : ", os.getcwd() + "/audio.wav")
208
+ else:
209
+ print("error : Échec du téléchargement...")
210
+
211
+ return os.getcwd() + "/audio.wav"
212
+
213
+
214
+ def split_audio(input_audio_path):
215
+ print("Start Split Audio")
216
+ audio = AudioSegment.from_file(input_audio_path)
217
+ silence_thresh = -20 # Seuil de silence
218
+ min_silence_len = 300 # Durée minimale de silence en ms
219
+
220
+ chunks = []
221
+ current_chunk = AudioSegment.silent(duration=0)
222
+ for ms in range(0, len(audio), 10): # Incrément de 10 ms
223
+ segment = audio[ms:ms + 10]
224
+ current_chunk += segment
225
+
226
+ if len(current_chunk) >= 8000: # Si la durée actuelle dépasse 8 secondes
227
+ # Vérifier s'il y a un silence
228
+ if silence.detect_silence(current_chunk[-min_silence_len:], min_silence_len=min_silence_len, silence_thresh=silence_thresh):
229
+ # Couper au silence
230
+ print("Silence détecté, découpage du segment")
231
+ chunks.append(current_chunk)
232
+ current_chunk = AudioSegment.silent(duration=0)
233
+
234
+ if len(current_chunk) >= 8900: # Si la durée dépasse 9,89 secondes
235
+ print("Durée maximale atteinte, découpage du segment")
236
+ chunks.append(current_chunk)
237
+ current_chunk = AudioSegment.silent(duration=0)
238
+
239
+ # Ajouter le dernier segment s'il n'est pas vide
240
+ if len(current_chunk) > 0:
241
+ chunks.append(current_chunk)
242
+
243
+ print('Nombre de segments valides:', len(chunks))
244
+ return chunks
245
+
246
+
247
+
248
+
249
+ def remove_prosody_tokens_from_text(text):
250
+ # filter out prosody tokens, there is only emphasis '*', and pause '='
251
+ text = text.replace("*", "").replace("=", "")
252
+ text = " ".join(text.split())
253
+ return text
254
+
255
+
256
+
257
+
258
+
259
+
260
+ import torchaudio
261
+
262
+ AUDIO_SAMPLE_RATE = 16000 # Taux d'échantillonnage standard
263
+
264
+ def preprocess_audio(input_audio_path: str):
265
+ print("preprocess_audio start")
266
+ print("Audio Path :", input_audio_path)
267
+ audio_segments = split_audio(input_audio_path)
268
+ temp_folder = os.path.join(os.getcwd(), "path_to_temp_folder")
269
+ os.makedirs(temp_folder, exist_ok=True)
270
+ segment_paths = []
271
+
272
+ for i, segment in enumerate(audio_segments):
273
+ segment_path = os.path.join(temp_folder, f"segment_{i}.wav")
274
+ segment_audio = segment.get_array_of_samples()
275
+ segment_tensor = torch.tensor(segment_audio).unsqueeze(0).float()
276
+
277
+ # Rééchantillonnage
278
+ segment_tensor = torchaudio.functional.resample(segment_tensor, orig_freq=segment.frame_rate, new_freq=AUDIO_SAMPLE_RATE)
279
+
280
+ torchaudio.save(segment_path, segment_tensor, sample_rate=AUDIO_SAMPLE_RATE)
281
+ segment_paths.append(segment_path)
282
+ print("path for :", segment_path)
283
+
284
+ return segment_paths
285
+
286
+
287
+
288
+ import os
289
+ import torchaudio
290
+
291
+ # Constante pour le taux d'échantillonnage
292
+ AUDIO_SAMPLE_RATE = 16000
293
+
294
+ def preprocess_audio22(input_audio_path: str):
295
+ print("preprocess_audio start")
296
+ print("Audio Path :", input_audio_path)
297
+
298
+ # Appeler split_audio et obtenir les segments
299
+ audio_segments = split_audio(input_audio_path)
300
+
301
+ # Créer un dossier temporaire pour stocker les segments
302
+ temp_folder = os.path.join(os.getcwd(), "path_to_temp_folder")
303
+ os.makedirs(temp_folder, exist_ok=True)
304
+
305
+ segment_paths = []
306
+ for i, segment in enumerate(audio_segments):
307
+ # Exporter chaque segment dans un fichier temporaire
308
+ temp_segment_path = os.path.join(temp_folder, f"temp_segment_{i}.wav")
309
+ segment.export(temp_segment_path, format="wav")
310
+
311
+ # Charger et traiter le segment audio
312
+ arr, org_sr = torchaudio.load(temp_segment_path)
313
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
314
+
315
+ # Enregistrer le segment traité
316
+ segment_path = os.path.join(temp_folder, f"segment_{i}.wav")
317
+ torchaudio.save(segment_path, new_arr, sample_rate=AUDIO_SAMPLE_RATE)
318
+
319
+ # Ajouter le chemin du segment traité à la liste
320
+ segment_paths.append(segment_path)
321
+ print("Path for :", segment_path)
322
+
323
+ return segment_paths
324
+
325
+
326
+ def preprocess_audio222(input_audio_path: str):
327
+ # Appeler split_audio et obtenir les segments
328
+ print("preprocess_audio start")
329
+ print("Audio Path :",input_audio_path)
330
+ audio_segments = split_audio(input_audio_path)
331
+ temp_folder = os.getcwd()+"/path_to_temp_folder"
332
+ os.makedirs(temp_folder, exist_ok=True)
333
+ segment_paths = []
334
+ for i, segment in enumerate(audio_segments):
335
+ segment_path = os.path.join(temp_folder, f"segment_{i}.wav")
336
+ segment.export(segment_path, format="wav")
337
+ segment_paths.append(segment_path)
338
+ print("path for : ",segment_path)
339
+
340
+ return segment_paths
341
+
342
+
343
+
344
+
345
+ def process_segment(segment_path, source_language_code, target_language_code):
346
+ # preprocess_audio(segment_path) - cette ligne peut ne pas être nécessaire si le segment est déjà prétraité
347
+
348
+ with pathlib.Path(segment_path).open("rb") as fb:
349
+ block = MemoryBlock(fb.read())
350
+ example = decode_audio(block)
351
+
352
+ example = convert_to_fbank(example)
353
+ example = normalize_fbank(example)
354
+ example = collate(example)
355
+
356
+ # Transcription pour mintox
357
+ source_sentences, _ = m4t_translator.predict(
358
+ input=example["fbank"],
359
+ task_str="S2TT",
360
+ tgt_lang=source_language_code,
361
+ text_generation_opts=m4t_text_generation_opts,
362
+ )
363
+ source_text = str(source_sentences[0])
364
+
365
+ prosody_encoder_input = example["gcmvn_fbank"]
366
+ text_output, unit_output = translator.predict(
367
+ example["fbank"],
368
+ "S2ST",
369
+ tgt_lang=target_language_code,
370
+ src_lang=source_language_code,
371
+ text_generation_opts=text_generation_opts,
372
+ unit_generation_ngram_filtering=False,
373
+ duration_factor=1.0,
374
+ prosody_encoder_input=prosody_encoder_input,
375
+ src_text=source_text,
376
+ )
377
+ speech_output = pretssel_generator.predict(
378
+ unit_output.units,
379
+ tgt_lang=target_language_code,
380
+ prosody_encoder_input=prosody_encoder_input,
381
+ )
382
+
383
+ # Chemin pour enregistrer l'audio du segment
384
+ segment_output_audio_path = os.path.join(os.getcwd(), "result", f"segment_audio_{os.path.basename(segment_path)}")
385
+ os.makedirs(os.path.dirname(segment_output_audio_path), exist_ok=True)
386
+
387
+ # Enregistrer l'audio du segment
388
+ torchaudio.save(
389
+ segment_output_audio_path,
390
+ speech_output.audio_wavs[0][0].to(torch.float32).cpu(),
391
+ sample_rate=speech_output.sample_rate,
392
+ )
393
+ segment_output_audio_path = adjust_audio_duration(segment_path, segment_output_audio_path)
394
+
395
+
396
+ text_out = remove_prosody_tokens_from_text(str(text_output[0]))
397
+ print("Audio ici : ",segment_output_audio_path)
398
+ return segment_output_audio_path, text_out
399
+
400
+
401
+ #---------------------------_#
402
+
403
+
404
+ from typing import Tuple
405
+
406
+ def run2(
407
+ input_audio_path: str,
408
+ source_language: str,
409
+ target_language: str,
410
+ ) -> Tuple[str, str]:
411
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
412
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
413
+
414
+ preprocess_audio(input_audio_path)
415
+
416
+ with pathlib.Path(input_audio_path).open("rb") as fb:
417
+ block = MemoryBlock(fb.read())
418
+ example = decode_audio(block)
419
+
420
+ example = convert_to_fbank(example)
421
+ example = normalize_fbank(example)
422
+ example = collate(example)
423
+
424
+ # get transcription for mintox
425
+ source_sentences, _ = m4t_translator.predict(
426
+ input=example["fbank"],
427
+ task_str="S2TT", # get source text
428
+ tgt_lang=source_language_code,
429
+ text_generation_opts=m4t_text_generation_opts,
430
+ )
431
+ source_text = str(source_sentences[0])
432
+
433
+ prosody_encoder_input = example["gcmvn_fbank"]
434
+ text_output, unit_output = translator.predict(
435
+ example["fbank"],
436
+ "S2ST",
437
+ tgt_lang=target_language_code,
438
+ src_lang=source_language_code,
439
+ text_generation_opts=text_generation_opts,
440
+ unit_generation_ngram_filtering=False,
441
+ duration_factor=1.0,
442
+ prosody_encoder_input=prosody_encoder_input,
443
+ src_text=source_text, # for mintox check
444
+ )
445
+ speech_output = pretssel_generator.predict(
446
+ unit_output.units,
447
+ tgt_lang=target_language_code,
448
+ prosody_encoder_input=prosody_encoder_input,
449
+ )
450
+
451
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
452
+ torchaudio.save(
453
+ f.name,
454
+ speech_output.audio_wavs[0][0].to(torch.float32).cpu(),
455
+ sample_rate=speech_output.sample_rate,
456
+ )
457
+
458
+ text_out = remove_prosody_tokens_from_text(str(text_output[0]))
459
+
460
+ return f.name, text_out
461
+
462
+
463
+
464
+
465
+
466
+
467
+
468
+
469
+
470
+
471
+ #---------------------------------------------------------_#
472
+ #----------------------------------------------------------#
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+ #----------------------------------------------__#------
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+ #-----------------------#
494
+
495
+
496
+ def run(input_audio_path: str, source_language: str, target_language: str) -> tuple[str, str]:
497
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
498
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
499
+
500
+ segment_paths = preprocess_audio22(input_audio_path)
501
+ print("preprocess_audio end")
502
+ final_text = ""
503
+ final_audio = AudioSegment.silent(duration=0)
504
+
505
+
506
+ for segment_path in segment_paths:
507
+ segment_audio_path, segment_text = process_segment(segment_path, source_language_code, target_language_code)
508
+ final_text += segment_text + " "
509
+ segment_audio = AudioSegment.from_file(segment_audio_path)
510
+ final_audio += segment_audio
511
+
512
+ output_audio_path = os.path.join(os.getcwd(), "result", "audio.wav")
513
+ os.makedirs(os.path.dirname(output_audio_path), exist_ok=True)
514
+ final_audio.export(output_audio_path, format="wav")
515
+
516
+ text_out = remove_prosody_tokens_from_text(final_text.strip())
517
+
518
+ return output_audio_path, text_out
519
+
520
+
521
+
522
+
523
+
524
+ TARGET_LANGUAGE_NAMES = [
525
+ "English",
526
+ "French",
527
+ "German",
528
+ "Spanish",
529
+ ]
530
+
531
+
532
+ from flask import Flask, request, jsonify
533
+ import torch
534
+ import torchaudio
535
+
536
+ app = Flask(__name__)
537
+ # Fonction run adaptée pour Flask
538
+ @app.route('/translate', methods=['POST'])
539
+ def translate():
540
+ # Récupérer les données de la requête
541
+ data = request.json
542
+ input_audio_path = data['input_audio_path']
543
+ source_language = data['source_language']
544
+ target_language = data['target_language']
545
+
546
+ # Exécution de la fonction de traduction
547
+ output_audio_path, output_text = run(input_audio_path, source_language, target_language)
548
+
549
+ # Retourner la réponse
550
+ return jsonify({
551
+ 'output_audio_path': output_audio_path,
552
+ 'output_text': output_text
553
+ })
554
+
555
+
556
+ import os
557
+
558
+ url = "https://youtu.be/qb_tHWGJOp8?si=10qB2JApy0q3XY76"
559
+ input_audio_path = dowloadYoutubeAudio(url)
560
+
561
+ #input_audio_path = os.getcwd()+"/au1min_Vocals_finale.wav"
562
+ source_language = "French"
563
+ target_language = "English"
564
+ print("Audio à traiter : ",input_audio_path)
565
+ output_audio_path, output_text = run(input_audio_path, source_language, target_language)
566
+
567
+ print("output_audio_path : ",output_audio_path)