Pengwei Li commited on
Commit
9adb123
1 Parent(s): 49a4ef8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +51 -15
README.md CHANGED
@@ -17,17 +17,34 @@ widget:
17
  src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
18
  ---
19
 
 
 
 
 
 
 
 
20
  ## Usage
21
  ```python
22
- from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
23
- from fairseq.models.text_to_speech.hub_interface import S2THubInterface
24
- from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
 
25
  import IPython.display as ipd
 
 
 
 
 
 
 
 
 
26
  import torchaudio
27
 
28
 
29
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
30
- "facebook/xm_transformer_600m-es_en-multi_domain",
31
  arg_overrides={"config_yaml": "config.yaml"},
32
  )
33
  model = models[0]
@@ -38,22 +55,41 @@ generator = task.build_generator(model, cfg)
38
  audio, _ = torchaudio.load("/path/to/an/audio/file")
39
 
40
  sample = S2THubInterface.get_model_input(task, audio)
41
- text = S2THubInterface.get_prediction(task, model, generator, sample)
42
 
43
  # speech synthesis
44
- tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
45
- f"facebook/fastspeech2-en-ljspeech",
46
- arg_overrides={"vocoder": "griffin_lim", "fp16": False},
 
 
 
 
 
47
  )
48
- tts_model = tts_models[0]
49
- TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
50
- tts_generator = tts_task.build_generator([tts_model], tts_cfg)
51
 
52
- tts_sample = TTSHubInterface.get_model_input(tts_task, text)
53
- wav, sr = TTSHubInterface.get_prediction(
54
- tts_task, tts_model, tts_generator, tts_sample
 
 
 
 
 
55
  )
56
 
57
- ipd.Audio(wav, rate=rate)
 
 
 
 
 
 
 
 
 
 
 
 
58
  ```
59
 
 
17
  src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
18
  ---
19
 
20
+ ## xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022
21
+ speech-to-speech translation model from fairseq S2UT (paper/code):
22
+
23
+ -Spanish-English
24
+ -Trained on
25
+ -Speech synthesis with facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur
26
+
27
  ## Usage
28
  ```python
29
+ import json
30
+ import os
31
+ from pathlib import Path
32
+
33
  import IPython.display as ipd
34
+ from fairseq import hub_utils
35
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
36
+ from fairseq.models.speech_to_text.hub_interface import S2THubInterface
37
+ from fairseq.models.text_to_speech import CodeHiFiGANVocoder
38
+ from fairseq.models.text_to_speech.hub_interface import (
39
+ TTSHubInterface,
40
+ VocoderHubInterface,
41
+ )
42
+ from huggingface_hub import snapshot_download
43
  import torchaudio
44
 
45
 
46
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
47
+ "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
48
  arg_overrides={"config_yaml": "config.yaml"},
49
  )
50
  model = models[0]
 
55
  audio, _ = torchaudio.load("/path/to/an/audio/file")
56
 
57
  sample = S2THubInterface.get_model_input(task, audio)
58
+ unit = S2THubInterface.get_prediction(task, model, generator, sample)
59
 
60
  # speech synthesis
61
+ cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
62
+
63
+ library_name = "fairseq"
64
+ cache_dir = (
65
+ cache_dir or (Path.home() / ".cache" / library_name).as_posix()
66
+ )
67
+ cache_dir = snapshot_download(
68
+ f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", cache_dir=cache_dir, library_name=library_name
69
  )
 
 
 
70
 
71
+ x = hub_utils.from_pretrained(
72
+ cache_dir,
73
+ "model.pt",
74
+ ".",
75
+ archive_map=CodeHiFiGANVocoder.hub_models(),
76
+ config_yaml="config.json",
77
+ fp16=False,
78
+ is_vocoder=True,
79
  )
80
 
81
+ with open(f"{x['args']['data']}/config.json") as f:
82
+ vocoder_cfg = json.load(f)
83
+ assert (
84
+ len(x["args"]["model_path"]) == 1
85
+ ), "Too many vocoder models in the input"
86
+
87
+ vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
88
+ tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
89
+
90
+ tts_sample = tts_model.get_model_input(unit)
91
+ wav, sr = tts_model.get_prediction(tts_sample)
92
+
93
+ ipd.Audio(wav, rate=sr)
94
  ```
95