Pengwei Li commited on
Commit
7a51771
1 Parent(s): 24d73d1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -13
README.md CHANGED
@@ -16,13 +16,12 @@ widget:
16
  - example_title: Common Voice sample 1
17
  src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
18
  ---
19
-
20
  ## xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022
21
 
22
- speech-to-speech translation model from fairseq S2UT (paper/code):
23
  - Spanish-English
24
  - Trained on
25
- - Speech synthesis with facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur
26
 
27
  ## Usage
28
  ```python
@@ -35,20 +34,21 @@ from fairseq import hub_utils
35
  from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
36
  from fairseq.models.speech_to_text.hub_interface import S2THubInterface
37
  from fairseq.models.text_to_speech import CodeHiFiGANVocoder
38
- from fairseq.models.text_to_speech.hub_interface import (
39
- TTSHubInterface,
40
- VocoderHubInterface,
41
- )
42
  from huggingface_hub import snapshot_download
43
  import torchaudio
44
 
 
45
 
46
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
47
  "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
48
- arg_overrides={"config_yaml": "config.yaml"},
 
49
  )
50
- model = models[0]
51
- generator = task.build_generator(model, cfg)
 
52
 
53
 
54
  # requires 16000Hz mono channel audio
@@ -57,9 +57,7 @@ audio, _ = torchaudio.load("/path/to/an/audio/file")
57
  sample = S2THubInterface.get_model_input(task, audio)
58
  unit = S2THubInterface.get_prediction(task, model, generator, sample)
59
 
60
- # speech synthesis
61
- cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
62
-
63
  library_name = "fairseq"
64
  cache_dir = (
65
  cache_dir or (Path.home() / ".cache" / library_name).as_posix()
 
16
  - example_title: Common Voice sample 1
17
  src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
18
  ---
 
19
  ## xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022
20
 
21
+ Speech-to-speech translation model from fairseq S2UT ([paper](https://arxiv.org/abs/2204.02967)/[code](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md)):
22
  - Spanish-English
23
  - Trained on
24
+ - Speech synthesis with [facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur](https://huggingface.co/facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur)
25
 
26
  ## Usage
27
  ```python
 
34
  from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
35
  from fairseq.models.speech_to_text.hub_interface import S2THubInterface
36
  from fairseq.models.text_to_speech import CodeHiFiGANVocoder
37
+ from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
38
+
 
 
39
  from huggingface_hub import snapshot_download
40
  import torchaudio
41
 
42
+ cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
43
 
44
  models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
45
  "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
46
+ arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
47
+ cache_dir=cache_dir,
48
  )
49
+ model = models[0].cpu()
50
+ cfg["task"].cpu = True
51
+ generator = task.build_generator([model], cfg)
52
 
53
 
54
  # requires 16000Hz mono channel audio
 
57
  sample = S2THubInterface.get_model_input(task, audio)
58
  unit = S2THubInterface.get_prediction(task, model, generator, sample)
59
 
60
+ # speech synthesis
 
 
61
  library_name = "fairseq"
62
  cache_dir = (
63
  cache_dir or (Path.home() / ".cache" / library_name).as_posix()