facebook
/

unit_hifigan_HK_layer12.km2500_frame_TAT-TTS

Model card Files Files and versions Community

lpw commited on Oct 13, 2022

Commit

5648027

•

1 Parent(s): 7e5d48f

Update README.md

Files changed (1) hide show

README.md +2 -21

README.md CHANGED Viewed

@@ -12,11 +12,8 @@ datasets:
 - covost2
 - europarl_st
 - voxpopuli
-widget:
-- example_title: Common Voice sample 1
-  src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
 ---
-## unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur
 Speech-to-speech translation model from fairseq S2UT ([paper](https://arxiv.org/abs/2204.02967)/[code](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md)):
 - Spanish-English
@@ -41,29 +38,13 @@ import torchaudio
 cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
-#models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
-#     "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
-#     arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
-#     cache_dir=cache_dir,
-# )
-# model = models[0].cpu()
-# cfg["task"].cpu = True
-# generator = task.build_generator([model], cfg)
-# # requires 16000Hz mono channel audio
-# audio, _ = torchaudio.load("/Users/lpw/git/api-inference-community/docker_images/fairseq/tests/samples/sample2.flac")
-# sample = S2THubInterface.get_model_input(task, audio)
-# unit = S2THubInterface.get_prediction(task, model, generator, sample)
 # speech synthesis
 library_name = "fairseq"
 cache_dir = (
     cache_dir or (Path.home() / ".cache" / library_name).as_posix()
 )
 cache_dir = snapshot_download(
-    f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", cache_dir=cache_dir, library_name=library_name
 )
 x = hub_utils.from_pretrained(

 - covost2
 - europarl_st
 - voxpopuli
 ---
+## unit_hifigan_HK_layer12.km2500_frame_TAT-TTS
 Speech-to-speech translation model from fairseq S2UT ([paper](https://arxiv.org/abs/2204.02967)/[code](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md)):
 - Spanish-English
 cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
 # speech synthesis
 library_name = "fairseq"
 cache_dir = (
     cache_dir or (Path.home() / ".cache" / library_name).as_posix()
 )
 cache_dir = snapshot_download(
+    f"facebook/unit_hifigan_HK_layer12.km2500_frame_TAT-TTS", cache_dir=cache_dir, library_name=library_name
 )
 x = hub_utils.from_pretrained(