lpw commited on
Commit
5648027
1 Parent(s): 7e5d48f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -21
README.md CHANGED
@@ -12,11 +12,8 @@ datasets:
12
  - covost2
13
  - europarl_st
14
  - voxpopuli
15
- widget:
16
- - example_title: Common Voice sample 1
17
- src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
18
  ---
19
- ## unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur
20
 
21
  Speech-to-speech translation model from fairseq S2UT ([paper](https://arxiv.org/abs/2204.02967)/[code](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md)):
22
  - Spanish-English
@@ -41,29 +38,13 @@ import torchaudio
41
 
42
  cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
43
 
44
- #models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
45
- # "facebook/xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022",
46
- # arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
47
- # cache_dir=cache_dir,
48
- # )
49
- # model = models[0].cpu()
50
- # cfg["task"].cpu = True
51
- # generator = task.build_generator([model], cfg)
52
-
53
-
54
- # # requires 16000Hz mono channel audio
55
- # audio, _ = torchaudio.load("/Users/lpw/git/api-inference-community/docker_images/fairseq/tests/samples/sample2.flac")
56
-
57
- # sample = S2THubInterface.get_model_input(task, audio)
58
- # unit = S2THubInterface.get_prediction(task, model, generator, sample)
59
-
60
  # speech synthesis
61
  library_name = "fairseq"
62
  cache_dir = (
63
  cache_dir or (Path.home() / ".cache" / library_name).as_posix()
64
  )
65
  cache_dir = snapshot_download(
66
- f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", cache_dir=cache_dir, library_name=library_name
67
  )
68
 
69
  x = hub_utils.from_pretrained(
 
12
  - covost2
13
  - europarl_st
14
  - voxpopuli
 
 
 
15
  ---
16
+ ## unit_hifigan_HK_layer12.km2500_frame_TAT-TTS
17
 
18
  Speech-to-speech translation model from fairseq S2UT ([paper](https://arxiv.org/abs/2204.02967)/[code](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md)):
19
  - Spanish-English
 
38
 
39
  cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # speech synthesis
42
  library_name = "fairseq"
43
  cache_dir = (
44
  cache_dir or (Path.home() / ".cache" / library_name).as_posix()
45
  )
46
  cache_dir = snapshot_download(
47
+ f"facebook/unit_hifigan_HK_layer12.km2500_frame_TAT-TTS", cache_dir=cache_dir, library_name=library_name
48
  )
49
 
50
  x = hub_utils.from_pretrained(