gmdnn commited on
Commit
bac761c
1 Parent(s): 79b9fb7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +86 -0
README.md CHANGED
@@ -1,3 +1,89 @@
1
  ---
2
  license: cc-by-nc-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-nc-4.0
3
  ---
4
+ ---
5
+ library_name: fairseq
6
+ task: audio-to-audio
7
+ tags:
8
+ - fairseq
9
+ - audio
10
+ - audio-to-audio
11
+ - speech-to-speech-translation
12
+
13
+ widget:
14
+ - example_title: Common Voice sample 1
15
+ src: https://huggingface.co/facebook/xm_transformer_600m-es_en-multi_domain/resolve/main/common_voice_es_19966634.flac
16
+ ---
17
+ ## xm_transformer_sm_all-en TODO: Add paper reference etc.
18
+ - Speech synthesis with [facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur](https://huggingface.co/facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur)
19
+
20
+ ## Usage
21
+ ```python
22
+ import json
23
+ import os
24
+ from pathlib import Path
25
+
26
+ import IPython.display as ipd
27
+ from fairseq import hub_utils
28
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
29
+ from fairseq.models.speech_to_text.hub_interface import S2THubInterface
30
+ from fairseq.models.text_to_speech import CodeHiFiGANVocoder
31
+ from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface
32
+
33
+ from huggingface_hub import snapshot_download
34
+ import torchaudio
35
+
36
+ cache_dir = os.getenv("HUGGINGFACE_HUB_CACHE")
37
+
38
+ models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
39
+ "facebook/xm_transformer_sm_all-en",
40
+ arg_overrides={"config_yaml": "config.yaml", "task": "speech_to_text"},
41
+ cache_dir=cache_dir,
42
+ )
43
+ #model = models[0].cpu()
44
+ #cfg["task"].cpu = True
45
+ generator = task.build_generator([model], cfg)
46
+
47
+
48
+ # requires 16000Hz mono channel audio
49
+ audio, _ = torchaudio.load("/path/to/an/audio/file")
50
+
51
+ sample = S2THubInterface.get_model_input(task, audio)
52
+ unit = S2THubInterface.get_prediction(task, model, generator, sample)
53
+
54
+ # speech synthesis
55
+ library_name = "fairseq"
56
+ cache_dir = (
57
+ cache_dir or (Path.home() / ".cache" / library_name).as_posix()
58
+ )
59
+ cache_dir = snapshot_download(
60
+ f"facebook/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", cache_dir=cache_dir, library_name=library_name
61
+ )
62
+
63
+ x = hub_utils.from_pretrained(
64
+ cache_dir,
65
+ "model.pt",
66
+ ".",
67
+ archive_map=CodeHiFiGANVocoder.hub_models(),
68
+ config_yaml="config.json",
69
+ fp16=False,
70
+ is_vocoder=True,
71
+ )
72
+
73
+ with open(f"{x['args']['data']}/config.json") as f:
74
+ vocoder_cfg = json.load(f)
75
+ assert (
76
+ len(x["args"]["model_path"]) == 1
77
+ ), "Too many vocoder models in the input"
78
+
79
+ vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
80
+ tts_model = VocoderHubInterface(vocoder_cfg, vocoder)
81
+
82
+ tts_sample = tts_model.get_model_input(unit)
83
+ wav, sr = tts_model.get_prediction(tts_sample)
84
+
85
+ ipd.Audio(wav, rate=sr)
86
+ ```
87
+
88
+ ## Citation
89
+ ```