SohomToom commited on
Commit
16c7cf3
·
verified ·
1 Parent(s): 03e45b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -121
app.py CHANGED
@@ -1,125 +1,60 @@
1
- # import os
2
- # import uuid
3
- # import time
4
- # import torch
5
- # import gradio as gr
6
- # os.environ["NUMBA_DISABLE_CACHE"] = "1"
7
- # import mecab_patch
8
- # import english_patch
9
- # from melo.api import TTS
10
- # from openvoice.api import ToneColorConverter
11
-
12
- # # Set temporary cache locations for Hugging Face Spaces
13
- # os.environ["TORCH_HOME"] = "/tmp/torch"
14
- # os.environ["HF_HOME"] = "/tmp/huggingface"
15
- # os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
16
- # os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
17
- # os.environ["MPLCONFIGDIR"] = "/tmp"
18
- # os.environ["XDG_CACHE_HOME"] = "/tmp"
19
- # os.environ["XDG_CONFIG_HOME"] = "/tmp"
20
- # os.environ["NUMBA_DISABLE_CACHE"] = "1"
21
- # os.makedirs("/tmp/torch", exist_ok=True)
22
- # os.makedirs("/tmp/huggingface", exist_ok=True)
23
- # os.makedirs("/tmp/flagged", exist_ok=True)
24
-
25
- # # Output folder
26
- # output_dir = "/tmp/outputs"
27
- # os.makedirs(output_dir, exist_ok=True)
28
-
29
- # # Initialize tone converter
30
- # ckpt_converter = "checkpoints/converter/config.json"
31
- # tone_color_converter = ToneColorConverter(ckpt_converter)
32
-
33
- # # Device setting
34
- # device = "cuda" if torch.cuda.is_available() else "cpu"
35
-
36
- # def clone_and_speak(text, speaker_wav):
37
- # if not speaker_wav:
38
- # return "Please upload a reference .wav file."
39
-
40
- # # import melo.text.english as english
41
- # # original_g2p = english.g2p
42
-
43
- # # def patched_g2p(text):
44
- # # phones, tones, word2ph = original_g2p(text)
45
- # # # Fix: wrap ints in list to avoid TypeError
46
- # # word2ph_fixed = []
47
- # # for item in word2ph:
48
- # # if isinstance(item, int):
49
- # # word2ph_fixed.append([item])
50
- # # else:
51
- # # word2ph_fixed.append(item)
52
- # # return phones, tones, word2ph_fixed
53
-
54
- # # english.g2p = patched_g2p
55
-
56
- # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
57
- # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
58
- # final_output_path = f"{output_dir}/{base_name}_converted.wav"
59
-
60
- # # Use English speaker model
61
- # model = TTS(language="EN", device=device)
62
- # speaker_ids = model.hps.data.spk2id
63
- # default_speaker_id = next(iter(speaker_ids.values()))
64
-
65
- # # Generate base TTS voice
66
- # speed = 1.0
67
- # model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
68
-
69
- # # Use speaker_wav as reference to extract style embedding
70
- # from openvoice import se_extractor
71
- # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
72
-
73
- # # Run the tone conversion
74
- # tone_color_converter.convert(
75
- # audio_src_path=tmp_melo_path,
76
- # src_se=ref_se,
77
- # tgt_se=ref_se,
78
- # output_path=final_output_path,
79
- # message="@HuggingFace",
80
- # )
81
-
82
- # return final_output_path
83
-
84
- # # Gradio interface
85
- # gr.Interface(
86
- # fn=clone_and_speak,
87
- # inputs=[
88
- # gr.Textbox(label="Enter Text"),
89
- # gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
90
- # ],
91
- # outputs=gr.Audio(label="Synthesized Output"),
92
- # flagging_dir="/tmp/flagged",
93
- # title="Text to Voice using Melo TTS + OpenVoice",
94
- # description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
95
- # ).launch()
96
-
97
-
98
  import os
99
- import time
100
  import uuid
 
 
101
  import gradio as gr
102
-
 
 
 
103
  from TTS.api import TTS
104
- from openvoice import se_extractor
105
  from openvoice.api import ToneColorConverter
106
-
107
- # Import your local english.py logic
108
  from meloTTS import english
109
 
110
- # Paths
111
- device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
112
- output_dir = "outputs"
 
 
 
 
 
 
 
 
 
 
 
 
113
  os.makedirs(output_dir, exist_ok=True)
114
 
115
- # Load OpenVoice tone converter
116
- tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
117
- tone_color_converter.load_model()
 
 
 
118
 
119
  def clone_and_speak(text, speaker_wav):
120
  if not speaker_wav:
121
  return "Please upload a reference .wav file."
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
124
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
125
  final_output_path = f"{output_dir}/{base_name}_converted.wav"
@@ -130,32 +65,99 @@ def clone_and_speak(text, speaker_wav):
130
  default_speaker_id = next(iter(speaker_ids.values()))
131
 
132
  # Generate base TTS voice
133
- model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
 
134
 
135
- # Extract style embedding
 
136
  ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
137
 
138
- # Convert tone
139
  tone_color_converter.convert(
140
  audio_src_path=tmp_melo_path,
141
  src_se=ref_se,
142
  tgt_se=ref_se,
143
  output_path=final_output_path,
144
- message="@HuggingFace"
145
  )
146
 
147
  return final_output_path
148
 
149
- # Gradio Interface
150
- demo = gr.Interface(
151
  fn=clone_and_speak,
152
  inputs=[
153
- gr.Textbox(label="Text to Synthesize"),
154
- gr.Audio(label="Reference Voice (WAV)", type="filepath")
155
  ],
156
- outputs=gr.Audio(label="Cloned Voice Output"),
157
- title="Voice Cloner with MeloTTS + OpenVoice"
158
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- if __name__ == "__main__":
161
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import uuid
3
+ import time
4
+ import torch
5
  import gradio as gr
6
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
7
+ import mecab_patch
8
+ import english_patch
9
+ #from melo.api import TTS
10
  from TTS.api import TTS
 
11
  from openvoice.api import ToneColorConverter
 
 
12
  from meloTTS import english
13
 
14
+ # Set temporary cache locations for Hugging Face Spaces
15
+ os.environ["TORCH_HOME"] = "/tmp/torch"
16
+ os.environ["HF_HOME"] = "/tmp/huggingface"
17
+ os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
18
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
19
+ os.environ["MPLCONFIGDIR"] = "/tmp"
20
+ os.environ["XDG_CACHE_HOME"] = "/tmp"
21
+ os.environ["XDG_CONFIG_HOME"] = "/tmp"
22
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
23
+ os.makedirs("/tmp/torch", exist_ok=True)
24
+ os.makedirs("/tmp/huggingface", exist_ok=True)
25
+ os.makedirs("/tmp/flagged", exist_ok=True)
26
+
27
+ # Output folder
28
+ output_dir = "/tmp/outputs"
29
  os.makedirs(output_dir, exist_ok=True)
30
 
31
+ # Initialize tone converter
32
+ ckpt_converter = "checkpoints/converter/config.json"
33
+ tone_color_converter = ToneColorConverter(ckpt_converter)
34
+
35
+ # Device setting
36
+ device = "cuda" if torch.cuda.is_available() else "cpu"
37
 
38
  def clone_and_speak(text, speaker_wav):
39
  if not speaker_wav:
40
  return "Please upload a reference .wav file."
41
 
42
+ # import melo.text.english as english
43
+ # original_g2p = english.g2p
44
+
45
+ # def patched_g2p(text):
46
+ # phones, tones, word2ph = original_g2p(text)
47
+ # # Fix: wrap ints in list to avoid TypeError
48
+ # word2ph_fixed = []
49
+ # for item in word2ph:
50
+ # if isinstance(item, int):
51
+ # word2ph_fixed.append([item])
52
+ # else:
53
+ # word2ph_fixed.append(item)
54
+ # return phones, tones, word2ph_fixed
55
+
56
+ # english.g2p = patched_g2p
57
+
58
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
59
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
60
  final_output_path = f"{output_dir}/{base_name}_converted.wav"
 
65
  default_speaker_id = next(iter(speaker_ids.values()))
66
 
67
  # Generate base TTS voice
68
+ speed = 1.0
69
+ model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
70
 
71
+ # Use speaker_wav as reference to extract style embedding
72
+ from openvoice import se_extractor
73
  ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
74
 
75
+ # Run the tone conversion
76
  tone_color_converter.convert(
77
  audio_src_path=tmp_melo_path,
78
  src_se=ref_se,
79
  tgt_se=ref_se,
80
  output_path=final_output_path,
81
+ message="@HuggingFace",
82
  )
83
 
84
  return final_output_path
85
 
86
+ # Gradio interface
87
+ gr.Interface(
88
  fn=clone_and_speak,
89
  inputs=[
90
+ gr.Textbox(label="Enter Text"),
91
+ gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
92
  ],
93
+ outputs=gr.Audio(label="Synthesized Output"),
94
+ flagging_dir="/tmp/flagged",
95
+ title="Text to Voice using Melo TTS + OpenVoice",
96
+ description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
97
+ ).launch()
98
+
99
+
100
+ # import os
101
+ # import time
102
+ # import uuid
103
+ # import gradio as gr
104
+
105
+ # from TTS.api import TTS
106
+ # from openvoice import se_extractor
107
+ # from openvoice.api import ToneColorConverter
108
+
109
+ # # Import your local english.py logic
110
+ # from meloTTS import english
111
+
112
+ # # Paths
113
+ # device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
114
+ # output_dir = "outputs"
115
+ # os.makedirs(output_dir, exist_ok=True)
116
+
117
+ # # Load OpenVoice tone converter
118
+ # tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
119
+ # tone_color_converter.load_model()
120
+
121
+ # def clone_and_speak(text, speaker_wav):
122
+ # if not speaker_wav:
123
+ # return "Please upload a reference .wav file."
124
+
125
+ # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
126
+ # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
127
+ # final_output_path = f"{output_dir}/{base_name}_converted.wav"
128
+
129
+ # # Use English speaker model
130
+ # model = TTS(language="EN", device=device)
131
+ # speaker_ids = model.hps.data.spk2id
132
+ # default_speaker_id = next(iter(speaker_ids.values()))
133
+
134
+ # # Generate base TTS voice
135
+ # model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
136
+
137
+ # # Extract style embedding
138
+ # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
139
+
140
+ # # Convert tone
141
+ # tone_color_converter.convert(
142
+ # audio_src_path=tmp_melo_path,
143
+ # src_se=ref_se,
144
+ # tgt_se=ref_se,
145
+ # output_path=final_output_path,
146
+ # message="@HuggingFace"
147
+ # )
148
+
149
+ # return final_output_path
150
+
151
+ # # Gradio Interface
152
+ # demo = gr.Interface(
153
+ # fn=clone_and_speak,
154
+ # inputs=[
155
+ # gr.Textbox(label="Text to Synthesize"),
156
+ # gr.Audio(label="Reference Voice (WAV)", type="filepath")
157
+ # ],
158
+ # outputs=gr.Audio(label="Cloned Voice Output"),
159
+ # title="Voice Cloner with MeloTTS + OpenVoice"
160
+ # )
161
 
162
+ # if __name__ == "__main__":
163
+ # demo.launch()