SohomToom commited on
Commit
cbb34e3
·
verified ·
1 Parent(s): 878264e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -68
app.py CHANGED
@@ -39,41 +39,9 @@ tone_color_converter = ToneColorConverter(ckpt_converter)
39
  # Device setting
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
 
42
- # def clone_and_speak(text, speaker_wav):
43
- # if not speaker_wav:
44
- # return "Please upload a reference .wav file."
45
-
46
- # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
47
- # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
48
- # final_output_path = f"{output_dir}/{base_name}_converted.wav"
49
-
50
- # # Use English speaker model
51
- # model = TTS(language="EN", device=device)
52
- # speaker_ids = model.hps.data.spk2id
53
- # default_speaker_id = next(iter(speaker_ids.values()))
54
-
55
- # # Generate base TTS voice
56
- # speed = 1.0
57
- # model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
58
-
59
- # # Use speaker_wav as reference to extract style embedding
60
- # from openvoice import se_extractor
61
- # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)
62
-
63
- # # Run the tone conversion
64
- # tone_color_converter.convert(
65
- # audio_src_path=tmp_melo_path,
66
- # src_se=ref_se,
67
- # tgt_se=ref_se,
68
- # output_path=final_output_path,
69
- # message="@HuggingFace",
70
- # )
71
-
72
- # return final_output_path
73
-
74
- def clone_and_speak(text, selected_speaker_key):
75
- if not text or not selected_speaker_key:
76
- return "Please enter text and select a speaker."
77
 
78
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
79
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
@@ -82,34 +50,25 @@ def clone_and_speak(text, selected_speaker_key):
82
  # Use English speaker model
83
  model = TTS(language="EN", device=device)
84
  speaker_ids = model.hps.data.spk2id
 
85
 
86
- # Map speaker_key to speaker_id (model-specific)
87
- if selected_speaker_key not in speaker_ids:
88
- return f"Speaker '{selected_speaker_key}' not found in model."
89
-
90
- speaker_id = speaker_ids[selected_speaker_key]
91
 
92
  # Generate base TTS voice
93
- speed = 1.0
94
- model.tts_to_file(text, speaker_id, tmp_melo_path, speed=speed)
95
-
96
- # Load pre-saved speaker embedding
97
- normalized_key = selected_speaker_key.lower().replace("_", "-")
98
- se_path = f'checkpoints_v2/base_speakers/ses/{normalized_key}.pth'
99
-
100
- if not os.path.isfile(se_path):
101
- return f"SE file not found for speaker '{normalized_key}'."
102
 
103
- ref_se = torch.load(se_path, map_location=device)
104
-
105
- # Disable MPS if present but device is CPU
106
- if torch.backends.mps.is_available() and device == 'cpu':
107
- torch.backends.mps.is_available = lambda: False
108
 
109
  # Run the tone conversion
110
- tone_color_converter.convert(
111
  audio_src_path=tmp_melo_path,
112
- src_se=ref_se,
113
  tgt_se=ref_se,
114
  output_path=final_output_path,
115
  message="@HuggingFace",
@@ -118,18 +77,20 @@ def clone_and_speak(text, selected_speaker_key):
118
  return final_output_path
119
 
120
 
121
- # Gradio interface
122
- # gr.Interface(
123
- # fn=clone_and_speak,
124
- # inputs=[
125
- # gr.Textbox(label="Enter Text"),
126
- # gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
127
- # ],
128
- # outputs=gr.Audio(label="Synthesized Output"),
129
- # flagging_dir="/tmp/flagged",
130
- # title="Text to Voice using Melo TTS + OpenVoice",
131
- # description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
132
- # ).launch()
 
 
133
 
134
  iface = gr.Interface(
135
  fn=clone_with_base_speaker,
 
39
  # Device setting
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
 
42
+ def clone_and_speak(text, speaker_wav):
43
+ if not speaker_wav:
44
+ return "Please upload a reference .wav file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
47
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
 
50
  # Use English speaker model
51
  model = TTS(language="EN", device=device)
52
  speaker_ids = model.hps.data.spk2id
53
+ #default_speaker_id = next(iter(speaker_ids.values()))
54
 
55
+ for speaker_key in speaker_ids.keys():
56
+ speaker_id = speaker_ids[speaker_key]
57
+ speaker_key = speaker_key.lower().replace('_', '-')
 
 
58
 
59
  # Generate base TTS voice
60
+ speed = 1.0
61
+ source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
62
+ model.tts_to_file(text, speaker_id, tmp_melo_path,speed=speed)
 
 
 
 
 
 
63
 
64
+ # Use speaker_wav as reference to extract style embedding
65
+ from openvoice import se_extractor
66
+ ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)
 
 
67
 
68
  # Run the tone conversion
69
+ tone_color_converter.convert(
70
  audio_src_path=tmp_melo_path,
71
+ src_se=source_se,
72
  tgt_se=ref_se,
73
  output_path=final_output_path,
74
  message="@HuggingFace",
 
77
  return final_output_path
78
 
79
 
80
+
81
+
82
+ Gradio interface
83
+ gr.Interface(
84
+ fn=clone_and_speak,
85
+ inputs=[
86
+ gr.Textbox(label="Enter Text"),
87
+ gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
88
+ ],
89
+ outputs=gr.Audio(label="Synthesized Output"),
90
+ flagging_dir="/tmp/flagged",
91
+ title="Text to Voice using Melo TTS + OpenVoice",
92
+ description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
93
+ ).launch()
94
 
95
  iface = gr.Interface(
96
  fn=clone_with_base_speaker,