SohomToom commited on
Commit
6d77b5b
·
verified ·
1 Parent(s): 711ecc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -24
app.py CHANGED
@@ -39,25 +39,41 @@ tone_color_converter = ToneColorConverter(ckpt_converter)
39
  # Device setting
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
 
42
- def clone_and_speak(text, speaker_wav):
43
- if not speaker_wav:
44
- return "Please upload a reference .wav file."
45
-
46
- # import melo.text.english as english
47
- # original_g2p = english.g2p
48
-
49
- # def patched_g2p(text):
50
- # phones, tones, word2ph = original_g2p(text)
51
- # # Fix: wrap ints in list to avoid TypeError
52
- # word2ph_fixed = []
53
- # for item in word2ph:
54
- # if isinstance(item, int):
55
- # word2ph_fixed.append([item])
56
- # else:
57
- # word2ph_fixed.append(item)
58
- # return phones, tones, word2ph_fixed
59
-
60
- # english.g2p = patched_g2p
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
63
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
@@ -66,15 +82,29 @@ def clone_and_speak(text, speaker_wav):
66
  # Use English speaker model
67
  model = TTS(language="EN", device=device)
68
  speaker_ids = model.hps.data.spk2id
69
- default_speaker_id = next(iter(speaker_ids.values()))
 
 
 
 
 
70
 
71
  # Generate base TTS voice
72
  speed = 1.0
73
- model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
74
 
75
- # Use speaker_wav as reference to extract style embedding
76
- from openvoice import se_extractor
77
- ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)
 
 
 
 
 
 
 
 
 
78
 
79
  # Run the tone conversion
80
  tone_color_converter.convert(
@@ -87,6 +117,7 @@ def clone_and_speak(text, speaker_wav):
87
 
88
  return final_output_path
89
 
 
90
  # Gradio interface
91
  gr.Interface(
92
  fn=clone_and_speak,
 
39
  # Device setting
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
 
42
+ # def clone_and_speak(text, speaker_wav):
43
+ # if not speaker_wav:
44
+ # return "Please upload a reference .wav file."
45
+
46
+ # base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
47
+ # tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
48
+ # final_output_path = f"{output_dir}/{base_name}_converted.wav"
49
+
50
+ # # Use English speaker model
51
+ # model = TTS(language="EN", device=device)
52
+ # speaker_ids = model.hps.data.spk2id
53
+ # default_speaker_id = next(iter(speaker_ids.values()))
54
+
55
+ # # Generate base TTS voice
56
+ # speed = 1.0
57
+ # model.tts_to_file(text, default_speaker_id, tmp_melo_path,speed=speed)
58
+
59
+ # # Use speaker_wav as reference to extract style embedding
60
+ # from openvoice import se_extractor
61
+ # ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)
62
+
63
+ # # Run the tone conversion
64
+ # tone_color_converter.convert(
65
+ # audio_src_path=tmp_melo_path,
66
+ # src_se=ref_se,
67
+ # tgt_se=ref_se,
68
+ # output_path=final_output_path,
69
+ # message="@HuggingFace",
70
+ # )
71
+
72
+ # return final_output_path
73
+
74
+ def clone_and_speak(text, selected_speaker_key):
75
+ if not text or not selected_speaker_key:
76
+ return "Please enter text and select a speaker."
77
 
78
  base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
79
  tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
 
82
  # Use English speaker model
83
  model = TTS(language="EN", device=device)
84
  speaker_ids = model.hps.data.spk2id
85
+
86
+ # Map speaker_key to speaker_id (model-specific)
87
+ if selected_speaker_key not in speaker_ids:
88
+ return f"Speaker '{selected_speaker_key}' not found in model."
89
+
90
+ speaker_id = speaker_ids[selected_speaker_key]
91
 
92
  # Generate base TTS voice
93
  speed = 1.0
94
+ model.tts_to_file(text, speaker_id, tmp_melo_path, speed=speed)
95
 
96
+ # Load pre-saved speaker embedding
97
+ normalized_key = selected_speaker_key.lower().replace("_", "-")
98
+ se_path = f'checkpoints_v2/base_speakers/ses/{normalized_key}.pth'
99
+
100
+ if not os.path.isfile(se_path):
101
+ return f"SE file not found for speaker '{normalized_key}'."
102
+
103
+ ref_se = torch.load(se_path, map_location=device)
104
+
105
+ # Disable MPS if present but device is CPU
106
+ if torch.backends.mps.is_available() and device == 'cpu':
107
+ torch.backends.mps.is_available = lambda: False
108
 
109
  # Run the tone conversion
110
  tone_color_converter.convert(
 
117
 
118
  return final_output_path
119
 
120
+
121
  # Gradio interface
122
  gr.Interface(
123
  fn=clone_and_speak,