Artificial-superintelligence commited on
Commit
f034b93
·
verified ·
1 Parent(s): 91cc0de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -155
app.py CHANGED
@@ -1,183 +1,192 @@
1
  import streamlit as st
2
- import torch
3
- import torchaudio
4
- import numpy as np
5
  import librosa
6
  import soundfile as sf
7
- from TTS.api import TTS
8
- from fairseq import checkpoint_utils
9
- import wget
10
- import os
11
  from io import BytesIO
12
  import tempfile
13
- import huggingface_hub
14
-
15
- class VoiceConverter:
16
- def __init__(self):
17
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
- self.load_models()
19
-
20
- def load_models(self):
21
- # Download pre-trained models if not exists
22
- models_dir = "pretrained_models"
23
- os.makedirs(models_dir, exist_ok=True)
24
-
25
- # Load Coqui TTS model
26
- self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
27
-
28
- # Load VITS model
29
- vits_path = os.path.join(models_dir, "vits_female.pth")
30
- if not os.path.exists(vits_path):
31
- # Download VITS pre-trained model
32
- wget.download(
33
- "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
34
- vits_path
35
- )
36
-
37
- self.vits_model = torch.load(vits_path, map_location=self.device)
38
- self.vits_model.eval()
39
 
40
- def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
41
- # Load audio
42
- wav, sr = librosa.load(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Resample if needed
45
- if sr != 22050:
46
- wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
47
- sr = 22050
48
-
49
- # Convert to tensor
50
- wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)
51
-
52
- # Process with VITS
53
- with torch.no_grad():
54
- converted = self.vits_model.voice_conversion(
55
- wav_tensor,
56
- speaker_id=speaker_id
57
- )
58
-
59
- # Process with Coqui TTS for emotion
60
- wav_path = "temp.wav"
61
- sf.write(wav_path, converted.cpu().numpy(), sr)
62
 
63
- emotional_wav = self.tts.tts_with_vc(
64
- wav_path,
65
- speaker_wav=wav_path,
66
- emotion=emotion
67
- )
68
-
69
- return emotional_wav, sr
70
-
71
- def save_audio(audio_data, sr):
72
- buffer = BytesIO()
73
- sf.write(buffer, audio_data, sr, format='WAV')
74
- return buffer
75
-
76
- # Streamlit Interface
77
- st.title("AI Voice Converter - Female Voice Transformation")
78
-
79
- # Model selection
80
- model_type = st.selectbox(
81
- "Select Voice Model",
82
- ["VITS Female", "YourTTS Female", "Mixed Model"]
83
- )
84
-
85
- # Voice character selection
86
- voice_character = st.selectbox(
87
- "Select Voice Character",
88
- ["Anime Female", "Natural Female", "Young Female", "Mature Female"]
89
- )
90
-
91
- # Emotion selection
92
- emotion = st.selectbox(
93
- "Select Emotion",
94
- ["Happy", "Sad", "Angry", "Neutral", "Excited"]
95
- )
96
-
97
- # Additional parameters
98
- with st.expander("Advanced Settings"):
99
- pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
100
- clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
101
- speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)
102
-
103
- # File upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
105
 
106
  if uploaded_file is not None:
107
- # Initialize converter
108
- converter = VoiceConverter()
109
-
110
  # Save uploaded file temporarily
111
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
112
  tmp_file.write(uploaded_file.getvalue())
113
  tmp_path = tmp_file.name
114
 
115
- if st.button("Convert Voice"):
116
- try:
117
- with st.spinner("Converting voice... This may take a few moments."):
118
- # Get speaker ID based on voice character
119
- speaker_id = {
120
- "Anime Female": 0,
121
- "Natural Female": 1,
122
- "Young Female": 2,
123
- "Mature Female": 3
124
- }[voice_character]
 
 
 
 
 
125
 
126
- # Convert voice
127
- converted_audio, sr = converter.convert_voice(
128
- tmp_path,
129
- speaker_id=speaker_id,
130
- emotion=emotion
 
 
 
 
 
 
131
  )
132
-
133
- # Create audio buffer
134
- audio_buffer = save_audio(converted_audio, sr)
135
-
 
136
  # Display audio player
137
- st.audio(audio_buffer, format='audio/wav')
138
-
139
  # Download button
140
  st.download_button(
141
  label="Download Converted Audio",
142
- data=audio_buffer,
143
- file_name="ai_converted_voice.wav",
144
  mime="audio/wav"
145
  )
 
 
 
146
 
147
- except Exception as e:
148
- st.error(f"Error during conversion: {str(e)}")
149
-
150
- # Add information about the models
151
  st.markdown("""
152
- ### Model Information:
153
- 1. **VITS Female**: Pre-trained on a large dataset of female voices
154
- 2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
155
- 3. **Mixed Model**: Combination of multiple models for better quality
156
-
157
- ### Voice Characters:
158
- - **Anime Female**: High-pitched, animated style voice
159
- - **Natural Female**: Realistic female voice
160
- - **Young Female**: Young adult female voice
161
- - **Mature Female**: Mature female voice
162
 
163
  ### Tips for Best Results:
164
- - Use clear audio input with minimal background noise
165
- - Short audio clips (5-30 seconds) work best
166
- - Experiment with different emotions and voice characters
167
- - Adjust advanced settings for fine-tuning
168
- """)
169
-
170
- # Requirements
171
- """
172
- pip install requirements:
173
- TTS
174
- fairseq
175
- torch
176
- torchaudio
177
- streamlit
178
- librosa
179
- soundfile
180
- numpy
181
- wget
182
- huggingface_hub
183
- """
 
1
  import streamlit as st
 
 
 
2
  import librosa
3
  import soundfile as sf
4
+ import numpy as np
5
+ import scipy.signal as signal
6
+ from scipy.io import wavfile
 
7
  from io import BytesIO
8
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def modify_formants(y, sr, formant_shift_factor=1.2):
11
+ # Get the power spectrum
12
+ D = librosa.stft(y)
13
+ S = np.abs(D)
14
+
15
+ # Use frame-based processing for LPC
16
+ frame_length = 2048
17
+ hop_length = 512
18
+ frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length)
19
+
20
+ # Process each frame
21
+ modified_frames = []
22
+ for frame in frames.T:
23
+ # Calculate LPC coefficients
24
+ a = librosa.lpc(frame, order=12)
25
 
26
+ # Shift formants
27
+ new_a = np.zeros_like(a)
28
+ new_a[0] = a[0]
29
+ for i in range(1, len(a)):
30
+ new_a[i] = a[i] * (formant_shift_factor ** i)
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Apply modified LPC filter
33
+ modified_frame = signal.lfilter([1], new_a, frame)
34
+ modified_frames.append(modified_frame)
35
+
36
+ # Reconstruct the signal
37
+ y_formant = np.concatenate([frame[:hop_length] for frame in modified_frames[:-1]] +
38
+ [modified_frames[-1]])
39
+
40
+ return librosa.util.normalize(y_formant)
41
+
42
+ def enhance_harmonics(y, sr):
43
+ # Extract harmonics using harmonic-percussive source separation
44
+ y_harmonic = librosa.effects.hpss(y)[0]
45
+
46
+ # Enhance the harmonics
47
+ y_enhanced = y_harmonic * 1.2 + y * 0.3
48
+ return librosa.util.normalize(y_enhanced)
49
+
50
+ def process_audio_advanced(audio_file, settings):
51
+ # Load audio
52
+ y, sr = librosa.load(audio_file)
53
+
54
+ # Pitch shifting with formant preservation
55
+ y_shifted = librosa.effects.pitch_shift(
56
+ y,
57
+ sr=sr,
58
+ n_steps=settings['pitch_shift']
59
+ )
60
+
61
+ # Modify formants
62
+ y_formant = modify_formants(
63
+ y_shifted,
64
+ sr,
65
+ settings['formant_shift']
66
+ )
67
+
68
+ # Enhance harmonics
69
+ y_harmonic = enhance_harmonics(y_formant, sr)
70
+
71
+ # Apply vocal tract length modification through resampling
72
+ y_vtln = librosa.effects.time_stretch(
73
+ y_harmonic,
74
+ rate=settings['vtln_factor']
75
+ )
76
+
77
+ # Smooth the output
78
+ y_smooth = signal.savgol_filter(y_vtln, 1001, 2)
79
+
80
+ # Final normalization
81
+ y_final = librosa.util.normalize(y_smooth)
82
+
83
+ return y_final, sr
84
+
85
+ def create_voice_preset(preset_name):
86
+ presets = {
87
+ 'Young Female': {
88
+ 'pitch_shift': 8.0,
89
+ 'formant_shift': 1.3,
90
+ 'vtln_factor': 1.1,
91
+ 'breathiness': 0.3
92
+ },
93
+ 'Mature Female': {
94
+ 'pitch_shift': 6.0,
95
+ 'formant_shift': 1.2,
96
+ 'vtln_factor': 1.05,
97
+ 'breathiness': 0.2
98
+ },
99
+ 'Soft Female': {
100
+ 'pitch_shift': 7.0,
101
+ 'formant_shift': 1.25,
102
+ 'vtln_factor': 1.15,
103
+ 'breathiness': 0.4
104
+ }
105
+ }
106
+ return presets.get(preset_name)
107
+
108
+ def add_breathiness(y, sr, amount=0.3):
109
+ # Generate breath noise
110
+ noise = np.random.normal(0, 0.01, len(y))
111
+ noise_filtered = signal.lfilter([1], [1, -0.98], noise)
112
+
113
+ # Mix with original signal
114
+ y_breathy = y * (1 - amount) + noise_filtered * amount
115
+ return librosa.util.normalize(y_breathy)
116
+
117
+ st.title("Advanced Female Voice Converter")
118
+
119
+ # File uploader
120
  uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
121
 
122
  if uploaded_file is not None:
 
 
 
123
  # Save uploaded file temporarily
124
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
125
  tmp_file.write(uploaded_file.getvalue())
126
  tmp_path = tmp_file.name
127
 
128
+ # Voice preset selector
129
+ preset_name = st.selectbox(
130
+ "Select Voice Preset",
131
+ ['Young Female', 'Mature Female', 'Soft Female', 'Custom']
132
+ )
133
+
134
+ if preset_name == 'Custom':
135
+ settings = {
136
+ 'pitch_shift': st.slider("Pitch Shift", 0.0, 12.0, 8.0, 0.5),
137
+ 'formant_shift': st.slider("Formant Shift", 1.0, 1.5, 1.2, 0.05),
138
+ 'vtln_factor': st.slider("Vocal Tract Length", 0.9, 1.2, 1.1, 0.05),
139
+ 'breathiness': st.slider("Breathiness", 0.0, 1.0, 0.3, 0.1)
140
+ }
141
+ else:
142
+ settings = create_voice_preset(preset_name)
143
 
144
+ if st.button("Convert Voice"):
145
+ with st.spinner("Processing audio..."):
146
+ try:
147
+ # Process audio
148
+ processed_audio, sr = process_audio_advanced(tmp_path, settings)
149
+
150
+ # Add breathiness
151
+ processed_audio = add_breathiness(
152
+ processed_audio,
153
+ sr,
154
+ settings['breathiness']
155
  )
156
+
157
+ # Save to buffer
158
+ buffer = BytesIO()
159
+ sf.write(buffer, processed_audio, sr, format='WAV')
160
+
161
  # Display audio player
162
+ st.audio(buffer, format='audio/wav')
163
+
164
  # Download button
165
  st.download_button(
166
  label="Download Converted Audio",
167
+ data=buffer,
168
+ file_name="female_voice_converted.wav",
169
  mime="audio/wav"
170
  )
171
+
172
+ except Exception as e:
173
+ st.error(f"Error processing audio: {str(e)}")
174
 
 
 
 
 
175
  st.markdown("""
176
+ ### Voice Conversion Features:
177
+ - Pitch shifting with formant preservation
178
+ - Harmonic enhancement
179
+ - Vocal tract length modification
180
+ - Natural breathiness addition
181
+ - Multiple voice presets
182
+ - Custom parameter controls
 
 
 
183
 
184
  ### Tips for Best Results:
185
+ 1. Start with a clear audio recording
186
+ 2. Try different presets to find the best match
187
+ 3. For custom settings:
188
+ - Pitch shift: 6-8 for natural female voice
189
+ - Formant shift: 1.1-1.3 for feminine resonance
190
+ - Vocal tract length: 1.05-1.15 for realistic results
191
+ - Breathiness: 0.2-0.4 for natural sound
192
+ """)