Spaces:

Artificial-superintelligence
/

Algorithmvoice

Running

App Files Files Community

Artificial-superintelligence commited on Oct 17, 2024

Commit

f034b93

verified ·

1 Parent(s): 91cc0de

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -155

app.py CHANGED Viewed

@@ -1,183 +1,192 @@
 import streamlit as st
-import torch
-import torchaudio
-import numpy as np
 import librosa
 import soundfile as sf
-from TTS.api import TTS
-from fairseq import checkpoint_utils
-import wget
-import os
 from io import BytesIO
 import tempfile
-import huggingface_hub
-class VoiceConverter:
-    def __init__(self):
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.load_models()
-    def load_models(self):
-        # Download pre-trained models if not exists
-        models_dir = "pretrained_models"
-        os.makedirs(models_dir, exist_ok=True)
-        # Load Coqui TTS model
-        self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
-        # Load VITS model
-        vits_path = os.path.join(models_dir, "vits_female.pth")
-        if not os.path.exists(vits_path):
-            # Download VITS pre-trained model
-            wget.download(
-                "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
-                vits_path
-            )
-        self.vits_model = torch.load(vits_path, map_location=self.device)
-        self.vits_model.eval()
-    def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
-        # Load audio
-        wav, sr = librosa.load(audio_path)
-        # Resample if needed
-        if sr != 22050:
-            wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
-            sr = 22050
-        # Convert to tensor
-        wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)
-        # Process with VITS
-        with torch.no_grad():
-            converted = self.vits_model.voice_conversion(
-                wav_tensor,
-                speaker_id=speaker_id
-            )
-        # Process with Coqui TTS for emotion
-        wav_path = "temp.wav"
-        sf.write(wav_path, converted.cpu().numpy(), sr)
-        emotional_wav = self.tts.tts_with_vc(
-            wav_path,
-            speaker_wav=wav_path,
-            emotion=emotion
-        )
-        return emotional_wav, sr
-def save_audio(audio_data, sr):
-    buffer = BytesIO()
-    sf.write(buffer, audio_data, sr, format='WAV')
-    return buffer
-# Streamlit Interface
-st.title("AI Voice Converter - Female Voice Transformation")
-# Model selection
-model_type = st.selectbox(
-    "Select Voice Model",
-    ["VITS Female", "YourTTS Female", "Mixed Model"]
-)
-# Voice character selection
-voice_character = st.selectbox(
-    "Select Voice Character",
-    ["Anime Female", "Natural Female", "Young Female", "Mature Female"]
-)
-# Emotion selection
-emotion = st.selectbox(
-    "Select Emotion",
-    ["Happy", "Sad", "Angry", "Neutral", "Excited"]
-)
-# Additional parameters
-with st.expander("Advanced Settings"):
-    pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
-    clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
-    speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)
-# File upload
 uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
 if uploaded_file is not None:
-    # Initialize converter
-    converter = VoiceConverter()
     # Save uploaded file temporarily
     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
         tmp_file.write(uploaded_file.getvalue())
         tmp_path = tmp_file.name
-    if st.button("Convert Voice"):
-        try:
-            with st.spinner("Converting voice... This may take a few moments."):
-                # Get speaker ID based on voice character
-                speaker_id = {
-                    "Anime Female": 0,
-                    "Natural Female": 1,
-                    "Young Female": 2,
-                    "Mature Female": 3
-                }[voice_character]
-                # Convert voice
-                converted_audio, sr = converter.convert_voice(
-                    tmp_path,
-                    speaker_id=speaker_id,
-                    emotion=emotion
                 )
-                # Create audio buffer
-                audio_buffer = save_audio(converted_audio, sr)
                 # Display audio player
-                st.audio(audio_buffer, format='audio/wav')
                 # Download button
                 st.download_button(
                     label="Download Converted Audio",
-                    data=audio_buffer,
-                    file_name="ai_converted_voice.wav",
                     mime="audio/wav"
                 )
-        except Exception as e:
-            st.error(f"Error during conversion: {str(e)}")
-# Add information about the models
 st.markdown("""
-### Model Information:
-1. **VITS Female**: Pre-trained on a large dataset of female voices
-2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
-3. **Mixed Model**: Combination of multiple models for better quality
-### Voice Characters:
-- **Anime Female**: High-pitched, animated style voice
-- **Natural Female**: Realistic female voice
-- **Young Female**: Young adult female voice
-- **Mature Female**: Mature female voice
 ### Tips for Best Results:
-- Use clear audio input with minimal background noise
-- Short audio clips (5-30 seconds) work best
-- Experiment with different emotions and voice characters
-- Adjust advanced settings for fine-tuning
-""")
-# Requirements
-"""
-pip install requirements:
-TTS
-fairseq
-torch
-torchaudio
-streamlit
-librosa
-soundfile
-numpy
-wget
-huggingface_hub
-"""

 import streamlit as st
 import librosa
 import soundfile as sf
+import numpy as np
+import scipy.signal as signal
+from scipy.io import wavfile
 from io import BytesIO
 import tempfile
+def modify_formants(y, sr, formant_shift_factor=1.2):
+    # Get the power spectrum
+    D = librosa.stft(y)
+    S = np.abs(D)
+    # Use frame-based processing for LPC
+    frame_length = 2048
+    hop_length = 512
+    frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length)
+    # Process each frame
+    modified_frames = []
+    for frame in frames.T:
+        # Calculate LPC coefficients
+        a = librosa.lpc(frame, order=12)
+        # Shift formants
+        new_a = np.zeros_like(a)
+        new_a[0] = a[0]
+        for i in range(1, len(a)):
+            new_a[i] = a[i] * (formant_shift_factor ** i)
+        # Apply modified LPC filter
+        modified_frame = signal.lfilter([1], new_a, frame)
+        modified_frames.append(modified_frame)
+    # Reconstruct the signal
+    y_formant = np.concatenate([frame[:hop_length] for frame in modified_frames[:-1]] +
+                              [modified_frames[-1]])
+    return librosa.util.normalize(y_formant)
+def enhance_harmonics(y, sr):
+    # Extract harmonics using harmonic-percussive source separation
+    y_harmonic = librosa.effects.hpss(y)[0]
+    # Enhance the harmonics
+    y_enhanced = y_harmonic * 1.2 + y * 0.3
+    return librosa.util.normalize(y_enhanced)
+def process_audio_advanced(audio_file, settings):
+    # Load audio
+    y, sr = librosa.load(audio_file)
+    # Pitch shifting with formant preservation
+    y_shifted = librosa.effects.pitch_shift(
+        y,
+        sr=sr,
+        n_steps=settings['pitch_shift']
+    )
+    # Modify formants
+    y_formant = modify_formants(
+        y_shifted,
+        sr,
+        settings['formant_shift']
+    )
+    # Enhance harmonics
+    y_harmonic = enhance_harmonics(y_formant, sr)
+    # Apply vocal tract length modification through resampling
+    y_vtln = librosa.effects.time_stretch(
+        y_harmonic,
+        rate=settings['vtln_factor']
+    )
+    # Smooth the output
+    y_smooth = signal.savgol_filter(y_vtln, 1001, 2)
+    # Final normalization
+    y_final = librosa.util.normalize(y_smooth)
+    return y_final, sr
+def create_voice_preset(preset_name):
+    presets = {
+        'Young Female': {
+            'pitch_shift': 8.0,
+            'formant_shift': 1.3,
+            'vtln_factor': 1.1,
+            'breathiness': 0.3
+        },
+        'Mature Female': {
+            'pitch_shift': 6.0,
+            'formant_shift': 1.2,
+            'vtln_factor': 1.05,
+            'breathiness': 0.2
+        },
+        'Soft Female': {
+            'pitch_shift': 7.0,
+            'formant_shift': 1.25,
+            'vtln_factor': 1.15,
+            'breathiness': 0.4
+        }
+    }
+    return presets.get(preset_name)
+def add_breathiness(y, sr, amount=0.3):
+    # Generate breath noise
+    noise = np.random.normal(0, 0.01, len(y))
+    noise_filtered = signal.lfilter([1], [1, -0.98], noise)
+    # Mix with original signal
+    y_breathy = y * (1 - amount) + noise_filtered * amount
+    return librosa.util.normalize(y_breathy)
+st.title("Advanced Female Voice Converter")
+# File uploader
 uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
 if uploaded_file is not None:
     # Save uploaded file temporarily
     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
         tmp_file.write(uploaded_file.getvalue())
         tmp_path = tmp_file.name
+    # Voice preset selector
+    preset_name = st.selectbox(
+        "Select Voice Preset",
+        ['Young Female', 'Mature Female', 'Soft Female', 'Custom']
+    )
+    if preset_name == 'Custom':
+        settings = {
+            'pitch_shift': st.slider("Pitch Shift", 0.0, 12.0, 8.0, 0.5),
+            'formant_shift': st.slider("Formant Shift", 1.0, 1.5, 1.2, 0.05),
+            'vtln_factor': st.slider("Vocal Tract Length", 0.9, 1.2, 1.1, 0.05),
+            'breathiness': st.slider("Breathiness", 0.0, 1.0, 0.3, 0.1)
+        }
+    else:
+        settings = create_voice_preset(preset_name)
+    if st.button("Convert Voice"):
+        with st.spinner("Processing audio..."):
+            try:
+                # Process audio
+                processed_audio, sr = process_audio_advanced(tmp_path, settings)
+                # Add breathiness
+                processed_audio = add_breathiness(
+                    processed_audio,
+                    sr,
+                    settings['breathiness']
                 )
+                # Save to buffer
+                buffer = BytesIO()
+                sf.write(buffer, processed_audio, sr, format='WAV')
                 # Display audio player
+                st.audio(buffer, format='audio/wav')
                 # Download button
                 st.download_button(
                     label="Download Converted Audio",
+                    data=buffer,
+                    file_name="female_voice_converted.wav",
                     mime="audio/wav"
                 )
+            except Exception as e:
+                st.error(f"Error processing audio: {str(e)}")
 st.markdown("""
+### Voice Conversion Features:
+- Pitch shifting with formant preservation
+- Harmonic enhancement
+- Vocal tract length modification
+- Natural breathiness addition
+- Multiple voice presets
+- Custom parameter controls
 ### Tips for Best Results:
+1. Start with a clear audio recording
+2. Try different presets to find the best match
+3. For custom settings:
+   - Pitch shift: 6-8 for natural female voice
+   - Formant shift: 1.1-1.3 for feminine resonance
+   - Vocal tract length: 1.05-1.15 for realistic results
+   - Breathiness: 0.2-0.4 for natural sound
+""")