Vaishnavi0404's picture
Update app.py
1973e0f verified
import os
import gradio as gr
import torch
import numpy as np
import librosa
import text2emotion as te
import nltk
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
from music_generator import generate_accompaniment
from text_processor import TextProcessor
from voice_synthesizer import VoiceSynthesizer
from singing_converter import SingingConverter
import setup
import sys
import subprocess
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
sys.path.append('./g2pM')
sys.path.append('./DiffSinger')
setup.setup_speaker_embeddings()
# Download necessary NLTK data
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('punkt')
# Initialize components
text_processor = TextProcessor()
voice_synthesizer = VoiceSynthesizer()
singing_converter = SingingConverter()
# Setup sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")
def create_placeholder_audio(output_path, duration=5, sample_rate=22050):
"""Create a placeholder silence audio file"""
silence = np.zeros(int(duration * sample_rate))
sf.write(output_path, silence, sample_rate)
return output_path
def convert_midi_to_wav(midi_path, wav_path, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2'):
"""Convert MIDI file to WAV using fluidsynth"""
# Check if the MIDI file exists
if not os.path.exists(midi_path):
print(f"MIDI file not found: {midi_path}")
print("Creating placeholder audio file instead")
return create_placeholder_audio(wav_path)
try:
# Use fluidsynth to convert MIDI to WAV
subprocess.run([
'fluidsynth',
'-a', 'file',
'-F', wav_path,
soundfont_path,
midi_path
], check=True)
return wav_path
except subprocess.CalledProcessError as e:
print(f"Error converting MIDI to WAV: {e}")
return create_placeholder_audio(wav_path)
except FileNotFoundError:
print("fluidsynth not found. Using placeholder audio instead.")
return create_placeholder_audio(wav_path)
def process_text_to_singing(text, voice_type="neutral", tempo=100, pitch_shift=0):
"""
Convert text to singing voice with accompaniment based on mood
Args:
text (str): Input text to be converted to singing
voice_type (str): Type of voice (neutral, feminine, masculine)
tempo (int): Speed of the singing (60-180 BPM)
pitch_shift (int): Pitch adjustment (-12 to 12 semitones)
Returns:
tuple: (input_audio_path, output_audio_path)
"""
# Step 1: Analyze text for emotion/mood
emotions = te.get_emotion(text)
dominant_emotion = max(emotions.items(), key=lambda x: x[1])[0] if emotions else "Happy"
# Additional sentiment analysis
sentiment_result = sentiment_analyzer(text)[0]
sentiment_score = sentiment_result['score'] * (1 if sentiment_result['label'] == 'POSITIVE' else -1)
print(f"Detected emotion: {dominant_emotion}")
print(f"Sentiment score: {sentiment_score}")
# Step 2: Process text for pronunciation and timing
phonemes, durations, stress_markers = text_processor.process(text)
# Step 3: Generate speech audio first
speech_audio_path = "temp_speech.wav"
voice_synthesizer.synthesize(
text=text,
output_path=speech_audio_path,
voice_type=voice_type
)
# Step 4: Convert speech to singing
singing_audio_path = "temp_singing.wav"
singing_converter.convert(
speech_path=speech_audio_path,
output_path=singing_audio_path,
emotion=dominant_emotion,
phonemes=phonemes,
durations=durations,
stress_markers=stress_markers,
pitch_shift=pitch_shift,
tempo=tempo
)
# Step 5: Generate musical accompaniment based on mood
accompaniment_midi_path = "temp_accompaniment.mid"
# Map emotion to musical key and style
emotion_key_map = {
"Happy": "C",
"Sad": "Am",
"Angry": "Em",
"Fear": "Dm",
"Surprise": "G"
}
key = emotion_key_map.get(dominant_emotion, "C")
style = "pop" # Default style
# Adjust tempo based on emotion if not explicitly set
tempo_value = tempo
try:
# Try to generate the accompaniment MIDI
generate_accompaniment(
lyrics=text,
melody_path=singing_audio_path,
output_path=accompaniment_midi_path,
tempo_value=tempo_value,
key=key,
time_signature="4/4",
style=style
)
except Exception as e:
print(f"Error generating accompaniment: {e}")
# We'll handle this with the convert_midi_to_wav function that creates a placeholder
# Convert MIDI to WAV
accompaniment_path = "temp_accompaniment.wav"
convert_midi_to_wav(accompaniment_midi_path, accompaniment_path)
# Step 6: Mix singing voice with accompaniment
final_output_path = "output_song.wav"
# Load singing audio
singing = AudioSegment.from_file(singing_audio_path)
# Load accompaniment or create placeholder if loading fails
try:
accompaniment = AudioSegment.from_file(accompaniment_path)
except Exception as e:
print(f"Error loading accompaniment: {e}")
create_placeholder_audio(accompaniment_path)
accompaniment = AudioSegment.from_file(accompaniment_path)
# Adjust volumes
singing = singing - 3 # Reduce singing volume slightly
accompaniment = accompaniment - 10 # Reduce accompaniment volume more
# Make sure accompaniment is the same length as singing
if len(accompaniment) < len(singing):
# Loop accompaniment to match singing length
times_to_repeat = (len(singing) / len(accompaniment)) + 1
accompaniment = accompaniment * int(times_to_repeat)
accompaniment = accompaniment[:len(singing)]
# Mix tracks
mixed = singing.overlay(accompaniment)
mixed.export(final_output_path, format="wav")
return speech_audio_path, final_output_path
# Create Gradio interface
with gr.Blocks(title="Text2Sing-DiffSinger") as demo:
gr.Markdown("# Text2Sing-DiffSinger")
gr.Markdown("Convert text into singing voice with musical accompaniment based on emotional content")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter text to convert to singing",
placeholder="Type your lyrics here...",
lines=5
)
with gr.Row():
voice_type = gr.Dropdown(
label="Voice Type",
choices=["neutral", "feminine", "masculine"],
value="neutral"
)
tempo = gr.Slider(
label="Tempo (BPM)",
minimum=60,
maximum=180,
value=100,
step=5
)
pitch_shift = gr.Slider(
label="Pitch Adjustment",
minimum=-12,
maximum=12,
value=0,
step=1
)
convert_btn = gr.Button("Convert to Singing")
with gr.Column():
input_audio = gr.Audio(label="Original Speech")
output_audio = gr.Audio(label="Singing Output")
convert_btn.click(
fn=process_text_to_singing,
inputs=[text_input, voice_type, tempo, pitch_shift],
outputs=[input_audio, output_audio]
)
# Launch the app
if __name__ == "__main__":
demo.launch()