Seqath's picture
Create app.py
bc8665f verified
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import soundfile as sf
import librosa
import numpy as np
from flask import Flask, request, jsonify
import gradio as gr
app = Flask(__name__)
# Load pre-trained model and tokenizer from Hugging Face
model_name = "facebook/wav2vec2-large-960h"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
def load_audio(file_path):
audio, _ = librosa.load(file_path, sr=16000)
return audio
def clone_voice(audio):
input_values = tokenizer(audio, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.decode(predicted_ids[0])
# Placeholder for voice conversion logic
converted_audio = np.array(audio) # Replace with actual conversion logic
output_path = "song_output/output.wav"
sf.write(output_path, converted_audio, 16000)
return output_path
@app.route('/clone-voice', methods=['POST'])
def clone_voice_endpoint():
if 'file' not in request.files:
return jsonify({"error": "No file provided"}), 400
file = request.files['file']
file_path = "input.wav"
file.save(file_path)
audio = load_audio(file_path)
output_path = clone_voice(audio)
return jsonify({"output_path": output_path}), 200
def main_interface(audio):
output_path = clone_voice(audio)
return output_path
iface = gr.Interface(fn=main_interface,
inputs=gr.Audio(source="upload", type="numpy"),
outputs=gr.Audio(type="file"))
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=5000)