from fastapi import FastAPI from IPython.display import Audio import chardet import torch from transformers import BarkModel from transformers import AutoProcessor import soundfile as sf def pre_process_text(text): # Detect encoding result = chardet.detect(text.encode()) encoding = result['encoding'] if encoding != 'utf-8': # Decode and re-encode to UTF-8 decoded_text = text.decode(encoding) text = decoded_text.encode('utf-8').decode('utf-8') # Double encode ensures proper conversion return text model = BarkModel.from_pretrained("suno/bark-small") device = "cuda:0" if torch.cuda.is_available() else "cpu" model = model.to(device) processor = AutoProcessor.from_pretrained("suno/bark") app = FastAPI() @app.get("/") def root(): return "ttsapi" @app.get("/infer") def tts(input="this is testing ustable space"): text_prompt = "Let's try generating speech, with Bark, a text-to-speech model" inputs = processor(text_prompt) speech_output = model.generate(**inputs.to(device)) sampling_rate = model.generation_config.sample_rate audio_data = Audio(speech_output[0].cpu().numpy(), rate=sampling_rate) audio_bytes, _ = sf.write(None, audio_data, samplerate=sampling_rate) # Example using soundfile for WAV return Response(content=audio_bytes, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"})