from fastapi import FastAPI
from IPython.display import Audio
import chardet
import torch
from transformers import BarkModel
from transformers import AutoProcessor
import soundfile as sf


def pre_process_text(text):
  # Detect encoding
  result = chardet.detect(text.encode())
  encoding = result['encoding']

  if encoding != 'utf-8':
    # Decode and re-encode to UTF-8
    decoded_text = text.decode(encoding)
    text = decoded_text.encode('utf-8').decode('utf-8')  # Double encode ensures proper conversion
  return text
    

model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
processor = AutoProcessor.from_pretrained("suno/bark")


app = FastAPI()

@app.get("/")
def root():
    return "ttsapi"

@app.get("/infer")
def tts(input="this is testing ustable space"):

    text_prompt = "Let's try generating speech, with Bark, a text-to-speech model"
    inputs = processor(text_prompt)
    speech_output = model.generate(**inputs.to(device))
    sampling_rate = model.generation_config.sample_rate
    audio_data = Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)
    audio_bytes, _ = sf.write(None, audio_data, samplerate=sampling_rate)  # Example using soundfile for WAV
    return Response(content=audio_bytes, media_type="audio/wav", headers={"Content-Disposition": "attachment; filename=audio.wav"})