Spaces:

wolf1997
/

text_to_speech_api

Sleeping

App Files Files Community

text_to_speech_api / app.py

wolf1997

Upload 3 files

157a15c verified 7 months ago

raw

history blame contribute delete

2.73 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import Response
	from pydantic import BaseModel
	from kokoro import KPipeline

	import soundfile as sf
	import torch
	import os
	import uuid
	import numpy as np
	import io
	from typing import Optional



	pipeline = KPipeline(lang_code='a')
	app = FastAPI(title="Text to Speech API")

	class TextToSpeechRequest(BaseModel):
	text: str
	language: Optional[str] = "en"
	slow: Optional[bool] = False



	def tensor_to_audio_bytes(audio_tensor: torch.Tensor, sample_rate: int = 24000) -> bytes:
	"""
	Convert a float audio tensor to bytes.

	Args:
	audio_tensor (torch.Tensor): Input audio tensor of shape (samples,) or (channels, samples)
	sample_rate (int): Sample rate of the audio in Hz. Default is 24000.

	Returns:
	bytes: Audio data in bytes format
	"""
	# Ensure tensor is on CPU and convert to numpy
	audio_np = audio_tensor.detach().cpu().numpy()

	# Handle different input shapes
	if len(audio_np.shape) == 1:
	# Mono audio (samples,)
	audio_np = audio_np.reshape(1, -1)
	elif len(audio_np.shape) > 2:
	raise ValueError(f"Expected 1D or 2D tensor, got shape {audio_np.shape}")

	# Create a bytes buffer
	buffer = io.BytesIO()

	# Write audio data to buffer using soundfile
	sf.write(buffer, audio_np.T, sample_rate, format='WAV')

	# Get the bytes from the buffer
	audio_bytes = buffer.getvalue()
	buffer.close()

	return audio_bytes

	@app.post("/tts")
	async def text_to_speech(request: TextToSpeechRequest):
	try:


	generator = pipeline(request.text, voice='af_heart')
	for i, (gs, ps, audio) in enumerate(generator):
	audio_tensor = audio

	audio_bytes = tensor_to_audio_bytes(audio_tensor)

	# Return audio bytes directly with appropriate headers
	return Response(
	content=audio_bytes,
	media_type="audio/wav",
	headers={
	"Content-Disposition": "attachment; filename=speech.wav"
	}
	)
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/")
	async def root():
	return {"message": "Welcome to the Text to Speech API. Use POST /tts to convert text to speech. the body should be a json with the following fields: {'text': 'text to convert to speech', 'language': 'language code (optional, default is en)', 'slow': 'boolean (optional, default is False)'}"}

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)