from fastapi import FastAPI, File, UploadFile from fastapi.responses import StreamingResponse from fastapi.responses import FileResponse, HTMLResponse from fastapi.middleware.cors import CORSMiddleware import os import io import json import torch #import httpcore #setattr(httpcore, 'SyncHTTPTransport', 'AsyncHTTPProxy') app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) #============speech_to_text======================================================= from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read import tempfile import os MODEL_NAME = "openai/whisper-base" BATCH_SIZE = 8 FILE_LIMIT_MB = 1000 YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) @app.post("/speech_to_text_whispher") async def speech_to_text_whispher(file: UploadFile = File(...)): file_path = "inputvoice.mp3" with open(file_path, "wb") as f: f.write(file.file.read()) #with open(file_path, "rb") as f: #inputs = f.read() #inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate) #inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} text = pipe(file_path, batch_size=BATCH_SIZE, generate_kwargs={"task":"transcribe"}, return_timestamps=True)["text"] return {"transcribe":text} #text_to_speech bock=========================================================================== import googletrans from googletrans import Translator translator = Translator() lan = googletrans.LANGUAGES keys = list(lan.keys()) vals = list(lan.values()) def translate(text,language): return translator.translate(text,dest=keys[vals.index(language)]).text os.environ["COQUI_TOS_AGREED"] = "1" from TTS.api import TTS # Get device device = "cuda" if torch.cuda.is_available() else "cpu" # List available 🐸TTS models print(TTS().list_models()) # Init TTS xtts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) starting_text = " This is [SDR’s name] with Neural Leap. I saw you schedule a call with us for [Insert day and time] to learn more about our AI engineering services. Does that ring a bell?" xtts.tts_to_file(starting_text,speaker_wav="hell fast 20x.mp3",language="en",file_path="output.wav") @app.get("/text-to-speech/") async def text_to_speech(text,language,mode): print(text) global xtts audio_file = 'text_to_speech.wav' if language=="vietnamese": from gtts import gTTS if mode=="both": tts = gTTS(str(translate(text,language))+" "+text) tts.save(audio_file) else: tts = gTTS(str(translate(text,language))) tts.save(audio_file) else: if mode=="both": xtts.tts_to_file(str(translate(text,language))+" "+text,speaker_wav="hell fast 20x.mp3",language=keys[vals.index(language)],file_path=audio_file) else: xtts.tts_to_file(str(translate(text,language)),speaker_wav="hell fast 20x.mp3",language=keys[vals.index(language)],file_path=audio_file) return FileResponse('text_to_speech.wav', media_type='audio/mpeg')