Spaces:
Sleeping
Sleeping
File size: 1,707 Bytes
25681f6 7fc6e29 7031ffc cf014fb 7031ffc cf014fb 7fc6e29 25681f6 cf014fb 7fc6e29 6d598f3 7fc6e29 cf014fb 7fc6e29 37a2817 7031ffc e4097c6 7fc6e29 cf014fb 7fc6e29 cf014fb 7fc6e29 cf014fb 7fc6e29 6d598f3 7fc6e29 cf014fb e4097c6 6d598f3 cf014fb 7fc6e29 1b41e6d cf014fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import gradio as gr
from transformers import pipeline
import edge_tts
import numpy as np
import asyncio
import os
# Load STT and chatbot pipelines
stt = pipeline("automatic-speech-recognition", model="openai/whisper-small")
chatbot = pipeline("text-generation", model="HooshvareLab/gpt2-fa")
async def tts(text: str, voice: str = "fa-IR-FaridNeural"):
communicate = edge_tts.Communicate(text, voice)
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
audio_array = np.frombuffer(audio_data, dtype=np.int16)
sample_rate = 24000
return sample_rate, audio_array
async def audio_to_audio(audio_input):
if audio_input is None:
return None, "No audio input received."
sample_rate_in, data_in = audio_input
audio = {"array": data_in, "sampling_rate": sample_rate_in}
# 1. ASR → text
text = stt(audio)["text"]
# 2. Generate response
response = chatbot(text, max_length=50, num_return_sequences=1)[0]["generated_text"]
# 3. TTS
return await tts(response)
# Gradio interface
demo = gr.Interface(
fn=audio_to_audio,
inputs=gr.Audio(
sources=["microphone"], # Use 'sources' instead of deprecated 'source' :contentReference[oaicite:2]{index=2}
type="numpy",
label="Speak in Farsi"
),
outputs=gr.Audio(type="numpy", label="Response in Farsi"),
title="Farsi Audio Chatbot",
description="Speak in Farsi, and the app will respond in Farsi audio.",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860))
)
|