import gradio as gr from asr import transcribe_auto from huggingface_hub import InferenceClient from ttsmms import download, TTS from langdetect import detect # Initialize text generation client client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf") # Download and load TTS models for Swahili and English swahili_dir = download("swh", "./data/swahili") english_dir = download("eng", "./data/english") # Ensure an English TTS model is available swahili_tts = TTS(swahili_dir) english_tts = TTS(english_dir) def is_uncertain(question, response): """Check if the model's response is unreliable.""" if len(response.split()) < 4 or response.lower() in question.lower(): return True uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"] return any(phrase.lower() in response.lower() for phrase in uncertain_phrases) def generate_text(prompt): """Generate a response from the text generation model.""" messages = [{"role": "user", "content": prompt}] response = "" for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95): token = message.choices[0].delta.content response += token if is_uncertain(prompt, response): return "AI is uncertain about the response." return response # Function to detect language and generate speech def text_to_speech(text): lang = detect(text) # Detect language wav_path = "./output.wav" if lang == "sw": # Swahili swahili_tts.synthesis(text, wav_path=wav_path) else: # Default to English if not Swahili english_tts.synthesis(text, wav_path=wav_path) return wav_path def process_audio(audio): # Step 1: Transcribe the audio transcription = transcribe_auto(audio) # Step 2: Generate text based on the transcription generated_text = generate_text(transcription) # Step 3: Convert the generated text to speech speech = text_to_speech(generated_text) return transcription, generated_text, speech # Gradio Interface with gr.Blocks() as demo: gr.Markdown("
End-to-End ASR, Text Generation, and TTS
") gr.HTML("