import gradio as gr from asr import transcribe_auto from huggingface_hub import InferenceClient from ttsmms import download, TTS from langdetect import detect # Initialize text generation client client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf") # Download and load TTS models for Swahili and English swahili_dir = download("swh", "./data/swahili") english_dir = download("eng", "./data/english") # Ensure an English TTS model is available swahili_tts = TTS(swahili_dir) english_tts = TTS(english_dir) def is_uncertain(question, response): """Check if the model's response is unreliable.""" if len(response.split()) < 4 or response.lower() in question.lower(): return True uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"] return any(phrase.lower() in response.lower() for phrase in uncertain_phrases) def generate_text(prompt): """Generate a response from the text generation model.""" messages = [{"role": "user", "content": prompt}] response = "" for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95): token = message.choices[0].delta.content response += token if is_uncertain(prompt, response): return "AI is uncertain about the response." return response # Function to detect language and generate speech def text_to_speech(text): lang = detect(text) # Detect language wav_path = "./output.wav" if lang == "sw": # Swahili swahili_tts.synthesis(text, wav_path=wav_path) else: # Default to English if not Swahili english_tts.synthesis(text, wav_path=wav_path) return wav_path def process_audio(audio): # Step 1: Transcribe the audio transcription = transcribe_auto(audio) # Step 2: Generate text based on the transcription generated_text = generate_text(transcription) # Step 3: Convert the generated text to speech speech = text_to_speech(generated_text) return transcription, generated_text, speech # Gradio Interface with gr.Blocks() as demo: gr.Markdown("

End-to-End ASR, Text Generation, and TTS

") gr.HTML("Upload or record audio. The model will transcribe, generate a response, and read it out.") audio_input = gr.Audio(label="Input Audio", type="file") text_output = gr.Textbox(label="Transcription") generated_text_output = gr.Textbox(label="Generated Text") audio_output = gr.Audio(label="Output Speech") submit_btn = gr.Button("Submit") submit_btn.click( fn=process_audio, inputs=audio_input, outputs=[text_output, generated_text_output, audio_output] ) if __name__ == "__main__": demo.launch()