Futuresony commited on
Commit
4cc3c9c
·
verified ·
1 Parent(s): a2dcce7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from asr import transcribe_auto
3
+ from huggingface_hub import InferenceClient
4
+ from ttsmms import download, TTS
5
+ from langdetect import detect
6
+
7
+ # Initialize text generation client
8
+ client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
9
+
10
+ # Download and load TTS models for Swahili and English
11
+ swahili_dir = download("swh", "./data/swahili")
12
+ english_dir = download("eng", "./data/english") # Ensure an English TTS model is available
13
+
14
+ swahili_tts = TTS(swahili_dir)
15
+ english_tts = TTS(english_dir)
16
+
17
+ def is_uncertain(question, response):
18
+ """Check if the model's response is unreliable."""
19
+ if len(response.split()) < 4 or response.lower() in question.lower():
20
+ return True
21
+ uncertain_phrases = ["Kulingana na utafiti", "Inaaminika kuwa", "Ninadhani", "It is believed that", "Some people say"]
22
+ return any(phrase.lower() in response.lower() for phrase in uncertain_phrases)
23
+
24
+ def generate_text(prompt):
25
+ """Generate a response from the text generation model."""
26
+ messages = [{"role": "user", "content": prompt}]
27
+
28
+ response = ""
29
+ for message in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7, top_p=0.95):
30
+ token = message.choices[0].delta.content
31
+ response += token
32
+
33
+ if is_uncertain(prompt, response):
34
+ return "AI is uncertain about the response."
35
+
36
+ return response
37
+
38
+ # Function to detect language and generate speech
39
+ def text_to_speech(text):
40
+ lang = detect(text) # Detect language
41
+ wav_path = "./output.wav"
42
+
43
+ if lang == "sw": # Swahili
44
+ swahili_tts.synthesis(text, wav_path=wav_path)
45
+ else: # Default to English if not Swahili
46
+ english_tts.synthesis(text, wav_path=wav_path)
47
+
48
+ return wav_path
49
+
50
+ def process_audio(audio):
51
+ # Step 1: Transcribe the audio
52
+ transcription = transcribe_auto(audio)
53
+
54
+ # Step 2: Generate text based on the transcription
55
+ generated_text = generate_text(transcription)
56
+
57
+ # Step 3: Convert the generated text to speech
58
+ speech = text_to_speech(generated_text)
59
+
60
+ return transcription, generated_text, speech
61
+
62
+ # Gradio Interface
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
65
+ gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
66
+
67
+ audio_input = gr.Audio(label="Input Audio", source="upload", type="file")
68
+ text_output = gr.Textbox(label="Transcription")
69
+ generated_text_output = gr.Textbox(label="Generated Text")
70
+ audio_output = gr.Audio(label="Output Speech")
71
+
72
+ submit_btn = gr.Button("Submit")
73
+
74
+ submit_btn.click(
75
+ fn=process_audio,
76
+ inputs=audio_input,
77
+ outputs=[text_output, generated_text_output, audio_output]
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()