WWMachine commited on
Commit
5a6a6ff
·
verified ·
1 Parent(s): 29a7fe7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -70
app.py CHANGED
@@ -2,107 +2,158 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
 
5
 
6
  # --- Configuration ---
7
- MODEL_REPO = "Kezovic/iris-q4gguf-v2"
8
- MODEL_FILE = "llama-3.2-1b-instruct.Q4_K_M.gguf"
 
 
 
 
 
9
  CONTEXT_WINDOW = 4096
10
  MAX_NEW_TOKENS = 512
11
  TEMPERATURE = 0.7
12
 
 
 
 
 
 
 
13
  # --- Model Loading Function ---
14
- # Initialize llm as None to avoid the Llama.__del__ 'NoneType' error
15
- llm = None
16
  def load_llm():
17
  """Downloads the GGUF model and initializes LlamaCPP."""
18
- global llm # Use the global variable
19
- print("Downloading model...")
20
  try:
21
  model_path = hf_hub_download(
22
- repo_id=MODEL_REPO,
23
- filename=MODEL_FILE
24
  )
25
-
26
  llm = Llama(
27
  model_path=model_path,
28
  n_ctx=CONTEXT_WINDOW,
29
  n_threads=2,
30
- verbose=False
31
  )
32
- print("Model loaded successfully!")
33
  return llm
34
  except Exception as e:
35
  print(f"Error loading model: {e}")
36
  return None
37
 
38
- # Load the model only once
39
- llm = load_llm()
40
-
41
- # --- Inference Function ---
42
- def generate_and_speak(text_prompt):
43
- """
44
- Generates a text response using the Llama model.
45
- The output text is automatically synthesized into speech by Gradio's Audio component.
46
- """
47
- if llm is None:
48
- return "Error: LLM failed to load. Please check model configuration.", None
49
 
50
- if not text_prompt or text_prompt.strip() == "":
51
- return "Please enter a query.", None
52
-
53
- # Use a basic prompt template
54
- full_prompt = f"### Human: {text_prompt}\n### Assistant:"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
 
 
 
56
  try:
57
- output = llm(
58
- prompt=full_prompt,
59
- max_tokens=MAX_NEW_TOKENS,
60
- temperature=TEMPERATURE,
61
- stop=["### Human:"],
62
- echo=False
63
  )
64
-
65
- response_text = output['choices'][0]['text'].strip()
66
- # Return the text. It will update the Textbox AND the Audio component.
67
- return response_text, response_text
68
  except Exception as e:
69
- return f"LLM Generation Error: {e}", None
 
70
 
71
- # --- Gradio Interface (TTS Flow) ---
72
- with gr.Blocks(title=f"Audio Chat with {MODEL_FILE}") as demo:
73
- gr.Markdown(f"## 🗣️ LLM Chat with Text-to-Speech (TTS)")
74
- gr.Markdown("Type your query (Text Input) and the LLM will reply in both text and auto-generated audio (TTS).")
 
 
 
 
 
75
 
76
- with gr.Row():
77
- text_input = gr.Textbox(
78
- label="Your Query (Text Input)",
79
- lines=2,
80
- scale=3
81
- )
82
- audio_btn = gr.Button("Generate and Speak")
83
-
84
- # Outputs
85
- text_output = gr.Textbox(label="LLM Response Text")
86
- audio_output = gr.Audio(
87
- label="Assistant Audio Playback (TTS)",
88
- autoplay=True,
89
- # Gradio automatically synthesizes the text output received by this Audio component
90
- # into speech. We set it as an 'update' target.
91
- interactive=False
92
- )
93
 
94
- # Set up the event listener: Button click triggers the function.
95
- audio_btn.click(
96
- fn=generate_and_speak,
97
- inputs=[text_input],
98
- outputs=[text_output, audio_output]
 
 
 
 
 
99
  )
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Enable enter key to submit
102
- text_input.submit(
103
- fn=generate_and_speak,
104
- inputs=[text_input],
105
- outputs=[text_output, audio_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  )
107
 
108
- demo.launch()
 
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import os
5
+ from deepgram import DeepgramClient, PrerecordedOptions, SpeakOptions
6
 
7
  # --- Configuration ---
8
+ # 1. API KEY: Ensure you have your Deepgram API Key ready
9
+ # Ideally, set this in your environment variables as DEEPGRAM_API_KEY
10
+ DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY", "YOUR_DEEPGRAM_KEY_HERE")
11
+
12
+ # 2. Model Config
13
+ REPO_ID = "Kezovic/iris-q4gguf-v2"
14
+ FILENAME = "llama-3.2-1b-instruct.Q4_K_M.gguf"
15
  CONTEXT_WINDOW = 4096
16
  MAX_NEW_TOKENS = 512
17
  TEMPERATURE = 0.7
18
 
19
+ # --- Initialize Deepgram ---
20
+ if DEEPGRAM_API_KEY == "YOUR_DEEPGRAM_KEY_HERE":
21
+ print("WARNING: Please set your DEEPGRAM_API_KEY.")
22
+
23
+ deepgram = DeepgramClient(DEEPGRAM_API_KEY)
24
+
25
  # --- Model Loading Function ---
26
+ llm = None
 
27
  def load_llm():
28
  """Downloads the GGUF model and initializes LlamaCPP."""
29
+ global llm
30
+ print("Downloading LLM...")
31
  try:
32
  model_path = hf_hub_download(
33
+ repo_id=REPO_ID,
34
+ filename=FILENAME
35
  )
36
+ # n_threads=2 is good for free Hugging Face CPU tiers
37
  llm = Llama(
38
  model_path=model_path,
39
  n_ctx=CONTEXT_WINDOW,
40
  n_threads=2,
41
+ verbose=False
42
  )
43
+ print("LLM loaded successfully!")
44
  return llm
45
  except Exception as e:
46
  print(f"Error loading model: {e}")
47
  return None
48
 
49
+ # Load model on startup
50
+ load_llm()
 
 
 
 
 
 
 
 
 
51
 
52
+ # --- 1. Speech-to-Text (Deepgram) ---
53
+ def transcribe_audio(audio_filepath):
54
+ """Sends audio file to Deepgram and returns text."""
55
+ if not audio_filepath:
56
+ return ""
57
+
58
+ try:
59
+ with open(audio_filepath, "rb") as buffer:
60
+ payload = {"buffer": buffer}
61
+ options = PrerecordedOptions(
62
+ smart_format=True,
63
+ model="nova-2",
64
+ language="en-US"
65
+ )
66
+ response = deepgram.listen.rest.v("1").transcribe_file(payload, options)
67
+ return response.results.channels[0].alternatives[0].transcript
68
+ except Exception as e:
69
+ print(f"STT Error: {e}")
70
+ return ""
71
 
72
+ # --- 2. Text-to-Speech (Deepgram) ---
73
+ def text_to_speech(text):
74
+ """Sends text to Deepgram and returns path to audio file."""
75
  try:
76
+ filename = "output_response.mp3"
77
+ options = SpeakOptions(
78
+ model="aura-asteria-en", # Choices: aura-asteria-en, aura-helios-en, etc.
79
+ encoding="linear16",
80
+ container="wav"
 
81
  )
82
+ # Save the audio to a file
83
+ deepgram.speak.rest.v("1").save(filename, {"text": text}, options)
84
+ return filename
 
85
  except Exception as e:
86
+ print(f"TTS Error: {e}")
87
+ return None
88
 
89
+ # --- 3. Main Pipeline Function ---
90
+ def process_conversation(audio_input):
91
+ """
92
+ 1. Transcribe Audio (STT)
93
+ 2. Query LLM
94
+ 3. Synthesize Speech (TTS)
95
+ """
96
+ if llm is None:
97
+ return "Model not loaded.", None, "System Error: Model failed to load."
98
 
99
+ # Step A: Transcribe
100
+ user_text = transcribe_audio(audio_input)
101
+ if not user_text:
102
+ return "Could not hear audio.", None, ""
103
+
104
+ print(f"User said: {user_text}")
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Step B: LLM Inference
107
+ # Using the prompt format from your original code
108
+ full_prompt = f"### Human: {user_text}\n### Assistant:"
109
+
110
+ output = llm(
111
+ prompt=full_prompt,
112
+ max_tokens=MAX_NEW_TOKENS,
113
+ temperature=TEMPERATURE,
114
+ stop=["### Human:"],
115
+ echo=False
116
  )
117
+ response_text = output['choices'][0]['text'].strip()
118
+ print(f"LLM said: {response_text}")
119
+
120
+ # Step C: Speak Response
121
+ output_audio_path = text_to_speech(response_text)
122
+
123
+ # Return: Transcription (for display), Audio (for playback), LLM Text (for display)
124
+ return user_text, output_audio_path, response_text
125
+
126
+ # --- Gradio UI ---
127
+ with gr.Blocks(title=f"Voice Chat with {FILENAME}") as demo:
128
+ gr.Markdown(f"## 🗣️ Deepgram Voice Chat with {FILENAME}")
129
 
130
+ with gr.Row():
131
+ # Input Column
132
+ with gr.Column():
133
+ audio_input = gr.Audio(
134
+ sources=["microphone"],
135
+ type="filepath",
136
+ label="Speak Now"
137
+ )
138
+ submit_btn = gr.Button("Submit Audio", variant="primary")
139
+
140
+ # Output Column
141
+ with gr.Column():
142
+ audio_output = gr.Audio(
143
+ label="Assistant Voice",
144
+ autoplay=True, # Automatically plays the response
145
+ interactive=False
146
+ )
147
+ # Debugging/Visuals
148
+ user_transcript = gr.Textbox(label="You said:")
149
+ ai_response_text = gr.Textbox(label="AI Response:")
150
+
151
+ # Event Listener
152
+ submit_btn.click(
153
+ fn=process_conversation,
154
+ inputs=[audio_input],
155
+ outputs=[user_transcript, audio_output, ai_response_text]
156
  )
157
 
158
+ if __name__ == "__main__":
159
+ demo.launch()