HolyMorphsim commited on
Commit
5df9c39
·
1 Parent(s): eb1d08d
Files changed (4) hide show
  1. .env +1 -0
  2. model.py +24 -0
  3. stt.py +28 -0
  4. tts.py +8 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GROK_API_KEY=""
model.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+
3
+ def process_image_question(image, question="What do you see in this image?"):
4
+ """
5
+ Process an image with a visual question using BLIP-2 model.
6
+
7
+ Args:
8
+ image: The image to analyze (PIL Image)
9
+ question: The question to ask about the image
10
+
11
+ Returns:
12
+ str: The model's answer
13
+ """
14
+ client = InferenceClient("Salesforce/blip2-flan-t5-xl")
15
+
16
+
17
+ # Process the visual question
18
+ response = client.visual_question_answering(
19
+ image=image,
20
+ question=question,
21
+ max_new_tokens=256
22
+ )
23
+
24
+ return response
stt.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import os
3
+ from groq import Groq
4
+ client = Groq()
5
+
6
+ # Whisper ASR via Groq API
7
+ def transcribe_audio(audio_bytes):
8
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_bytes.name)[1]) as tmp_file:
9
+ tmp_file.write(audio_bytes.getvalue())
10
+ tmp_file_path = tmp_file.name
11
+
12
+ try:
13
+ with open(tmp_file_path, "rb") as file:
14
+ transcription = client.audio.transcriptions.create(
15
+ file=(tmp_file_path, file.read()),
16
+ model="whisper-large-v3-turbo",
17
+ prompt="Specify context or spelling",
18
+ response_format="json",
19
+ language="en",
20
+ temperature=0.0
21
+ )
22
+
23
+ return transcription.text
24
+
25
+ finally:
26
+ # Clean up the temporary audio file
27
+ if os.path.exists(tmp_file_path):
28
+ os.remove(tmp_file_path)
tts.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ from tempfile import NamedTemporaryFile
3
+
4
+ def text_to_speech(text: str) -> str:
5
+ tts = gTTS(text=text)
6
+ temp_audio = NamedTemporaryFile(delete=False, suffix=".mp3")
7
+ tts.save(temp_audio.name)
8
+ return temp_audio.name