Spaces:
Sleeping
Sleeping
Commit
·
5df9c39
1
Parent(s):
eb1d08d
vlm added
Browse files
.env
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
GROK_API_KEY=""
|
model.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import InferenceClient
|
| 2 |
+
|
| 3 |
+
def process_image_question(image, question="What do you see in this image?"):
|
| 4 |
+
"""
|
| 5 |
+
Process an image with a visual question using BLIP-2 model.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
image: The image to analyze (PIL Image)
|
| 9 |
+
question: The question to ask about the image
|
| 10 |
+
|
| 11 |
+
Returns:
|
| 12 |
+
str: The model's answer
|
| 13 |
+
"""
|
| 14 |
+
client = InferenceClient("Salesforce/blip2-flan-t5-xl")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Process the visual question
|
| 18 |
+
response = client.visual_question_answering(
|
| 19 |
+
image=image,
|
| 20 |
+
question=question,
|
| 21 |
+
max_new_tokens=256
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
return response
|
stt.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import os
|
| 3 |
+
from groq import Groq
|
| 4 |
+
client = Groq()
|
| 5 |
+
|
| 6 |
+
# Whisper ASR via Groq API
|
| 7 |
+
def transcribe_audio(audio_bytes):
|
| 8 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_bytes.name)[1]) as tmp_file:
|
| 9 |
+
tmp_file.write(audio_bytes.getvalue())
|
| 10 |
+
tmp_file_path = tmp_file.name
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
with open(tmp_file_path, "rb") as file:
|
| 14 |
+
transcription = client.audio.transcriptions.create(
|
| 15 |
+
file=(tmp_file_path, file.read()),
|
| 16 |
+
model="whisper-large-v3-turbo",
|
| 17 |
+
prompt="Specify context or spelling",
|
| 18 |
+
response_format="json",
|
| 19 |
+
language="en",
|
| 20 |
+
temperature=0.0
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
return transcription.text
|
| 24 |
+
|
| 25 |
+
finally:
|
| 26 |
+
# Clean up the temporary audio file
|
| 27 |
+
if os.path.exists(tmp_file_path):
|
| 28 |
+
os.remove(tmp_file_path)
|
tts.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from gtts import gTTS
|
| 2 |
+
from tempfile import NamedTemporaryFile
|
| 3 |
+
|
| 4 |
+
def text_to_speech(text: str) -> str:
|
| 5 |
+
tts = gTTS(text=text)
|
| 6 |
+
temp_audio = NamedTemporaryFile(delete=False, suffix=".mp3")
|
| 7 |
+
tts.save(temp_audio.name)
|
| 8 |
+
return temp_audio.name
|