Spaces:

HolyMorphsim
/

pici-talk

Sleeping

HolyMorphsim commited on May 6

Commit

5df9c39

1 Parent(s): eb1d08d

vlm added

Files changed (4) hide show

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ GROK_API_KEY=""

model.py ADDED Viewed

+from huggingface_hub import InferenceClient
+def process_image_question(image, question="What do you see in this image?"):
+    """
+    Process an image with a visual question using BLIP-2 model.
+    Args:
+        image: The image to analyze (PIL Image)
+        question: The question to ask about the image
+    Returns:
+        str: The model's answer
+    """
+    client = InferenceClient("Salesforce/blip2-flan-t5-xl")
+    # Process the visual question
+    response = client.visual_question_answering(
+        image=image,
+        question=question,
+        max_new_tokens=256
+    )
+    return response

stt.py ADDED Viewed

+import tempfile
+import os
+from groq import Groq
+client = Groq()
+# Whisper ASR via Groq API
+def transcribe_audio(audio_bytes):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_bytes.name)[1]) as tmp_file:
+                    tmp_file.write(audio_bytes.getvalue())
+                    tmp_file_path = tmp_file.name
+    try:
+        with open(tmp_file_path, "rb") as file:
+            transcription = client.audio.transcriptions.create(
+                file=(tmp_file_path, file.read()),
+                model="whisper-large-v3-turbo",
+                prompt="Specify context or spelling",
+                response_format="json",
+                language="en",
+                temperature=0.0
+            )
+        return transcription.text
+    finally:
+        # Clean up the temporary audio file
+        if os.path.exists(tmp_file_path):
+            os.remove(tmp_file_path)

tts.py ADDED Viewed

+from gtts import gTTS
+from tempfile import NamedTemporaryFile
+def text_to_speech(text: str) -> str:
+    tts = gTTS(text=text)
+    temp_audio = NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts.save(temp_audio.name)
+    return temp_audio.name