Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on Apr 11

Commit

bdeddcb

verified ·

1 Parent(s): 47942ca

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -27

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
 import os
 import shutil
 from PIL import Image
-from transformers import ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
 from gtts import gTTS
 import torch
 import tempfile
@@ -82,25 +82,6 @@ app = FastAPI()
 vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-# Load GPT model to rewrite answers (Phi-1.5)
-gpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
-gpt_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5")
-def rewrite_answer(question, short_answer):
-    prompt = f"Question: {question}\nShort Answer: {short_answer}\nFull Sentence:"
-    inputs = gpt_tokenizer(prompt, return_tensors="pt")
-    with torch.no_grad():
-        outputs = gpt_model.generate(
-            **inputs,
-            max_new_tokens=50,
-            do_sample=True,
-            top_p=0.9,
-            temperature=0.7,
-            pad_token_id=gpt_tokenizer.eos_token_id
-        )
-    generated = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return generated.split("Full Sentence:")[-1].strip()
 def answer_question_from_image(image, question):
     if image is None or not question.strip():
         return "Please upload an image and ask a question.", None
@@ -111,18 +92,15 @@ def answer_question_from_image(image, question):
     predicted_id = outputs.logits.argmax(-1).item()
     short_answer = vqa_model.config.id2label[predicted_id]
-    # Rewrite short answer to full sentence
-    full_answer = rewrite_answer(question, short_answer)
     try:
-        tts = gTTS(text=full_answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
     except Exception as e:
-        return f"Answer: {full_answer}\n\n⚠️ Audio generation error: {e}", None
-    return full_answer, audio_path
 def process_image_question(image: Image.Image, question: str):
     answer, audio_path = answer_question_from_image(image, question)
@@ -139,7 +117,7 @@ gui = gr.Interface(
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
-    description="Upload an image and ask a question. You'll get a full-sentence spoken answer."
 )
 app = gr.mount_gradio_app(app, gui, path="/")

 import os
 import shutil
 from PIL import Image
+from transformers import ViltProcessor, ViltForQuestionAnswering
 from gtts import gTTS
 import torch
 import tempfile
 vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 def answer_question_from_image(image, question):
     if image is None or not question.strip():
         return "Please upload an image and ask a question.", None
     predicted_id = outputs.logits.argmax(-1).item()
     short_answer = vqa_model.config.id2label[predicted_id]
     try:
+        tts = gTTS(text=short_answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
     except Exception as e:
+        return f"Answer: {short_answer}\n\n⚠️ Audio generation error: {e}", None
+    return short_answer, audio_path
 def process_image_question(image: Image.Image, question: str):
     answer, audio_path = answer_question_from_image(image, question)
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
+    description="Upload an image and ask a question. You'll get an answer spoken out loud."
 )
 app = gr.mount_gradio_app(app, gui, path="/")