Spaces:

ikraamkb
/

Summarization

Running

App Files Files Community

Summarization / app.py

ikraamkb

Update app.py

05e0b44 verified 16 days ago

raw

history blame

2.06 kB

	from fastapi import FastAPI
	from fastapi.responses import RedirectResponse
	import gradio as gr
	from PIL import Image
	import numpy as np
	from transformers import pipeline
	from gtts import gTTS
	import tempfile
	import os
	import pytesseract # ✅ Replacing easyocr

	app = FastAPI()

	# Models
	caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
	vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")


	def process_image_question(image: Image.Image, question: str):
	if image is None:
	return "No image uploaded.", None

	try:
	# Convert image to numpy
	np_image = np.array(image)

	# OCR text using pytesseract
	extracted_text = pytesseract.image_to_string(image)

	# Caption
	caption = caption_model(image)[0]['generated_text']

	# Visual QA
	vqa_result = vqa_model(image=image, question=question)
	answer = vqa_result[0]['answer']

	# Answer as speech
	tts = gTTS(text=answer)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
	tts.save(tmp.name)
	audio_path = tmp.name

	final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
	return final_output, audio_path

	except Exception as e:
	return f"❌ Error: {e}", None

	gui = gr.Interface(
	fn=process_image_question,
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
	],
	outputs=[
	gr.Textbox(label="Result", lines=10),
	gr.Audio(label="Answer (Audio)", type="filepath")
	],
	title="🧠 Image QA with Voice",
	description="Upload an image and ask any question — even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
	)

	app = gr.mount_gradio_app(app, gui, path="/")

	@app.get("/")
	def home():
	return RedirectResponse(url="/")