exam-notes-ocr / main.py
Ayan8901's picture
Update main.py
511b6c3 verified
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image, ImageOps
import io
import os
import base64
import httpx
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
def fix_image_orientation(img: Image.Image) -> Image.Image:
try:
img = ImageOps.exif_transpose(img)
except Exception:
pass
w, h = img.size
if w > h:
img = img.rotate(90, expand=True)
return img
def resize_for_ocr(img: Image.Image, max_width: int = 1600) -> Image.Image:
w, h = img.size
if w > max_width:
ratio = max_width / w
img = img.resize((max_width, int(h * ratio)), Image.LANCZOS)
return img
def image_to_base64(img: Image.Image) -> str:
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=90)
return base64.b64encode(buf.getvalue()).decode("utf-8")
@app.get("/")
def root():
return {"status": "OCR running"}
@app.post("/ocr")
async def ocr_images(
file: UploadFile = File(...),
mode: str = Form("print")
):
try:
if not GROQ_API_KEY:
return {"success": False, "error": "GROQ_API_KEY not set in Space secrets."}
contents = await file.read()
pil_image = Image.open(io.BytesIO(contents)).convert("RGB")
pil_image = fix_image_orientation(pil_image)
pil_image = resize_for_ocr(pil_image)
b64 = image_to_base64(pil_image)
prompt = (
"Extract ALL the text from this image exactly as it appears. "
"Preserve paragraph structure and line breaks. "
"Do not summarize, translate, or add anything. "
"Only output the raw extracted text, nothing else."
)
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
GROQ_URL,
headers={
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": "meta-llama/llama-4-scout-17b-16e-instruct",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64}"
}
},
{
"type": "text",
"text": prompt
}
]
}
],
"max_tokens": 4096,
"temperature": 0.0,
},
)
result = response.json()
if "error" in result:
error_msg = result["error"].get("message", "Groq vision error")
print(f"Groq vision error: {error_msg}")
return {"success": False, "error": error_msg}
if not result.get("choices"):
print(f"Groq empty choices: {result}")
return {"success": False, "error": "No response from vision model"}
text = result["choices"][0]["message"]["content"].strip()
if not text or len(text) < 10:
return {
"success": False,
"error": "No text found in image. Try a clearer photo."
}
return {"success": True, "text": text}
except Exception as e:
return {"success": False, "error": str(e)}