import json import io import base64 import os, re from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat from PIL import Image from langchain_core.messages import HumanMessage, SystemMessage from langchain_groq import ChatGroq from dotenv import load_dotenv from groq import Groq from flask import Flask, jsonify from langgraph.prebuilt import create_react_agent from langchain_community.llms import huggingface_pipeline from transformers import BlipProcessor, BlipForConditionalGeneration import torch from langchain_core.prompts import PromptTemplate load_dotenv() # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") # groq_api_key = os.getenv("GROQ_API_KEY") app = Flask(__name__) static_image_path = os.path.join("images", "page2_print.jfif") # llm = ChatGroq( # model="meta-llama/llama-4-maverick-17b-128e-instruct", # temperature=0, # max_tokens=None, # ) processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu") def analyze_with_blip(image_pil): inputs = processor(image_pil, return_tensors="pt").to("cpu") out = model.generate(**inputs, max_new_tokens=100) caption = processor.decode(out[0], skip_special_tokens=True) return caption @app.route("/", methods=["GET"]) def analyze_static_image(): if not os.path.exists(static_image_path): return jsonify({"error": f"Image not found"}) # Load image and convert to base64 string image_path = r"images\page2_print.jfif" with open(image_path, "rb") as image_file: image_bytes = image_file.read() img_base64 = base64.b64encode(image_bytes).decode("utf-8") # SET A SYSTEM PROMPT system_prompt = """ You are an expert in visual scene understanding. Your Job is to analyze an image and respond with structured json like This : - Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are **in front of the background** (e.g., cat, ball, crab, person, etc.). { "Sprite 1": { "name": "Cat", "description":"An orange cartoon cat with a cheerful expression, shown jumping playfully." }, "Backdrop":{ "name":"Beach Scene", "description":"A serene beach with sand, blue water, and a clear sky." } } Guidelines: - Focus only the images given in Square Shape. - Don't Consider Blank areas in Image as "Backdrop". - Do NOT classify the background scene as a sprite. - All characters or objects placed in the foreground should be "Sprites". - Use 'Sprite 1', 'Sprite 2', etc. for character or figures. - Use 'Backdrop' for environmental setting or Background behind Sprite. - Don't include generic summary or explanation outside the fields. Return only valid JSON. """ # Compose message using LangChain's HumanMessage content = [ { "type": "text", "text": "Analyze the image and describe the backdrops and characters as per instruction." }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{img_base64}" } } ] agent = create_react_agent( model = llm, tools = [], prompt = system_prompt ) # call the LLM try: # response = llm.invoke(messages) # response = agent.invoke({"input":human_prompt}) response = agent.invoke({"messages": [{"role": "user", "content":content}]}) print(response) raw_response = response["messages"][-1].content cleaned_json_str = re.sub(r"^```json\s*|\s*```$", "", raw_response.strip(), flags=re.DOTALL) try: detected_info = json.loads(cleaned_json_str) except json.JSONDecodeError as e: # If parsing fails, fallback to raw string or handle error print("JSON parsing error:", e) detected_info = cleaned_json_str # or handle as needed except Exception as e: return jsonify({"error": str(e)}), 500 # Save the detected information to a JSON file result = { "image_path": image_path, "detected_info": detected_info, } # Save JSON result with open("detected_image_info.json", "w") as f: json.dump(result, f, indent=4) print("Detection results saved to detected_image_info.json") return jsonify(result) if __name__ == "__main__": app.run(debug=True)