import json import io import base64 import os, re from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat from PIL import Image from langchain_core.messages import HumanMessage, SystemMessage from langchain_groq import ChatGroq from dotenv import load_dotenv from groq import Groq from flask import Flask, jsonify from langgraph.prebuilt import create_react_agent load_dotenv() # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") groq_api_key = os.getenv("GROQ_API_KEY") app = Flask(__name__) '''#initialize groq client client = Groq(api_key=groq_api_key) print(f"client:{client}") ''' static_image_path = os.path.join("images", "page2_print.jfif") llm = ChatGroq( model="meta-llama/llama-4-maverick-17b-128e-instruct", temperature=0, max_tokens=None, ) @app.route("/", methods=["GET"]) def analyze_static_image(): if not os.path.exists(static_image_path): return jsonify({"error": f"Image not found"}) # Load image and convert to base64 string image_path = r"images\page2_print.jfif" with open(image_path, "rb") as image_file: image_bytes = image_file.read() img_base64 = base64.b64encode(image_bytes).decode("utf-8") # # Construct image content block # image_content_block = { # "type": "image_url", # "image_url": { # # "url": f"data:image/jpeg;base64,{image_data_url}" # "url": f"data:image/jpeg;base64,{img_base64}" # } # } # SET A SYSTEM PROMPT system_prompt = """ You are an expert in visual scene understanding. Your Job is to analyze an image and respond with structured json like This : - Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are **in front of the background** (e.g., cat, ball, crab, person, etc.). { "Sprite 1": { "name": "Cat", "description":"An orange cartoon cat with a cheerful expression, shown jumping playfully." }, "Backdrop":{ "name":"Beach Scene", "description":"A serene beach with sand, blue water, and a clear sky." } } Guidelines: - Focus only the images given in Square Shape. - Don't Consider Blank areas in Image as "Backdrop". - Do NOT classify the background scene as a sprite. - All characters or objects placed in the foreground should be "Sprites". - Use 'Sprite 1', 'Sprite 2', etc. for character or figures. - Use 'Backdrop' for environmental setting or Background behind Sprite. - Don't include generic summary or explanation outside the fields. Return only valid JSON. """ # Compose message using LangChain's HumanMessage content = [ { "type": "text", "text": "Analyze the image and describe the backdrops and characters as per instruction." }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{img_base64}" } } ] agent = create_react_agent( model = llm, tools = [], prompt = system_prompt ) # agent_executor = AgentExecutor(agent=agent, tools=[]) # Pass the human prompt + system message # messages = [system_prompt, *human_prompt] # messages = [system_prompt, *human_prompt] # call the LLM try: # response = llm.invoke(messages) # response = agent.invoke({"input":human_prompt}) response = agent.invoke({"messages": [{"role": "user", "content":content}]}) print(response) raw_response = response["messages"][-1].content cleaned_json_str = re.sub(r"^```json\s*|\s*```$", "", raw_response.strip(), flags=re.DOTALL) try: detected_info = json.loads(cleaned_json_str) except json.JSONDecodeError as e: # If parsing fails, fallback to raw string or handle error print("JSON parsing error:", e) detected_info = cleaned_json_str # or handle as needed # Extract the answer text from the response # detected_info = response.content # detected_info = raw_response except Exception as e: return jsonify({"error": str(e)}), 500 # Save the detected information to a JSON file result = { "image_path": image_path, "detected_info": detected_info, } # Save JSON result with open("detected_image_info.json", "w") as f: json.dump(result, f, indent=4) print("Detection results saved to detected_image_info.json") return jsonify(result) if __name__ == "__main__": app.run(debug=True) '''#build the chat messages messages = [ { "role":"system", "content":"you are an expert image analyzer. Describe backdrops and sprite/character in the image." }, { "role":"user", "content":[ { "type":"text", "text":"Describe image in detail. What backdrops and characters are present ?" }, image_content_block ] } ]''' '''# create completion with Groq response = client.chat.completions.create( model = "meta-llama/llama-4-maverick-17b-128e-instruct", messages=messages, temperature=0, max_tokens=1024, top_p=1, stream=False ) print(f"\n\n========RESPONSE CHOICES : {response}\n\n") # extract the result detected_info = response.choices[0].message.content print(f"DETECTED_INFO : {detected_info}") # save output to json output_data = { "image_path":image_path, "detected_info":detected_info } print(f"output_data : {output_data}") with open("detected_image_info.json", "w") as f: json.dump(output_data, f, indent=4) print("✅ Detection results saved to detected_image_info.json")''' # # Define the question to detect objects and characters in the image # question = "What objects and characters are present in this image?" # messages = [HumanMessage(content=[image_content_block, question])] # print(messages) # Invoke the model with the image and question # response = llm.invoke({"image": image_content_block, "question": question})