Spaces:
Sleeping
Sleeping
| import json | |
| import io | |
| import base64 | |
| import os, re | |
| from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat | |
| from PIL import Image | |
| from langchain_core.messages import HumanMessage, SystemMessage | |
| from langchain_groq import ChatGroq | |
| from dotenv import load_dotenv | |
| from groq import Groq | |
| from flask import Flask, jsonify | |
| from langgraph.prebuilt import create_react_agent | |
| load_dotenv() | |
| # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY") | |
| groq_api_key = os.getenv("GROQ_API_KEY") | |
| app = Flask(__name__) | |
| '''#initialize groq client | |
| client = Groq(api_key=groq_api_key) | |
| print(f"client:{client}") ''' | |
| static_image_path = os.path.join("images", "page2_print.jfif") | |
| llm = ChatGroq( | |
| model="meta-llama/llama-4-maverick-17b-128e-instruct", | |
| temperature=0, | |
| max_tokens=None, | |
| ) | |
| def analyze_static_image(): | |
| if not os.path.exists(static_image_path): | |
| return jsonify({"error": f"Image not found"}) | |
| # Load image and convert to base64 string | |
| image_path = r"images\page2_print.jfif" | |
| with open(image_path, "rb") as image_file: | |
| image_bytes = image_file.read() | |
| img_base64 = base64.b64encode(image_bytes).decode("utf-8") | |
| # # Construct image content block | |
| # image_content_block = { | |
| # "type": "image_url", | |
| # "image_url": { | |
| # # "url": f"data:image/jpeg;base64,{image_data_url}" | |
| # "url": f"data:image/jpeg;base64,{img_base64}" | |
| # } | |
| # } | |
| # SET A SYSTEM PROMPT | |
| system_prompt = """ | |
| You are an expert in visual scene understanding. | |
| Your Job is to analyze an image and respond with structured json like This : | |
| - Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are **in front of the background** (e.g., cat, ball, crab, person, etc.). | |
| { | |
| "Sprite 1": { | |
| "name": "Cat", | |
| "description":"An orange cartoon cat with a cheerful expression, shown jumping playfully." | |
| }, | |
| "Backdrop":{ | |
| "name":"Beach Scene", | |
| "description":"A serene beach with sand, blue water, and a clear sky." | |
| } | |
| } | |
| Guidelines: | |
| - Focus only the images given in Square Shape. | |
| - Don't Consider Blank areas in Image as "Backdrop". | |
| - Do NOT classify the background scene as a sprite. | |
| - All characters or objects placed in the foreground should be "Sprites". | |
| - Use 'Sprite 1', 'Sprite 2', etc. for character or figures. | |
| - Use 'Backdrop' for environmental setting or Background behind Sprite. | |
| - Don't include generic summary or explanation outside the fields. | |
| Return only valid JSON. | |
| """ | |
| # Compose message using LangChain's HumanMessage | |
| content = [ | |
| { | |
| "type": "text", | |
| "text": "Analyze the image and describe the backdrops and characters as per instruction." | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{img_base64}" | |
| } | |
| } | |
| ] | |
| agent = create_react_agent( | |
| model = llm, | |
| tools = [], | |
| prompt = system_prompt | |
| ) | |
| # agent_executor = AgentExecutor(agent=agent, tools=[]) | |
| # Pass the human prompt + system message | |
| # messages = [system_prompt, *human_prompt] | |
| # messages = [system_prompt, *human_prompt] | |
| # call the LLM | |
| try: | |
| # response = llm.invoke(messages) | |
| # response = agent.invoke({"input":human_prompt}) | |
| response = agent.invoke({"messages": [{"role": "user", "content":content}]}) | |
| print(response) | |
| raw_response = response["messages"][-1].content | |
| cleaned_json_str = re.sub(r"^```json\s*|\s*```$", "", raw_response.strip(), flags=re.DOTALL) | |
| try: | |
| detected_info = json.loads(cleaned_json_str) | |
| except json.JSONDecodeError as e: | |
| # If parsing fails, fallback to raw string or handle error | |
| print("JSON parsing error:", e) | |
| detected_info = cleaned_json_str # or handle as needed | |
| # Extract the answer text from the response | |
| # detected_info = response.content | |
| # detected_info = raw_response | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| # Save the detected information to a JSON file | |
| result = { | |
| "image_path": image_path, | |
| "detected_info": detected_info, | |
| } | |
| # Save JSON result | |
| with open("detected_image_info.json", "w") as f: | |
| json.dump(result, f, indent=4) | |
| print("Detection results saved to detected_image_info.json") | |
| return jsonify(result) | |
| if __name__ == "__main__": | |
| app.run(debug=True) | |
| '''#build the chat messages | |
| messages = [ | |
| { | |
| "role":"system", | |
| "content":"you are an expert image analyzer. Describe backdrops and sprite/character in the image." | |
| }, | |
| { | |
| "role":"user", | |
| "content":[ | |
| { | |
| "type":"text", | |
| "text":"Describe image in detail. What backdrops and characters are present ?" | |
| }, | |
| image_content_block | |
| ] | |
| } | |
| ]''' | |
| '''# create completion with Groq | |
| response = client.chat.completions.create( | |
| model = "meta-llama/llama-4-maverick-17b-128e-instruct", | |
| messages=messages, | |
| temperature=0, | |
| max_tokens=1024, | |
| top_p=1, | |
| stream=False | |
| ) | |
| print(f"\n\n========RESPONSE CHOICES : {response}\n\n") | |
| # extract the result | |
| detected_info = response.choices[0].message.content | |
| print(f"DETECTED_INFO : {detected_info}") | |
| # save output to json | |
| output_data = { | |
| "image_path":image_path, | |
| "detected_info":detected_info | |
| } | |
| print(f"output_data : {output_data}") | |
| with open("detected_image_info.json", "w") as f: | |
| json.dump(output_data, f, indent=4) | |
| print("✅ Detection results saved to detected_image_info.json")''' | |
| # # Define the question to detect objects and characters in the image | |
| # question = "What objects and characters are present in this image?" | |
| # messages = [HumanMessage(content=[image_content_block, question])] | |
| # print(messages) | |
| # Invoke the model with the image and question | |
| # response = llm.invoke({"image": image_content_block, "question": question}) | |