Scratch_Vision_Game_dup

Sleeping

App Files Files Community

Scratch_Vision_Game_dup / app2.py

prthm11

Upload 11 files

1c58aa5 verified about 1 month ago

raw

history blame contribute delete

6.49 kB

	import json
	import io
	import base64
	import os, re
	from langchain_google_vertexai.vision_models import VertexAIVisualQnAChat
	from PIL import Image
	from langchain_core.messages import HumanMessage, SystemMessage
	from langchain_groq import ChatGroq
	from dotenv import load_dotenv
	from groq import Groq
	from flask import Flask, jsonify
	from langgraph.prebuilt import create_react_agent

	load_dotenv()
	# os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
	groq_api_key = os.getenv("GROQ_API_KEY")

	app = Flask(__name__)

	'''#initialize groq client
	client = Groq(api_key=groq_api_key)
	print(f"client:{client}") '''

	static_image_path = os.path.join("images", "page2_print.jfif")

	llm = ChatGroq(
	model="meta-llama/llama-4-maverick-17b-128e-instruct",
	temperature=0,
	max_tokens=None,
	)

	@app.route("/", methods=["GET"])
	def analyze_static_image():
	if not os.path.exists(static_image_path):
	return jsonify({"error": f"Image not found"})

	# Load image and convert to base64 string
	image_path = r"images\page2_print.jfif"
	with open(image_path, "rb") as image_file:
	image_bytes = image_file.read()
	img_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# # Construct image content block
	# image_content_block = {
	# "type": "image_url",
	# "image_url": {
	# # "url": f"data:image/jpeg;base64,{image_data_url}"
	# "url": f"data:image/jpeg;base64,{img_base64}"
	# }
	# }

	# SET A SYSTEM PROMPT
	system_prompt = """
	You are an expert in visual scene understanding.
	Your Job is to analyze an image and respond with structured json like This :
	- Any number of "Sprites": These refer to distinct characters, animals, or objects in the image that are in front of the background (e.g., cat, ball, crab, person, etc.).
	{
	"Sprite 1": {
	"name": "Cat",
	"description":"An orange cartoon cat with a cheerful expression, shown jumping playfully."
	},
	"Backdrop":{
	"name":"Beach Scene",
	"description":"A serene beach with sand, blue water, and a clear sky."
	}
	}
	Guidelines:
	- Focus only the images given in Square Shape.
	- Don't Consider Blank areas in Image as "Backdrop".
	- Do NOT classify the background scene as a sprite.
	- All characters or objects placed in the foreground should be "Sprites".
	- Use 'Sprite 1', 'Sprite 2', etc. for character or figures.
	- Use 'Backdrop' for environmental setting or Background behind Sprite.
	- Don't include generic summary or explanation outside the fields.
	Return only valid JSON.
	"""

	# Compose message using LangChain's HumanMessage
	content = [
	{
	"type": "text",
	"text": "Analyze the image and describe the backdrops and characters as per instruction."
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{img_base64}"
	}
	}
	]

	agent = create_react_agent(
	model = llm,
	tools = [],
	prompt = system_prompt
	)

	# agent_executor = AgentExecutor(agent=agent, tools=[])
	# Pass the human prompt + system message
	# messages = [system_prompt, *human_prompt]
	# messages = [system_prompt, *human_prompt]

	# call the LLM
	try:
	# response = llm.invoke(messages)
	# response = agent.invoke({"input":human_prompt})
	response = agent.invoke({"messages": [{"role": "user", "content":content}]})
	print(response)

	raw_response = response["messages"][-1].content

	cleaned_json_str = re.sub(r"^```json\s\|\s```$", "", raw_response.strip(), flags=re.DOTALL)
	try:
	detected_info = json.loads(cleaned_json_str)
	except json.JSONDecodeError as e:
	# If parsing fails, fallback to raw string or handle error
	print("JSON parsing error:", e)
	detected_info = cleaned_json_str # or handle as needed
	# Extract the answer text from the response
	# detected_info = response.content
	# detected_info = raw_response
	except Exception as e:
	return jsonify({"error": str(e)}), 500

	# Save the detected information to a JSON file
	result = {
	"image_path": image_path,
	"detected_info": detected_info,
	}

	# Save JSON result
	with open("detected_image_info.json", "w") as f:
	json.dump(result, f, indent=4)
	print("Detection results saved to detected_image_info.json")
	return jsonify(result)

	if __name__ == "__main__":
	app.run(debug=True)


	'''#build the chat messages
	messages = [
	{
	"role":"system",
	"content":"you are an expert image analyzer. Describe backdrops and sprite/character in the image."
	},
	{
	"role":"user",
	"content":[
	{
	"type":"text",
	"text":"Describe image in detail. What backdrops and characters are present ?"
	},
	image_content_block
	]
	}
	]'''

	'''# create completion with Groq
	response = client.chat.completions.create(
	model = "meta-llama/llama-4-maverick-17b-128e-instruct",
	messages=messages,
	temperature=0,
	max_tokens=1024,
	top_p=1,
	stream=False
	)
	print(f"\n\n========RESPONSE CHOICES : {response}\n\n")
	# extract the result
	detected_info = response.choices[0].message.content
	print(f"DETECTED_INFO : {detected_info}")


	# save output to json
	output_data = {
	"image_path":image_path,
	"detected_info":detected_info
	}
	print(f"output_data : {output_data}")

	with open("detected_image_info.json", "w") as f:
	json.dump(output_data, f, indent=4)

	print("✅ Detection results saved to detected_image_info.json")'''


	# # Define the question to detect objects and characters in the image
	# question = "What objects and characters are present in this image?"

	# messages = [HumanMessage(content=[image_content_block, question])]
	# print(messages)
	# Invoke the model with the image and question
	# response = llm.invoke({"image": image_content_block, "question": question})