Tj
/

SmolVLM_Proxy

Text Generation

vision-language

Model card Files Files and versions

SmolVLM_Proxy / test_ui_agent.py

Tj's picture

Tj

Upload SmolVLM final merged model

baa41dd verified about 2 months ago

history blame contribute delete

3.31 kB

	"""
	SmolVLM UI Automation Agent - Test Script
	Your trained model is ready!
	"""

	import torch
	from transformers import Idefics3ForConditionalGeneration, AutoProcessor
	from PIL import Image
	import os

	def load_model():
	"""Load your trained SmolVLM model"""
	model_path = r"C:\Users\keith\OneDrive\Desktop\admin.trac.jobs-DATA\LLaMA-Factory_local\smolvlm_final_merged"

	print("Loading your trained SmolVLM UI automation agent...")
	model = Idefics3ForConditionalGeneration.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)

	processor = AutoProcessor.from_pretrained(model_path)
	print("Model loaded successfully!")
	return model, processor

	def analyze_screenshot(image_path: str, model, processor):
	"""Analyze a screenshot for UI automation"""

	# Load and process image
	image = Image.open(image_path).convert("RGB")
	prompt = "<image>\nAnalyze this interface for UI automation opportunities. Identify clickable elements and automation targets."

	# Process inputs
	inputs = processor(text=prompt, images=[image], return_tensors="pt")

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=150,
	do_sample=True,
	temperature=0.7,
	top_p=0.9
	)

	# Decode response
	response = processor.decode(outputs[0], skip_special_tokens=True)

	# Extract just the assistant's response
	if "Assistant:" in response:
	response = response.split("Assistant:")[-1].strip()

	return response

	def main():
	print("🤖 SmolVLM UI Automation Agent")
	print("=" * 50)
	print("Your custom-trained model for TRAC administration!")
	print()

	try:
	# Load your trained model
	model, processor = load_model()

	while True:
	print("\nOptions:")
	print("1. Analyze a screenshot")
	print("2. Quit")

	choice = input("\nEnter choice (1-2): ").strip()

	if choice == "1":
	image_path = input("Enter path to screenshot: ").strip().strip('"')

	if os.path.exists(image_path):
	print("\n🔍 Analyzing screenshot...")
	try:
	result = analyze_screenshot(image_path, model, processor)
	print("\n🎯 Analysis Result:")
	print("-" * 30)
	print(result)
	print("-" * 30)
	except Exception as e:
	print(f"❌ Analysis error: {e}")
	else:
	print("❌ Image file not found!")

	elif choice == "2":
	print("👋 Goodbye!")
	break
	else:
	print("❌ Invalid choice!")

	except Exception as e:
	print(f"❌ Error loading model: {e}")
	print("Make sure the model was merged successfully.")

	if __name__ == "__main__":
	main()