Image-Text-to-Text
Safetensors
openvla
custom_code
emrys-hong commited on
Commit
3debc13
·
verified ·
1 Parent(s): 1b2ebf2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -6
README.md CHANGED
@@ -72,18 +72,16 @@ vla = AutoModelForVision2Seq.from_pretrained(
72
  low_cpu_mem_usage=True,
73
  trust_remote_code=True
74
  ).to("cuda:0")
 
75
 
76
- # Grab image input & format prompt of size 224x224
77
  image: Image.Image = get_from_camera(...)
78
  prompt = "In: What action should the robot take to achieve the instruction\nINSTRUCTION: \n{<Instruction here>}\n\nOut: "
79
 
80
  # Predict Action (action is a 7 dimensional vector to control the robot)
81
- action, grounded_reasoning = vla.generate_actions(
82
- image=image, prompt_text=prompt, type="act", do_sample=False,
83
- max_new_tokens=512, do_sample=False
84
- )
85
 
86
- print("Grounded Reasoning:", grounded_reasoning)
87
  # Execute...
88
  robot.act(action, ...)
89
  ```
 
72
  low_cpu_mem_usage=True,
73
  trust_remote_code=True
74
  ).to("cuda:0")
75
+ processor = AutoProcessor.from_pretrained("declare-lab/Emma-X", trust_remote_code=True)
76
 
 
77
  image: Image.Image = get_from_camera(...)
78
  prompt = "In: What action should the robot take to achieve the instruction\nINSTRUCTION: \n{<Instruction here>}\n\nOut: "
79
 
80
  # Predict Action (action is a 7 dimensional vector to control the robot)
81
+ inputs = processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
82
+ action, _ = vla.generate_actions(inputs, do_sample=False, max_new_tokens=512)
 
 
83
 
84
+ print("action", action)
85
  # Execute...
86
  robot.act(action, ...)
87
  ```