XFFXFF commited on
Commit
918d8ec
1 Parent(s): ad9ba0d

Revert "update readme"

Browse files

This reverts commit ad9ba0dc580ec9c0b76db76e799ea527caa21f55.

Files changed (1) hide show
  1. README.md +24 -26
README.md CHANGED
@@ -60,10 +60,7 @@ base_model:
60
  ## Quick Start
61
  ### Installation
62
  ```
63
- # Install transformers from GitHub until the next release includes the Aria model
64
- pip install git+https://github.com/huggingface/transformers.git
65
-
66
- pip install accelerate sentencepiece torchvision requests torch Pillow
67
  pip install flash-attn --no-build-isolation
68
 
69
  # For better inference performance, you can install grouped-gemm, which may take 3-5 minutes to install
@@ -80,24 +77,23 @@ Here is a code snippet to show you how to use Aria.
80
  import requests
81
  import torch
82
  from PIL import Image
 
83
 
84
- from transformers import AriaProcessor, AriaForConditionalGeneration
85
 
 
86
 
87
- model_id_or_path = "rhymes-ai/Aria"
88
- model = AriaForConditionalGeneration.from_pretrained(
89
- model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16
90
- )
91
 
92
- processor = AriaProcessor.from_pretrained(model_id_or_path)
93
 
94
- image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
95
 
96
  messages = [
97
  {
98
  "role": "user",
99
  "content": [
100
- {"type": "image"},
101
  {"text": "what is the image?", "type": "text"},
102
  ],
103
  }
@@ -105,20 +101,22 @@ messages = [
105
 
106
  text = processor.apply_chat_template(messages, add_generation_prompt=True)
107
  inputs = processor(text=text, images=image, return_tensors="pt")
108
- inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
109
- inputs.to(model.device)
110
-
111
- output = model.generate(
112
- **inputs,
113
- max_new_tokens=15,
114
- stop_strings=["<|im_end|>"],
115
- tokenizer=processor.tokenizer,
116
- do_sample=True,
117
- temperature=0.9,
118
- )
119
- output_ids = output[0][inputs["input_ids"].shape[1]:]
120
- response = processor.decode(output_ids, skip_special_tokens=True)
121
- print(response)
 
 
122
  ```
123
 
124
  ### Advanced Inference and Fine-tuning
 
60
  ## Quick Start
61
  ### Installation
62
  ```
63
+ pip install transformers==4.45.0 accelerate==0.34.1 sentencepiece==0.2.0 torchvision requests torch Pillow
 
 
 
64
  pip install flash-attn --no-build-isolation
65
 
66
  # For better inference performance, you can install grouped-gemm, which may take 3-5 minutes to install
 
77
  import requests
78
  import torch
79
  from PIL import Image
80
+ from transformers import AutoModelForCausalLM, AutoProcessor
81
 
82
+ model_id_or_path = "rhymes-ai/Aria"
83
 
84
+ model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
85
 
86
+ processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True)
 
 
 
87
 
88
+ image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
89
 
90
+ image = Image.open(requests.get(image_path, stream=True).raw)
91
 
92
  messages = [
93
  {
94
  "role": "user",
95
  "content": [
96
+ {"text": None, "type": "image"},
97
  {"text": "what is the image?", "type": "text"},
98
  ],
99
  }
 
101
 
102
  text = processor.apply_chat_template(messages, add_generation_prompt=True)
103
  inputs = processor(text=text, images=image, return_tensors="pt")
104
+ inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
105
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
106
+
107
+ with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
108
+ output = model.generate(
109
+ **inputs,
110
+ max_new_tokens=500,
111
+ stop_strings=["<|im_end|>"],
112
+ tokenizer=processor.tokenizer,
113
+ do_sample=True,
114
+ temperature=0.9,
115
+ )
116
+ output_ids = output[0][inputs["input_ids"].shape[1]:]
117
+ result = processor.decode(output_ids, skip_special_tokens=True)
118
+
119
+ print(result)
120
  ```
121
 
122
  ### Advanced Inference and Fine-tuning