Seungyoun commited on
Commit
7e83c59
1 Parent(s): 0d6aaac

Update "Quick Start"

Browse files
Files changed (1) hide show
  1. README.md +24 -10
README.md CHANGED
@@ -21,33 +21,47 @@ from transformers import (
21
  LlavaProcessor,
22
  LlavaForConditionalGeneration,
23
  )
24
- import torch
25
  from PIL import Image
26
  import requests
27
 
28
  MODEL_NAME = "Seungyoun/llava-llama-3-8b-hf"
29
 
30
  processor = LlavaProcessor.from_pretrained(MODEL_NAME)
31
- # add 128257 <image> , <pad>
32
- processor.tokenizer.add_tokens(["<|image|>", "<pad>"], special_tokens=True)
 
33
 
34
  model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda:0")
35
- # resize embeddings
36
- model.resize_token_embeddings(len(processor.tokenizer))
 
37
 
38
 
39
  # prepare image and text prompt, using the appropriate prompt template
40
- url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTd4g61TSw890IYKBbPMgXPyWAKdVOpWWUAF0-FGzgX2Q&s"
41
  image = Image.open(requests.get(url, stream=True).raw)
42
- prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <|image|>\nWhat is shown in this image? ASSISTANT:" # FIX : Chat template
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
45
 
46
  # autoregressively complete prompt
47
- output = model.generate(**inputs, max_new_tokens=100)
48
 
49
- print(processor.decode(output[0], skip_special_tokens=True))
50
- # What is shown in this image? ASSISTANT: The image shows a heartwarming scene of two dogs sitting together on a couch. The dogs are of different breeds, one being a golden retriever and the other being a tabby cat. The dogs are sitting close together, indicating a strong bond between them. The image captures a beautiful moment of companionship between two different species. sit on couch. golden retriever and tabby cat. dogs are sitting together. companionship between two different species.
51
  ```
52
  ---
53
 
 
21
  LlavaProcessor,
22
  LlavaForConditionalGeneration,
23
  )
 
24
  from PIL import Image
25
  import requests
26
 
27
  MODEL_NAME = "Seungyoun/llava-llama-3-8b-hf"
28
 
29
  processor = LlavaProcessor.from_pretrained(MODEL_NAME)
30
+ processor.tokenizer.add_tokens(
31
+ ["<|image|>", "<pad>"], special_tokens=True
32
+ ) # add 128257 <|image|> , <pad>
33
 
34
  model = LlavaForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda:0")
35
+ model.resize_token_embeddings(
36
+ len(processor.tokenizer)
37
+ ) # resize embeddings for new tokens
38
 
39
 
40
  # prepare image and text prompt, using the appropriate prompt template
41
+ url = "https://upload.wikimedia.org/wikipedia/commons/1/18/Kochendes_wasser02.jpg"
42
  image = Image.open(requests.get(url, stream=True).raw)
43
+
44
+ template = """<|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|>
45
+ <|start_header_id|>user<|end_header_id|>{user_msg_1}<|eot_id|>
46
+ <|start_header_id|>assistant<|end_header_id|>"""
47
+
48
+ terminators = [
49
+ processor.tokenizer.eos_token_id,
50
+ processor.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
51
+ ]
52
+
53
+ prompt = template.format(
54
+ system_prompt="As a vision-llm, your task is to analyze and describe the contents of the image presented to you. Examine the photograph closely and provide a comprehensive, detailed caption. You should identify and describe the various food items and their arrangement, as well as any discernible textures, colors, and specific features of the containers they are in. Highlight the variety and how these contribute to the overall visual appeal of the meal. Your description should help someone who cannot see the image to visualize its contents accurately.",
55
+ user_msg_1="<|image|>\nGive me detailed description of the image.",
56
+ )
57
 
58
  inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
59
 
60
  # autoregressively complete prompt
61
+ output = model.generate(**inputs, max_new_tokens=1024, eos_token_id=terminators)
62
 
63
+ print(processor.decode(output[0], skip_special_tokens=False))
64
+ # The image captures a moment in a kitchen. The main focus is a white electric kettle, which is plugged in and resting on a black stovetop. The stovetop has four burners, although only one is occupied by the kettle. The background is blurred, drawing attention to the kettle and stovetop. The image does not contain any text or additional objects. The relative position of the objects is such that the kettle is on the stovetop, and the background is blurred.
65
  ```
66
  ---
67