sundevil45 commited on
Commit
9177d69
·
verified ·
1 Parent(s): cb3d86f

Update README.md

Browse files

Update num_beams to Default

Files changed (1) hide show
  1. README.md +66 -67
README.md CHANGED
@@ -1,68 +1,67 @@
1
- ---
2
- license: apache-2.0
3
- ---
4
- We open-sourced Flame-Waterfall-7B, a model built by connecting DeepSeek-Coder-7B-Instruct and the SigLIP vision encoder with a 2-layer MLP, and instruction-tuned on the Flame-Code-VLM/Flame-Waterfall-React dataset.
5
- This model is released to showcase the value of the synthesized dataset. However, it is not intended for general-purpose tasks. Please use it with caution.
6
-
7
- ### Generation
8
-
9
- The following is the sample code for inference.
10
-
11
- ```python
12
-
13
- # pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
14
- # Replace the corresponding code files in the original repository with those in https://github.com/Flame-Code-VLM/Flame-Code-VLM/tree/main/model
15
- # export PYTHONPATH="/your_path_to_LLaVA-NeXT_repo:$PYTHONPATH"
16
-
17
- from llava.model.builder import load_pretrained_model
18
- from llava.mm_utils import process_images, tokenizer_image_token
19
- from llava.constants import DEFAULT_IMAGE_TOKEN
20
-
21
- from PIL import Image
22
- import torch
23
- import warnings
24
-
25
- warnings.filterwarnings("ignore")
26
-
27
- pretrained = "Flame-Code-VLM/flame_waterfall_7b"
28
-
29
- model_name = "flame"
30
- device = "cuda"
31
- device_map = "auto"
32
- llava_model_args = {
33
- "multimodal": True,
34
- "attn_implementation": None,
35
- }
36
- tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map,**llava_model_args)
37
- model.config.tokenizer_padding_side = 'left' # Use left padding for batch processing
38
- model.eval()
39
-
40
- url = "path_to_your_screenshot_image_file"
41
- image = Image.open(url)
42
- image_tensor = process_images([image], image_processor, model.config)
43
- image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
44
-
45
- prompt = "Below is an image of the page to create. Generate React code and styles to replicate the design, including layout, typography, and styling. Format your response as follows:'// CSS\n[CSS/SCSS code]\n\n// [React Implementation (JS/TS/JSX/TSX)]\n[Component code]'.\n\n ### Input Image:\n{image}\n\n### Response:\n"
46
-
47
- input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors='pt')
48
- input_ids = input_ids.unsqueeze(0)
49
- input_ids=input_ids.to(device)
50
- image_sizes = [image.size]
51
- modalities = ["image"]
52
-
53
- cont = model.generate(
54
- input_ids,
55
- images=image_tensor,
56
- image_sizes=image_sizes,
57
- modalities=modalities, # Added this line with the modalities
58
- do_sample=True,
59
- num_beams=5,
60
- temperature=0.1,
61
- max_new_tokens=4096,
62
- top_p=0.95,
63
- repetition_penalty=1.05
64
- )
65
-
66
- text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
67
-
68
  ```
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ We open-sourced Flame-Waterfall-7B, a model built by connecting DeepSeek-Coder-7B-Instruct and the SigLIP vision encoder with a 2-layer MLP, and instruction-tuned on the Flame-Code-VLM/Flame-Waterfall-React dataset.
5
+ This model is released to showcase the value of the synthesized dataset. However, it is not intended for general-purpose tasks. Please use it with caution.
6
+
7
+ ### Generation
8
+
9
+ The following is the sample code for inference.
10
+
11
+ ```python
12
+
13
+ # pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
14
+ # Replace the corresponding code files in the original repository with those in https://github.com/Flame-Code-VLM/Flame-Code-VLM/tree/main/model
15
+ # export PYTHONPATH="/your_path_to_LLaVA-NeXT_repo:$PYTHONPATH"
16
+
17
+ from llava.model.builder import load_pretrained_model
18
+ from llava.mm_utils import process_images, tokenizer_image_token
19
+ from llava.constants import DEFAULT_IMAGE_TOKEN
20
+
21
+ from PIL import Image
22
+ import torch
23
+ import warnings
24
+
25
+ warnings.filterwarnings("ignore")
26
+
27
+ pretrained = "Flame-Code-VLM/flame_waterfall_7b"
28
+
29
+ model_name = "flame"
30
+ device = "cuda"
31
+ device_map = "auto"
32
+ llava_model_args = {
33
+ "multimodal": True,
34
+ "attn_implementation": None,
35
+ }
36
+ tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map,**llava_model_args)
37
+ model.config.tokenizer_padding_side = 'left' # Use left padding for batch processing
38
+ model.eval()
39
+
40
+ url = "path_to_your_screenshot_image_file"
41
+ image = Image.open(url)
42
+ image_tensor = process_images([image], image_processor, model.config)
43
+ image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
44
+
45
+ prompt = "Below is an image of the page to create. Generate React code and styles to replicate the design, including layout, typography, and styling. Format your response as follows:'// CSS\n[CSS/SCSS code]\n\n// [React Implementation (JS/TS/JSX/TSX)]\n[Component code]'.\n\n ### Input Image:\n{image}\n\n### Response:\n"
46
+
47
+ input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors='pt')
48
+ input_ids = input_ids.unsqueeze(0)
49
+ input_ids=input_ids.to(device)
50
+ image_sizes = [image.size]
51
+ modalities = ["image"]
52
+
53
+ cont = model.generate(
54
+ input_ids,
55
+ images=image_tensor,
56
+ image_sizes=image_sizes,
57
+ modalities=modalities, # Added this line with the modalities
58
+ do_sample=True,
59
+ temperature=0.1,
60
+ max_new_tokens=4096,
61
+ top_p=0.95,
62
+ repetition_penalty=1.05
63
+ )
64
+
65
+ text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
66
+
 
67
  ```