MiyamizuMitsuha commited on
Commit
2bec297
1 Parent(s): a58826c

Update space

Browse files
Files changed (2) hide show
  1. app.py +136 -59
  2. requirements.txt +4 -1
app.py CHANGED
@@ -1,64 +1,141 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ],
 
 
 
 
60
  )
61
 
 
 
 
62
 
63
- if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ import os
3
+ import numpy as np
4
+ import torch
5
+ import torchvision.transforms as T
6
+ # from decord import VideoReader, cpu
7
+ from PIL import Image
8
+ from torchvision.transforms.functional import InterpolationMode
9
+ from transformers import AutoModel, AutoTokenizer
10
+ import matplotlib.pyplot as plt
11
+ import glob
12
+
13
+
14
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
15
+ IMAGENET_STD = (0.229, 0.224, 0.225)
16
+
17
+ def build_transform(input_size):
18
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
19
+ transform = T.Compose([
20
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
21
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
22
+ T.ToTensor(),
23
+ T.Normalize(mean=MEAN, std=STD)
24
+ ])
25
+ return transform
26
+
27
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
28
+ best_ratio_diff = float('inf')
29
+ best_ratio = (1, 1)
30
+ area = width * height
31
+ for ratio in target_ratios:
32
+ target_aspect_ratio = ratio[0] / ratio[1]
33
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
34
+ if ratio_diff < best_ratio_diff:
35
+ best_ratio_diff = ratio_diff
36
+ best_ratio = ratio
37
+ elif ratio_diff == best_ratio_diff:
38
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
39
+ best_ratio = ratio
40
+ return best_ratio
41
+
42
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
43
+ orig_width, orig_height = image.size
44
+ aspect_ratio = orig_width / orig_height
45
+
46
+ # calculate the existing image aspect ratio
47
+ target_ratios = set(
48
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
49
+ i * j <= max_num and i * j >= min_num)
50
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
51
+
52
+ # find the closest aspect ratio to the target
53
+ target_aspect_ratio = find_closest_aspect_ratio(
54
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
55
+
56
+ # calculate the target width and height
57
+ target_width = image_size * target_aspect_ratio[0]
58
+ target_height = image_size * target_aspect_ratio[1]
59
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
60
+
61
+ # resize the image
62
+ resized_img = image.resize((target_width, target_height))
63
+ processed_images = []
64
+ for i in range(blocks):
65
+ box = (
66
+ (i % (target_width // image_size)) * image_size,
67
+ (i // (target_width // image_size)) * image_size,
68
+ ((i % (target_width // image_size)) + 1) * image_size,
69
+ ((i // (target_width // image_size)) + 1) * image_size
70
+ )
71
+ # split the image
72
+ split_img = resized_img.crop(box)
73
+ processed_images.append(split_img)
74
+ assert len(processed_images) == blocks
75
+ if use_thumbnail and len(processed_images) != 1:
76
+ thumbnail_img = image.resize((image_size, image_size))
77
+ processed_images.append(thumbnail_img)
78
+ return processed_images
79
+
80
+ def load_image(image_file, input_size=448, max_num=12):
81
+ image = Image.open(image_file).convert('RGB')
82
+ transform = build_transform(input_size=input_size)
83
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
84
+ pixel_values = [transform(image) for image in images]
85
+ pixel_values = torch.stack(pixel_values)
86
+ return pixel_values
87
+
88
+
89
+ model_name = "YuukiAsuna/Vintern-1B-v2-ViTable-docvqa"
90
+
91
+
92
+ model = AutoModel.from_pretrained(
93
+ model_name,
94
+ torch_dtype=torch.bfloat16,
95
+ low_cpu_mem_usage=True,
96
+ trust_remote_code=True
97
+ ).eval().cuda()
98
+
99
+
100
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
101
+
102
+
103
+
104
+
105
+ def Vintern_1B_v2_ViTable_docvqa(image, question, chat_history=[]):
106
+ pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda()
107
+
108
+ generation_config = dict(max_new_tokens= 1024, do_sample=False, num_beams = 3, repetition_penalty=2.0)
109
+
110
+ # question = input("Question: ")
111
+ question = '<image>\n' + question
112
+ response = model.chat(tokenizer, pixel_values, question, generation_config)
113
+ print(f'User: {question}\nAssistant: {response}')
114
+ print("="*30)
115
+
116
+
117
+ # Update the chat history
118
+ chat_history.append((image, None))
119
+ chat_history.append((question, None))
120
+ chat_history.append((None, response))
121
+
122
+ return chat_history
123
+
124
+
125
+
126
+ interface = gr.Interface(
127
+ fn=Vintern_1B_v2_ViTable_docvqa,
128
+ inputs=[
129
+ gr.Image(label="Upload Image", type="filepath", optional=True), # Image input
130
+ gr.Textbox(label="Enter your question", optional=True), # Text input
131
  ],
132
+ outputs=gr.Chatbot(label="Chat History"), # Chatbot-style output
133
+ title="Vintern-1B-v2-ViTable-docvqa,",
134
+ # description="A chatbot that accepts both images and text, displays images, and provides conversational responses.",
135
+ allow_flagging="never",
136
  )
137
 
138
+ # Launch the chatbot
139
+ interface.launch()
140
+
141
 
 
 
requirements.txt CHANGED
@@ -1 +1,4 @@
1
- huggingface_hub==0.25.2
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+ !pip install timm einops
3
+ !wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
4
+ !pip install --no-dependencies --upgrade flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl