Files changed (2) hide show
  1. app.py +68 -58
  2. requirements.txt +7 -6
app.py CHANGED
@@ -1,58 +1,68 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
4
- from qwen_vl_utils import process_vision_info
5
-
6
- # Load the model and processor on available device(s)
7
- model = Qwen2VLForConditionalGeneration.from_pretrained(
8
- "Qwen/Qwen2-VL-72B-Instruct-AWQ",
9
- torch_dtype=torch.float16,
10
- #device_map="auto"
11
- )
12
-
13
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct-AWQ")
14
-
15
- @spaces.GPU(duration=60)
16
- def generate_caption(image, prompt):
17
- messages = [
18
- {
19
- "role": "user",
20
- "content": [
21
- {
22
- "type": "image",
23
- "image": image, # The uploaded image
24
- },
25
- {"type": "text", "text": prompt},
26
- ],
27
- }
28
- ]
29
-
30
- # Prepare the input
31
- text = processor.apply_chat_template(
32
- messages, tokenize=False, add_generation_prompt=True
33
- )
34
- image_inputs, video_inputs = process_vision_info(messages)
35
- inputs = processor(
36
- text=[text],
37
- images=image_inputs,
38
- videos=video_inputs,
39
- padding=True,
40
- return_tensors="pt"
41
- )
42
- device = "cuda" if torch.cuda.is_available() else "cpu"
43
- inputs = inputs.to(device)
44
-
45
- # Generate the output
46
- generated_ids = model.generate(**inputs, max_new_tokens=128)
47
- generated_ids_trimmed = [
48
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
49
- ]
50
- output_text = processor.batch_decode(
51
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
52
- )
53
- return output_text[0]
54
-
55
-
56
- # Launch the Gradio interface with the updated inference function and title
57
- demo = gr.ChatInterface(fn=generate_caption, title="Qwen2-VL-72B-Instruct-OCR", multimodal=True, description="Upload your Image and get the best possible insights out of the Image")
58
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
5
+ from qwen_vl_utils import process_vision_info
6
+
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+
9
+ MODEL_REPO = "Qwen/Qwen2-VL-72B-Instruct-AWQ"
10
+ #MODEL_REPO = "Qwen/Qwen2-VL-7B-Instruct"
11
+ # Load the model and processor on available device(s)
12
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
13
+ MODEL_REPO,
14
+ torch_dtype=torch.float16,
15
+ #device_map="auto"
16
+ )#.to(device)
17
+
18
+ processor = AutoProcessor.from_pretrained(MODEL_REPO)
19
+
20
+ @spaces.GPU(duration=60)
21
+ def generate_caption(message, history, system_prompt, max_new_tokens):
22
+ messages = [
23
+ {
24
+ "role": "user",
25
+ "content": [
26
+ {"type": "text", "text": message.get("text", "")}
27
+ ]
28
+ }
29
+ ]
30
+ for image in message["files"]:
31
+ messages["content"].append({"type": "image", "image": image}) # The uploaded image
32
+
33
+ # Prepare the input
34
+ text = processor.apply_chat_template(
35
+ messages, tokenize=False, add_generation_prompt=True
36
+ )
37
+ image_inputs, video_inputs = process_vision_info(messages)
38
+ inputs = processor(
39
+ text=[text],
40
+ images=image_inputs,
41
+ videos=video_inputs,
42
+ padding=True,
43
+ return_tensors="pt"
44
+ )
45
+
46
+ inputs.to(device)
47
+ #model.to(device)
48
+
49
+ # Generate the output
50
+ generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
51
+ generated_ids_trimmed = [
52
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
53
+ ]
54
+ output_text = processor.batch_decode(
55
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
56
+ )
57
+ return output_text[0]
58
+
59
+ # Launch the Gradio interface with the updated inference function and title
60
+ with gr.Blocks() as demo:
61
+ system_prompt = gr.Textbox("You are helpful AI.", label="System Prompt", render=False)
62
+ tokens = gr.Slider(minimum=1, maximum=4096, value=128, step=1, label="Max new tokens", render=False)
63
+
64
+ gr.ChatInterface(fn=generate_caption, title="Qwen2-VL-72B-Instruct-OCR", multimodal=True,
65
+ additional_inputs=[system_prompt, tokens],
66
+ description="Upload your Image and get the best possible insights out of the Image")
67
+
68
+ demo.queue().launch()
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- huggingface_hub
2
- #torch==2.3.1
3
- torchvision==0.18.1
4
- accelerate
5
- qwen-vl-utils
6
- autoawq
 
7
  git+https://github.com/huggingface/transformers
 
1
+ spaces>=0.30.3
2
+ huggingface_hub
3
+ torch
4
+ torchvision
5
+ accelerate
6
+ qwen-vl-utils
7
+ git+https://github.com/casper-hansen/AutoAWQ
8
  git+https://github.com/huggingface/transformers