amberborici commited on
Commit
9c9f973
·
1 Parent(s): f03a25b
Files changed (1) hide show
  1. app.py +155 -188
app.py CHANGED
@@ -1,211 +1,178 @@
1
  import gradio as gr
2
- import requests
3
- import base64
4
- import io
5
  from PIL import Image
6
- import os
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def encode_image_to_base64(image):
9
- """Convert PIL image to base64 string"""
10
- buffered = io.BytesIO()
11
- image.save(buffered, format="JPEG")
12
- img_str = base64.b64encode(buffered.getvalue()).decode()
13
- return f"data:image/jpeg;base64,{img_str}"
14
 
15
- def process_images_with_api(images, prompt, api_key):
16
- """
17
- Process multiple images using Hugging Face Inference API
 
18
 
19
- Args:
20
- images: List of uploaded images
21
- prompt: User-provided prompt
22
- api_key: Hugging Face API key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- Returns:
25
- Generated descriptions
26
- """
 
 
27
  if not images:
28
  return "Please upload at least one image."
29
 
30
- if not api_key:
31
- return "Please provide your Hugging Face API key."
32
-
33
- # API endpoint for Qwen2-VL model
34
- api_url = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-7B-Instruct"
35
-
36
- headers = {
37
- "Authorization": f"Bearer {api_key}",
38
- "Content-Type": "application/json"
39
- }
40
-
41
  results = []
42
-
43
  for i, image in enumerate(images):
44
- if image is None:
45
- continue
46
-
47
- try:
48
- # Convert numpy array to PIL Image
49
- pil_image = Image.fromarray(image)
50
-
51
- # Encode image to base64
52
- base64_image = encode_image_to_base64(pil_image)
53
-
54
- # Prepare the request payload
55
- payload = {
56
- "inputs": [
57
- {
58
- "role": "user",
59
- "content": [
60
- {
61
- "type": "text",
62
- "text": prompt
63
- },
64
- {
65
- "type": "image_url",
66
- "image_url": {
67
- "url": base64_image
68
- }
69
- }
70
- ]
71
- }
72
- ]
73
- }
74
-
75
- # Make API request
76
- response = requests.post(api_url, headers=headers, json=payload, timeout=60)
77
-
78
- if response.status_code == 200:
79
- result = response.json()
80
- if "choices" in result and len(result["choices"]) > 0:
81
- description = result["choices"][0]["message"]["content"]
82
- results.append(f"Image {i+1}: {description}")
83
- else:
84
- results.append(f"Image {i+1}: ❌ No response from API")
85
- else:
86
- error_msg = f"API Error (Status {response.status_code}): {response.text}"
87
- results.append(f"Image {i+1}: ❌ {error_msg}")
88
-
89
- except Exception as e:
90
- results.append(f"Image {i+1}: ❌ Error - {str(e)}")
91
-
92
- if not results:
93
- return "No valid images processed."
94
 
95
  return "\n\n".join(results)
96
 
97
- def create_gradio_interface():
98
- """Create the Gradio interface for Hugging Face Spaces"""
 
 
 
 
99
 
100
- with gr.Blocks(
101
- title="Multi-Image AI Processor",
102
- theme=gr.themes.Soft(),
103
- fill_height=True
104
- ) as demo:
105
-
106
- gr.Markdown("# 🖼️ Multi-Image AI Processor")
107
- gr.Markdown("Upload multiple images and get AI-generated descriptions using the Qwen2-VL model via Hugging Face Inference API.")
108
-
109
- with gr.Row():
110
- with gr.Column(scale=2):
111
- # Image upload area
112
- images_input = gr.File(
113
- file_count="multiple",
114
- file_types=["image"],
115
- label="Upload Images",
116
- height=300
117
- )
118
-
119
- # Prompt input
120
- prompt_input = gr.Textbox(
121
- label="Prompt",
122
- placeholder="Describe this image in detail...",
123
- value="Describe this image in detail.",
124
- lines=3
125
- )
126
-
127
- # API key input (required)
128
- api_key_input = gr.Textbox(
129
- label="Hugging Face API Key",
130
- placeholder="hf_...",
131
- type="password",
132
- info="Required: Get your API key from https://huggingface.co/settings/tokens"
133
- )
134
-
135
- # Process button
136
- process_btn = gr.Button(
137
- "🚀 Process Images",
138
- variant="primary",
139
- size="lg"
140
- )
141
 
142
- with gr.Column(scale=2):
143
- # Results area
144
- results_output = gr.Textbox(
145
- label="Results",
146
- lines=15,
147
- max_lines=25,
148
- interactive=False
149
- )
150
-
151
- # Examples
152
- with gr.Accordion("Example Prompts", open=False):
153
- gr.Examples(
154
- examples=[
155
- [
156
- "Describe the architectural style and features of this building.",
157
- "Upload images of buildings to analyze their architectural style."
158
- ],
159
- [
160
- "What are the key features and amenities shown in this property?",
161
- "Upload property images to get detailed descriptions of features and amenities."
162
- ],
163
- [
164
- "Describe the interior design and layout of this space.",
165
- "Upload interior photos to get detailed descriptions of design and layout."
166
- ],
167
- [
168
- "What type of property is this and what are its main characteristics?",
169
- "Upload property images to identify type and characteristics."
170
- ],
171
- [
172
- "Describe the condition and quality of this property.",
173
- "Upload property images to assess condition and quality."
174
- ]
175
- ],
176
- inputs=[prompt_input],
177
- outputs=[results_output],
178
- label="Example Prompts"
179
  )
180
 
181
- # Footer
182
- gr.Markdown("---")
183
- gr.Markdown("""
184
- **How to use:**
185
- 1. Get your Hugging Face API key from https://huggingface.co/settings/tokens
186
- 2. Upload one or more images
187
- 3. Enter a prompt describing what you want to know about the images
188
- 4. Paste your API key
189
- 5. Click "Process Images" to get AI-generated descriptions
190
-
191
- **Tips:**
192
- - Use specific prompts for better results
193
- - The model works best with clear, high-quality images
194
- - You can process multiple images at once
195
- - Each image is processed individually with the same prompt
196
- """)
197
-
198
- # Connect the process button
199
- process_btn.click(
200
- fn=process_images_with_api,
201
- inputs=[images_input, prompt_input, api_key_input],
202
- outputs=[results_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
204
 
205
- return demo
206
-
207
- # Create and launch the interface
208
- demo = create_gradio_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
 
210
  if __name__ == "__main__":
211
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForCausalLM
3
+ import torch
 
4
  from PIL import Image
5
+ import io
6
+
7
+ # Load the model and processor
8
+ def load_model():
9
+ """Load the Qwen2-VL model"""
10
+ model_id = "Qwen/Qwen2-VL-7B-Instruct"
11
+ processor = AutoProcessor.from_pretrained(model_id)
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ model_id,
14
+ torch_dtype=torch.float16,
15
+ device_map="auto"
16
+ )
17
+ return model, processor
18
 
19
+ # Initialize model and processor
20
+ model, processor = load_model()
 
 
 
 
21
 
22
+ def process_single_image(image, prompt):
23
+ """Process a single image with the model"""
24
+ if image is None:
25
+ return "Please upload an image."
26
 
27
+ try:
28
+ # Convert Gradio image to PIL Image
29
+ if hasattr(image, 'name'): # Gradio file object
30
+ pil_image = Image.open(image.name)
31
+ else: # Numpy array
32
+ pil_image = Image.fromarray(image)
33
+
34
+ # Prepare the prompt
35
+ text = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
36
+
37
+ # Process the image and text
38
+ inputs = processor(
39
+ text=text,
40
+ images=pil_image,
41
+ return_tensors="pt"
42
+ )
43
+
44
+ # Generate response
45
+ with torch.no_grad():
46
+ generated_ids = model.generate(
47
+ **inputs,
48
+ max_new_tokens=512,
49
+ do_sample=True,
50
+ temperature=0.7,
51
+ top_p=0.9
52
+ )
53
+
54
+ # Decode the response
55
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
56
+
57
+ # Extract only the assistant's response
58
+ response = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()
59
+
60
+ return response
61
 
62
+ except Exception as e:
63
+ return f"Error processing image: {str(e)}"
64
+
65
+ def process_multiple_images(images, prompt):
66
+ """Process multiple images with the same prompt"""
67
  if not images:
68
  return "Please upload at least one image."
69
 
 
 
 
 
 
 
 
 
 
 
 
70
  results = []
 
71
  for i, image in enumerate(images):
72
+ if image is not None:
73
+ result = process_single_image(image, prompt)
74
+ results.append(f"Image {i+1}: {result}")
75
+ else:
76
+ results.append(f"Image {i+1}: No image provided")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  return "\n\n".join(results)
79
 
80
+ # Create the Gradio interface
81
+ with gr.Blocks(
82
+ title="Multi-Image AI Processor",
83
+ theme=gr.themes.Soft(),
84
+ fill_height=True
85
+ ) as demo:
86
 
87
+ gr.Markdown("# 🖼️ Multi-Image AI Processor")
88
+ gr.Markdown("Upload multiple images and get AI-generated descriptions using the Qwen2-VL model.")
89
+
90
+ with gr.Row():
91
+ with gr.Column(scale=2):
92
+ # Image upload area
93
+ images_input = gr.File(
94
+ file_count="multiple",
95
+ file_types=["image"],
96
+ label="Upload Images",
97
+ height=300
98
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ # Prompt input
101
+ prompt_input = gr.Textbox(
102
+ label="Prompt",
103
+ placeholder="Describe this image in detail...",
104
+ value="Describe this image in detail.",
105
+ lines=3
106
+ )
107
+
108
+ # Process button
109
+ process_btn = gr.Button(
110
+ "🚀 Process Images",
111
+ variant="primary",
112
+ size="lg"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
114
 
115
+ with gr.Column(scale=2):
116
+ # Results area
117
+ results_output = gr.Textbox(
118
+ label="Results",
119
+ lines=15,
120
+ max_lines=25,
121
+ interactive=False
122
+ )
123
+
124
+ # Examples
125
+ with gr.Accordion("Example Prompts", open=False):
126
+ gr.Examples(
127
+ examples=[
128
+ [
129
+ "Describe the architectural style and features of this building.",
130
+ "Upload images of buildings to analyze their architectural style."
131
+ ],
132
+ [
133
+ "What are the key features and amenities shown in this property?",
134
+ "Upload property images to get detailed descriptions of features and amenities."
135
+ ],
136
+ [
137
+ "Describe the interior design and layout of this space.",
138
+ "Upload interior photos to get detailed descriptions of design and layout."
139
+ ],
140
+ [
141
+ "What type of property is this and what are its main characteristics?",
142
+ "Upload property images to identify type and characteristics."
143
+ ],
144
+ [
145
+ "Describe the condition and quality of this property.",
146
+ "Upload property images to assess condition and quality."
147
+ ]
148
+ ],
149
+ inputs=[prompt_input],
150
+ outputs=[results_output],
151
+ label="Example Prompts"
152
  )
153
 
154
+ # Footer
155
+ gr.Markdown("---")
156
+ gr.Markdown("""
157
+ **How to use:**
158
+ 1. Upload one or more images
159
+ 2. Enter a prompt describing what you want to know about the images
160
+ 3. Click "Process Images" to get AI-generated descriptions
161
+
162
+ **Tips:**
163
+ - Use specific prompts for better results
164
+ - The model works best with clear, high-quality images
165
+ - You can process multiple images at once
166
+ - Each image is processed individually with the same prompt
167
+ """)
168
+
169
+ # Connect the process button
170
+ process_btn.click(
171
+ fn=process_multiple_images,
172
+ inputs=[images_input, prompt_input],
173
+ outputs=[results_output]
174
+ )
175
 
176
+ # Launch the app
177
  if __name__ == "__main__":
178
  demo.launch()