Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -142,33 +142,6 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
| 142 |
torch_dtype=torch.float16
|
| 143 |
).to(device).eval()
|
| 144 |
|
| 145 |
-
# Load Aya-Vision-8b
|
| 146 |
-
MODEL_ID_A = "CohereForAI/aya-vision-8b"
|
| 147 |
-
processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
|
| 148 |
-
model_a = AutoModelForImageTextToText.from_pretrained(
|
| 149 |
-
MODEL_ID_A,
|
| 150 |
-
trust_remote_code=True,
|
| 151 |
-
torch_dtype=torch.float16
|
| 152 |
-
).to(device).eval()
|
| 153 |
-
|
| 154 |
-
# Load olmOCR-7B-0725
|
| 155 |
-
MODEL_ID_W = "allenai/olmOCR-7B-0725"
|
| 156 |
-
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
| 157 |
-
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 158 |
-
MODEL_ID_W,
|
| 159 |
-
trust_remote_code=True,
|
| 160 |
-
torch_dtype=torch.float16
|
| 161 |
-
).to(device).eval()
|
| 162 |
-
|
| 163 |
-
# Load RolmOCR
|
| 164 |
-
MODEL_ID_M = "reducto/RolmOCR"
|
| 165 |
-
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
|
| 166 |
-
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 167 |
-
MODEL_ID_M,
|
| 168 |
-
trust_remote_code=True,
|
| 169 |
-
torch_dtype=torch.float16
|
| 170 |
-
).to(device).eval()
|
| 171 |
-
|
| 172 |
|
| 173 |
@spaces.GPU
|
| 174 |
def generate_image(model_name: str, text: str, image: Image.Image,
|
|
@@ -178,21 +151,12 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 178 |
Generates responses using the selected model for image input.
|
| 179 |
Yields raw text and Markdown-formatted text.
|
| 180 |
"""
|
| 181 |
-
if model_name == "
|
| 182 |
-
processor = processor_m
|
| 183 |
-
model = model_m
|
| 184 |
-
elif model_name == "Qwen2-VL-OCR-2B":
|
| 185 |
processor = processor_x
|
| 186 |
model = model_x
|
| 187 |
elif model_name == "Nanonets-OCR2-3B":
|
| 188 |
processor = processor_v
|
| 189 |
model = model_v
|
| 190 |
-
elif model_name == "Aya-Vision-8B":
|
| 191 |
-
processor = processor_a
|
| 192 |
-
model = model_a
|
| 193 |
-
elif model_name == "olmOCR-7B-0725":
|
| 194 |
-
processor = processor_w
|
| 195 |
-
model = model_w
|
| 196 |
else:
|
| 197 |
yield "Invalid model selected.", "Invalid model selected."
|
| 198 |
return
|
|
@@ -237,17 +201,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 237 |
yield buffer, buffer
|
| 238 |
|
| 239 |
|
| 240 |
-
# Define examples for image inference
|
| 241 |
-
image_examples = [
|
| 242 |
-
["Extract the full page.", "images/ocr.png"],
|
| 243 |
-
["Extract the content.", "images/4.png"],
|
| 244 |
-
["Convert this page to doc [table] precisely for markdown.", "images/0.png"]
|
| 245 |
-
]
|
| 246 |
-
|
| 247 |
-
|
| 248 |
# Create the Gradio Interface
|
| 249 |
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
| 250 |
-
gr.Markdown("# **
|
| 251 |
with gr.Row():
|
| 252 |
with gr.Column(scale=2):
|
| 253 |
image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
|
|
@@ -273,8 +229,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
|
| 273 |
markdown_output = gr.Markdown(label="(Result.Md)")
|
| 274 |
|
| 275 |
model_choice = gr.Radio(
|
| 276 |
-
choices=["Nanonets-OCR2-3B", "
|
| 277 |
-
"Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
|
| 278 |
label="Select Model",
|
| 279 |
value="Nanonets-OCR2-3B"
|
| 280 |
)
|
|
|
|
| 142 |
torch_dtype=torch.float16
|
| 143 |
).to(device).eval()
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
@spaces.GPU
|
| 147 |
def generate_image(model_name: str, text: str, image: Image.Image,
|
|
|
|
| 151 |
Generates responses using the selected model for image input.
|
| 152 |
Yields raw text and Markdown-formatted text.
|
| 153 |
"""
|
| 154 |
+
if model_name == "Qwen2-VL-OCR-2B":
|
|
|
|
|
|
|
|
|
|
| 155 |
processor = processor_x
|
| 156 |
model = model_x
|
| 157 |
elif model_name == "Nanonets-OCR2-3B":
|
| 158 |
processor = processor_v
|
| 159 |
model = model_v
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
else:
|
| 161 |
yield "Invalid model selected.", "Invalid model selected."
|
| 162 |
return
|
|
|
|
| 201 |
yield buffer, buffer
|
| 202 |
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
# Create the Gradio Interface
|
| 205 |
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
|
| 206 |
+
gr.Markdown("# **Angel's Eye - Copilot**", elem_id="main-title")
|
| 207 |
with gr.Row():
|
| 208 |
with gr.Column(scale=2):
|
| 209 |
image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
|
|
|
|
| 229 |
markdown_output = gr.Markdown(label="(Result.Md)")
|
| 230 |
|
| 231 |
model_choice = gr.Radio(
|
| 232 |
+
choices=["Nanonets-OCR2-3B", "Qwen2-VL-OCR-2B"],
|
|
|
|
| 233 |
label="Select Model",
|
| 234 |
value="Nanonets-OCR2-3B"
|
| 235 |
)
|