Spaces:

davanstrien
/

vllm-index-card-extractor

Running on Zero

App Files Files Community

davanstrien HF Staff commited on Oct 6

Commit

caebeb2

1 Parent(s): bbe7feb

Refactor app.py for improved readability and consistency; streamline model loading and update extraction prompt.

Browse files

Files changed (1) hide show

app.py +16 -30

app.py CHANGED Viewed

@@ -12,9 +12,7 @@ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 # Load model and processor
 print("Loading Qwen3-VL-30B-A3B-Instruct model...")
 model = AutoModelForImageTextToText.from_pretrained(
-    "Qwen/Qwen3-VL-30B-A3B-Instruct",
-    torch_dtype=torch.bfloat16,
-    device_map="auto"
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
 print("Model loaded successfully!")
@@ -25,10 +23,10 @@ EXTRACTION_PROMPT = """Extract all metadata from this library catalog card and r
 - date: Any dates mentioned (publication, creation, or coverage dates)
 - call_number: Library classification or call number
 - physical_description: Details about the physical item (size, extent, format)
-- subjects: Subject headings or topics
 - notes: Any additional notes or information
-Return ONLY the JSON object, nothing else. If a field is not present on the card, use null for that field."""
 @spaces.GPU
 def extract_metadata(image):
@@ -47,8 +45,8 @@ def extract_metadata(image):
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
-                    {"type": "text", "text": EXTRACTION_PROMPT}
-                ]
             }
         ]
@@ -63,29 +61,27 @@ def extract_metadata(image):
             images=image_inputs,
             videos=video_inputs,
             padding=True,
-            return_tensors="pt"
         )
         inputs = inputs.to(model.device)
         # Generate
         with torch.inference_mode():
             generated_ids = model.generate(
-                **inputs,
-                max_new_tokens=512,
-                temperature=0.1,
-                do_sample=False
             )
         # Trim input tokens from output
         generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         # Decode output
         output_text = processor.batch_decode(
             generated_ids_trimmed,
             skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
         )[0]
         # Try to parse as JSON for pretty formatting
@@ -99,11 +95,12 @@ def extract_metadata(image):
     except Exception as e:
         return f"Error during extraction: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Library Card Metadata Extractor") as demo:
     gr.Markdown("# 📇 Library Card Metadata Extractor")
     gr.Markdown(
-        "Extract structured metadata from library catalog cards using **Qwen3-VL-30B**. "
         "Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
         "call numbers, and more.\n\n"
         "This demo works with catalog cards from libraries and archives, such as the "
@@ -116,25 +113,14 @@ with gr.Blocks(title="Library Card Metadata Extractor") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Catalog Card")
-            image_input = gr.Image(
-                label="Library Catalog Card",
-                type="pil"
-            )
             submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### 📋 Extracted Metadata (JSON)")
-            output = gr.Code(
-                label="Metadata",
-                language="json",
-                lines=15
-            )
-    submit_btn.click(
-        fn=extract_metadata,
-        inputs=image_input,
-        outputs=output
-    )
     gr.Markdown("---")
@@ -152,7 +138,7 @@ with gr.Blocks(title="Library Card Metadata Extractor") as demo:
         inputs=image_input,
         outputs=output,
         fn=extract_metadata,
-        cache_examples=False
     )
     gr.Markdown("---")

 # Load model and processor
 print("Loading Qwen3-VL-30B-A3B-Instruct model...")
 model = AutoModelForImageTextToText.from_pretrained(
+    "Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
 print("Model loaded successfully!")
 - date: Any dates mentioned (publication, creation, or coverage dates)
 - call_number: Library classification or call number
 - physical_description: Details about the physical item (size, extent, format)
 - notes: Any additional notes or information
+Return NLY the JSON object, nothing else. If a field is not present on the card, use null for that field."""
 @spaces.GPU
 def extract_metadata(image):
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": EXTRACTION_PROMPT},
+                ],
             }
         ]
             images=image_inputs,
             videos=video_inputs,
             padding=True,
+            return_tensors="pt",
         )
         inputs = inputs.to(model.device)
         # Generate
         with torch.inference_mode():
             generated_ids = model.generate(
+                **inputs, max_new_tokens=512, temperature=0.1, do_sample=False
             )
         # Trim input tokens from output
         generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         # Decode output
         output_text = processor.batch_decode(
             generated_ids_trimmed,
             skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
         )[0]
         # Try to parse as JSON for pretty formatting
     except Exception as e:
         return f"Error during extraction: {str(e)}"
 # Create Gradio interface
 with gr.Blocks(title="Library Card Metadata Extractor") as demo:
     gr.Markdown("# 📇 Library Card Metadata Extractor")
     gr.Markdown(
+        "Extract structured metadata from library catalog cards using **Qwen/Qwen3-VL-30B-A3B-Instruct**. "
         "Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
         "call numbers, and more.\n\n"
         "This demo works with catalog cards from libraries and archives, such as the "
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Catalog Card")
+            image_input = gr.Image(label="Library Catalog Card", type="pil")
             submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### 📋 Extracted Metadata (JSON)")
+            output = gr.Code(label="Metadata", language="json", lines=15)
+    submit_btn.click(fn=extract_metadata, inputs=image_input, outputs=output)
     gr.Markdown("---")
         inputs=image_input,
         outputs=output,
         fn=extract_metadata,
+        cache_examples=False,
     )
     gr.Markdown("---")