Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +264 -0
config.json +145 -0
model.onnx +3 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,264 @@

+---
+license: mit
+base_model:
+- google/efficientnet-b0
+datasets:
+- docling-project/HF-CC-v0-00001-00010-images-filtered-new-class
+tags:
+- image-classification
+- document-analysis
+- figure-classification
+---
+# EfficientNet-B0 Document Figure Classifier v2.5
+This is an image classification model based on **Google EfficientNet-B0**, fine-tuned on a subset of the [subset of HuggingFace/finepdfs](https://huggingface.co/datasets/docling-project/HF-CC-v0-00001-00010-images-filtered-new-class) to classify document figures into one of the following 26 categories:
+1. **logo**
+2. **photograph**
+3. **icon**
+4. **engineering_drawing**
+5. **line_chart**
+6. **bar_chart**
+7. **other**
+8. **table**
+9. **flow_chart**
+10. **screenshot_from_computer**
+11. **signature**
+12. **screenshot_from_manual**
+13. **geographical_map**
+14. **pie_chart**
+15. **page_thumbnail**
+16. **stamp**
+17. **music**
+18. **calendar**
+19. **qr_code**
+20. **bar_code**
+21. **full_page_image**
+22. **scatter_plot**
+23. **chemistry_structure**
+24. **topographical_map**
+25. **crossword_puzzle**
+26. **box_plot**
+## Model Performance
+The model was evaluated on a held-out test set from the finepdfs dataset with the following metrics:
+| Metric | Score |
+|--------|-------|
+| **Accuracy** | 0.90703 |
+| **Balanced Accuracy** | 0.68836 |
+| **Macro F1** | 0.68942 |
+| **Weighted F1** | 0.90716 |
+| **Cohen's Kappa** | 0.87449 |
+### Per-Label Performance
+| Label | Precision | Recall |
+|-------|-----------|--------|
+| **logo** | 0.92807 | 0.91816 |
+| **photograph** | 0.90966 | 0.96029 |
+| **icon** | 0.83605 | 0.82678 |
+| **engineering_drawing** | 0.71689 | 0.81172 |
+| **line_chart** | 0.73055 | 0.92117 |
+| **bar_chart** | 0.88599 | 0.92720 |
+| **other** | 0.41893 | 0.38213 |
+| **table** | 0.98636 | 0.96765 |
+| **flow_chart** | 0.75926 | 0.82425 |
+| **screenshot_from_computer** | 0.85952 | 0.71980 |
+| **signature** | 0.89020 | 0.85971 |
+| **screenshot_from_manual** | 0.48559 | 0.34543 |
+| **geographical_map** | 0.86780 | 0.85219 |
+| **pie_chart** | 0.96880 | 0.94220 |
+| **page_thumbnail** | 0.52008 | 0.35188 |
+| **stamp** | 0.71269 | 0.41794 |
+| **music** | 0.48037 | 0.57778 |
+| **calendar** | 0.52880 | 0.28775 |
+| **qr_code** | 0.95694 | 0.93240 |
+| **bar_code** | 0.34244 | 0.84305 |
+| **full_page_image** | 0.40323 | 0.65789 |
+| **scatter_plot** | 0.66848 | 0.67213 |
+| **chemistry_structure** | 0.72781 | 0.65426 |
+| **topographical_map** | 0.83333 | 0.38462 |
+| **crossword_puzzle** | 0.57143 | 0.21622 |
+| **box_plot** | 0.85714 | 0.64286 |
+## How to use - Transformers
+Example of how to classify an image into one of the 26 classes using transformers:
+```python
+import torch
+import torchvision.transforms as transforms
+from transformers import EfficientNetForImageClassification
+from PIL import Image
+import requests
+urls = [
+    'http://images.cocodataset.org/val2017/000000039769.jpg',
+    'http://images.cocodataset.org/test-stuff2017/000000001750.jpg',
+    'http://images.cocodataset.org/test-stuff2017/000000000001.jpg'
+]
+image_processor = transforms.Compose(
+    [
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.47853944, 0.4732864, 0.47434163],
+        ),
+    ]
+)
+images = []
+for url in urls:
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    image = image_processor(image)
+    images.append(image)
+model_id = 'docling-project/DocumentFigureClassifier-v2.5'
+model = EfficientNetForImageClassification.from_pretrained(model_id)
+labels = model.config.id2label
+device = torch.device("cpu")
+torch_images = torch.stack(images).to(device)
+with torch.no_grad():
+    logits = model(torch_images).logits  # (batch_size, num_classes)
+    probs_batch = logits.softmax(dim=1)  # (batch_size, num_classes)
+    probs_batch = probs_batch.cpu().numpy().tolist()
+for idx, probs_image in enumerate(probs_batch):
+    preds = [(labels[i], prob) for i, prob in enumerate(probs_image)]
+    preds.sort(key=lambda t: t[1], reverse=True)
+    print(f"{idx}: {preds}")
+```
+## How to use - ONNX
+Example of how to classify an image into one of the 26 classes using onnx runtime:
+```python
+import onnxruntime
+import numpy as np
+import torchvision.transforms as transforms
+from PIL import Image
+import requests
+LABELS = [
+    "logo",
+    "photograph",
+    "icon",
+    "engineering_drawing",
+    "line_chart",
+    "bar_chart",
+    "other",
+    "table",
+    "flow_chart",
+    "screenshot_from_computer",
+    "signature",
+    "screenshot_from_manual",
+    "geographical_map",
+    "pie_chart",
+    "page_thumbnail",
+    "stamp",
+    "music",
+    "calendar",
+    "qr_code",
+    "bar_code",
+    "full_page_image",
+    "scatter_plot",
+    "chemistry_structure",
+    "topographical_map",
+    "crossword_puzzle",
+    "box_plot"
+]
+urls = [
+    'http://images.cocodataset.org/val2017/000000039769.jpg',
+    'http://images.cocodataset.org/test-stuff2017/000000001750.jpg',
+    'http://images.cocodataset.org/test-stuff2017/000000000001.jpg'
+]
+images = []
+for url in urls:
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    images.append(image)
+image_processor = transforms.Compose(
+    [
+        transforms.Resize((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.47853944, 0.4732864, 0.47434163],
+        ),
+    ]
+)
+processed_images_onnx = [image_processor(image).unsqueeze(0) for image in images]
+# onnx needs numpy as input
+onnx_inputs = [item.numpy(force=True) for item in processed_images_onnx]
+# pack into a batch
+onnx_inputs = np.concatenate(onnx_inputs, axis=0)
+ort_session = onnxruntime.InferenceSession(
+    "./DocumentFigureClassifier-v2_5-onnx/model.onnx",
+    providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+)
+for item in ort_session.run(None, {'input': onnx_inputs}):
+    for x in iter(item):
+        pred = x.argmax()
+        print(LABELS[pred])
+```
+## Training Data
+This model was trained on a subset of the [subset of HuggingFace/finepdfs](https://huggingface.co/datasets/docling-project/HF-CC-v0-00001-00010-images-filtered-new-class), a large-scale dataset for document understanding tasks.
+## Citation
+If you use this model in your work, please cite the following papers:
+```
+@article{Tan2019EfficientNetRM,
+  title={EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks},
+  author={Mingxing Tan and Quoc V. Le},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1905.11946}
+}
+@techreport{Docling,
+  author = {Deep Search Team},
+  month = {8},
+  title = {{Docling Technical Report}},
+  url={https://arxiv.org/abs/2408.09869},
+  eprint={2408.09869},
+  doi = "10.48550/arXiv.2408.09869",
+  version = {1.0.0},
+  year = {2024}
+}

config.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "architectures": [
+    "EfficientNetForImageClassification"
+  ],
+  "batch_norm_eps": 0.001,
+  "batch_norm_momentum": 0.99,
+  "depth_coefficient": 1.0,
+  "depth_divisor": 8,
+  "depthwise_padding": [],
+  "drop_connect_rate": 0.2,
+  "dropout_rate": 0.2,
+  "dtype": "float32",
+  "expand_ratios": [
+    1,
+    6,
+    6,
+    6,
+    6,
+    6,
+    6
+  ],
+  "hidden_act": "swish",
+  "hidden_dim": 1280,
+  "id2label": {
+    "0": "logo",
+    "1": "photograph",
+    "10": "signature",
+    "11": "screenshot_from_manual",
+    "12": "geographical_map",
+    "13": "pie_chart",
+    "14": "page_thumbnail",
+    "15": "stamp",
+    "16": "music",
+    "17": "calendar",
+    "18": "qr_code",
+    "19": "bar_code",
+    "2": "icon",
+    "20": "full_page_image",
+    "21": "scatter_plot",
+    "22": "chemistry_structure",
+    "23": "topographical_map",
+    "24": "crossword_puzzle",
+    "25": "box_plot",
+    "3": "engineering_drawing",
+    "4": "line_chart",
+    "5": "bar_chart",
+    "6": "other",
+    "7": "table",
+    "8": "flow_chart",
+    "9": "screenshot_from_computer"
+  },
+  "image_size": 224,
+  "in_channels": [
+    32,
+    16,
+    24,
+    40,
+    80,
+    112,
+    192
+  ],
+  "initializer_range": 0.02,
+  "kernel_sizes": [
+    3,
+    3,
+    5,
+    3,
+    5,
+    5,
+    3
+  ],
+  "label2id": {
+    "bar_chart": "5",
+    "bar_code": "19",
+    "box_plot": "25",
+    "calendar": "17",
+    "chemistry_structure": "22",
+    "crossword_puzzle": "24",
+    "engineering_drawing": "3",
+    "flow_chart": "8",
+    "full_page_image": "20",
+    "geographical_map": "12",
+    "icon": "2",
+    "line_chart": "4",
+    "logo": "0",
+    "music": "16",
+    "other": "6",
+    "page_thumbnail": "14",
+    "photograph": "1",
+    "pie_chart": "13",
+    "qr_code": "18",
+    "scatter_plot": "21",
+    "screenshot_from_computer": "9",
+    "screenshot_from_manual": "11",
+    "signature": "10",
+    "stamp": "15",
+    "table": "7",
+    "topographical_map": "23"
+  },
+  "model_type": "efficientnet",
+  "num_block_repeats": [
+    1,
+    2,
+    2,
+    3,
+    3,
+    4,
+    1
+  ],
+  "num_channels": 3,
+  "num_hidden_layers": 64,
+  "out_channels": [
+    16,
+    24,
+    40,
+    80,
+    112,
+    192,
+    320
+  ],
+  "out_features": null,
+  "pooling_type": "mean",
+  "squeeze_expansion_ratio": 0.25,
+  "stage_names": [
+    "stem",
+    "stage1",
+    "stage2",
+    "stage3",
+    "stage4",
+    "stage5",
+    "stage6",
+    "stage7"
+  ],
+  "strides": [
+    1,
+    2,
+    2,
+    2,
+    1,
+    2,
+    1
+  ],
+  "transformers_version": "4.57.3",
+  "width_coefficient": 1.0
+}

model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27ffc48c27ae4e12c99b6f6de0dd730005245e47b70dd0c1339e62cbac3ec4c0
+size 16940439

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bf1e44d6bce316dcade6eb9929d8f8d23b6e8d9d29062b3b4011cff87c7c3cd
+size 16378200