Mirror rafmacalaba/gliner2-datause-large-v1-hybrid-entities → ai4data/datause-extraction (two-pass hybrid adapter)
Browse files- README.md +57 -146
- adapter_config.json +2 -5
- adapter_weights.safetensors +2 -2
README.md
CHANGED
|
@@ -1,175 +1,86 @@
|
|
| 1 |
---
|
| 2 |
-
language:
|
| 3 |
-
- en
|
| 4 |
-
license: apache-2.0
|
| 5 |
tags:
|
| 6 |
- gliner2
|
| 7 |
- ner
|
| 8 |
-
-
|
| 9 |
- lora
|
| 10 |
-
-
|
| 11 |
base_model: fastino/gliner2-large-v1
|
| 12 |
library_name: gliner2
|
| 13 |
-
pipeline_tag: token-classification
|
| 14 |
-
datasets:
|
| 15 |
-
- rafmacalaba/datause-v8
|
| 16 |
-
model-index:
|
| 17 |
-
- name: datause-extraction
|
| 18 |
-
results:
|
| 19 |
-
- task:
|
| 20 |
-
type: token-classification
|
| 21 |
-
name: Dataset Mention Extraction
|
| 22 |
-
metrics:
|
| 23 |
-
- type: f1
|
| 24 |
-
value: 84.8
|
| 25 |
-
name: F1 (max_tokens=512)
|
| 26 |
-
- type: precision
|
| 27 |
-
value: 90.0
|
| 28 |
-
name: Precision
|
| 29 |
-
- type: recall
|
| 30 |
-
value: 80.2
|
| 31 |
-
name: Recall
|
| 32 |
---
|
| 33 |
|
| 34 |
-
#
|
| 35 |
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
|
|
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|---|---|
|
| 50 |
-
| **F1** | **84.8%** |
|
| 51 |
-
| Precision | 90.0% |
|
| 52 |
-
| Recall | 80.2% |
|
| 53 |
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|---|---|---|---|
|
| 58 |
-
| Named | 394 | 317 | 80.5% |
|
| 59 |
-
| Descriptive | 135 | 108 | 80.0% |
|
| 60 |
-
| Vague | 87 | 70 | 80.5% |
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
| `dataset_name` | string | Name or description of the dataset |
|
| 69 |
-
| `acronym` | string | Abbreviation (e.g., "DHS", "LSMS") |
|
| 70 |
-
| `author` | string | Individual author(s) |
|
| 71 |
-
| `producer` | string | Organization that created the dataset |
|
| 72 |
-
| `publication_year` | string | Year published |
|
| 73 |
-
| `reference_year` | string | Year data was collected |
|
| 74 |
-
| `reference_population` | string | Target population |
|
| 75 |
-
| `geography` | string | Geographic coverage |
|
| 76 |
-
| `description` | string | Content description |
|
| 77 |
-
| `data_type` | choice | survey, census, database, administrative, indicator, geospatial, microdata, report, other |
|
| 78 |
-
| `dataset_tag` | choice | named, descriptive, vague |
|
| 79 |
-
| `usage_context` | choice | primary, supporting, background |
|
| 80 |
-
| `is_used` | choice | True, False |
|
| 81 |
|
| 82 |
## Usage
|
| 83 |
|
| 84 |
-
### With `ai4data` library (recommended)
|
| 85 |
-
|
| 86 |
-
```bash
|
| 87 |
-
pip install git+https://github.com/rafmacalaba/monitoring_of_datause.git
|
| 88 |
-
```
|
| 89 |
-
|
| 90 |
-
```python
|
| 91 |
-
from ai4data import extract_from_text, extract_from_document
|
| 92 |
-
|
| 93 |
-
# Extract from text
|
| 94 |
-
text = """We use the Demographic and Health Survey (DHS) from 2020 as our
|
| 95 |
-
primary data source to analyze outcomes in Ghana. For robustness checks,
|
| 96 |
-
we also reference the Ghana Living Standard Survey (GLSS) from 2012."""
|
| 97 |
-
|
| 98 |
-
results = extract_from_text(text)
|
| 99 |
-
for ds in results["datasets"]:
|
| 100 |
-
print(f" {ds['dataset_name']} [{ds['dataset_tag']}]")
|
| 101 |
-
|
| 102 |
-
# Extract from PDF (URL or local file)
|
| 103 |
-
url = "https://documents1.worldbank.org/curated/en/.../report.pdf"
|
| 104 |
-
results = extract_from_document(url)
|
| 105 |
-
```
|
| 106 |
-
|
| 107 |
-
### With GLiNER2 directly
|
| 108 |
-
|
| 109 |
```python
|
| 110 |
from gliner2 import GLiNER2
|
| 111 |
-
from huggingface_hub import snapshot_download
|
| 112 |
-
|
| 113 |
-
# Load base model + adapter
|
| 114 |
-
model = GLiNER2.from_pretrained("fastino/gliner2-large-v1")
|
| 115 |
-
adapter_path = snapshot_download("ai4data/datause-extraction")
|
| 116 |
-
model.load_adapter(adapter_path)
|
| 117 |
-
|
| 118 |
-
# Define extraction schema
|
| 119 |
-
schema = (
|
| 120 |
-
model.create_schema()
|
| 121 |
-
.structure("dataset_mention")
|
| 122 |
-
.field("dataset_name", dtype="str")
|
| 123 |
-
.field("acronym", dtype="str")
|
| 124 |
-
.field("producer", dtype="str")
|
| 125 |
-
.field("geography", dtype="str")
|
| 126 |
-
.field("description", dtype="str")
|
| 127 |
-
.field("data_type", dtype="str",
|
| 128 |
-
choices=["survey", "census", "database", "administrative",
|
| 129 |
-
"indicator", "geospatial", "microdata", "report", "other"])
|
| 130 |
-
.field("dataset_tag", dtype="str",
|
| 131 |
-
choices=["named", "descriptive", "vague"])
|
| 132 |
-
.field("usage_context", dtype="str",
|
| 133 |
-
choices=["primary", "supporting", "background"])
|
| 134 |
-
.field("is_used", dtype="str", choices=["True", "False"])
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
results = model.extract(text, schema)
|
| 138 |
-
for mention in results["dataset_mention"]:
|
| 139 |
-
print(mention)
|
| 140 |
-
```
|
| 141 |
-
|
| 142 |
-
## Training Details
|
| 143 |
-
|
| 144 |
-
- **Base model**: [fastino/gliner2-large-v1](https://huggingface.co/fastino/gliner2-large-v1) (DeBERTa-v3-large encoder)
|
| 145 |
-
- **Method**: LoRA (r=16, alpha=32)
|
| 146 |
-
- **Training data**: ~3,400 synthetic examples (v8 dataset) generated with GPT-4o and Gemini 2.5 Flash
|
| 147 |
-
- **Max context**: 512 tokens (aligned with DeBERTa-v3 position embeddings)
|
| 148 |
-
- **Data format**: Context-aware passages with markdown formatting, footnotes, and structured annotations
|
| 149 |
|
| 150 |
-
#
|
|
|
|
| 151 |
|
| 152 |
-
|
| 153 |
-
-
|
| 154 |
-
- May not generalize well to non-research text (news articles, social media, etc.)
|
| 155 |
-
- Requires the `fastino/gliner2-large-v1` base model
|
| 156 |
|
| 157 |
-
#
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
}
|
|
|
|
|
|
|
|
|
|
| 169 |
```
|
| 170 |
-
|
| 171 |
-
## Links
|
| 172 |
-
|
| 173 |
-
- **Library**: [ai4data](https://github.com/rafmacalaba/monitoring_of_datause)
|
| 174 |
-
- **Base model**: [fastino/gliner2-large-v1](https://huggingface.co/fastino/gliner2-large-v1)
|
| 175 |
-
- **Program**: [AI for Data—Data for AI](https://www.worldbank.org/en/programs/ai4data) (World Bank & UNHCR)
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
| 2 |
tags:
|
| 3 |
- gliner2
|
| 4 |
- ner
|
| 5 |
+
- data-mention-extraction
|
| 6 |
- lora
|
| 7 |
+
- two-pass-hybrid
|
| 8 |
base_model: fastino/gliner2-large-v1
|
| 9 |
library_name: gliner2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# GLiNER2 Data Mention Extractor (v1-hybrid-entities)
|
| 13 |
|
| 14 |
+
Fine-tuned GLiNER2 LoRA adapter for extracting structured data mentions from
|
| 15 |
+
development economics and humanitarian research documents.
|
| 16 |
|
| 17 |
+
## Architecture: Two-Pass Hybrid
|
| 18 |
|
| 19 |
+
This adapter uses a **two-pass** inference strategy to bypass the count_pred/count_embed
|
| 20 |
+
mode collapse that limits native `extract_json` to 1 mention per chunk:
|
| 21 |
|
| 22 |
+
- **Pass 1** (`extract_entities`): Finds ALL data mention spans using 3 entity types
|
| 23 |
+
(`named_mention`, `descriptive_mention`, `vague_mention`). Bypasses count_pred entirely.
|
| 24 |
+
- **Pass 2** (`extract_json`): Classifies each span individually using sentence-level context.
|
| 25 |
+
count=1 is always correct since each call contains exactly 1 mention.
|
| 26 |
|
| 27 |
+
See `finetuning/ARCHITECTURE.md` for the full rationale.
|
| 28 |
|
| 29 |
+
## Task
|
| 30 |
|
| 31 |
+
Given a document passage, extracts structured information about each dataset mentioned:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
- **Entity types** (Pass 1 — span detection):
|
| 34 |
+
- `named_mention`: Proper names and acronyms (DHS, LSMS, FAOSTAT)
|
| 35 |
+
- `descriptive_mention`: Described data with identifying detail but no formal name
|
| 36 |
+
- `vague_mention`: Generic data references with minimal identifying detail
|
| 37 |
+
- **Classification fields** (Pass 2 — fixed choices):
|
| 38 |
+
- `typology_tag`: survey / census / database / administrative / indicator / geospatial / microdata / report / other
|
| 39 |
+
- `is_used`: True / False
|
| 40 |
+
- `usage_context`: primary / supporting / background
|
| 41 |
|
| 42 |
+
## Training
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
- **Base model**: `fastino/gliner2-large-v1`
|
| 45 |
+
- **Method**: LoRA (r=16, alpha=32.0)
|
| 46 |
+
- **Target modules**: ['encoder', 'span_rep']
|
| 47 |
+
- **Training examples**: 8087
|
| 48 |
+
- **Val examples**: 563
|
| 49 |
+
- **Best val loss**: None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
## Usage
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
```python
|
| 54 |
from gliner2 import GLiNER2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
# Install the patched library first
|
| 57 |
+
# pip install git+https://github.com/rafmacalaba/GLiNER2.git@feat/main-mirror
|
| 58 |
|
| 59 |
+
extractor = GLiNER2.from_pretrained("fastino/gliner2-large-v1")
|
| 60 |
+
extractor.load_adapter("rafmacalaba/gliner2-datause-large-v1-hybrid-entities")
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
# Pass 1: Extract all mention spans
|
| 63 |
+
entity_schema = {
|
| 64 |
+
"entities": ["named_mention", "descriptive_mention", "vague_mention"],
|
| 65 |
+
"entity_descriptions": {
|
| 66 |
+
"named_mention": "A proper name or well-known acronym for a data source...",
|
| 67 |
+
"descriptive_mention": "A described data reference with enough detail...",
|
| 68 |
+
"vague_mention": "A generic or loosely specified reference to data...",
|
| 69 |
+
},
|
| 70 |
+
}
|
| 71 |
+
spans = extractor.extract(text, entity_schema, threshold=0.3)
|
| 72 |
+
|
| 73 |
+
# Pass 2: Classify each span
|
| 74 |
+
json_schema = {
|
| 75 |
+
"data_mention": {
|
| 76 |
+
"mention_name": "",
|
| 77 |
+
"typology_tag": {"choices": ["survey", "census", "administrative", "database",
|
| 78 |
+
"indicator", "geospatial", "microdata", "report", "other"]},
|
| 79 |
+
"is_used": {"choices": ["True", "False"]},
|
| 80 |
+
"usage_context": {"choices": ["primary", "supporting", "background"]},
|
| 81 |
+
},
|
| 82 |
}
|
| 83 |
+
for span in spans.get("named_mention", []):
|
| 84 |
+
context = extract_sentence_context(text, span)
|
| 85 |
+
tags = extractor.extract(context, json_schema)
|
| 86 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adapter_config.json
CHANGED
|
@@ -2,14 +2,11 @@
|
|
| 2 |
"adapter_type": "lora",
|
| 3 |
"adapter_version": "1.0",
|
| 4 |
"lora_r": 16,
|
| 5 |
-
"lora_alpha": 32,
|
| 6 |
"lora_dropout": 0.1,
|
| 7 |
"target_modules": [
|
| 8 |
-
"classifier",
|
| 9 |
-
"count_embed",
|
| 10 |
-
"count_pred",
|
| 11 |
"encoder",
|
| 12 |
"span_rep"
|
| 13 |
],
|
| 14 |
-
"created_at": "2026-
|
| 15 |
}
|
|
|
|
| 2 |
"adapter_type": "lora",
|
| 3 |
"adapter_version": "1.0",
|
| 4 |
"lora_r": 16,
|
| 5 |
+
"lora_alpha": 32.0,
|
| 6 |
"lora_dropout": 0.1,
|
| 7 |
"target_modules": [
|
|
|
|
|
|
|
|
|
|
| 8 |
"encoder",
|
| 9 |
"span_rep"
|
| 10 |
],
|
| 11 |
+
"created_at": "2026-04-06T22:28:30.225894Z"
|
| 12 |
}
|
adapter_weights.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:651065028cbf29f7aa1cdb7dc3b85990189808be3c849e9e357030dbfa64c5d0
|
| 3 |
+
size 30380176
|