ophelielacroix commited on
Commit
4a030b9
1 Parent(s): 8c96148

initial commit

Browse files
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language:
4
+ - da
5
+ tags:
6
+ - ned
7
+ - xlm-roberta
8
+ - pytorch
9
+ - transformers
10
+ license: cc-by-sa-4.0
11
+ datasets:
12
+ - DaNED
13
+ - DaWikiNED
14
+ metrics:
15
+ - f1
16
+ ---
17
+
18
+ # XLM-Roberta fine-tuned for Named Entity Disambiguation
19
+
20
+ Given a sentence and a knowledge graph context, the model detects whether a specific entity (represented by the knowledge graph context) is mentioned in the sentence (binary classification).
21
+ The base language model used is the [xlm-roberta-base](https://huggingface.co/xlm-roberta-base).
22
+
23
+ Here is how to use the model:
24
+
25
+ ```python
26
+ from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
27
+
28
+ model = XLMRobertaForSequenceClassification.from_pretrained("DaNLP/da-xlmr-ned")
29
+ tokenizer = XLMRobertaTokenizer.from_pretrained("DaNLP/da-xlmr-ned")
30
+ ```
31
+
32
+ The tokenizer takes 2 strings has input: the sentence and the knowledge graph (KG) context.
33
+ Here is an example:
34
+ ```python
35
+ sentence = "Karen Blixen vendte tilbage til Danmark, hvor hun boede resten af sit liv på Rungstedlund, som hun arvede efter sin mor i 1939"
36
+ kg_context = "udmærkelser modtaget Kritikerprisen udmærkelser modtaget Tagea Brandts Rejselegat udmærkelser modtaget Ingenio et arti udmærkelser modtaget Holbergmedaljen udmærkelser modtaget De Gyldne Laurbær mor Ingeborg Dinesen ægtefælle Bror von Blixen-Finecke køn kvinde Commons-kategori Karen Blixen LCAuth no95003722 VIAF 90663542 VIAF 121643918 GND-identifikator 118637878 ISNI 0000 0001 2096 6265 ISNI 0000 0003 6863 4408 ISNI 0000 0001 1891 0457 fødested Rungstedlund fødested Rungsted dødssted Rungstedlund dødssted København statsborgerskab Danmark NDL-nummer 00433530 dødsdato +1962-09-07T00:00:00Z dødsdato +1962-01-01T00:00:00Z fødselsdato +1885-04-17T00:00:00Z fødselsdato +1885-01-01T00:00:00Z AUT NKC jn20000600905 AUT NKC jo2015880827 AUT NKC xx0196181 emnets hovedkategori Kategori:Karen Blixen tilfælde af menneske billede Karen Blixen cropped from larger original.jpg IMDb-identifikationsnummer nm0227598 Freebase-ID /m/04ymd8w BNF 118857710 beskæftigelse skribent beskæftigelse selvbiograf beskæftigelse novelleforfatter ..."
37
+ ```
38
+
39
+ A KG context, for a specific entity, can be generated from its Wikidata page.
40
+ In the previous example, the KG context is a string representation of the Wikidata page of [Karen Blixen (QID=Q182804)](https://www.wikidata.org/wiki/Q182804).
41
+ See the [DaNLP documentation](https://danlp-alexandra.readthedocs.io/en/latest/docs/tasks/ned.html#xlmr) for more details about how to generate a KG context.
42
+
43
+
44
+ ## Training Data
45
+
46
+ The model has been trained on the [DaNED](https://danlp-alexandra.readthedocs.io/en/latest/docs/datasets.html#daned) and [DaWikiNED](https://danlp-alexandra.readthedocs.io/en/latest/docs/datasets.html#dawikined) datasets.
47
+
48
+
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": ".",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "not mentioned",
15
+ "1": "mentioned"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "mentioned": 1,
21
+ "not mentioned": 0
22
+ },
23
+ "layer_norm_eps": 1e-05,
24
+ "max_position_embeddings": 514,
25
+ "model_type": "xlm-roberta",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "output_past": true,
29
+ "pad_token_id": 1,
30
+ "position_embedding_type": "absolute",
31
+ "transformers_version": "4.9.2",
32
+ "type_vocab_size": 1,
33
+ "use_cache": true,
34
+ "vocab_size": 250002
35
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:216f83dd3982d7a4674125737cca5999ccd95c61043fd144ac08572f4163c05a
3
+ size 1112269212
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0c5eb6ed9b89260cb34d1f39ca31447ac0a975c874c565f36b75bbdf0cb29ca
3
+ size 1112454872
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "do_lower_case": false, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "xlm-roberta-base"}