kyrgyzNER model (xlm-roberta-base) by The_Cramer_Project
- The original repository: https://github.com/Akyl-AI/KyrgyzNER
- Paper will be uploaded soon
- KyrgyzNER dataset and Codes will be uploaded soon
This model is a fine-tuned version of xlm-roberta-base on the KyrgyzNER dataset. It achieves the following results on the evaluation set:
- Loss: 0.3273
- Precision: 0.7090
- Recall: 0.6946
- F1: 0.7017
- Accuracy: 0.9119
How to use
You can use this model with the Transformers pipeline for NER.
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers import pipeline
id2label = {
'LABEL_0': 'B-NATIONAL',
'LABEL_1': 'I-PLANT',
'LABEL_2': 'I-ORGANISATION',
'LABEL_3': 'B-ORGANISATION',
'LABEL_4': 'B-MEDIA',
'LABEL_5': 'I-ARTIFACT',
'LABEL_6': 'B-AWARD',
'LABEL_7': 'B-UNKNOWN',
'LABEL_8': 'I-LOCATION',
'LABEL_9': 'B-PERSON',
'LABEL_10': 'I-LEGAL',
'LABEL_11': 'B-BUSINESS',
'LABEL_12': 'B-ACRONYM',
'LABEL_13': 'I-PERIOD',
'LABEL_14': 'B-INSTITUTION',
'LABEL_15': 'I-MEASURE',
'LABEL_16': 'B-CREATION',
'LABEL_17': 'I-ACRONYM',
'LABEL_18': 'I-AWARD',
'LABEL_19': 'I-WEBSITE',
'LABEL_20': 'B-PERIOD',
'LABEL_21': 'I-PERSON',
'LABEL_22': 'I-PERSON_TYPE',
'LABEL_23': 'B-SUBSTANCE',
'LABEL_24': 'O',
'LABEL_25': 'B-PLANT',
'LABEL_26': 'I-INSTITUTION',
'LABEL_27': 'I-SUBSTANCE',
'LABEL_28': 'I-INSTALLATION',
'LABEL_29': 'B-CONCEPT',
'LABEL_30': 'B-TITLE',
'LABEL_31': 'I-EVENT',
'LABEL_32': 'B-ARTIFACT',
'LABEL_33': 'B-MEASURE',
'LABEL_34': 'B-LOCATION',
'LABEL_35': 'I-BUSINESS',
'LABEL_36': 'B-ANIMAL',
'LABEL_37': 'B-PERSON_TYPE',
'LABEL_38': 'B-INSTALLATION',
'LABEL_39': 'I-TITLE',
'LABEL_40': 'B-IDENTIFIER',
'LABEL_41': 'I-IDENTIFIER',
'LABEL_42': 'B-LEGAL',
'LABEL_43': 'I-MEDIA',
'LABEL_44': 'I-CONCEPT',
'LABEL_45': 'I-UNKNOWN',
'LABEL_46': 'B-EVENT',
'LABEL_47': 'B-WEBSITE',
'LABEL_48': 'I-NATIONAL',
'LABEL_49': 'I-CREATION',
'LABEL_50': 'I-ANIMAL'}
model_ckpt = "TTimur/xlm-roberta-base-kyrgyzNER"
config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForTokenClassification.from_pretrained(model_ckpt, config = config)
# aggregation_strategy = "none"
nlp = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "none")
example = "Кыргызстан Орто Азиянын түндүк-чыгышында орун алган мамлекет."
ner_results = nlp(example)
for result in ner_results:
result.update({'entity': id2label[result['entity']]})
print(result)
# output:
# {'entity': 'B-LOCATION', 'score': 0.95103735, 'index': 1, 'word': '▁Кыргызстан', 'start': 0, 'end': 10}
# {'entity': 'B-LOCATION', 'score': 0.79447913, 'index': 2, 'word': '▁Ор', 'start': 11, 'end': 13}
# {'entity': 'I-LOCATION', 'score': 0.8703734, 'index': 3, 'word': 'то', 'start': 13, 'end': 15}
# {'entity': 'I-LOCATION', 'score': 0.942387, 'index': 4, 'word': '▁Азия', 'start': 16, 'end': 20}
# {'entity': 'I-LOCATION', 'score': 0.8542615, 'index': 5, 'word': 'нын', 'start': 20, 'end': 23}
# {'entity': 'I-LOCATION', 'score': 0.70930535, 'index': 6, 'word': '▁түн', 'start': 24, 'end': 27}
# {'entity': 'I-LOCATION', 'score': 0.6540094, 'index': 7, 'word': 'дүк', 'start': 27, 'end': 30}
# {'entity': 'I-LOCATION', 'score': 0.63446337, 'index': 8, 'word': '-', 'start': 30, 'end': 31}
# {'entity': 'I-LOCATION', 'score': 0.6204858, 'index': 9, 'word': 'чы', 'start': 31, 'end': 33}
# {'entity': 'I-LOCATION', 'score': 0.6786872, 'index': 10, 'word': 'г', 'start': 33, 'end': 34}
# {'entity': 'I-LOCATION', 'score': 0.64190257, 'index': 11, 'word': 'ыш', 'start': 34, 'end': 36}
# {'entity': 'O', 'score': 0.64438057, 'index': 12, 'word': 'ында', 'start': 36, 'end': 40}
# {'entity': 'O', 'score': 0.9916931, 'index': 13, 'word': '▁орун', 'start': 41, 'end': 45}
# {'entity': 'O', 'score': 0.9953047, 'index': 14, 'word': '▁алган', 'start': 46, 'end': 51}
# {'entity': 'O', 'score': 0.9901377, 'index': 15, 'word': '▁мамлекет', 'start': 52, 'end': 60}
# {'entity': 'O', 'score': 0.99605453, 'index': 16, 'word': '.', 'start': 60, 'end': 61}
token = ""
label_list = []
token_list = []
for result in ner_results:
if result["word"].startswith("▁"):
if token:
token_list.append(token.replace("▁", ""))
token = result["word"]
label_list.append(result["entity"])
else:
token += result["word"]
token_list.append(token.replace("▁", ""))
for token, label in zip(token_list, label_list):
print(f"{token}\t{label}")
# output:
# Кыргызстан B-LOCATION
# Орто B-LOCATION
# Азиянын I-LOCATION
# түндүк-чыгышында I-LOCATION
# орун O
# алган O
# мамлекет. O
# aggregation_strategy = "simple"
nlp = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "simple")
example = "Кыргызстан Орто Азиянын түндүк-чыгышында орун алган мамлекет."
ner_results = nlp(example)
for result in ner_results:
result.update({'entity_group': id2label[result['entity_group']]})
print(result)
# output:
# {'entity_group': 'B-LOCATION', 'score': 0.87275827, 'word': 'Кыргызстан Ор', 'start': 0, 'end': 13}
# {'entity_group': 'I-LOCATION', 'score': 0.73398614, 'word': 'то Азиянын түндүк-чыгыш', 'start': 13, 'end': 36}
# {'entity_group': 'O', 'score': 0.92351407, 'word': 'ында орун алган мамлекет.', 'start': 36, 'end': 61}
NE classes
PERSON, LOCATION , MEASURE , INSTITUTION , PERIOD , ORGANISATION , MEDIA , TITLE , BUSINESS , LEGAL , EVENT , ARTIFACT , INSTALLATION , PERSON_TYPE, NATIONAL, CONCEPT, CREATION, WEBSITE, SUBSTANCE, ACRONYM, IDENTIFIER, UNKNOWN, AWARD, ANIMAL
- Downloads last month
- 7
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.
Model tree for the-cramer-project/xlm-roberta-base-kyrgyzNER
Base model
FacebookAI/xlm-roberta-base