Spaces:
Sleeping
Sleeping
File size: 8,312 Bytes
ab2661e 6fb6ead ab2661e 08725fa e7df576 08725fa a4366fd ab2661e 6fb6ead ab2661e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
from annotated_text import annotated_text, parameters, annotation
import razdel
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import streamlit as st
import torch
nltk.download('punkt')
# add the caching decorator and use custom text for spinner
@st.cache_resource(show_spinner = "Loading the model...")
def label_text(text):
if text != "":
tokenizer = AutoTokenizer.from_pretrained("yeshpanovrustem/xlm-roberta-large-ner-kazakh")
model = AutoModelForTokenClassification.from_pretrained("yeshpanovrustem/xlm-roberta-large-ner-kazakh")
nlp = pipeline("ner", model = model, tokenizer = tokenizer)
labels_dict = {0: 'O',
1: 'B-ADAGE',
2: 'I-ADAGE',
3: 'B-ART',
4: 'I-ART',
5: 'B-CARDINAL',
6: 'I-CARDINAL',
7: 'B-CONTACT',
8: 'I-CONTACT',
9: 'B-DATE',
10: 'I-DATE',
11: 'B-DISEASE',
12: 'I-DISEASE',
13: 'B-EVENT',
14: 'I-EVENT',
15: 'B-FACILITY',
16: 'I-FACILITY',
17: 'B-GPE',
18: 'I-GPE',
19: 'B-LANGUAGE',
20: 'I-LANGUAGE',
21: 'B-LAW',
22: 'I-LAW',
23: 'B-LOCATION',
24: 'I-LOCATION',
25: 'B-MISCELLANEOUS',
26: 'I-MISCELLANEOUS',
27: 'B-MONEY',
28: 'I-MONEY',
29: 'B-NON_HUMAN',
30: 'I-NON_HUMAN',
31: 'B-NORP',
32: 'I-NORP',
33: 'B-ORDINAL',
34: 'I-ORDINAL',
35: 'B-ORGANISATION',
36: 'I-ORGANISATION',
37: 'B-PERSON',
38: 'I-PERSON',
39: 'B-PERCENTAGE',
40: 'I-PERCENTAGE',
41: 'B-POSITION',
42: 'I-POSITION',
43: 'B-PRODUCT',
44: 'I-PRODUCT',
45: 'B-PROJECT',
46: 'I-PROJECT',
47: 'B-QUANTITY',
48: 'I-QUANTITY',
49: 'B-TIME',
50: 'I-TIME'}
single_sentence_tokens = [_.text for _ in list(razdel.tokenize(text))]
tokenized_input = tokenizer(single_sentence_tokens, is_split_into_words = True, return_tensors = "pt")
tokens = tokenized_input.tokens()
output = model(**tokenized_input).logits
predictions = torch.argmax(output, dim = 2)
# convert label IDs to label names
word_ids = tokenized_input.word_ids(batch_index = 0)
previous_word_id = None
labels = []
for token, word_id, prediction in zip(tokens, word_ids, predictions[0].numpy()):
# # Special tokens have a word id that is None. We set the label to -100 so they are
# # automatically ignored in the loss function.
if word_id is None or word_id == previous_word_id:
continue
elif word_id != previous_word_id:
labels.append(labels_dict[prediction])
previous_word_id = word_id
assert len(single_sentence_tokens) == len(labels), "Mismatch between input token and label sizes!"
sentence_tokens = []
sentence_labels = []
token_list = []
label_list = []
previous_token = ""
previous_label = ""
for token, label in zip(single_sentence_tokens, labels):
current_token = token
current_label = label
# starting loop
if previous_label == "":
previous_token = current_token
previous_label = current_label
# collecting compound named entities
elif (previous_label.startswith("B-")) and (current_label.startswith("I-")):
token_list.append(previous_token)
label_list.append(previous_label)
elif (previous_label.startswith("I-")) and (current_label.startswith("I-")):
token_list.append(previous_token)
label_list.append(previous_label)
elif (previous_label.startswith("I-")) and (not current_label.startswith("I-")):
token_list.append(previous_token)
label_list.append(previous_label)
sentence_tokens.append(token_list)
sentence_labels.append(label_list)
token_list = []
label_list = []
# collecting single named entities:
elif (not previous_label.startswith("I-")) and (not current_label.startswith("I-")):
token_list.append(previous_token)
label_list.append(previous_label)
sentence_tokens.append(token_list)
sentence_labels.append(label_list)
token_list = []
label_list = []
previous_token = current_token
previous_label = current_label
token_list.append(previous_token)
label_list.append(previous_label)
sentence_tokens.append(token_list)
sentence_labels.append(label_list)
output = []
for sentence_token, sentence_label in zip(sentence_tokens, sentence_labels):
if len(sentence_label[0]) > 1:
if len(sentence_label) > 1:
output.append((" ".join(sentence_token), sentence_label[0].split("-")[1]))
else:
output.append((sentence_token[0], sentence_label[0].split("-")[1]))
else:
# output.append((sentence_token[0], sentence_label[0]))
output.append(sentence_token[0])
modified_output = []
for element in output:
if not isinstance(element, tuple):
if element.isalnum():
modified_output.append(' ' + element + ' ')
else:
modified_output.append(' ' + element + ' ')
else:
tuple_first = f" {element[0]} "
tuple_second = element[1]
new_tuple = (tuple_first, tuple_second)
modified_output.append(new_tuple)
else:
return st.markdown("<p id = 'warning'>PLEASE INSERT YOUR TEXT</p>", unsafe_allow_html = True)
return modified_output
#########################
#### CREATE SIDEBAR #####
#########################
with open("style.css") as f:
css = f.read()
st.sidebar.markdown(f'<style>{css}</style>', unsafe_allow_html = True)
st.sidebar.markdown("<h1>Kazakh NER</h1>", unsafe_allow_html = True)
st.sidebar.markdown("<h2>Named entity classes</h2>", unsafe_allow_html = True)
with st.sidebar.expander("ADAGE"): st.write("Well-known Kazakh proverbs and sayings")
with st.sidebar.expander("ART"): st.write("Titles of books, songs, television programmes, etc.")
with st.sidebar.expander("CARDINAL"): st.write("Cardinal numbers, including whole numbers, fractions, and decimals")
with st.sidebar.expander("CONTACT"): st.write("Addresses, emails, phone numbers, URLs")
with st.sidebar.expander("DATE"): st.write("Dates or periods of 24 hours or more")
with st.sidebar.expander("DISEASE"): st.write("Diseases or medical conditions")
with st.sidebar.expander("EVENT"): st.write("Named events and phenomena")
with st.sidebar.expander("FACILITY"): st.write("Names of man-made structures")
with st.sidebar.expander("GPE"): st.write("Names of geopolitical entities")
with st.sidebar.expander("LANGUAGE"): st.write("Named languages")
with st.sidebar.expander("LAW"): st.write("Named legal documents")
with st.sidebar.expander("LOCATION"): st.write("Names of geographical locations other than GPEs")
with st.sidebar.expander("MISCELLANEOUS"): st.write("Entities of interest but hard to assign a proper tag to")
with st.sidebar.expander("MONEY"): st.write("Monetary values")
with st.sidebar.expander("NON_HUMAN"): st.write("Names of pets, animals or non-human creatures")
with st.sidebar.expander("NORP"): st.write("Adjectival forms of GPE and LOCATION; named religions, etc.")
with st.sidebar.expander("ORDINAL"): st.write("Ordinal numbers, including adverbials")
with st.sidebar.expander("ORGANISATION"): st.write("Names of companies, government agencies, etc.")
with st.sidebar.expander("PERCENTAGES"): st.write("Percentages")
with st.sidebar.expander("PERSON"): st.write("Names of persons")
with st.sidebar.expander("POSITION"): st.write("Names of posts and job titles")
with st.sidebar.expander("PRODUCT"): st.write("Names of products")
with st.sidebar.expander("PROJECT"): st.write("Names of projects, policies, plans, etc.")
with st.sidebar.expander("QUANTITY"): st.write("Length, distance, etc. measurements")
with st.sidebar.expander("TIME"): st.write("Times of day and time duration less than 24 hours")
######################
#### CREATE FORM #####
######################
text_field = st.form(key = 'text_field')
form_text = text_field.text_input('Insert your text here')
submit = text_field.form_submit_button('Submit')
st.markdown('Press **Submit** to have your text labelled')
if submit:
annotated_text(label_text(form_text)) |