vonewman commited on
Commit
08eb663
1 Parent(s): 447a922

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import json
5
+ import transformers
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
8
+
9
+ st.set_page_config(
10
+ page_title="Named Entity Recognition Wolof",
11
+ page_icon="📘"
12
+ )
13
+
14
+ def convert_df(df: pd.DataFrame):
15
+ return df.to_csv(index=False).encode('utf-8')
16
+
17
+ def convert_json(df: pd.DataFrame):
18
+ result = df.to_json(orient="index")
19
+ parsed = json.loads(result)
20
+ json_string = json.dumps(parsed)
21
+ return json_string
22
+
23
+ def load_model():
24
+ model = AutoModelForTokenClassification.from_pretrained("vonewman/wolof-finetuned-ner")
25
+ trainer = Trainer(model=model)
26
+ tokenizer = AutoTokenizer.from_pretrained("vonewman/wolof-finetuned-ner")
27
+ return trainer, model, tokenizer
28
+
29
+ def align_word_ids(texts):
30
+ trainer, model, tokenizer = load_model()
31
+ tokenized_inputs = tokenizer(texts, padding='max_length', max_length=218, truncation=True)
32
+ word_ids = tokenized_inputs.word_ids()
33
+ previous_word_idx = None
34
+ label_ids = []
35
+ for word_idx in word_ids:
36
+ if word_idx is None:
37
+ label_ids.append(-100)
38
+ elif word_idx != previous_word_idx:
39
+ try:
40
+ label_ids.append(1)
41
+ except:
42
+ label_ids.append(-100)
43
+ else:
44
+ try:
45
+ label_ids.append(1 if label_all_tokens else -100)
46
+ except:
47
+ label_ids.append(-100)
48
+ previous_word_idx = word_idx
49
+ return label_ids
50
+
51
+ def predict_ner_labels(model, tokenizer, sentence):
52
+ use_cuda = torch.cuda.is_available()
53
+ device = torch.device("cuda" if use_cuda else "cpu")
54
+ if use_cuda:
55
+ model = model.cuda()
56
+ text = tokenizer(sentence, padding='max_length', max_length=218, truncation=True, return_tensors="pt")
57
+ mask = text['attention_mask'].to(device)
58
+ input_id = text['input_ids'].to(device)
59
+ label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
60
+ logits = model(input_id, mask, None)
61
+ logits_clean = logits[0][label_ids != -100]
62
+ predictions = logits_clean.argmax(dim=1).tolist()
63
+ prediction_label = [id2tag[i] for i in predictions]
64
+ return prediction_label
65
+
66
+ id2tag = {0: 'O', 1: 'B-LOC', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-DATE', 6: 'B-DATE', 7: 'I-ORG', 8: 'I-LOC'}
67
+
68
+ def tag_sentence(text):
69
+ trainer, model, tokenizer = load_model()
70
+ predictions = predict_ner_labels(model, tokenizer, text)
71
+ # Créez un DataFrame avec les colonnes "words" et "tags"
72
+ df = pd.DataFrame({'words': text.split(), 'tags': predictions})
73
+ return df
74
+
75
+ st.title("📘 Named Entity Recognition Wolof")
76
+
77
+ with st.form(key='my_form'):
78
+ x1 = st.text_input(label='Enter a sentence:', max_chars=250)
79
+ submit_button = st.form_submit_button(label='🏷️ Create tags')
80
+
81
+ if submit_button:
82
+ if re.sub('\s+', '', x1) == '':
83
+ st.error('Please enter a non-empty sentence.')
84
+ elif re.match(r'\A\s*\w+\s*\Z', x1):
85
+ st.error("Please enter a sentence with at least one word")
86
+ else:
87
+ st.markdown("### Tagged Sentence")
88
+ st.header("")
89
+ results = tag_sentence(x1)
90
+ cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])
91
+ with c1:
92
+ csvbutton = st.download_button(label="📥 Download .csv", data=convert_df(results),
93
+ file_name="results.csv", mime='text/csv', key='csv')
94
+ with c2:
95
+ textbutton = st.download_button(label="📥 Download .txt", data=convert_df(results),
96
+ file_name="results.text", mime='text/plain', key='text')
97
+ with c3:
98
+ jsonbutton = st.download_button(label="📥 Download .json", data=convert_json(results),
99
+ file_name="results.json", mime='application/json', key='json')
100
+ st.header("")
101
+ c1, c2, c3 = st.columns([1, 3, 1])
102
+ with c2:
103
+ st.table(results[['words', 'tags']])
104
+
105
+ st.header("")
106
+ st.header("")
107
+ st.header("")
108
+ with st.expander("ℹ️ - About this app", expanded=True):
109
+ st.write(
110
+ """
111
+ - The **Named Entity Recognition Wolof** app is a tool that performs named entity recognition in Wolof.
112
+ - The available entities are: *corporation*, *location*, *person*, and *date*.
113
+ - The app uses the [XLMRoberta model](https://huggingface.co/xlm-roberta-base), fine-tuned on the [masakhaNER](https://huggingface.co/datasets/masakhane/masakhaner2) dataset.
114
+ - The model uses the **byte-level BPE tokenizer**. Each sentence is first tokenized.
115
+ """
116
+ )