File size: 4,339 Bytes
4d9dd77
cb0039e
 
 
 
 
 
4d9dd77
cb0039e
 
 
4d9dd77
 
cb0039e
 
 
4d9dd77
 
cb0039e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
967c296
cb0039e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d9dd77
 
cb0039e
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import base64
import uuid

import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer


st.set_page_config(
    page_title="Named Entity Recognition Tagger", page_icon="πŸ“˜"
)


def convert_df(df:pd.DataFrame):
     return df.to_csv(index=False).encode('utf-8')

#@st.cache
def convert_json(df:pd.DataFrame):
    result = df.to_json(orient="index")
    parsed = json.loads(result)
    json_string = json.dumps(parsed)
    #st.json(json_string, expanded=True)
    return json_string

st.title("πŸ“˜Named Entity Recognition Tagger")

@st.cache(allow_output_mutation=True)
def load_model():

    model = AutoModelForTokenClassification.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")
    trainer = Trainer(model=model)

    tokenizer = AutoTokenizer.from_pretrained("vonewman/xlm-roberta-base-finetuned-wolof")

    return trainer, model, tokenizer

id2tag = {0: 'O',
         1: 'B-LOC',
         2: 'B-PER',
         3: 'I-PER',
         4: 'B-ORG',
         5: 'I-DATE',
         6: 'B-DATE',
         7: 'I-ORG',
         8: 'I-LOC'
        }

def tag_sentence(text:str):
      # convert our text to a tokenized sequence
      inputs = tokenizer(text, truncation=True, return_tensors="pt")
      # get outputs
      outputs = model(**inputs)
      # convert to probabilities with softmax
      probs = outputs[0][0].softmax(1)
      # get the tags with the highest probability
      word_tags = [(tokenizer.decode(inputs['input_ids'][0][i].item()), id2tag[tagid.item()], np.round(probs[i][tagid].item() *100,2) ) 
                    for i, tagid in enumerate (probs.argmax(axis=1))]

      df=pd.DataFrame(word_tags, columns=['word', 'tag', 'probability'])
      return df


with st.form(key='my_form'):

    x1 = st.text_input(label='Enter a sentence:', max_chars=250)
    print(x1)
    submit_button = st.form_submit_button(label='🏷️ Create tags')


if submit_button:
    if re.sub('\s+','',x1)=='':
        st.error('Please enter a non-empty sentence.')

    elif re.match(r'\A\s*\w+\s*\Z', x1):
        st.error("Please enter a sentence with at least one word")
    
    else:
        st.markdown("### Tagged Sentence")
        st.header("")

        Trainer, model, tokenizer = load_model()  
        results=tag_sentence(x1)
        
        cs, c1, c2, c3, cLast = st.columns([0.75, 1.5, 1.5, 1.5, 0.75])

        with c1:
            #csvbutton = download_button(results, "results.csv", "πŸ“₯ Download .csv")
            csvbutton = st.download_button(label="πŸ“₯ Download .csv", data=convert_df(results), file_name= "results.csv", mime='text/csv', key='csv')
        with c2:
            #textbutton = download_button(results, "results.txt", "πŸ“₯ Download .txt")
            textbutton = st.download_button(label="πŸ“₯ Download .txt", data=convert_df(results), file_name= "results.text", mime='text/plain',  key='text')
        with c3:
            #jsonbutton = download_button(results, "results.json", "πŸ“₯ Download .json")
            jsonbutton = st.download_button(label="πŸ“₯ Download .json", data=convert_json(results), file_name= "results.json", mime='application/json',  key='json')

        st.header("")
        
        c1, c2, c3 = st.columns([1, 3, 1])
        
        with c2:

             st.table(results.style.background_gradient(subset=['probability']).format(precision=2))

st.header("")
st.header("")
st.header("")
with st.expander("ℹ️ - About this app", expanded=True):


    st.write(
        """     
-   The **Named Entity Recognition Tagger** app is a tool that performs named entity recognition.
-   The available entitites are: *corporation*, *creative-work*, *group*, *location*, *person* and *product*.
-   The app uses the [RoBERTa model](https://huggingface.co/roberta-large), fine-tuned on the [wnut](https://huggingface.co/datasets/wnut_17) dataset.      
-   The model uses the **byte-level BPE tokenizer**. Each sentece is first tokenized.
-   For more info regarding the data science part, check this [post](https://towardsdatascience.com/named-entity-recognition-with-deep-learning-bert-the-essential-guide-274c6965e2d?sk=c3c3699e329e45a8ed93d286ae04ef10).      
       """
    )