File size: 3,314 Bytes
7f12d56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ab4587
 
7f12d56
 
 
7ab4587
 
 
 
 
 
 
 
 
 
 
 
7f12d56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Gradio
import gradio as gr

# Hugging Face libraries
from transformers import pipeline
from transformers import AutoTokenizer

# Model checkpoint
model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"

# Instantiate the pipeline
ner_task = pipeline(model=model_checkpoint, task="ner",
        aggregation_strategy="simple")

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Sample sentences
sentence1 = "Herbert Akroyd Stuart patented the first diesel engine, 1890"
sentence2 = "May 10 A delegation tells Leopold III his return would be \
illtimed, 1945"
sentence3 = "Fri May 10 Fred Astaire (Frederick Austerlitz) born in Omaha, Nebraska, 1899"
sentence4 = "Fri May 10 Germany invades Low Countries, 1940"
sentence5 = "Fri May 10 Nazi bookburning, 1933"
sentence6 = "Fri May 10 Confederate Memorial Day in South Carolina"
sentence7 = "Fri May 10 Mothers Day in Guatemala"
sentence8 = "Fri May 10 Dave Mason is born in Worcester, England, 1945"


# Gradio interface
def predict(sentence):
    """
    Use the corresponding tokenizer to tokenize the sentence.
    Use the model to predict the entities.
    """
    # Get the tokens from the tokenizer
    processed_tokens = tokenizer(sentence)
    token_pieces = processed_tokens.tokens()
    
    # Get the prediction of ner from the model
    result_ner = ner_task(sentence)
    formatted_ner = ""
    entities_count = 0
    # Print individual entities.
    # Start the count from 1 for intuitive reading.
    for i, result in enumerate(result_ner):
        # Only get the result where score is at least 0.8
        if result['score'] < 0.8:
            continue;
        else:
            entities_count += 1
            formatted_ner +=  f"Number: {entities_count} \n" \
                            + f"Entity: {result['entity_group']}\n" \
                            + f"Word group: {result['word']}\n" \
                            + f"Score: {result['score']}\n"
            formatted_ner += f"{result}\n\n"

    formatted_ner += f"Number of predicted entities: {entities_count}\n\n"
    
    return token_pieces, formatted_ner

# Main Gradio interface
demo = gr.Interface(
    fn = predict,
    inputs = [gr.TextArea(label="Place your sentence here", lines=10,
                          show_copy_button=True)],
    outputs =
        [
            gr.TextArea(label="Tokens input to the model", interactive=False, 
                        lines=10, show_copy_button=True),
            gr.TextArea(label="Prediction of entities", interactive=False, 
                        lines=10, show_copy_button=True)
        ],
    examples=[[sentence1], [sentence2], [sentence3], [sentence4], 
              [sentence5], [sentence6], [sentence7], [sentence8]],
    title = "NER (Named Entities Recognition)",
    description = f"""
         ## Using model {model_checkpoint} to predict entities type
         <p style="font-size: 1.2rem;">Notes: </p>
         <ul style="font-size: 1.2rem; list-style-type:square">
         <li>  The examples are from the calendar utility in Linux.
         <li>  The model cannot recognize date and time. 
         <li> It can recongize PER (person), LOC (location), ORG (organization) and MIS (miscellaneous)
         entities.
         </ul>
         """
)
demo.launch()