File size: 3,213 Bytes
102e437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
887a2c0
 
 
 
 
102e437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62090fa
 
 
 
 
102e437
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline


def convert_hf_ents_to_gradio(hf_ents):
    gradio_ents = []
    for hf_ent in hf_ents:
        gradio_ent = {"start" : hf_ent['start'], "end": hf_ent['end'], "entity": hf_ent['entity_group']}
        gradio_ents.append(gradio_ent)
    return gradio_ents


def tag(text):
    hf_ents = nlp(text, aggregation_strategy="first")
    gradio_ents = convert_hf_ents_to_gradio(hf_ents)
    doc ={"text": text,
    "entities": gradio_ents}
    return doc


if __name__ == "__main__":
    model_ckpt = "carolanderson/roberta-base-food-ner"
    model = AutoModelForTokenClassification.from_pretrained(model_ckpt)
    tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
    nlp = pipeline("ner", model=model, tokenizer=tokenizer)
    
    with open("app_text/blog_text.md", "r") as f:
        blog_text = f.read()
        
    examples=[
            ["Saute the onions in olive oil until browned."],
            ["Add bourbon and sweet vermouth to the shaker."],
            ["Salt the water and butter the bread."],
            ["Add salt to the water and spread butter on the bread."]]

    with gr.Blocks() as demo:
        gr.Markdown("# Extracting Food Mentions from Text")
        html = ("<div style='max-width:100%; max-height:200px; overflow:auto'>"
            + "<img src='file=app_images/featured.jpg' alt='Cookbook'>"
            + "</div>"
        )
        gr.HTML(html)
        gr.Markdown("This is a model I trained to extract food terms from text. "
                    "I fine tuned RoBERTa base on a dataset I created by labeling a set of recipes.")
        gr.Markdown("Details about the training data and training process are below.")            
        with gr.Row():
            with gr.Column():
                inp = gr.Textbox(placeholder="Enter text here...", lines=4, label="Input text")
                btn = gr.Button("Tag food")
            gr.Examples(examples, inp, label="Examples (click to use)")
        out = gr.HighlightedText(label="Predictions")
        btn.click(fn=tag, inputs=inp, outputs=out)
        gr.Markdown(blog_text)
        html_2 = ("<div style='max-width:100%; max-height:50px; overflow:auto'>"
        + "<img src='file=app_images/salt_butter_old.png' alt='Butter and Salt (old model)'>"
        + "</div>"
        )
        gr.HTML(html_2)
        gr.Markdown("I speculated then that these kinds of errors could probably be reduced by using"
                    " contextual word embeddings, such as ELMo or BERT embeddings, or by using BERT itself "
                    "(fine-tuning it on the NER task)."
                    " That turned out to be true -- the current, RoBERTa model correctly handles these cases:")
        html_3 = ("<div style='max-width:100%; max-height:50px; overflow:auto'>"
        + "<img src='file=app_images/salt_butter_new.png' alt='Butter and Salt (new model)'>"
        + "</div>"
        )
        gr.HTML(html_3)
        gr.Markdown("To use this model yourself, see the "
                    "[model card.](https://huggingface.co/carolanderson/roberta-base-food-ner)")



        
    demo.launch()