File size: 2,433 Bytes
733a1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54e9b45
733a1a0
 
 
 
 
 
 
 
a5bc7b6
 
54e9b45
 
a5bc7b6
54e9b45
733a1a0
 
a5bc7b6
733a1a0
 
 
 
 
 
 
 
 
 
bfa5f32
 
54e9b45
 
e897615
54e9b45
 
 
733a1a0
 
54e9b45
733a1a0
 
 
54e9b45
 
 
733a1a0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import gradio as gr
from spacy import displacy

tokenizer = AutoTokenizer.from_pretrained("lirondos/anglicisms-spanish-mbert")
model = AutoModelForTokenClassification.from_pretrained(
    "lirondos/anglicisms-spanish-mbert"
)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

diplacy_dict_template = {
    "text": "But Google is starting from behind.",
    "ents": [{"start": 4, "end": 10, "label": "ORG"}],
    "title": None,
}


def infer(input_text):
    displacy_ents = []
    borrowings = nlp(input_text)
    
    for borrowing in borrowings:
        displacy_ent_dict = {
            "start": borrowing["start"],
            "end": borrowing["end"],
            "label": borrowing["entity"],
        }
        displacy_ents.append(displacy_ent_dict)

    colors = {"B-ENG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
              "I-ENG": "linear-gradient(90deg, #99bfff, #a57cf0)",
              "B-OTHER": "linear-gradient(90deg, #79d0a5, #f6e395)",
              "I-OTHER": "linear-gradient(90deg, #f79a76, #fb6d6d)"}
    
    options = {"ents": ["B-ENG", "I-ENG", "B-OTHER", "I-OTHER"], "colors": colors}
    displacy_dict_template = {"text": input_text, "ents": displacy_ents, "title": None}

    html = displacy.render(displacy_dict_template, style="ent", page=True, manual=True, options=options)
    
    html = (
        ""
        + html
        + ""
    )
    
    return html


description="""This space is a demo for the paper [Detecting Unassimilated Borrowings in Spanish:
An Annotated Corpus and Approaches to Modeling](https://arxiv.org/pdf/2203.16169.pdf)

The goal of the underlying model is to detect foreign words, e.g. anglicisms, in spanish texts.
In general it has two types of tags for foreign words: *ENG* and *OTHER*. The authors used [BIO-tagging](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)),
which is why in practice you will see a *B-* or *I-* in front of the tags.
"""

demo = gr.Interface(
    title="Borrowing Detection Español",
    description=description,
    fn=infer,
    inputs=gr.Text(),
    outputs=gr.HTML(),
    examples=["Buscamos data scientist para proyecto de machine learning.",
              "Las fake news sobre la celebrity se reprodujeron por los 'mass media' en prime time.",
              "Me gusta el cine noir y el anime."],
)

demo.launch()