File size: 7,656 Bytes
bc565d4
 
 
 
 
 
d04bf10
32163e9
bc565d4
c0dee52
 
bc565d4
32163e9
 
 
 
 
bc565d4
d04bf10
 
 
 
 
 
 
 
 
 
c0dee52
d04bf10
 
 
 
 
 
c0dee52
 
 
 
 
 
bc565d4
d04bf10
 
c0dee52
 
 
 
bc565d4
 
d04bf10
 
c0dee52
 
 
 
 
 
 
 
bc565d4
 
d04bf10
 
 
c0dee52
 
 
 
 
 
bc565d4
 
d04bf10
 
c0dee52
32163e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0dee52
 
 
bc565d4
 
32163e9
 
 
 
 
 
 
 
bc565d4
 
 
d04bf10
 
32163e9
 
 
bc565d4
 
 
c0dee52
bc565d4
 
32163e9
bc565d4
 
 
 
d04bf10
 
 
 
 
bc565d4
32163e9
 
 
c0dee52
 
32163e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d04bf10
32163e9
d04bf10
32163e9
 
 
d04bf10
bc565d4
c0dee52
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import spacy
from spacy import displacy
import random
from spacy.tokens import Span
import gradio as gr

DEFAULT_MODEL = "en_core_web"
DEFAULT_TEXT = "Apple is looking at buying U.K. startup for $1 billion."
DEFAULT_TOK_ATTR = ['idx', 'text', 'pos_', 'lemma_', 'shape_', 'dep_']
DEFAULT_ENTS = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY',
                'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']

texts = {"en": DEFAULT_TEXT, "ca": "Apple està buscant comprar una startup del Regne Unit per mil milions de dòlars", "da": "Apple overvejer at købe et britisk startup for 1 milliard dollar.", "de": "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
         "el": "Η άνιση κατανομή του πλούτου και του εισοδήματος, η οποία έχει λάβει τρομερές διαστάσεις, δεν δείχνει τάσεις βελτίωσης.", "es": "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.", "fi": "Itseajavat autot siirtävät vakuutusvastuun autojen valmistajille", "fr": "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars", "it": "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
         "ja": "アップルがイギリスの新興企業を10億ドルで購入を検討", "ko": "애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.", "lt": "Jaunikis pirmąją vestuvinę naktį iškeitė į areštinės gultą", "nb": "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.", "nl": "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
         "pl": "Poczuł przyjemną woń mocnej kawy.", "pt": "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", "ro": "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", "ru": "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", "sv": "Apple överväger att köpa brittisk startup för 1 miljard dollar.", "zh": "作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。"}


def get_all_models():
    with open("requirements.txt") as f:
        content = f.readlines()
        models = []
        for line in content:
            if "huggingface.co" in line:
                model = "_".join(line.split("/")[4].split("_")[:3])
                if model not in models:
                    models.append(model)
        return models


models = get_all_models()


def dependency(text, col_punct, col_phrase, compact, model):
    nlp = spacy.load(model + "_sm")
    doc = nlp(text)
    options = {"compact": compact, "collapse_phrases": col_phrase,
               "collapse_punct": col_punct}
    html = displacy.render(doc, style="dep", options=options)
    return html


def entity(text, ents, model):
    nlp = spacy.load(model + "_sm")
    doc = nlp(text)
    options = {"ents": ents}
    html = displacy.render(doc, style="ent", options=options)
    return html


def token(text, attributes, model):
    nlp = spacy.load(model + "_sm")
    data = []
    doc = nlp(text)
    for tok in doc:
        tok_data = []
        for attr in attributes:
            tok_data.append(getattr(tok, attr))
        data.append(tok_data)
    return data


def vectors(text, model):
    nlp = spacy.load(model + "_md")
    doc = nlp(text)
    n_chunks = [chunk for chunk in doc.noun_chunks]
    words = [tok for tok in doc if not tok.is_stop and tok.pos_ not in [
        'PUNCT', "PROPN"]]
    str_list = n_chunks + words
    choice = random.choices(str_list, k=2)
    return round(choice[0].similarity(choice[1]), 2), choice[0].text, choice[1].text


def span(text, span1, span2, label1, label2, model):
    nlp = spacy.load(model + "_sm")
    doc = nlp(text)
    if span1:
        idx1_1 = 0
        idx1_2 = 0
        idx2_1 = 0
        idx2_2 = 0

        span1 = span1.split(" ")
        span2 = span2.split(" ")

        for i in range(len(list(doc))):
            tok = list(doc)[i]
            if span1[0] == tok.text:
                idx1_1 = i
            if span1[-1] == tok.text:
                idx1_2 = i + 1
            if span2[0] == tok.text:
                idx2_1 = i
            if span2[-1] == tok.text:
                idx2_2 = i + 1

        doc.spans["sc"] = [
            Span(doc, idx1_1, idx1_2, label1),
            Span(doc, idx2_1, idx2_2, label2),
        ]
    else:
        idx1_1 = 0
        idx1_2 = round(len(list(doc)) / 2)
        idx2_1 = 0
        idx2_2 = 1

        doc.spans["sc"] = [
            Span(doc, idx1_1, idx1_2, label1),
            Span(doc, idx2_1, idx2_2, label2),
        ]

    html = displacy.render(doc, style="span")
    return html


def get_text(model):
    for i in range(len(models)):
        model = model.split("_")[0]
        new_text = texts[model]

    return new_text


demo = gr.Blocks()

with demo:
    model_input = gr.Dropdown(
        choices=models, value=DEFAULT_MODEL, interactive=True)
    text_button = gr.Button("Get new text")
    text_input = gr.Textbox(value=DEFAULT_TEXT, interactive=True)
    button = gr.Button("Generate")
    with gr.Tabs():
        with gr.TabItem("Dependency"):
            col_punct = gr.Checkbox(label="Collapse Punctuation", value=True)
            col_phrase = gr.Checkbox(label="Collapse Phrases", value=True)
            compact = gr.Checkbox(label="Compact", value=True)
            depen_output = gr.HTML()

        with gr.TabItem("Entity"):
            entity_input = gr.CheckboxGroup(DEFAULT_ENTS, value=DEFAULT_ENTS)
            entity_output = gr.HTML()
        with gr.TabItem("Tokens"):
            with gr.Column():
                tok_input = gr.CheckboxGroup(
                    DEFAULT_TOK_ATTR, value=DEFAULT_TOK_ATTR)
                tok_output = gr.Dataframe(
                    headers=DEFAULT_TOK_ATTR, overflow_row_behaviour="paginate")
        with gr.TabItem("Similarity"):
            with gr.Row():
                sim_text1 = gr.Textbox(value="David Bowie", label="Chosen")
                sim_text2 = gr.Textbox(value="the US", label="Chosen")
            sim_output = gr.Textbox(value="0.09", label="Similarity Score")
        with gr.TabItem("Spans"):
            with gr.Column():
                with gr.Row():
                    span1 = gr.Textbox(label="Span 1")
                    label1 = gr.Textbox(value="Label 1",
                                        label="Label for Span 1")
                with gr.Row():
                    span2 = gr.Textbox(label="Span 2")
                    label2 = gr.Textbox(value="Label 2",
                                        label="Label for Span 2")
                with gr.Row():
                    span_output = gr.HTML()
    text_button.click(get_text, inputs=[model_input], outputs=text_input)
    button.click(dependency, inputs=[
        text_input, col_punct, col_phrase, compact, model_input], outputs=depen_output)
    button.click(
        entity, inputs=[text_input, entity_input, model_input], outputs=entity_output)
    button.click(
        token, inputs=[text_input, tok_input, model_input], outputs=tok_output)
    button.click(vectors, inputs=[text_input, model_input], outputs=[
        sim_output, sim_text1, sim_text2])
    button.click(
        span, inputs=[text_input, span1, span2, label1, label2, model_input], outputs=span_output)

demo.launch()