File size: 4,022 Bytes
ce54c6a
 
 
6db2364
ce54c6a
0a66247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9927ce5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce54c6a
 
 
 
 
 
9927ce5
 
 
ce54c6a
0555443
9927ce5
 
 
 
 
 
 
 
 
0555443
9927ce5
0a66247
9927ce5
 
 
 
6db2364
9927ce5
 
 
 
af7d479
9927ce5
 
 
 
0555443
 
 
0a66247
 
 
 
 
0555443
 
ce54c6a
f660872
0a66247
 
 
8f07179
af7d479
 
0a66247
8f07179
0a66247
af7d479
0a66247
f660872
af7d479
 
9927ce5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from transformers import pipeline
import requests
import json
import gradio as gr

js = """
async () => {
    function showCard(event, title, content) {
        document.getElementById('hovercard').style.visibility = 'visible';
        document.getElementById('card_title').innerText = title;
        document.getElementById('card_content').innerText = content;
    }
    function hideCard(event) {
        document.getElementById('hovercard').style.visibility = 'hidden';
    }
    globalThis.showCard = showCard;
    globalThis.hideCard = hideCard;
}
"""

def get_matches(text):
    pred = pipe(text, max_length=5000)[0]["translation_text"]
    
    def get_mapping(pred):
        pred = pred.split(" = ")
        pred = [x.split("+") for x in pred]
        flat = [x for y in pred for x in y]
        flat = [x.split(":") for x in flat]
        return flat

    mapping = get_mapping(pred)
    # only keep tuples with length 2
    mapping = [x for x in mapping if len(x) == 2]

    matches = []
    cur = mapping.pop(0)
    i = 0
    done = False

    while i < len(text) and not done:
        if text[i:].startswith(cur[0]):
            matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
            i += len(cur[0])
            if len(mapping) == 0:
                done = True
            else:
                cur = mapping.pop(0)
        else:
            i += 1
            
    return (text, pred, matches)

pipe = pipeline("translation", "guymorlan/TokenizerLabeller")

r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
data = json.loads(r.text)

def predict(input):
    text, pred, matches = get_matches(input)

    matches = {x["start"]: x for x in matches}

    output = f"""
        <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""

    i = 0
    while i < len(text):
        if i in matches:
            match = matches[i]["lexicon"]
            # if match ends with _R, remove _R suffix
            if match.endswith("_R"):
                match = match[:-2]

            if match in data:
                output += f"""
                        <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;' 
                        onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")' 
                        onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
                        """
            else:
                output += matches[i]["match"]
            i = matches[i]["end"]
        else:
            if text[i] == " ":
                output += "&nbsp;"
            else:
                output += text[i]
            i += 1

    output += "</div>"

    output += """
    <div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px; 
    border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
        <h3 id='card_title' style='color: #000000;'></h3>
        <p id='card_content' style='color: #000000;'></p>
    </div>
    """
    return output

with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler", js = js) as demo:
    gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
    with gr.Row():
        with gr.Column():
            input = gr.Textbox(label="Input", placeholder="Enter Arabic Text", lines=1)
            gr.Examples(examples=["بديش اروح معك", "معملتش اشي"], inputs=input)
            btn = gr.Button("Analyze")
        with gr.Column():
            html = gr.HTML()
    btn.click(predict, inputs=[input], outputs=[html])
    input.submit(predict, inputs=[input], outputs=[html])

    demo.load()

if __name__ == "__main__":
    demo.launch()