Spaces:

guymorlan
/

TokenizerLabeller

Sleeping

File size: 4,022 Bytes

from transformers import pipeline
import requests
import json
import gradio as gr

js = """
async () => {
    function showCard(event, title, content) {
        document.getElementById('hovercard').style.visibility = 'visible';
        document.getElementById('card_title').innerText = title;
        document.getElementById('card_content').innerText = content;
    }
    function hideCard(event) {
        document.getElementById('hovercard').style.visibility = 'hidden';
    }
    globalThis.showCard = showCard;
    globalThis.hideCard = hideCard;
}
"""

def get_matches(text):
    pred = pipe(text, max_length=5000)[0]["translation_text"]
    
    def get_mapping(pred):
        pred = pred.split(" = ")
        pred = [x.split("+") for x in pred]
        flat = [x for y in pred for x in y]
        flat = [x.split(":") for x in flat]
        return flat

    mapping = get_mapping(pred)
    # only keep tuples with length 2
    mapping = [x for x in mapping if len(x) == 2]

    matches = []
    cur = mapping.pop(0)
    i = 0
    done = False

    while i < len(text) and not done:
        if text[i:].startswith(cur[0]):
            matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
            i += len(cur[0])
            if len(mapping) == 0:
                done = True
            else:
                cur = mapping.pop(0)
        else:
            i += 1
            
    return (text, pred, matches)

pipe = pipeline("translation", "guymorlan/TokenizerLabeller")

r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
data = json.loads(r.text)

def predict(input):
    text, pred, matches = get_matches(input)

    matches = {x["start"]: x for x in matches}

    output = f"""
        <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""

    i = 0
    while i < len(text):
        if i in matches:
            match = matches[i]["lexicon"]
            # if match ends with _R, remove _R suffix
            if match.endswith("_R"):
                match = match[:-2]

            if match in data:
                output += f"""
                        <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;' 
                        onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")' 
                        onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
                        """
            else:
                output += matches[i]["match"]
            i = matches[i]["end"]
        else:
            if text[i] == " ":
                output += "&nbsp;"
            else:
                output += text[i]
            i += 1

    output += "</div>"

    output += """
    <div id='hovercard' style='position: absolute; visibility: hidden; background: #FFFFFF; padding: 10px; 
    border: 1px solid #9E9E9E; border-radius: 5px; font-family: Arial, sans-serif;'>
        <h3 id='card_title' style='color: #000000;'></h3>
        <p id='card_content' style='color: #000000;'></p>
    </div>
    """
    return output

with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler", js = js) as demo:
    gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
    with gr.Row():
        with gr.Column():
            input = gr.Textbox(label="Input", placeholder="Enter Arabic Text", lines=1)
            gr.Examples(examples=["بديش اروح معك", "معملتش اشي"], inputs=input)
            btn = gr.Button("Analyze")
        with gr.Column():
            html = gr.HTML()
    btn.click(predict, inputs=[input], outputs=[html])
    input.submit(predict, inputs=[input], outputs=[html])

    demo.load()

if __name__ == "__main__":
    demo.launch()