from transformers import pipeline import requests import json import gradio as gr js = """ async () => { function showCard(event, title, content) { document.getElementById('hovercard').style.visibility = 'visible'; document.getElementById('card_title').innerText = title; document.getElementById('card_content').innerText = content; } function hideCard(event) { document.getElementById('hovercard').style.visibility = 'hidden'; } globalThis.showCard = showCard; globalThis.hideCard = hideCard; } """ def get_matches(text): pred = pipe(text, max_length=5000)[0]["translation_text"] def get_mapping(pred): pred = pred.split(" = ") pred = [x.split("+") for x in pred] flat = [x for y in pred for x in y] flat = [x.split(":") for x in flat] return flat mapping = get_mapping(pred) # only keep tuples with length 2 mapping = [x for x in mapping if len(x) == 2] matches = [] cur = mapping.pop(0) i = 0 done = False while i < len(text) and not done: if text[i:].startswith(cur[0]): matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]}) i += len(cur[0]) if len(mapping) == 0: done = True else: cur = mapping.pop(0) else: i += 1 return (text, pred, matches) pipe = pipeline("translation", "guymorlan/TokenizerLabeller") r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json") data = json.loads(r.text) def predict(input): text, pred, matches = get_matches(input) matches = {x["start"]: x for x in matches} output = f"""
""" i = 0 while i < len(text): if i in matches: match = matches[i]["lexicon"] # if match ends with _R, remove _R suffix if match.endswith("_R"): match = match[:-2] if match in data: # match = matches[i]["lexicon"] output += f""" {matches[i]['match']} """ else: output += matches[i]["match"] i = matches[i]["end"] else: print(f"'{text[i]}'") if text[i] == " ": output += " " else: output += text[i] i += 1 output += "
" output += """ """ return output with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo: gr.HTML("

Colloquial Arabic

Tokenizer and Annotator") with gr.Row(): with gr.Column(): input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1) gr.Examples(["بديش اروح معك", "معملتش اشي"], input) btn = gr.Button(label="Analyze") with gr.Column(): with gr.Box(): html = gr.HTML() btn.click(predict, inputs=[input], outputs=[html]) input.submit(predict, inputs = [input], outputs=[html]) demo.load(_js=js) demo.launch()