guymorlan commited on
Commit
9927ce5
1 Parent(s): c6866fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -19
app.py CHANGED
@@ -20,32 +20,79 @@ async () => {
20
  }
21
  """
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
24
 
25
  r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
26
  data = json.loads(r.text)
27
 
28
  def predict(input):
29
- out = pipe(input)[0]['translation_text']
30
- raw = out
31
- out = [x.strip() for x in out.split(" + ")]
32
 
33
  output = f"""
34
- <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>{raw}<br><br>"""
 
 
 
 
 
 
 
 
 
35
 
36
- for o in out:
37
- oo = [x.strip() for x in o.split("+")]
38
- output += "<span style='background-color: #E0E0E0; border-radius: 5px; padding: 5px; margin-right: 5px; display: inline-block;'>"
39
- for ooo in oo:
40
- if ooo in data:
41
  output += f"""
42
- <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; padding: 2px; margin-right: 2px; font-family: "Courier New", Courier, monospace;'
43
- onmouseover='showCard(event, "{data[ooo]['translation']}", "{data[ooo]['features']}")'
44
- onmouseout='hideCard(event)' onclick='showCard(event, "{data[ooo]['translation']}", "{data[ooo]['features']}")'>{data[ooo]['word']}</span>
45
- """
46
  else:
47
- output += ooo
48
- output += "</span> "
 
 
 
 
 
 
 
 
49
  output += "</div>"
50
 
51
  output += """
@@ -57,12 +104,12 @@ def predict(input):
57
  """
58
  return output
59
 
60
- with gr.Blocks(title="Ammiya Tokenizer and Annotator") as demo:
61
  gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
62
  with gr.Row():
63
  with gr.Column():
64
  input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
65
- gr.Examples(["بديش اروح معك", "مكنتش هون قبل ما جيت"], input)
66
  btn = gr.Button(label="Analyze")
67
  with gr.Column():
68
  with gr.Box():
@@ -71,5 +118,4 @@ with gr.Blocks(title="Ammiya Tokenizer and Annotator") as demo:
71
  input.submit(predict, inputs = [input], outputs=[html])
72
 
73
  demo.load(_js=js)
74
- demo.launch()
75
-
 
20
  }
21
  """
22
 
23
+ def get_matches(text):
24
+ pred = pipe(text, max_length=5000)[0]["translation_text"]
25
+
26
+ def get_mapping(pred):
27
+ pred = pred.split(" = ")
28
+ pred = [x.split("+") for x in pred]
29
+ flat = [x for y in pred for x in y]
30
+ flat = [x.split(":") for x in flat]
31
+ return flat
32
+
33
+ mapping = get_mapping(pred)
34
+ # only keep tuples with length 2
35
+ mapping = [x for x in mapping if len(x) == 2]
36
+
37
+
38
+ matches = []
39
+ cur = mapping.pop(0)
40
+ i = 0
41
+ done = False
42
+
43
+ while i < len(text) and not done:
44
+ if text[i:].startswith(cur[0]):
45
+ matches.append({"start": i, "end": i+len(cur[0]), "match": cur[0], "lexicon": cur[1]})
46
+ i += len(cur[0])
47
+ if len(mapping) == 0:
48
+ done = True
49
+ else:
50
+ cur = mapping.pop(0)
51
+ else:
52
+ i += 1
53
+
54
+ return (text, pred, matches)
55
+
56
  pipe = pipeline("translation", "guymorlan/TokenizerLabeller")
57
 
58
  r = requests.get("https://huggingface.co/guymorlan/TokenizerLabeller/raw/main/playaling_words.json")
59
  data = json.loads(r.text)
60
 
61
  def predict(input):
62
+ text, pred, matches = get_matches(input)
63
+
64
+ matches = {x["start"]: x for x in matches}
65
 
66
  output = f"""
67
+ <div style='direction: rtl; text-align: right; font-size: 18px; font-family: Arial, sans-serif; line-height: 1.5'>"""
68
+
69
+ i = 0
70
+ while i < len(text):
71
+ if i in matches:
72
+
73
+ match = matches[i]["lexicon"]
74
+ # if match ends with _R, remove _R suffix
75
+ if match.endswith("_R"):
76
+ match = match[:-2]
77
 
78
+ if match in data:
79
+ # match = matches[i]["lexicon"]
 
 
 
80
  output += f"""
81
+ <span style='background-color: #4CAF50; color: #FFFFFF; border: 1px solid #4CAF50; border-radius: 5px; font-family: "Courier New", Courier, monospace;'
82
+ onmouseover='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'
83
+ onmouseout='hideCard(event)' onclick='showCard(event, "{data[match]['translation']}", "{data[match]['features']}")'>{matches[i]['match']}</span>
84
+ """
85
  else:
86
+ output += matches[i]["match"]
87
+ i = matches[i]["end"]
88
+ else:
89
+ print(f"'{text[i]}'")
90
+ if text[i] == " ":
91
+ output += "&nbsp"
92
+ else:
93
+ output += text[i]
94
+ i += 1
95
+
96
  output += "</div>"
97
 
98
  output += """
 
104
  """
105
  return output
106
 
107
+ with gr.Blocks(theme=gr.themes.Soft(), title="Ammiya Tokenizer and Labeler") as demo:
108
  gr.HTML("<h2><span style='color: #2563eb'>Colloquial Arabic</span></h2> Tokenizer and Annotator")
109
  with gr.Row():
110
  with gr.Column():
111
  input = gr.Textbox(label="Input", placeholder="Enter English Text", lines=1)
112
+ gr.Examples(["بديش اروح معك"], input)
113
  btn = gr.Button(label="Analyze")
114
  with gr.Column():
115
  with gr.Box():
 
118
  input.submit(predict, inputs = [input], outputs=[html])
119
 
120
  demo.load(_js=js)
121
+ demo.launch()