AMR-KELEG commited on
Commit
e42b2b2
1 Parent(s): ab3e62e

Adapt the demo to the Franco Arabic Transliteration

Browse files
Files changed (4) hide show
  1. README.md +2 -1
  2. app.py +19 -158
  3. constants.py +0 -4
  4. requirements.txt +2 -2
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: ALDi
3
  emoji: ☕
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.27.2
 
8
  app_file: app.py
9
  pinned: true
10
  tags: [Arabic]
 
1
  ---
2
+ title: Franco Arabic Transliterator
3
  emoji: ☕
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
+ python_version: 3.7
9
  app_file: app.py
10
  pinned: true
11
  tags: [Arabic]
app.py CHANGED
@@ -1,170 +1,31 @@
1
  # Hint: this cheatsheet is magic! https://cheat-sheet.streamlit.app/
2
- import constants
3
- import pandas as pd
4
  import streamlit as st
5
- import matplotlib.pyplot as plt
6
- from transformers import BertForSequenceClassification, AutoTokenizer
7
-
8
- import altair as alt
9
- from altair import X, Y, Scale
10
- import base64
11
-
12
- import re
13
-
14
-
15
- def preprocess_text(arabic_text):
16
- """Apply preprocessing to the given Arabic text.
17
-
18
- Args:
19
- arabic_text: The Arabic text to be preprocessed.
20
-
21
- Returns:
22
- The preprocessed Arabic text.
23
- """
24
- no_urls = re.sub(
25
- r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
26
- "",
27
- arabic_text,
28
- flags=re.MULTILINE,
29
- )
30
- no_english = re.sub(r"[a-zA-Z]", "", no_urls)
31
-
32
- return no_english
33
-
34
-
35
- @st.cache_data
36
- def render_svg(svg):
37
- """Renders the given svg string."""
38
- b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
39
- html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}"/> </p>'
40
- c = st.container()
41
- c.write(html, unsafe_allow_html=True)
42
-
43
-
44
- @st.cache_data
45
- def convert_df(df):
46
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
47
- return df.to_csv(index=None).encode("utf-8")
48
 
 
49
 
50
  @st.cache_resource
51
- def load_model(model_name):
52
- model = BertForSequenceClassification.from_pretrained(model_name)
53
- return model
54
-
55
-
56
- tokenizer = AutoTokenizer.from_pretrained(constants.MODEL_NAME)
57
- model = load_model(constants.MODEL_NAME)
58
-
59
-
60
- def compute_ALDi(sentences):
61
- """Computes the ALDi score for the given sentences.
62
 
63
- Args:
64
- sentences: A list of Arabic sentences.
65
 
66
- Returns:
67
- A list of ALDi scores for the given sentences.
68
- """
69
- progress_text = "Computing ALDi..."
70
- my_bar = st.progress(0, text=progress_text)
71
 
72
- BATCH_SIZE = 4
73
- output_logits = []
74
 
75
- preprocessed_sentences = [preprocess_text(s) for s in sentences]
 
 
76
 
77
- for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
78
- inputs = tokenizer(
79
- preprocessed_sentences[first_index : first_index + BATCH_SIZE],
80
- return_tensors="pt",
81
- padding=True,
82
- )
83
- outputs = model(**inputs).logits.reshape(-1).tolist()
84
- output_logits = output_logits + [max(min(o, 1), 0) for o in outputs]
85
- my_bar.progress(
86
- min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
87
- text=progress_text,
88
- )
89
- my_bar.empty()
90
- return output_logits
91
-
92
-
93
- render_svg(open("assets/ALDi_logo.svg").read())
94
-
95
- tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
96
-
97
- with tab1:
98
- sent = st.text_input(
99
- "Arabic Sentence:", placeholder="Enter an Arabic sentence.", on_change=None
100
  )
101
 
102
- # TODO: Check if this is needed!
103
- clicked = st.button("Submit")
104
-
105
- if sent:
106
- ALDi_score = compute_ALDi([sent])[0]
107
-
108
- ORANGE_COLOR = "#FF8000"
109
- fig, ax = plt.subplots(figsize=(8, 1))
110
- fig.patch.set_facecolor("none")
111
- ax.set_facecolor("none")
112
-
113
- ax.spines["left"].set_color(ORANGE_COLOR)
114
- ax.spines["bottom"].set_color(ORANGE_COLOR)
115
- ax.tick_params(axis="x", colors=ORANGE_COLOR)
116
-
117
- ax.spines[["right", "top"]].set_visible(False)
118
-
119
- ax.barh(y=[0], width=[ALDi_score], color=ORANGE_COLOR)
120
- ax.set_xlim(0, 1)
121
- ax.set_ylim(-1, 1)
122
- ax.set_title(f"ALDi score is: {round(ALDi_score, 3)}", color=ORANGE_COLOR)
123
- ax.get_yaxis().set_visible(False)
124
- ax.set_xlabel("ALDi score", color=ORANGE_COLOR)
125
- st.pyplot(fig)
126
-
127
- print(sent)
128
- with open("logs.txt", "a") as f:
129
- f.write(sent + "\n")
130
-
131
- with tab2:
132
- file = st.file_uploader("Upload a file", type=["txt"])
133
- if file is not None:
134
- df = pd.read_csv(file, sep="\t", header=None)
135
- df.columns = ["Sentence"]
136
- df.reset_index(drop=True, inplace=True)
137
-
138
- # TODO: Run the model
139
- df["ALDi"] = compute_ALDi(df["Sentence"].tolist())
140
-
141
- # A horizontal rule
142
- st.markdown("""---""")
143
-
144
- chart = (
145
- alt.Chart(df.reset_index())
146
- .mark_area(color="darkorange", opacity=0.5)
147
- .encode(
148
- x=X(field="index", title="Sentence Index"),
149
- y=Y("ALDi", scale=Scale(domain=[0, 1])),
150
- )
151
- )
152
- st.altair_chart(chart.interactive(), use_container_width=True)
153
-
154
- col1, col2 = st.columns([4, 1])
155
-
156
- with col1:
157
- # Display the output
158
- st.table(
159
- df,
160
- )
161
-
162
- with col2:
163
- # Add a download button
164
- csv = convert_df(df)
165
- st.download_button(
166
- label=":file_folder: Download predictions as CSV",
167
- data=csv,
168
- file_name="ALDi_scores.csv",
169
- mime="text/csv",
170
- )
 
1
  # Hint: this cheatsheet is magic! https://cheat-sheet.streamlit.app/
 
 
2
  import streamlit as st
3
+ import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ from franco_arabic_transliterator.franco_arabic_transliterator import FrancoArabicTransliterator
6
 
7
  @st.cache_resource
8
+ def load_model():
9
+ return FrancoArabicTransliterator()
 
 
 
 
 
 
 
 
 
10
 
11
+ transliterator = load_model()
 
12
 
13
+ sent = st.text_input(
14
+ "Franco Arabic (Arabizi) Sentence:", placeholder="Enter an Arabizi sentence.", on_change=None
15
+ )
 
 
16
 
17
+ # TODO: Check if this is needed!
18
+ clicked = st.button("Submit")
19
 
20
+ if sent:
21
+ lexicon_transliteration = transliterator.transliterate(sent, method="lexicon")
22
+ lm_transliteration = transliterator.transliterate(sent, method="language-model")
23
 
24
+ df = pd.DataFrame(
25
+ {"method": ["Lexicon", "Language Model"],
26
+ "transliteration": [lexicon_transliteration, lm_transliteration]})
27
+ st.table(
28
+ df,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
 
31
+ print(sent)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
constants.py DELETED
@@ -1,4 +0,0 @@
1
- CHOICE_TEXT = "Input Text"
2
- CHOICE_FILE = "Upload File"
3
- TITLE = "ALDi: Arabic Level of Dialectness"
4
- MODEL_NAME = "AMR-KELEG/Sentence-ALDi"
 
 
 
 
 
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
- transformers
2
- torch
 
1
+ franco-arabic-transliterator
2
+ pandas