AMR-KELEG commited on
Commit
ab3e62e
1 Parent(s): c464f06

Apply black

Browse files
Files changed (1) hide show
  1. app.py +38 -5
app.py CHANGED
@@ -9,6 +9,28 @@ import altair as alt
9
  from altair import X, Y, Scale
10
  import base64
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  @st.cache_data
14
  def render_svg(svg):
@@ -36,22 +58,33 @@ model = load_model(constants.MODEL_NAME)
36
 
37
 
38
  def compute_ALDi(sentences):
39
- # TODO: Perform inference in batches
 
 
 
 
 
 
 
40
  progress_text = "Computing ALDi..."
41
  my_bar = st.progress(0, text=progress_text)
42
 
43
  BATCH_SIZE = 4
44
  output_logits = []
45
- for first_index in range(0, len(sentences), BATCH_SIZE):
 
 
 
46
  inputs = tokenizer(
47
- sentences[first_index : first_index + BATCH_SIZE],
48
  return_tensors="pt",
49
  padding=True,
50
  )
51
  outputs = model(**inputs).logits.reshape(-1).tolist()
52
  output_logits = output_logits + [max(min(o, 1), 0) for o in outputs]
53
  my_bar.progress(
54
- min((first_index + BATCH_SIZE) / len(sentences), 1), text=progress_text
 
55
  )
56
  my_bar.empty()
57
  return output_logits
@@ -93,7 +126,7 @@ with tab1:
93
 
94
  print(sent)
95
  with open("logs.txt", "a") as f:
96
- f.write(sent+"\n")
97
 
98
  with tab2:
99
  file = st.file_uploader("Upload a file", type=["txt"])
 
9
  from altair import X, Y, Scale
10
  import base64
11
 
12
+ import re
13
+
14
+
15
+ def preprocess_text(arabic_text):
16
+ """Apply preprocessing to the given Arabic text.
17
+
18
+ Args:
19
+ arabic_text: The Arabic text to be preprocessed.
20
+
21
+ Returns:
22
+ The preprocessed Arabic text.
23
+ """
24
+ no_urls = re.sub(
25
+ r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
26
+ "",
27
+ arabic_text,
28
+ flags=re.MULTILINE,
29
+ )
30
+ no_english = re.sub(r"[a-zA-Z]", "", no_urls)
31
+
32
+ return no_english
33
+
34
 
35
  @st.cache_data
36
  def render_svg(svg):
 
58
 
59
 
60
  def compute_ALDi(sentences):
61
+ """Computes the ALDi score for the given sentences.
62
+
63
+ Args:
64
+ sentences: A list of Arabic sentences.
65
+
66
+ Returns:
67
+ A list of ALDi scores for the given sentences.
68
+ """
69
  progress_text = "Computing ALDi..."
70
  my_bar = st.progress(0, text=progress_text)
71
 
72
  BATCH_SIZE = 4
73
  output_logits = []
74
+
75
+ preprocessed_sentences = [preprocess_text(s) for s in sentences]
76
+
77
+ for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
78
  inputs = tokenizer(
79
+ preprocessed_sentences[first_index : first_index + BATCH_SIZE],
80
  return_tensors="pt",
81
  padding=True,
82
  )
83
  outputs = model(**inputs).logits.reshape(-1).tolist()
84
  output_logits = output_logits + [max(min(o, 1), 0) for o in outputs]
85
  my_bar.progress(
86
+ min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
87
+ text=progress_text,
88
  )
89
  my_bar.empty()
90
  return output_logits
 
126
 
127
  print(sent)
128
  with open("logs.txt", "a") as f:
129
+ f.write(sent + "\n")
130
 
131
  with tab2:
132
  file = st.file_uploader("Upload a file", type=["txt"])