Mark7549 commited on
Commit
8e13e1c
1 Parent(s): 2605d63

updated dictionary readability

Browse files
Files changed (3) hide show
  1. app.py +3 -0
  2. lsj_dict.json +2 -2
  3. lsj_dict.py +40 -7
app.py CHANGED
@@ -227,8 +227,11 @@ elif active_tab == "Dictionary":
227
  # Put text in readable format
228
  text = format_text(data)
229
 
 
230
  st.markdown(format_text(data), unsafe_allow_html = True)
231
 
 
 
232
  st.markdown("""
233
  <style>
234
  .tab {
 
227
  # Put text in readable format
228
  text = format_text(data)
229
 
230
+
231
  st.markdown(format_text(data), unsafe_allow_html = True)
232
 
233
+
234
+
235
  st.markdown("""
236
  <style>
237
  .tab {
lsj_dict.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79d162dda9e22917970d9eb84e108fa180f228bb52fc7e1cf48904f84d4676f5
3
- size 132311884
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5f2d966dbeab082d776f146a1b5e42685e91363cbf7ce2df3835574687e2d37
3
+ size 138789469
lsj_dict.py CHANGED
@@ -3,6 +3,7 @@ from collections import defaultdict
3
  from autocomplete import load_compressed_word_list
4
  import json
5
  import streamlit as st
 
6
 
7
  def read_xml(file):
8
  """
@@ -40,7 +41,10 @@ def extract_entry_info(entry):
40
  definitions[lemma]['definitions'] = {'tr': definition}
41
 
42
 
43
- text = get_descendants_text(entry)
 
 
 
44
  cleaned_text = prettify_text(text)
45
 
46
  definitions[lemma]['definitions']['text'] = cleaned_text
@@ -48,7 +52,24 @@ def extract_entry_info(entry):
48
 
49
  return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
50
 
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def get_descendants_text(element):
54
  """
@@ -124,7 +145,11 @@ def format_text(data):
124
  text = data['definitions']['text']
125
 
126
  # Change <tr> tags to bold
127
- text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
 
 
 
 
128
 
129
  formatted_text = []
130
 
@@ -146,11 +171,16 @@ def format_text(data):
146
  "u", "v", "w", "x", "y", "z"
147
  ]
148
 
149
- for text_part in text.split("[SENSE_SEPARATOR]"):
150
- level = text_part.split(".")[0].strip()
151
- text_part = text_part.replace(level + ".", "")
 
 
 
152
  if level:
153
- if level in secondary_indicators:
 
 
154
  formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
155
  elif level in tertiary_indicators:
156
  formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
@@ -163,6 +193,9 @@ def format_text(data):
163
 
164
 
165
 
 
 
 
166
  def main():
167
  # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
168
 
 
3
  from autocomplete import load_compressed_word_list
4
  import json
5
  import streamlit as st
6
+ import re
7
 
8
  def read_xml(file):
9
  """
 
41
  definitions[lemma]['definitions'] = {'tr': definition}
42
 
43
 
44
+ # text = get_descendants_text(entry)
45
+
46
+ text = get_all_text(entry)
47
+
48
  cleaned_text = prettify_text(text)
49
 
50
  definitions[lemma]['definitions']['text'] = cleaned_text
 
52
 
53
  return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
54
 
55
+
56
+ def get_all_text(element):
57
+ """Recursively collect text from an element and all its descendants."""
58
+ text = (element.text or "")
59
+ for child in element:
60
+ if child.tag == 'sense':
61
+ level = child.get('n')
62
+ text += f"[SENSE_SEPARATOR][level={level}]\n\n"
63
+ elif child.tag == 'tr' and element.tag == 'sense':
64
+ if child.text is not None:
65
+ text += f"<tr>{child.text.strip()}</tr>\n"
66
+ # Skip further recursion for this child since we are already handling its text
67
+ text += (child.tail or "") + " "
68
+ continue
69
+ text += get_all_text(child) + " "
70
+ text += (child.tail or "") + " "
71
+ return text
72
+
73
 
74
  def get_descendants_text(element):
75
  """
 
145
  text = data['definitions']['text']
146
 
147
  # Change <tr> tags to bold
148
+ text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
149
+
150
+ text = re.sub(r"\s+,\s+", ", ", text)
151
+
152
+ # .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
153
 
154
  formatted_text = []
155
 
 
171
  "u", "v", "w", "x", "y", "z"
172
  ]
173
 
174
+ header = text.split("\n")[0]
175
+ formatted_text.append(header)
176
+
177
+ for text_part in text.split("[SENSE_SEPARATOR]")[1:]:
178
+ level = text_part.split("level=")[1].split("]")[0]
179
+ text_part = text_part.replace(f"[level={level}]", "")
180
  if level:
181
+ if level == "A":
182
+ formatted_text.append(f"<div class='list-class primary-class'> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
183
+ elif level in secondary_indicators:
184
  formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
185
  elif level in tertiary_indicators:
186
  formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
 
193
 
194
 
195
 
196
+
197
+
198
+
199
  def main():
200
  # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
201