Mark7549 commited on
Commit
2605d63
1 Parent(s): 94e7f68

improved display of the library

Browse files
Files changed (3) hide show
  1. app.py +60 -1
  2. lsj_dict.json +2 -2
  3. lsj_dict.py +61 -12
app.py CHANGED
@@ -46,6 +46,18 @@ active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D g
46
  menu_icon="cast", default_index=0, orientation="horizontal")
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Nearest neighbours tab
50
  if active_tab == "Nearest neighbours":
51
 
@@ -215,7 +227,54 @@ elif active_tab == "Dictionary":
215
  # Put text in readable format
216
  text = format_text(data)
217
 
218
- st.markdown(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
 
221
 
 
46
  menu_icon="cast", default_index=0, orientation="horizontal")
47
 
48
 
49
+ # Adding CSS style to remove list-style-type
50
+ st.markdown("""
51
+ <style>
52
+ /* Define a class to remove list-style-type */
53
+ .no-list-style {
54
+ list-style-type: none;
55
+ }
56
+ </style>
57
+ """, unsafe_allow_html=True)
58
+
59
+
60
+
61
  # Nearest neighbours tab
62
  if active_tab == "Nearest neighbours":
63
 
 
227
  # Put text in readable format
228
  text = format_text(data)
229
 
230
+ st.markdown(format_text(data), unsafe_allow_html = True)
231
+
232
+ st.markdown("""
233
+ <style>
234
+ .tab {
235
+ display: inline-block;
236
+ margin-left: 4em;
237
+ }
238
+ .tr {
239
+ font-weight: bold;
240
+ }
241
+ .list-class {
242
+ list-style-type: none;
243
+ margin-top: 1em;
244
+ }
245
+ .primary-indicator {
246
+ font-weight: bold;
247
+ font-size: x-large;
248
+ }
249
+ .secondary-indicator {
250
+ font-weight: bold;
251
+ font-size: large;
252
+ }
253
+ .tertiary-indicator {
254
+ font-weight: bold;
255
+ font-size: medium;
256
+ }
257
+ .quaternary-indicator {
258
+ font-weight: bold;
259
+ font-size: medium;
260
+ }
261
+ .primary-class {
262
+ padding-left: 2em;
263
+ }
264
+ .secondary-class {
265
+ padding-left: 4em;
266
+ }
267
+ .tertiary-class {
268
+ padding-left: 6em;
269
+ }
270
+ .quaternary-class {
271
+ padding-left: 8em;
272
+ }
273
+ </style>
274
+ """, unsafe_allow_html=True)
275
+
276
+
277
+
278
 
279
 
280
 
lsj_dict.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30459c5cf72d067b38e5419903e202bb06cc38426bf8a42b58cc1472d2cc1320
3
- size 135538892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79d162dda9e22917970d9eb84e108fa180f228bb52fc7e1cf48904f84d4676f5
3
+ size 132311884
lsj_dict.py CHANGED
@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
2
  from collections import defaultdict
3
  from autocomplete import load_compressed_word_list
4
  import json
5
-
6
 
7
  def read_xml(file):
8
  """
@@ -39,33 +39,48 @@ def extract_entry_info(entry):
39
  definition = ' '.join(entry.itertext()).strip()
40
  definitions[lemma]['definitions'] = {'tr': definition}
41
 
 
42
  text = get_descendants_text(entry)
43
  cleaned_text = prettify_text(text)
44
 
45
  definitions[lemma]['definitions']['text'] = cleaned_text
46
 
47
 
48
- return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
49
-
 
50
 
51
  def get_descendants_text(element):
52
  """
53
  Get all the text of the descendants of a given element, separating every 'sense' element.
54
  """
55
  text = ""
 
 
 
 
 
 
 
 
 
56
  for child in element:
57
  if child.tag == 'sense':
58
  # Add a separator before each 'sense' element
59
- text += "[SENSE_SEPARATOR]\n\n"
60
  if child.tag == 'tr' and element.tag == 'sense':
61
  # Add [tr] tags around text inside 'tr' tags within 'sense' tags
62
  if child.text is not None:
63
  text += f"<tr>{child.text.strip()}</tr>\n"
 
64
  else:
 
 
65
  text += child.text or ""
66
  text += get_descendants_text(child)
67
- text += child.tail or ""
68
  return text
 
 
69
 
70
 
71
  def prettify_text(text):
@@ -78,7 +93,8 @@ def prettify_text(text):
78
 
79
  # Prettify each part separately
80
  prettified_parts = []
81
- for part in parts:
 
82
  # Remove leading and trailing whitespace and join lines with a space
83
  cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
84
  prettified_parts.append(cleaned_part)
@@ -108,13 +124,43 @@ def format_text(data):
108
  text = data['definitions']['text']
109
 
110
  # Change <tr> tags to bold
111
- text = text.replace("<tr>", "**").replace("</tr>", "**").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
112
 
113
- # Change [SENSE_SEPARATOR] to integers
114
- for i in range(len(text.split("[SENSE_SEPARATOR]"))):
115
- text = text.replace("[SENSE_SEPARATOR]", f"{i+1}.")
116
-
117
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
 
120
  def main():
@@ -163,3 +209,6 @@ def print_test(lemma_dict):
163
 
164
  if __name__ == "__main__":
165
  main()
 
 
 
 
2
  from collections import defaultdict
3
  from autocomplete import load_compressed_word_list
4
  import json
5
+ import streamlit as st
6
 
7
  def read_xml(file):
8
  """
 
39
  definition = ' '.join(entry.itertext()).strip()
40
  definitions[lemma]['definitions'] = {'tr': definition}
41
 
42
+
43
  text = get_descendants_text(entry)
44
  cleaned_text = prettify_text(text)
45
 
46
  definitions[lemma]['definitions']['text'] = cleaned_text
47
 
48
 
49
+ return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
50
+
51
+
52
 
53
  def get_descendants_text(element):
54
  """
55
  Get all the text of the descendants of a given element, separating every 'sense' element.
56
  """
57
  text = ""
58
+ level_indicators = [
59
+ 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
60
+ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
61
+ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
62
+ 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
63
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
64
+ 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
65
+ ]
66
+
67
  for child in element:
68
  if child.tag == 'sense':
69
  # Add a separator before each 'sense' element
70
+ text += f"[SENSE_SEPARATOR]\n\n"
71
  if child.tag == 'tr' and element.tag == 'sense':
72
  # Add [tr] tags around text inside 'tr' tags within 'sense' tags
73
  if child.text is not None:
74
  text += f"<tr>{child.text.strip()}</tr>\n"
75
+ text += child.tail
76
  else:
77
+ if child.get('n') and len(child.get('n')) <= 2:
78
+ text += f"{child.get('n')}. "
79
  text += child.text or ""
80
  text += get_descendants_text(child)
 
81
  return text
82
+
83
+
84
 
85
 
86
  def prettify_text(text):
 
93
 
94
  # Prettify each part separately
95
  prettified_parts = []
96
+
97
+ for part in parts:
98
  # Remove leading and trailing whitespace and join lines with a space
99
  cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip())
100
  prettified_parts.append(cleaned_part)
 
124
  text = data['definitions']['text']
125
 
126
  # Change <tr> tags to bold
127
+ text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
128
 
129
+ formatted_text = []
130
+
131
+ primary_indicators = [
132
+ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J"
133
+ ]
134
+
135
+ secondary_indicators = [
136
+ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
137
+ ]
138
+
139
+ tertiary_indicators = [
140
+ "2", "3", "4", "5", "6", "7", "8", "9", "10"
141
+ ]
142
+
143
+ quaternary_indicators = [
144
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
145
+ "k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
146
+ "u", "v", "w", "x", "y", "z"
147
+ ]
148
+
149
+ for text_part in text.split("[SENSE_SEPARATOR]"):
150
+ level = text_part.split(".")[0].strip()
151
+ text_part = text_part.replace(level + ".", "")
152
+ if level:
153
+ if level in secondary_indicators:
154
+ formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
155
+ elif level in tertiary_indicators:
156
+ formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
157
+ elif level in quaternary_indicators:
158
+ formatted_text.append(f"<div class='list-class quaternary-class'> <span class='quaternary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div> ")
159
+ elif level in primary_indicators:
160
+ formatted_text.append(f"<div class='list-class primary-class'> <span class='primary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
161
+
162
+ return '\n'.join(formatted_text)
163
+
164
 
165
 
166
  def main():
 
209
 
210
  if __name__ == "__main__":
211
  main()
212
+
213
+
214
+