updated dictionary readability
Browse files- app.py +3 -0
- lsj_dict.json +2 -2
- lsj_dict.py +40 -7
app.py
CHANGED
@@ -227,8 +227,11 @@ elif active_tab == "Dictionary":
|
|
227 |
# Put text in readable format
|
228 |
text = format_text(data)
|
229 |
|
|
|
230 |
st.markdown(format_text(data), unsafe_allow_html = True)
|
231 |
|
|
|
|
|
232 |
st.markdown("""
|
233 |
<style>
|
234 |
.tab {
|
|
|
227 |
# Put text in readable format
|
228 |
text = format_text(data)
|
229 |
|
230 |
+
|
231 |
st.markdown(format_text(data), unsafe_allow_html = True)
|
232 |
|
233 |
+
|
234 |
+
|
235 |
st.markdown("""
|
236 |
<style>
|
237 |
.tab {
|
lsj_dict.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5f2d966dbeab082d776f146a1b5e42685e91363cbf7ce2df3835574687e2d37
|
3 |
+
size 138789469
|
lsj_dict.py
CHANGED
@@ -3,6 +3,7 @@ from collections import defaultdict
|
|
3 |
from autocomplete import load_compressed_word_list
|
4 |
import json
|
5 |
import streamlit as st
|
|
|
6 |
|
7 |
def read_xml(file):
|
8 |
"""
|
@@ -40,7 +41,10 @@ def extract_entry_info(entry):
|
|
40 |
definitions[lemma]['definitions'] = {'tr': definition}
|
41 |
|
42 |
|
43 |
-
text = get_descendants_text(entry)
|
|
|
|
|
|
|
44 |
cleaned_text = prettify_text(text)
|
45 |
|
46 |
definitions[lemma]['definitions']['text'] = cleaned_text
|
@@ -48,7 +52,24 @@ def extract_entry_info(entry):
|
|
48 |
|
49 |
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def get_descendants_text(element):
|
54 |
"""
|
@@ -124,7 +145,11 @@ def format_text(data):
|
|
124 |
text = data['definitions']['text']
|
125 |
|
126 |
# Change <tr> tags to bold
|
127 |
-
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
|
|
|
|
|
|
|
|
|
128 |
|
129 |
formatted_text = []
|
130 |
|
@@ -146,11 +171,16 @@ def format_text(data):
|
|
146 |
"u", "v", "w", "x", "y", "z"
|
147 |
]
|
148 |
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
152 |
if level:
|
153 |
-
if level
|
|
|
|
|
154 |
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
155 |
elif level in tertiary_indicators:
|
156 |
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
@@ -163,6 +193,9 @@ def format_text(data):
|
|
163 |
|
164 |
|
165 |
|
|
|
|
|
|
|
166 |
def main():
|
167 |
# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
|
168 |
|
|
|
3 |
from autocomplete import load_compressed_word_list
|
4 |
import json
|
5 |
import streamlit as st
|
6 |
+
import re
|
7 |
|
8 |
def read_xml(file):
|
9 |
"""
|
|
|
41 |
definitions[lemma]['definitions'] = {'tr': definition}
|
42 |
|
43 |
|
44 |
+
# text = get_descendants_text(entry)
|
45 |
+
|
46 |
+
text = get_all_text(entry)
|
47 |
+
|
48 |
cleaned_text = prettify_text(text)
|
49 |
|
50 |
definitions[lemma]['definitions']['text'] = cleaned_text
|
|
|
52 |
|
53 |
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
|
54 |
|
55 |
+
|
56 |
+
def get_all_text(element):
|
57 |
+
"""Recursively collect text from an element and all its descendants."""
|
58 |
+
text = (element.text or "")
|
59 |
+
for child in element:
|
60 |
+
if child.tag == 'sense':
|
61 |
+
level = child.get('n')
|
62 |
+
text += f"[SENSE_SEPARATOR][level={level}]\n\n"
|
63 |
+
elif child.tag == 'tr' and element.tag == 'sense':
|
64 |
+
if child.text is not None:
|
65 |
+
text += f"<tr>{child.text.strip()}</tr>\n"
|
66 |
+
# Skip further recursion for this child since we are already handling its text
|
67 |
+
text += (child.tail or "") + " "
|
68 |
+
continue
|
69 |
+
text += get_all_text(child) + " "
|
70 |
+
text += (child.tail or "") + " "
|
71 |
+
return text
|
72 |
+
|
73 |
|
74 |
def get_descendants_text(element):
|
75 |
"""
|
|
|
145 |
text = data['definitions']['text']
|
146 |
|
147 |
# Change <tr> tags to bold
|
148 |
+
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
|
149 |
+
|
150 |
+
text = re.sub(r"\s+,\s+", ", ", text)
|
151 |
+
|
152 |
+
# .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
|
153 |
|
154 |
formatted_text = []
|
155 |
|
|
|
171 |
"u", "v", "w", "x", "y", "z"
|
172 |
]
|
173 |
|
174 |
+
header = text.split("\n")[0]
|
175 |
+
formatted_text.append(header)
|
176 |
+
|
177 |
+
for text_part in text.split("[SENSE_SEPARATOR]")[1:]:
|
178 |
+
level = text_part.split("level=")[1].split("]")[0]
|
179 |
+
text_part = text_part.replace(f"[level={level}]", "")
|
180 |
if level:
|
181 |
+
if level == "A":
|
182 |
+
formatted_text.append(f"<div class='list-class primary-class'> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
183 |
+
elif level in secondary_indicators:
|
184 |
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
185 |
elif level in tertiary_indicators:
|
186 |
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
|
|
193 |
|
194 |
|
195 |
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
def main():
|
200 |
# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
|
201 |
|