Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on May 16

Commit

8e13e1c

•

1 Parent(s): 2605d63

updated dictionary readability

Browse files

Files changed (3) hide show

app.py +3 -0
lsj_dict.json +2 -2
lsj_dict.py +40 -7

app.py CHANGED Viewed

@@ -227,8 +227,11 @@ elif active_tab == "Dictionary":
             # Put text in readable format
             text = format_text(data)
             st.markdown(format_text(data), unsafe_allow_html = True)
             st.markdown("""
                         <style>
                         .tab {

             # Put text in readable format
             text = format_text(data)
             st.markdown(format_text(data), unsafe_allow_html = True)
             st.markdown("""
                         <style>
                         .tab {

lsj_dict.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79d162dda9e22917970d9eb84e108fa180f228bb52fc7e1cf48904f84d4676f5
-size 132311884

 version https://git-lfs.github.com/spec/v1
+oid sha256:f5f2d966dbeab082d776f146a1b5e42685e91363cbf7ce2df3835574687e2d37
+size 138789469

lsj_dict.py CHANGED Viewed

@@ -3,6 +3,7 @@ from collections import defaultdict
 from autocomplete import load_compressed_word_list
 import json
 import streamlit as st
 def read_xml(file):
     """
@@ -40,7 +41,10 @@ def extract_entry_info(entry):
     definitions[lemma]['definitions'] = {'tr': definition}
-    text = get_descendants_text(entry)
     cleaned_text = prettify_text(text)
     definitions[lemma]['definitions']['text'] = cleaned_text
@@ -48,7 +52,24 @@ def extract_entry_info(entry):
     return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
 def get_descendants_text(element):
     """
@@ -124,7 +145,11 @@ def format_text(data):
     text = data['definitions']['text']
     # Change <tr> tags to bold
-    text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
     formatted_text = []
@@ -146,11 +171,16 @@ def format_text(data):
         "u", "v", "w", "x", "y", "z"
     ]
-    for text_part in text.split("[SENSE_SEPARATOR]"):
-        level = text_part.split(".")[0].strip()
-        text_part = text_part.replace(level + ".", "")
         if level:
-            if level in secondary_indicators:
                 formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
             elif level in tertiary_indicators:
                 formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
@@ -163,6 +193,9 @@ def format_text(data):
 def main():
     # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")

 from autocomplete import load_compressed_word_list
 import json
 import streamlit as st
+import re
 def read_xml(file):
     """
     definitions[lemma]['definitions'] = {'tr': definition}
+    # text = get_descendants_text(entry)
+    text = get_all_text(entry)
     cleaned_text = prettify_text(text)
     definitions[lemma]['definitions']['text'] = cleaned_text
     return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
+def get_all_text(element):
+    """Recursively collect text from an element and all its descendants."""
+    text = (element.text or "")
+    for child in element:
+        if child.tag == 'sense':
+            level = child.get('n')
+            text += f"[SENSE_SEPARATOR][level={level}]\n\n"
+        elif child.tag == 'tr' and element.tag == 'sense':
+            if child.text is not None:
+                text += f"<tr>{child.text.strip()}</tr>\n"
+            # Skip further recursion for this child since we are already handling its text
+            text += (child.tail or "") + " "
+            continue
+        text += get_all_text(child) + " "
+        text += (child.tail or "")  + " "
+    return text
 def get_descendants_text(element):
     """
     text = data['definitions']['text']
     # Change <tr> tags to bold
+    text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
+    text = re.sub(r"\s+,\s+", ", ", text)
+    # .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
     formatted_text = []
         "u", "v", "w", "x", "y", "z"
     ]
+    header = text.split("\n")[0]
+    formatted_text.append(header)
+    for text_part in text.split("[SENSE_SEPARATOR]")[1:]:
+        level = text_part.split("level=")[1].split("]")[0]
+        text_part = text_part.replace(f"[level={level}]", "")
         if level:
+            if level == "A":
+                formatted_text.append(f"<div class='list-class primary-class'> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
+            elif level in secondary_indicators:
                 formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
             elif level in tertiary_indicators:
                 formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
 def main():
     # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")