|
import xml.etree.ElementTree as ET |
|
from collections import defaultdict |
|
from autocomplete import load_compressed_word_list |
|
import json |
|
|
|
|
|
def read_xml(file): |
|
""" |
|
Read an XML file of the Greek LSJ dictionary |
|
and return a dictionary with the words and their definitions. |
|
""" |
|
tree = ET.parse(file) |
|
root = tree.getroot() |
|
|
|
xml_info = defaultdict(dict) |
|
|
|
for entry in root.findall('.//entryFree'): |
|
entry_info = extract_entry_info(entry) |
|
|
|
xml_info[entry_info['lemma']] = entry_info |
|
|
|
return xml_info |
|
|
|
|
|
def extract_entry_info(entry): |
|
""" |
|
Extract information from an entry in the LSJ dictionary. |
|
""" |
|
definitions = defaultdict(dict) |
|
|
|
|
|
lemma = ''.join([i for i in entry.get('key') if not i.isdigit()]) |
|
|
|
|
|
orthographies = [orth.text for orth in entry.findall('orth')] |
|
definitions[lemma]['orthographies'] = orthographies |
|
|
|
|
|
definition = ' '.join(entry.itertext()).strip() |
|
definitions[lemma]['definitions'] = {'tr': definition} |
|
|
|
text = get_descendants_text(entry) |
|
cleaned_text = prettify_text(text) |
|
|
|
definitions[lemma]['definitions']['text'] = cleaned_text |
|
|
|
|
|
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']} |
|
|
|
|
|
def get_descendants_text(element): |
|
""" |
|
Get all the text of the descendants of a given element, separating every 'sense' element. |
|
""" |
|
text = "" |
|
for child in element: |
|
if child.tag == 'sense': |
|
|
|
text += "[SENSE_SEPARATOR]\n\n" |
|
if child.tag == 'tr' and element.tag == 'sense': |
|
|
|
if child.text is not None: |
|
text += f"<tr>{child.text.strip()}</tr>\n" |
|
else: |
|
text += child.text or "" |
|
text += get_descendants_text(child) |
|
text += child.tail or "" |
|
return text |
|
|
|
|
|
def prettify_text(text): |
|
""" |
|
Prettify the text of the definitions into a readable format, |
|
adding [tr] tags to text inside 'tr' tags within 'sense' tags. |
|
""" |
|
|
|
parts = text.split("[SENSE_SEPARATOR]") |
|
|
|
|
|
prettified_parts = [] |
|
for part in parts: |
|
|
|
cleaned_part = ' '.join(line.strip() for line in part.split('\n') if line.strip()) |
|
prettified_parts.append(cleaned_part) |
|
|
|
|
|
prettified_text = "\n\n[SENSE_SEPARATOR] ".join(prettified_parts) |
|
|
|
return prettified_text |
|
|
|
|
|
def full_dictionary(): |
|
""" |
|
Return the full dictionary of the LSJ dictionary. |
|
""" |
|
merged_info = {} |
|
for i in range(1, 28): |
|
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" |
|
xml_info = read_xml(file) |
|
for lemma, info in xml_info.items(): |
|
|
|
merged_info.setdefault(lemma, {}).update(info) |
|
|
|
return merged_info |
|
|
|
|
|
def format_text(data): |
|
text = data['definitions']['text'] |
|
|
|
|
|
text = text.replace("<tr>", "**").replace("</tr>", "**").replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ") |
|
|
|
|
|
for i in range(len(text.split("[SENSE_SEPARATOR]"))): |
|
text = text.replace("[SENSE_SEPARATOR]", f"{i+1}.") |
|
|
|
return text |
|
|
|
|
|
def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
download = True |
|
|
|
if download is True: |
|
merged_info = {} |
|
for i in range(1, 28): |
|
file = f"LSJ_GreekUnicode/grc.lsj.perseus-eng{i}.xml" |
|
xml_info = read_xml(file) |
|
for word, info in xml_info.items(): |
|
|
|
merged_info.setdefault(word, {}).update(info) |
|
|
|
|
|
|
|
with open("lsj_dict.json", "w", encoding="utf-8") as file: |
|
json.dump(merged_info, file, ensure_ascii=False, indent=4) |
|
|
|
|
|
|
|
lemma_dict = json.load(open('lsj_dict.json', 'r')) |
|
|
|
print_test(lemma_dict) |
|
|
|
|
|
def print_test(lemma_dict): |
|
print(lemma_dict["βομβάζω"]) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|