Spaces:

GroNLP
/

agalma

Sleeping

App Files Files Community

Mark7549 commited on Jun 14, 2024

Commit

74e30c6

1 Parent(s): 00a8bfc

made some small cleanups of redundant code

Browse files

Files changed (6) hide show

app.py +0 -1
compress_words.py +48 -0
lsj_dict.py +6 -29
plots.py +0 -5
vector_graph.py +1 -2
word2vec.py +1 -2

app.py CHANGED Viewed

@@ -47,7 +47,6 @@ lemma_counts = load_lemma_count_dict()
 # Set styles for menu
 styles_horizontal = {
     "container": {"display": "flex", "justify-content": "center"},

 # Set styles for menu
 styles_horizontal = {
     "container": {"display": "flex", "justify-content": "center"},

compress_words.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pickle
+from autocomplete import save_compressed_word_list
+import json
+def compress_words(words):
+    """
+    Compress a list of words.
+    """
+    with open(words, 'r') as f:
+        words = f.readlines()
+    compressed_words = []
+    for word in words:
+        compressed_word = word.strip()
+        compressed_words.append(compressed_word)
+    # Save the compressed words
+    save_compressed_word_list(compressed_words, 'all_lemmas.pkl.gz')
+def compress_word_list(words):
+    """
+    Compress a list of words.
+    """
+    compressed_words = []
+    for word in words:
+        compressed_word = word.strip()
+        compressed_words.append(compressed_word)
+    # Save the compressed words
+    save_compressed_word_list(compressed_words, 'all_lemmas.pkl.gz')
+def main():
+    lemma_dict = json.load(open('lsj_dict.json', 'r'))
+    # Get all lemmas
+    all_lemmas = list(lemma_dict.keys())
+    # Compress words
+    compress_word_list(all_lemmas)
+if __name__ == "__main__":
+    main()

lsj_dict.py CHANGED Viewed

@@ -76,14 +76,6 @@ def get_descendants_text(element):
     Get all the text of the descendants of a given element, separating every 'sense' element.
     """
     text = ""
-    level_indicators = [
-                        'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
-                        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
-                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
-                        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
-                        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
-                        'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
-                    ]
     for child in element:
         if child.tag == 'sense':
@@ -142,6 +134,9 @@ def full_dictionary():
 def format_text(data):
     text = data['definitions']['text']
     # Change <tr> tags to bold
@@ -199,22 +194,10 @@ def format_text(data):
-def main():
-    # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
-    # for word, info in xml_info.items():
-    #     print(word)
-    #     print('Lemma: ', info['lemma'])
-    #     print('Orthographies: ', info['orthographies'])
-    #     print('Definitions: ', info['definitions'])
-    #     print('TEST', info['definitions']['tr'].split('\n')[0])  # First word in the definition
-    #     print('Text:', info['definitions']['text'])
-    #     if len(info['definitions']['tr'].split('\n')) > 1:
-    #         print('First definition:  ', info['definitions']['tr'].split('\n')[1])
-    #     print(' ')
-    #   full_dictionary()
     download = True
     if download is True:
@@ -234,12 +217,6 @@ def main():
     lemma_dict = json.load(open('lsj_dict.json', 'r'))
-    print_test(lemma_dict)
-def print_test(lemma_dict):
-    print(lemma_dict["βομβάζω"])

     Get all the text of the descendants of a given element, separating every 'sense' element.
     """
     text = ""
     for child in element:
         if child.tag == 'sense':
 def format_text(data):
+    """
+        Modify text to desired template
+    """
     text = data['definitions']['text']
     # Change <tr> tags to bold
+def main():
+    # This code is used to convert the .xml files into a .json file
+    # TO DO: Make seperate function
     download = True
     if download is True:
     lemma_dict = json.load(open('lsj_dict.json', 'r'))

plots.py CHANGED Viewed

@@ -16,9 +16,6 @@ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
         List structure: [(word, model_name, vector, cosine_sim)]
     """
     word = target_word
-    # Load model
-    model = load_word2vec_model(f'models/{time_slice_model}.model')
     # Extract vectors and names from ./3d_models/{time_slice_model}.model
     all_vectors = {}
@@ -33,8 +30,6 @@ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
     # Only keep the vectors that are in vectors_list and their cosine similarities
     result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
     # Create DataFrame from the transformed vectors
     df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])

         List structure: [(word, model_name, vector, cosine_sim)]
     """
     word = target_word
     # Extract vectors and names from ./3d_models/{time_slice_model}.model
     all_vectors = {}
     # Only keep the vectors that are in vectors_list and their cosine similarities
     result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
     # Create DataFrame from the transformed vectors
     df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])

vector_graph.py CHANGED Viewed

@@ -3,8 +3,7 @@ import numpy as np
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import pandas as pd
-import gensim
-import umap
 def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):

 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import pandas as pd
 def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):

word2vec.py CHANGED Viewed

@@ -4,11 +4,11 @@ import os
 import pickle
 import tempfile
 import pandas as pd
-import xlsxwriter
 from sklearn.preprocessing import StandardScaler
 from sklearn.manifold import TSNE
 import plotly.express as px
 from collections import Counter
@@ -476,7 +476,6 @@ def count_lemmas(directory):
     return lemma_count_dict
 def main():

 import pickle
 import tempfile
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 from sklearn.manifold import TSNE
 import plotly.express as px
 from collections import Counter
+import streamlit as st
     return lemma_count_dict
 def main():