Mark7549 commited on
Commit
74e30c6
·
1 Parent(s): 00a8bfc

made some small cleanups of redundant code

Browse files
Files changed (6) hide show
  1. app.py +0 -1
  2. compress_words.py +48 -0
  3. lsj_dict.py +6 -29
  4. plots.py +0 -5
  5. vector_graph.py +1 -2
  6. word2vec.py +1 -2
app.py CHANGED
@@ -47,7 +47,6 @@ lemma_counts = load_lemma_count_dict()
47
 
48
 
49
 
50
-
51
  # Set styles for menu
52
  styles_horizontal = {
53
  "container": {"display": "flex", "justify-content": "center"},
 
47
 
48
 
49
 
 
50
  # Set styles for menu
51
  styles_horizontal = {
52
  "container": {"display": "flex", "justify-content": "center"},
compress_words.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ from autocomplete import save_compressed_word_list
3
+ import json
4
+
5
+
6
+ def compress_words(words):
7
+ """
8
+ Compress a list of words.
9
+ """
10
+ with open(words, 'r') as f:
11
+ words = f.readlines()
12
+
13
+ compressed_words = []
14
+ for word in words:
15
+ compressed_word = word.strip()
16
+ compressed_words.append(compressed_word)
17
+
18
+ # Save the compressed words
19
+ save_compressed_word_list(compressed_words, 'all_lemmas.pkl.gz')
20
+
21
+
22
+ def compress_word_list(words):
23
+ """
24
+ Compress a list of words.
25
+ """
26
+ compressed_words = []
27
+ for word in words:
28
+ compressed_word = word.strip()
29
+ compressed_words.append(compressed_word)
30
+
31
+ # Save the compressed words
32
+ save_compressed_word_list(compressed_words, 'all_lemmas.pkl.gz')
33
+
34
+
35
+ def main():
36
+
37
+ lemma_dict = json.load(open('lsj_dict.json', 'r'))
38
+
39
+ # Get all lemmas
40
+ all_lemmas = list(lemma_dict.keys())
41
+
42
+ # Compress words
43
+ compress_word_list(all_lemmas)
44
+
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
lsj_dict.py CHANGED
@@ -76,14 +76,6 @@ def get_descendants_text(element):
76
  Get all the text of the descendants of a given element, separating every 'sense' element.
77
  """
78
  text = ""
79
- level_indicators = [
80
- 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
81
- '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
82
- 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
83
- 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
84
- 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
85
- 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
86
- ]
87
 
88
  for child in element:
89
  if child.tag == 'sense':
@@ -142,6 +134,9 @@ def full_dictionary():
142
 
143
 
144
  def format_text(data):
 
 
 
145
  text = data['definitions']['text']
146
 
147
  # Change <tr> tags to bold
@@ -199,22 +194,10 @@ def format_text(data):
199
 
200
 
201
 
202
- def main():
203
- # xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
204
-
205
- # for word, info in xml_info.items():
206
- # print(word)
207
- # print('Lemma: ', info['lemma'])
208
- # print('Orthographies: ', info['orthographies'])
209
- # print('Definitions: ', info['definitions'])
210
- # print('TEST', info['definitions']['tr'].split('\n')[0]) # First word in the definition
211
- # print('Text:', info['definitions']['text'])
212
- # if len(info['definitions']['tr'].split('\n')) > 1:
213
- # print('First definition: ', info['definitions']['tr'].split('\n')[1])
214
- # print(' ')
215
-
216
- # full_dictionary()
217
 
 
 
218
  download = True
219
 
220
  if download is True:
@@ -234,12 +217,6 @@ def main():
234
 
235
 
236
  lemma_dict = json.load(open('lsj_dict.json', 'r'))
237
-
238
- print_test(lemma_dict)
239
-
240
-
241
- def print_test(lemma_dict):
242
- print(lemma_dict["βομβάζω"])
243
 
244
 
245
 
 
76
  Get all the text of the descendants of a given element, separating every 'sense' element.
77
  """
78
  text = ""
 
 
 
 
 
 
 
 
79
 
80
  for child in element:
81
  if child.tag == 'sense':
 
134
 
135
 
136
  def format_text(data):
137
+ """
138
+ Modify text to desired template
139
+ """
140
  text = data['definitions']['text']
141
 
142
  # Change <tr> tags to bold
 
194
 
195
 
196
 
197
+ def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ # This code is used to convert the .xml files into a .json file
200
+ # TO DO: Make seperate function
201
  download = True
202
 
203
  if download is True:
 
217
 
218
 
219
  lemma_dict = json.load(open('lsj_dict.json', 'r'))
 
 
 
 
 
 
220
 
221
 
222
 
plots.py CHANGED
@@ -16,9 +16,6 @@ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
16
  List structure: [(word, model_name, vector, cosine_sim)]
17
  """
18
  word = target_word
19
-
20
- # Load model
21
- model = load_word2vec_model(f'models/{time_slice_model}.model')
22
 
23
  # Extract vectors and names from ./3d_models/{time_slice_model}.model
24
  all_vectors = {}
@@ -33,8 +30,6 @@ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
33
  # Only keep the vectors that are in vectors_list and their cosine similarities
34
  result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
35
 
36
-
37
-
38
  # Create DataFrame from the transformed vectors
39
  df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
40
 
 
16
  List structure: [(word, model_name, vector, cosine_sim)]
17
  """
18
  word = target_word
 
 
 
19
 
20
  # Extract vectors and names from ./3d_models/{time_slice_model}.model
21
  all_vectors = {}
 
30
  # Only keep the vectors that are in vectors_list and their cosine similarities
31
  result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
32
 
 
 
33
  # Create DataFrame from the transformed vectors
34
  df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
35
 
vector_graph.py CHANGED
@@ -3,8 +3,7 @@ import numpy as np
3
  from sklearn.decomposition import PCA
4
  from sklearn.preprocessing import StandardScaler
5
  import pandas as pd
6
- import gensim
7
- import umap
8
 
9
 
10
  def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
 
3
  from sklearn.decomposition import PCA
4
  from sklearn.preprocessing import StandardScaler
5
  import pandas as pd
6
+
 
7
 
8
 
9
  def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
word2vec.py CHANGED
@@ -4,11 +4,11 @@ import os
4
  import pickle
5
  import tempfile
6
  import pandas as pd
7
- import xlsxwriter
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.manifold import TSNE
10
  import plotly.express as px
11
  from collections import Counter
 
12
 
13
 
14
 
@@ -476,7 +476,6 @@ def count_lemmas(directory):
476
 
477
  return lemma_count_dict
478
 
479
-
480
 
481
 
482
  def main():
 
4
  import pickle
5
  import tempfile
6
  import pandas as pd
 
7
  from sklearn.preprocessing import StandardScaler
8
  from sklearn.manifold import TSNE
9
  import plotly.express as px
10
  from collections import Counter
11
+ import streamlit as st
12
 
13
 
14
 
 
476
 
477
  return lemma_count_dict
478
 
 
479
 
480
 
481
  def main():