made some small cleanups of redundant code
Browse files- app.py +0 -1
- compress_words.py +48 -0
- lsj_dict.py +6 -29
- plots.py +0 -5
- vector_graph.py +1 -2
- word2vec.py +1 -2
app.py
CHANGED
@@ -47,7 +47,6 @@ lemma_counts = load_lemma_count_dict()
|
|
47 |
|
48 |
|
49 |
|
50 |
-
|
51 |
# Set styles for menu
|
52 |
styles_horizontal = {
|
53 |
"container": {"display": "flex", "justify-content": "center"},
|
|
|
47 |
|
48 |
|
49 |
|
|
|
50 |
# Set styles for menu
|
51 |
styles_horizontal = {
|
52 |
"container": {"display": "flex", "justify-content": "center"},
|
compress_words.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
from autocomplete import save_compressed_word_list
|
3 |
+
import json
|
4 |
+
|
5 |
+
|
6 |
+
def compress_words(words):
|
7 |
+
"""
|
8 |
+
Compress a list of words.
|
9 |
+
"""
|
10 |
+
with open(words, 'r') as f:
|
11 |
+
words = f.readlines()
|
12 |
+
|
13 |
+
compressed_words = []
|
14 |
+
for word in words:
|
15 |
+
compressed_word = word.strip()
|
16 |
+
compressed_words.append(compressed_word)
|
17 |
+
|
18 |
+
# Save the compressed words
|
19 |
+
save_compressed_word_list(compressed_words, 'all_lemmas.pkl.gz')
|
20 |
+
|
21 |
+
|
22 |
+
def compress_word_list(words):
|
23 |
+
"""
|
24 |
+
Compress a list of words.
|
25 |
+
"""
|
26 |
+
compressed_words = []
|
27 |
+
for word in words:
|
28 |
+
compressed_word = word.strip()
|
29 |
+
compressed_words.append(compressed_word)
|
30 |
+
|
31 |
+
# Save the compressed words
|
32 |
+
save_compressed_word_list(compressed_words, 'all_lemmas.pkl.gz')
|
33 |
+
|
34 |
+
|
35 |
+
def main():
|
36 |
+
|
37 |
+
lemma_dict = json.load(open('lsj_dict.json', 'r'))
|
38 |
+
|
39 |
+
# Get all lemmas
|
40 |
+
all_lemmas = list(lemma_dict.keys())
|
41 |
+
|
42 |
+
# Compress words
|
43 |
+
compress_word_list(all_lemmas)
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
main()
|
lsj_dict.py
CHANGED
@@ -76,14 +76,6 @@ def get_descendants_text(element):
|
|
76 |
Get all the text of the descendants of a given element, separating every 'sense' element.
|
77 |
"""
|
78 |
text = ""
|
79 |
-
level_indicators = [
|
80 |
-
'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
|
81 |
-
'1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
|
82 |
-
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
|
83 |
-
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
84 |
-
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
|
85 |
-
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
|
86 |
-
]
|
87 |
|
88 |
for child in element:
|
89 |
if child.tag == 'sense':
|
@@ -142,6 +134,9 @@ def full_dictionary():
|
|
142 |
|
143 |
|
144 |
def format_text(data):
|
|
|
|
|
|
|
145 |
text = data['definitions']['text']
|
146 |
|
147 |
# Change <tr> tags to bold
|
@@ -199,22 +194,10 @@ def format_text(data):
|
|
199 |
|
200 |
|
201 |
|
202 |
-
def main():
|
203 |
-
# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
|
204 |
-
|
205 |
-
# for word, info in xml_info.items():
|
206 |
-
# print(word)
|
207 |
-
# print('Lemma: ', info['lemma'])
|
208 |
-
# print('Orthographies: ', info['orthographies'])
|
209 |
-
# print('Definitions: ', info['definitions'])
|
210 |
-
# print('TEST', info['definitions']['tr'].split('\n')[0]) # First word in the definition
|
211 |
-
# print('Text:', info['definitions']['text'])
|
212 |
-
# if len(info['definitions']['tr'].split('\n')) > 1:
|
213 |
-
# print('First definition: ', info['definitions']['tr'].split('\n')[1])
|
214 |
-
# print(' ')
|
215 |
-
|
216 |
-
# full_dictionary()
|
217 |
|
|
|
|
|
218 |
download = True
|
219 |
|
220 |
if download is True:
|
@@ -234,12 +217,6 @@ def main():
|
|
234 |
|
235 |
|
236 |
lemma_dict = json.load(open('lsj_dict.json', 'r'))
|
237 |
-
|
238 |
-
print_test(lemma_dict)
|
239 |
-
|
240 |
-
|
241 |
-
def print_test(lemma_dict):
|
242 |
-
print(lemma_dict["βομβάζω"])
|
243 |
|
244 |
|
245 |
|
|
|
76 |
Get all the text of the descendants of a given element, separating every 'sense' element.
|
77 |
"""
|
78 |
text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
for child in element:
|
81 |
if child.tag == 'sense':
|
|
|
134 |
|
135 |
|
136 |
def format_text(data):
|
137 |
+
"""
|
138 |
+
Modify text to desired template
|
139 |
+
"""
|
140 |
text = data['definitions']['text']
|
141 |
|
142 |
# Change <tr> tags to bold
|
|
|
194 |
|
195 |
|
196 |
|
197 |
+
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
+
# This code is used to convert the .xml files into a .json file
|
200 |
+
# TO DO: Make seperate function
|
201 |
download = True
|
202 |
|
203 |
if download is True:
|
|
|
217 |
|
218 |
|
219 |
lemma_dict = json.load(open('lsj_dict.json', 'r'))
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
|
222 |
|
plots.py
CHANGED
@@ -16,9 +16,6 @@ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
|
|
16 |
List structure: [(word, model_name, vector, cosine_sim)]
|
17 |
"""
|
18 |
word = target_word
|
19 |
-
|
20 |
-
# Load model
|
21 |
-
model = load_word2vec_model(f'models/{time_slice_model}.model')
|
22 |
|
23 |
# Extract vectors and names from ./3d_models/{time_slice_model}.model
|
24 |
all_vectors = {}
|
@@ -33,8 +30,6 @@ def make_3d_plot_tSNE(vectors_list, target_word, time_slice_model):
|
|
33 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
34 |
result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
|
35 |
|
36 |
-
|
37 |
-
|
38 |
# Create DataFrame from the transformed vectors
|
39 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
40 |
|
|
|
16 |
List structure: [(word, model_name, vector, cosine_sim)]
|
17 |
"""
|
18 |
word = target_word
|
|
|
|
|
|
|
19 |
|
20 |
# Extract vectors and names from ./3d_models/{time_slice_model}.model
|
21 |
all_vectors = {}
|
|
|
30 |
# Only keep the vectors that are in vectors_list and their cosine similarities
|
31 |
result_with_names = [(word, all_vectors[word], cosine_sim) for word, _, _, cosine_sim in vectors_list]
|
32 |
|
|
|
|
|
33 |
# Create DataFrame from the transformed vectors
|
34 |
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
|
35 |
|
vector_graph.py
CHANGED
@@ -3,8 +3,7 @@ import numpy as np
|
|
3 |
from sklearn.decomposition import PCA
|
4 |
from sklearn.preprocessing import StandardScaler
|
5 |
import pandas as pd
|
6 |
-
|
7 |
-
import umap
|
8 |
|
9 |
|
10 |
def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
|
|
|
3 |
from sklearn.decomposition import PCA
|
4 |
from sklearn.preprocessing import StandardScaler
|
5 |
import pandas as pd
|
6 |
+
|
|
|
7 |
|
8 |
|
9 |
def create_3d_vectors(word, time_slice, nearest_neighbours_vectors):
|
word2vec.py
CHANGED
@@ -4,11 +4,11 @@ import os
|
|
4 |
import pickle
|
5 |
import tempfile
|
6 |
import pandas as pd
|
7 |
-
import xlsxwriter
|
8 |
from sklearn.preprocessing import StandardScaler
|
9 |
from sklearn.manifold import TSNE
|
10 |
import plotly.express as px
|
11 |
from collections import Counter
|
|
|
12 |
|
13 |
|
14 |
|
@@ -476,7 +476,6 @@ def count_lemmas(directory):
|
|
476 |
|
477 |
return lemma_count_dict
|
478 |
|
479 |
-
|
480 |
|
481 |
|
482 |
def main():
|
|
|
4 |
import pickle
|
5 |
import tempfile
|
6 |
import pandas as pd
|
|
|
7 |
from sklearn.preprocessing import StandardScaler
|
8 |
from sklearn.manifold import TSNE
|
9 |
import plotly.express as px
|
10 |
from collections import Counter
|
11 |
+
import streamlit as st
|
12 |
|
13 |
|
14 |
|
|
|
476 |
|
477 |
return lemma_count_dict
|
478 |
|
|
|
479 |
|
480 |
|
481 |
def main():
|