Mark7549 commited on
Commit
cdb0a70
1 Parent(s): ee625fc

removed forms for first 2 tabs and used cache to make program faster

Browse files
Files changed (4) hide show
  1. app.py +20 -12
  2. autocomplete.py +58 -1
  3. corpora/compass_filtered_v2.pkl.gz +3 -0
  4. word2vec.py +5 -5
app.py CHANGED
@@ -21,6 +21,11 @@ def load_lsj_dict():
21
  def load_all_models_words():
22
  return sorted(load_compressed_word_list('corpora/compass_filtered.pkl.gz'), key=custom_sort)
23
 
 
 
 
 
 
24
  # Load compressed word list
25
  all_models_words = load_all_models_words()
26
 
@@ -28,6 +33,9 @@ all_models_words = load_all_models_words()
28
  # Prepare lsj dictionary
29
  lemma_dict = load_lsj_dict()
30
 
 
 
 
31
 
32
  # Horizontal menu
33
  active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
@@ -41,13 +49,13 @@ if active_tab == "Nearest neighbours":
41
  eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
42
  all_models_words = load_all_models_words()
43
 
44
- with st.form("nn_form"):
45
  st.markdown("## Nearest Neighbours")
46
  target_word = st.multiselect("Enter a word", options=all_models_words, max_selections=1)
47
  if len(target_word) > 0:
48
  target_word = target_word[0]
49
 
50
- eligible_models = check_word_in_models(target_word)
51
 
52
  models = st.multiselect(
53
  "Select models to search for neighbours",
@@ -55,8 +63,8 @@ if active_tab == "Nearest neighbours":
55
  )
56
  n = st.slider("Number of neighbours", 1, 50, 15)
57
 
58
- nearest_neighbours_button = st.form_submit_button("Find nearest neighbours")
59
-
60
  if nearest_neighbours_button:
61
  if validate_nearest_neighbours(target_word, n, models) == False:
62
  st.error('Please fill in all fields')
@@ -98,11 +106,11 @@ if active_tab == "Nearest neighbours":
98
 
99
  # Cosine similarity tab
100
  elif active_tab == "Cosine similarity":
101
- eligible_models_1 = []
102
- eligible_models_2 = []
103
  all_models_words = load_all_models_words()
104
 
105
- with st.form("cosine_similarity_form"):
 
 
106
  st.markdown("## Cosine similarity")
107
  col1, col2 = st.columns(2)
108
  col3, col4 = st.columns(2)
@@ -110,24 +118,24 @@ elif active_tab == "Cosine similarity":
110
  word_1 = st.multiselect("Enter a word", placeholder="πατήρ", max_selections=1, options=all_models_words)
111
  if len(word_1) > 0:
112
  word_1 = word_1[0]
113
- eligible_models_1 = check_word_in_models(word_1)
114
 
115
-
 
116
 
117
- time_slice_1 = st.selectbox("Time slice word 1", eligible_models_1)
118
 
119
  with st.container():
120
  with col3:
121
  word_2 = st.multiselect("Enter a word", placeholder="μήτηρ", max_selections=1, options=all_models_words)
122
  if len(word_2) > 0:
123
  word_2 = word_2[0]
124
- eligible_models_2 = check_word_in_models(word_2)
125
 
126
  with col4:
127
  time_slice_2 = st.selectbox("Time slice word 2", eligible_models_2)
128
 
129
  # Create button for calculating cosine similarity
130
- cosine_similarity_button = st.form_submit_button("Calculate cosine similarity")
131
 
132
  # If the button is clicked, execute calculation
133
  if cosine_similarity_button:
 
21
  def load_all_models_words():
22
  return sorted(load_compressed_word_list('corpora/compass_filtered.pkl.gz'), key=custom_sort)
23
 
24
+
25
+ @st.cache_data
26
+ def load_models_for_word_dict():
27
+ return word_in_models_dict('corpora/compass_filtered.pkl.gz')
28
+
29
  # Load compressed word list
30
  all_models_words = load_all_models_words()
31
 
 
33
  # Prepare lsj dictionary
34
  lemma_dict = load_lsj_dict()
35
 
36
+ # Load dictionary with words as keys and eligible models as values
37
+ models_for_word_dict = load_models_for_word_dict()
38
+
39
 
40
  # Horizontal menu
41
  active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
 
49
  eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
50
  all_models_words = load_all_models_words()
51
 
52
+ with st.container():
53
  st.markdown("## Nearest Neighbours")
54
  target_word = st.multiselect("Enter a word", options=all_models_words, max_selections=1)
55
  if len(target_word) > 0:
56
  target_word = target_word[0]
57
 
58
+ eligible_models = models_for_word_dict[target_word]
59
 
60
  models = st.multiselect(
61
  "Select models to search for neighbours",
 
63
  )
64
  n = st.slider("Number of neighbours", 1, 50, 15)
65
 
66
+ nearest_neighbours_button = st.button("Find nearest neighbours")
67
+
68
  if nearest_neighbours_button:
69
  if validate_nearest_neighbours(target_word, n, models) == False:
70
  st.error('Please fill in all fields')
 
106
 
107
  # Cosine similarity tab
108
  elif active_tab == "Cosine similarity":
 
 
109
  all_models_words = load_all_models_words()
110
 
111
+ with st.container():
112
+ eligible_models_1 = []
113
+ eligible_models_2 = []
114
  st.markdown("## Cosine similarity")
115
  col1, col2 = st.columns(2)
116
  col3, col4 = st.columns(2)
 
118
  word_1 = st.multiselect("Enter a word", placeholder="πατήρ", max_selections=1, options=all_models_words)
119
  if len(word_1) > 0:
120
  word_1 = word_1[0]
121
+ eligible_models_1 = models_for_word_dict[word_1]
122
 
123
+ with col2:
124
+ time_slice_1 = st.selectbox("Time slice word 1", options = eligible_models_1)
125
 
 
126
 
127
  with st.container():
128
  with col3:
129
  word_2 = st.multiselect("Enter a word", placeholder="μήτηρ", max_selections=1, options=all_models_words)
130
  if len(word_2) > 0:
131
  word_2 = word_2[0]
132
+ eligible_models_2 = models_for_word_dict[word_2]
133
 
134
  with col4:
135
  time_slice_2 = st.selectbox("Time slice word 2", eligible_models_2)
136
 
137
  # Create button for calculating cosine similarity
138
+ cosine_similarity_button = st.button("Calculate cosine similarity")
139
 
140
  # If the button is clicked, execute calculation
141
  if cosine_similarity_button:
autocomplete.py CHANGED
@@ -1,5 +1,6 @@
1
  import pickle
2
  import gzip
 
3
 
4
 
5
  def get_unique_words(corpus_filename):
@@ -34,4 +35,60 @@ def get_autocomplete(input_word=" ", all_words=" "):
34
  """
35
  Get a list of words that start with the input word
36
  """
37
- return [word for word in all_words if word.startswith(input_word)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pickle
2
  import gzip
3
+ from word2vec import *
4
 
5
 
6
  def get_unique_words(corpus_filename):
 
35
  """
36
  Get a list of words that start with the input word
37
  """
38
+ return [word for word in all_words if word.startswith(input_word)]
39
+
40
+
41
+ def custom_sort(item):
42
+ if item.isdigit():
43
+ print(item)
44
+ return (2, item) # Place numbers last
45
+ else:
46
+ return (0, item.lower())
47
+
48
+
49
+ def order_compressed_list(filename):
50
+ """
51
+ Order the compressed list of words alphabetically and put numbers at the end
52
+ """
53
+ # Strip extension from filename
54
+ filename_raw = filename.split('.')[0]
55
+
56
+ with gzip.open(filename, 'rb') as file:
57
+ words = pickle.load(file)
58
+
59
+ # Sort the words
60
+ sorted_words = sorted(words, key=custom_sort)
61
+
62
+ return sorted_words
63
+
64
+
65
+ def read_compressed_list(filename):
66
+ """
67
+ Read the compressed list of words
68
+ """
69
+ with gzip.open(filename, 'rb') as file:
70
+ print(pickle.load(file))
71
+
72
+
73
+ def word_in_models_dict(words_file):
74
+ """
75
+ Create a dictionary with words as keys and models in which the word occurs as values
76
+ """
77
+ with gzip.open(words_file, 'rb') as file:
78
+ words = pickle.load(file)
79
+
80
+ models = load_all_models()
81
+
82
+ word_models = {word: [] for word in words} # Initialize word_models dictionary with empty lists
83
+
84
+ for model in models:
85
+ model_name = convert_model_to_time_name(model[0])
86
+ for word in words:
87
+ if word in model[1].wv.key_to_index:
88
+ word_models[word].append(model_name)
89
+
90
+ return word_models
91
+
92
+
93
+
94
+
corpora/compass_filtered_v2.pkl.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32818a420a9458c7e8be4919f78a2623ffca704cd93340b05b4825f209c01b61
3
+ size 127623
word2vec.py CHANGED
@@ -161,15 +161,15 @@ def convert_model_to_time_name(model_name):
161
  '''
162
  Convert the model name to the time slice name
163
  '''
164
- if model_name == 'archaic_cbow':
165
  return 'Archaic'
166
- elif model_name == 'classical_cbow':
167
  return 'Classical'
168
- elif model_name == 'early_roman_cbow':
169
  return 'Early Roman'
170
- elif model_name == 'hellen_cbow':
171
  return 'Hellenistic'
172
- elif model_name == 'late_roman_cbow':
173
  return 'Late Roman'
174
 
175
 
 
161
  '''
162
  Convert the model name to the time slice name
163
  '''
164
+ if model_name == 'archaic_cbow' or model_name == 'archaic':
165
  return 'Archaic'
166
+ elif model_name == 'classical_cbow' or model_name == 'classical':
167
  return 'Classical'
168
+ elif model_name == 'early_roman_cbow' or model_name == 'early_roman':
169
  return 'Early Roman'
170
+ elif model_name == 'hellen_cbow' or model_name == 'hellen':
171
  return 'Hellenistic'
172
+ elif model_name == 'late_roman_cbow' or model_name == 'late_roman':
173
  return 'Late Roman'
174
 
175