Mark7549 commited on
Commit
0d0f07a
·
1 Parent(s): 47e1289

Migrated from gradio to streamlit

Browse files
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_option_menu import option_menu
3
+ from word2vec import *
4
+
5
+ st.set_page_config(page_title="Ancient Greek Word2Vec", layout="centered")
6
+
7
+ # Horizontal menu
8
+ active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
9
+ menu_icon="cast", default_index=0, orientation="horizontal")
10
+
11
+ # Nearest neighbours tab
12
+ if active_tab == "Nearest neighbours":
13
+ st.write("### TO DO: add description of function")
14
+ col1, col2 = st.columns(2)
15
+ with st.container():
16
+ with col1:
17
+ word = st.text_input("Enter a word", placeholder="ἀνήρ")
18
+
19
+ with col2:
20
+ time_slice = st.multiselect("Time slice", ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"])
21
+
22
+ st.slider("Number of neighbours", 1, 50, 15)
23
+
24
+ nearest_neighbours_button = st.button("Find nearest neighbours")
25
+
26
+ if nearest_neighbours_button:
27
+ st.write("button pressed")
28
+
29
+
30
+ # Cosine similarity tab
31
+ elif active_tab == "Cosine similarity":
32
+ with st.container():
33
+ st.write("Cosine similarity tab")
34
+
35
+ # 3D graph tab
36
+ elif active_tab == "3D graph":
37
+ with st.container():
38
+ st.write("3D graph tab")
39
+
40
+ # Dictionary tab
41
+ elif active_tab == "Dictionary":
42
+ with st.container():
43
+ st.write("Dictionary tab")
44
+
models/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.model filter=lfs diff=lfs merge=lfs -text
models/archaic_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdd1887db84078af826ae006bf11f884c808342f1ff9da93fd525052eef08204
3
+ size 1647899
models/classical_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a50d112100a49d901e45e798591d2040c53bc50c67a48da1e05294f207ed5e2e
3
+ size 6263363
models/early_roman_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f63942fae0974f4c3e39552d2d574a2f4b84e125c648d428a038e6192ec6f3f8
3
+ size 8483329
models/hellen_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:027f8bdad4555ad4a4821a65ab2d564275105dda2d02e598e1f5f3435aedd90a
3
+ size 5473215
models/late_roman_cbow.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53d66deaf1b14067cead5da52e46e75d0944c2140a9b36782e85f01f2ac454f4
3
+ size 3696190
word2vec.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gensim.models import Word2Vec
2
+ from collections import defaultdict
3
+ import os
4
+ import tempfile
5
+
6
+
7
+ def load_all_models():
8
+ '''
9
+ Load all word2vec models
10
+ '''
11
+
12
+ archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
13
+ classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
14
+ early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
15
+ hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
16
+ late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
17
+
18
+ return [archaic, classical, early_roman, hellen, late_roman]
19
+
20
+
21
+ def load_word2vec_model(model_path):
22
+ '''
23
+ Load a word2vec model from a file
24
+ '''
25
+ return Word2Vec.load(model_path)
26
+
27
+
28
+ def get_word_vector(model, word):
29
+ '''
30
+ Return the word vector of a word
31
+ '''
32
+ return model.wv[word]
33
+
34
+
35
+ def iterate_over_words(model):
36
+ '''
37
+ Iterate over all words in the vocabulary and print their vectors
38
+ '''
39
+ index = 0
40
+ for word, index in model.wv.key_to_index.items():
41
+ vector = get_word_vector(model, word)
42
+ print(f'{index} Word: {word}, Vector: {vector}')
43
+ index += 1
44
+
45
+
46
+ def model_dictionary(model):
47
+ '''
48
+ Return the dictionary of the word2vec model
49
+ Key is the word and value is the vector of the word
50
+ '''
51
+ dict = defaultdict(list)
52
+ for word, index in model.wv.key_to_index.items():
53
+ vector = get_word_vector(model, word)
54
+ dict[word] = vector
55
+
56
+ return dict
57
+
58
+
59
+ def dot_product(vector_a, vector_b):
60
+ '''
61
+ Return the dot product of two vectors
62
+ '''
63
+ return sum(a * b for a, b in zip(vector_a, vector_b))
64
+
65
+
66
+ def magnitude(vector):
67
+ '''
68
+ Return the magnitude of a vector
69
+ '''
70
+ return sum(x**2 for x in vector) ** 0.5
71
+
72
+
73
+ def cosine_similarity(vector_a, vector_b):
74
+ '''
75
+ Return the cosine similarity of two vectors
76
+ '''
77
+ dot_prod = dot_product(vector_a, vector_b)
78
+ mag_a = magnitude(vector_a)
79
+ mag_b = magnitude(vector_b)
80
+
81
+ # Avoid division by zero
82
+ if mag_a == 0 or mag_b == 0:
83
+ return 0.0
84
+
85
+ similarity = dot_prod / (mag_a * mag_b)
86
+ return "{:.2f}".format(similarity)
87
+
88
+
89
+ def get_cosine_similarity(word1, word2, time_slice):
90
+ '''
91
+ Return the cosine similarity of two words
92
+ '''
93
+ # TO DO: MOET NETTER
94
+
95
+ # Return if path does not exist
96
+ if not os.path.exists(f'models/{time_slice}.model'):
97
+ return
98
+
99
+ model = load_word2vec_model(f'models/{time_slice}.model')
100
+ dict = model_dictionary(model)
101
+ return cosine_similarity(dict[word1], dict[word2])
102
+
103
+
104
+ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
105
+ '''
106
+ Return the cosine similarity of one word in two different time slices
107
+ '''
108
+
109
+ # Return if path does not exist
110
+ if not os.path.exists(f'models/{time_slice1}.model') or not os.path.exists(f'models/{time_slice2}.model'):
111
+ return
112
+
113
+ model1 = load_word2vec_model(f'models/{time_slice1}.model')
114
+ model2 = load_word2vec_model(f'models/{time_slice2}.model')
115
+
116
+ dict1 = model_dictionary(model1)
117
+ dict2 = model_dictionary(model2)
118
+
119
+ return cosine_similarity(dict1[word], dict2[word])
120
+
121
+
122
+
123
+ def validate_nearest_neighbours(word, time_slice_model, n):
124
+ '''
125
+ Validate the input of the nearest neighbours function
126
+ '''
127
+ if word == '' or time_slice_model == 'models/None.model' or n == '':
128
+ return False
129
+ return True
130
+
131
+
132
+ def get_nearest_neighbours(word, time_slice_model, n=10, models=load_all_models()):
133
+ '''
134
+ Return the nearest neighbours of a word
135
+
136
+ word: the word for which the nearest neighbours are calculated
137
+ time_slice_model: the word2vec model of the time slice of the input word
138
+ models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
139
+ n: the number of nearest neighbours to return (default: 10)
140
+
141
+ Return: list of tuples with the word, the time slice and
142
+ the cosine similarity of the nearest neighbours
143
+ '''
144
+
145
+ # Check if all parameters are set
146
+ valid = validate_nearest_neighbours(word, time_slice_model, n)
147
+ if valid == False:
148
+ return [['Error: not all parameters are set', '', '']]
149
+
150
+
151
+
152
+ time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
153
+ vector_1 = get_word_vector(time_slice_model, word)
154
+ nearest_neighbours = []
155
+
156
+ # Iterate over all models
157
+ for model in models:
158
+ model_name = model[0]
159
+ model = model[1]
160
+
161
+ # Iterate over all words of the model
162
+ for word, index in model.wv.key_to_index.items():
163
+
164
+ # Vector of the current word
165
+ vector_2 = get_word_vector(model, word)
166
+
167
+ # Calculate the cosine similarity between current word and input word
168
+ cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
169
+
170
+ # If the list of nearest neighbours is not full yet, add the current word
171
+ if len(nearest_neighbours) < n:
172
+ nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
173
+
174
+ # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
175
+ else:
176
+ smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
177
+ if cosine_similarity_vectors > smallest_neighbour[2]:
178
+ nearest_neighbours.remove(smallest_neighbour)
179
+ nearest_neighbours.append((word, model_name, cosine_similarity_vectors))
180
+
181
+
182
+ return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
183
+
184
+
185
+ def write_to_file(data):
186
+ '''
187
+ Write the data to a file
188
+ '''
189
+ # Create random tmp file name
190
+ temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
191
+
192
+ os.close(temp_file_descriptor)
193
+
194
+ # Write data to the temporary file
195
+ with open(temp_file_path, 'w') as temp_file:
196
+ temp_file.write(str(data))
197
+
198
+ return temp_file_path
199
+
200
+
201
+ def main():
202
+ # model = load_word2vec_model('models/archaic_cbow.model')
203
+ # archaic_cbow_dict = model_dictionary(model)
204
+
205
+ # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
206
+ # print(score)
207
+
208
+
209
+ archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
210
+ classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
211
+ early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
212
+ hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
213
+ late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
214
+
215
+ models = [archaic, classical, early_roman, hellen, late_roman]
216
+ nearest_neighbours = get_nearest_neighbours('πατήρ', archaic[1], models, n=5)
217
+ print(nearest_neighbours)
218
+ # vector = get_word_vector(model, 'ἀνήρ')
219
+ # print(vector)
220
+
221
+ # Iterate over all words and print their vectors
222
+ # iterate_over_words(model)
223
+
224
+
225
+ if __name__ == "__main__":
226
+ main()