Mark7549 commited on
Commit
17c5755
1 Parent(s): 74e30c6

improved code quality

Browse files
Files changed (1) hide show
  1. word2vec.py +99 -168
word2vec.py CHANGED
@@ -1,16 +1,9 @@
1
  from gensim.models import Word2Vec
2
  from collections import defaultdict
3
  import os
4
- import pickle
5
  import tempfile
6
  import pandas as pd
7
- from sklearn.preprocessing import StandardScaler
8
- from sklearn.manifold import TSNE
9
- import plotly.express as px
10
  from collections import Counter
11
- import streamlit as st
12
-
13
-
14
 
15
 
16
  def load_all_models():
@@ -30,6 +23,8 @@ def load_all_models():
30
  def load_selected_models(selected_models):
31
  '''
32
  Load the selected word2vec models
 
 
33
  '''
34
  models = []
35
  for model in selected_models:
@@ -48,6 +43,8 @@ def load_selected_models(selected_models):
48
  def load_word2vec_model(model_path):
49
  '''
50
  Load a word2vec model from a file
 
 
51
  '''
52
  return Word2Vec.load(model_path)
53
 
@@ -55,6 +52,9 @@ def load_word2vec_model(model_path):
55
  def get_word_vector(model, word):
56
  '''
57
  Return the word vector of a word
 
 
 
58
  '''
59
  return model.wv[word]
60
 
@@ -62,6 +62,8 @@ def get_word_vector(model, word):
62
  def iterate_over_words(model):
63
  '''
64
  Iterate over all words in the vocabulary and print their vectors
 
 
65
  '''
66
  index = 0
67
  for word, index in model.wv.key_to_index.items():
@@ -74,6 +76,8 @@ def model_dictionary(model):
74
  '''
75
  Return the dictionary of the word2vec model
76
  Key is the word and value is the vector of the word
 
 
77
  '''
78
  dict = defaultdict(list)
79
  for word, index in model.wv.key_to_index.items():
@@ -86,13 +90,24 @@ def model_dictionary(model):
86
  def dot_product(vector_a, vector_b):
87
  '''
88
  Return the dot product of two vectors
 
 
 
 
 
 
89
  '''
90
  return sum(a * b for a, b in zip(vector_a, vector_b))
91
 
92
 
93
  def magnitude(vector):
94
  '''
95
- Return the magnitude of a vector
 
 
 
 
 
96
  '''
97
  return sum(x**2 for x in vector) ** 0.5
98
 
@@ -100,6 +115,13 @@ def magnitude(vector):
100
  def cosine_similarity(vector_a, vector_b):
101
  '''
102
  Return the cosine similarity of two vectors
 
 
 
 
 
 
 
103
  '''
104
  dot_prod = dot_product(vector_a, vector_b)
105
  mag_a = magnitude(vector_a)
@@ -116,10 +138,16 @@ def cosine_similarity(vector_a, vector_b):
116
  def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
117
  '''
118
  Return the cosine similarity of two words
 
 
 
 
 
 
 
 
 
119
  '''
120
- # TO DO: MOET NETTER
121
-
122
- # Return if path does not exist
123
 
124
  time_slice_1 = convert_time_name_to_model(time_slice_1)
125
  time_slice_2 = convert_time_name_to_model(time_slice_2)
@@ -139,6 +167,14 @@ def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
139
  def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
140
  '''
141
  Return the cosine similarity of one word in two different time slices
 
 
 
 
 
 
 
 
142
  '''
143
 
144
  # Return if path does not exist
@@ -158,6 +194,14 @@ def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
158
  def validate_nearest_neighbours(word, n, models):
159
  '''
160
  Validate the input of the nearest neighbours function
 
 
 
 
 
 
 
 
161
  '''
162
  if word == '' or n == '' or models == []:
163
  return False
@@ -167,6 +211,11 @@ def validate_nearest_neighbours(word, n, models):
167
  def convert_model_to_time_name(model_name):
168
  '''
169
  Convert the model name to the time slice name
 
 
 
 
 
170
  '''
171
  if model_name == 'archaic_cbow' or model_name == 'archaic':
172
  return 'Archaic'
@@ -183,6 +232,12 @@ def convert_model_to_time_name(model_name):
183
  def convert_time_name_to_model(time_name):
184
  '''
185
  Convert the time slice name to the model name
 
 
 
 
 
 
186
  '''
187
  if time_name == 'Archaic':
188
  return 'archaic_cbow'
@@ -205,52 +260,6 @@ def convert_time_name_to_model(time_name):
205
  elif time_name == 'archaic':
206
  return 'Archaic'
207
 
208
- def get_nearest_neighbours2(word, n=10, models=load_all_models()):
209
- '''
210
- Return the nearest neighbours of a word
211
-
212
- word: the word for which the nearest neighbours are calculated
213
- time_slice_model: the word2vec model of the time slice of the input word
214
- models: list of tuples with the name of the time slice and the word2vec model (default: all in ./models)
215
- n: the number of nearest neighbours to return (default: 10)
216
-
217
- Return: list of tuples with the word, the time slice and
218
- the cosine similarity of the nearest neighbours
219
- '''
220
-
221
- time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
222
- vector_1 = get_word_vector(time_slice_model, word)
223
- nearest_neighbours = []
224
-
225
- # Iterate over all models
226
- for model in models:
227
- model_name = model[0]
228
- time_name = convert_model_to_time_name(model_name)
229
- model = model[1]
230
-
231
- # Iterate over all words of the model
232
- for word, index in model.wv.key_to_index.items():
233
-
234
- # Vector of the current word
235
- vector_2 = get_word_vector(model, word)
236
-
237
- # Calculate the cosine similarity between current word and input word
238
- cosine_similarity_vectors = cosine_similarity(vector_1, vector_2)
239
-
240
- # If the list of nearest neighbours is not full yet, add the current word
241
- if len(nearest_neighbours) < n:
242
- nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
243
-
244
- # If the list of nearest neighbours is full, replace the word with the smallest cosine similarity
245
- else:
246
- smallest_neighbour = min(nearest_neighbours, key=lambda x: x[2])
247
- if cosine_similarity_vectors > smallest_neighbour[2]:
248
- nearest_neighbours.remove(smallest_neighbour)
249
- nearest_neighbours.append((word, time_name, cosine_similarity_vectors))
250
-
251
-
252
- return sorted(nearest_neighbours, key=lambda x: x[2], reverse=True)
253
-
254
 
255
  def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
256
  """
@@ -298,9 +307,16 @@ def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
298
 
299
 
300
  def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
301
- """
302
- Returns the vectors of the nearest neighbours of a word
303
- """
 
 
 
 
 
 
 
304
  model_name = convert_model_to_time_name(time_slice_model)
305
  time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
306
  vector_1 = get_word_vector(time_slice_model, word)
@@ -327,6 +343,10 @@ def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
327
  def write_to_file(data):
328
  '''
329
  Write the data to a file
 
 
 
 
330
  '''
331
  # Create random tmp file name
332
  temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
@@ -342,7 +362,11 @@ def write_to_file(data):
342
 
343
  def store_df_in_temp_file(all_dfs):
344
  '''
345
- Store the dataframe in a temporary file
 
 
 
 
346
  '''
347
  # Define directory for temporary files
348
  temp_dir = "./downloads/nn"
@@ -350,37 +374,34 @@ def store_df_in_temp_file(all_dfs):
350
  # Create the directory if it doesn't exist
351
  os.makedirs(temp_dir, exist_ok=True)
352
 
353
- # Create random tmp file name
354
  _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)
355
 
356
-
357
  # Concatenate all dataframes
358
  df = pd.concat([df for _, df in all_dfs], axis=1, keys=[model for model, _ in all_dfs])
359
 
360
-
361
  # Create an ExcelWriter object
362
  with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
363
  # Create a new sheet
364
  worksheet = writer.book.add_worksheet('Results')
365
 
366
- # Write text before DataFrames
367
  start_row = 0
368
  for model, df in all_dfs:
369
- # Write model name as text
370
  worksheet.write(start_row, 0, f"Model: {model}")
371
- # Write DataFrame
372
  df.to_excel(writer, sheet_name='Results', index=False, startrow=start_row + 1, startcol=0)
373
- # Update start_row for the next model
374
  start_row += df.shape[0] + 3 # Add some space between models
375
 
376
  return temp_file_path
377
 
378
 
379
-
380
  def check_word_in_models(word):
381
- """
382
- Check in which models a word occurs.
383
- """
 
 
 
 
384
  all_models = load_all_models()
385
  eligible_models = []
386
 
@@ -393,75 +414,16 @@ def check_word_in_models(word):
393
 
394
  return eligible_models
395
 
396
-
397
-
398
- def reduce_dimensions_tSNE():
399
- '''
400
- Reduce the dimensions of the data using t-SNE
401
- '''
402
- all_models = load_all_models()
403
-
404
- for model in all_models:
405
- model_name = model[0]
406
- model = model[1]
407
- model_dict = model_dictionary(model)
408
-
409
- # Extract vectors and names from model_dict
410
- all_vector_names = list(model_dict.keys())
411
- all_vectors = list(model_dict.values())
412
-
413
- print('Scaling', model_name)
414
 
415
- # Scale vectors
416
- scaler = StandardScaler()
417
- vectors_scaled = scaler.fit_transform(all_vectors)
418
-
419
- print('Fitting', model_name)
420
-
421
- # Make t-SNE model and fit it to the scaled vectors
422
- tsne_model = TSNE(n_components=3, random_state=42)
423
- tsne_result = tsne_model.fit_transform(vectors_scaled)
424
-
425
- print('Done fitting')
426
-
427
- # Associate the names with the 3D representations
428
- result_with_names = [(all_vector_names[i], tsne_result[i]) for i in range(len(all_vector_names))]
429
-
430
- # Store all vectors in /3d_models/{model_name}.model
431
- store_3d_model(result_with_names, model_name)
432
-
433
 
434
- def store_3d_model(result_with_names, model_name):
435
- """
436
- Store the 3D model data to a file.
437
- """
438
- output_dir = './3d_models'
439
- os.makedirs(output_dir, exist_ok=True)
440
- file_path = os.path.join(output_dir, f'{model_name}.model')
441
-
442
- with open(file_path, 'wb') as f:
443
- pickle.dump(result_with_names, f)
444
- print(f"3D model for {model_name} stored at {file_path}")
445
-
446
 
 
447
 
448
- def print_3d_model(model_name):
449
- """
450
- Print the 3D model data.
451
- """
452
- file_path = f'./3d_models/{model_name}.model'
453
-
454
- with open(file_path, 'rb') as f:
455
- result_with_names = pickle.load(f)
456
-
457
- for word, vector in result_with_names:
458
- print(f'{word}: {vector}')
459
-
460
-
461
- def count_lemmas(directory):
462
- """
463
- Create a Counter with all words and their occurences for all models
464
- """
465
  lemma_count_dict = {}
466
  for file in os.listdir(directory):
467
  model_name = file.split('.')[0].replace('_', ' ').capitalize()
@@ -475,34 +437,3 @@ def count_lemmas(directory):
475
  lemma_count_dict[model_name] = Counter(words)
476
 
477
  return lemma_count_dict
478
-
479
-
480
-
481
- def main():
482
- # model = load_word2vec_model('models/archaic_cbow.model')
483
- # archaic_cbow_dict = model_dictionary(model)
484
-
485
- # score = cosine_similarity(archaic_cbow_dict['Πελοπόννησος'], archaic_cbow_dict['σπάργανον'])
486
- # print(score)
487
-
488
-
489
- # archaic = ('archaic', load_word2vec_model('models/archaic_cbow.model'))
490
- # classical = ('classical', load_word2vec_model('models/classical_cbow.model'))
491
- # early_roman = ('early_roman', load_word2vec_model('models/early_roman_cbow.model'))
492
- # hellen = ('hellen', load_word2vec_model('models/hellen_cbow.model'))
493
- # late_roman = ('late_roman', load_word2vec_model('models/late_roman_cbow.model'))
494
-
495
- # models = [archaic, classical, early_roman, hellen, late_roman]
496
- # nearest_neighbours = get_nearest_neighbours('πατήρ', 'archaic_cbow', n=5)
497
- # print(nearest_neighbours)
498
- # vector = get_word_vector(model, 'ἀνήρ')
499
- # print(vector)
500
-
501
- # Iterate over all words and print their vectors
502
- # iterate_over_words(model)
503
-
504
- print(count_lemmas('lemma_list_raw'))
505
-
506
-
507
- if __name__ == "__main__":
508
- main()
 
1
  from gensim.models import Word2Vec
2
  from collections import defaultdict
3
  import os
 
4
  import tempfile
5
  import pandas as pd
 
 
 
6
  from collections import Counter
 
 
 
7
 
8
 
9
  def load_all_models():
 
23
  def load_selected_models(selected_models):
24
  '''
25
  Load the selected word2vec models
26
+
27
+ selected_models: a list of models that should be loaded
28
  '''
29
  models = []
30
  for model in selected_models:
 
43
  def load_word2vec_model(model_path):
44
  '''
45
  Load a word2vec model from a file
46
+
47
+ model_path: relative path to model files
48
  '''
49
  return Word2Vec.load(model_path)
50
 
 
52
  def get_word_vector(model, word):
53
  '''
54
  Return the word vector of a word
55
+
56
+ model: word2vec model object
57
+ word: word to extract vector from
58
  '''
59
  return model.wv[word]
60
 
 
62
  def iterate_over_words(model):
63
  '''
64
  Iterate over all words in the vocabulary and print their vectors
65
+
66
+ model: word2vec model object
67
  '''
68
  index = 0
69
  for word, index in model.wv.key_to_index.items():
 
76
  '''
77
  Return the dictionary of the word2vec model
78
  Key is the word and value is the vector of the word
79
+
80
+ model: word2vec model object
81
  '''
82
  dict = defaultdict(list)
83
  for word, index in model.wv.key_to_index.items():
 
90
  def dot_product(vector_a, vector_b):
91
  '''
92
  Return the dot product of two vectors
93
+
94
+ vector_a: A list of numbers representing the first vector
95
+ vector_b: A list of numbers representing the second vector
96
+
97
+ Returns:
98
+ A single number representing the dot product of the two vectors
99
  '''
100
  return sum(a * b for a, b in zip(vector_a, vector_b))
101
 
102
 
103
  def magnitude(vector):
104
  '''
105
+ Returns the magnitude of a vector
106
+
107
+ vector: A list of numbers representing the vetor
108
+
109
+ Returns:
110
+ A single number representing the magnitude of the vector.
111
  '''
112
  return sum(x**2 for x in vector) ** 0.5
113
 
 
115
  def cosine_similarity(vector_a, vector_b):
116
  '''
117
  Return the cosine similarity of two vectors
118
+
119
+ vector_a: A list of numbers representing the first vector
120
+ vector_b: A list of numbers representing the second vector
121
+
122
+ Returns:
123
+ A String representing the cosine similarity of the two vectors \
124
+ formatted to two decimals.
125
  '''
126
  dot_prod = dot_product(vector_a, vector_b)
127
  mag_a = magnitude(vector_a)
 
138
  def get_cosine_similarity(word1, time_slice_1, word2, time_slice_2):
139
  '''
140
  Return the cosine similarity of two words
141
+
142
+ word1: The first word as a string.
143
+ time_slice_1: The time slice for the first word as a string.
144
+ word2: The second word as a string.
145
+ time_slice_2: The time slice for the second word as a string.
146
+
147
+ Returns:
148
+ A string representing the cosine similarity of the two words formatted to two decimal places.
149
+
150
  '''
 
 
 
151
 
152
  time_slice_1 = convert_time_name_to_model(time_slice_1)
153
  time_slice_2 = convert_time_name_to_model(time_slice_2)
 
167
  def get_cosine_similarity_one_word(word, time_slice1, time_slice2):
168
  '''
169
  Return the cosine similarity of one word in two different time slices
170
+
171
+ word: The word as a string.
172
+ time_slice1: The first time slice as a string.
173
+ time_slice2: The second time slice as a string.
174
+
175
+ Returns:
176
+ A string representing the cosine similarity of the word in two different time slices formatted to two decimal places.
177
+
178
  '''
179
 
180
  # Return if path does not exist
 
194
  def validate_nearest_neighbours(word, n, models):
195
  '''
196
  Validate the input of the nearest neighbours function
197
+
198
+ word: The word as a string.
199
+ n: The number of nearest neighbours to find as an integer.
200
+ models: A list of model names as strings.
201
+
202
+ Returns:
203
+ A boolean value. True if inputs are valid, False otherwise.
204
+
205
  '''
206
  if word == '' or n == '' or models == []:
207
  return False
 
211
  def convert_model_to_time_name(model_name):
212
  '''
213
  Convert the model name to the time slice name
214
+
215
+ model_name: The model name as a string.
216
+
217
+ Returns:
218
+ A string representing the corresponding time slice name.
219
  '''
220
  if model_name == 'archaic_cbow' or model_name == 'archaic':
221
  return 'Archaic'
 
232
  def convert_time_name_to_model(time_name):
233
  '''
234
  Convert the time slice name to the model name
235
+
236
+ time_name -- The time slice name as a string.
237
+
238
+ Returns:
239
+ A string representing the corresponding model name.
240
+
241
  '''
242
  if time_name == 'Archaic':
243
  return 'archaic_cbow'
 
260
  elif time_name == 'archaic':
261
  return 'Archaic'
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  def get_nearest_neighbours(target_word, n=10, models=load_all_models()):
265
  """
 
307
 
308
 
309
  def get_nearest_neighbours_vectors(word, time_slice_model, n=15):
310
+ '''
311
+ Return the vectors of the nearest neighbours of a word
312
+
313
+ word: the word for which the nearest neighbours are calculated
314
+ time_slice_model: the word2vec model of the time slice of the input word
315
+ n: the number of nearest neighbours to return (default: 15)
316
+
317
+ Return: list of tuples with the word, the time slice, the vector, and the cosine similarity
318
+ of the nearest neighbours
319
+ '''
320
  model_name = convert_model_to_time_name(time_slice_model)
321
  time_slice_model = load_word2vec_model(f'models/{time_slice_model}.model')
322
  vector_1 = get_word_vector(time_slice_model, word)
 
343
  def write_to_file(data):
344
  '''
345
  Write the data to a file
346
+
347
+ data: the data to be written to the file
348
+
349
+ Return: the path to the temporary file
350
  '''
351
  # Create random tmp file name
352
  temp_file_descriptor, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".txt", dir="/tmp")
 
362
 
363
  def store_df_in_temp_file(all_dfs):
364
  '''
365
+ Store the dataframes in a temporary file
366
+
367
+ all_dfs: list of tuples with the name of the time slice and the dataframe
368
+
369
+ Return: the path to the temporary Excel file
370
  '''
371
  # Define directory for temporary files
372
  temp_dir = "./downloads/nn"
 
374
  # Create the directory if it doesn't exist
375
  os.makedirs(temp_dir, exist_ok=True)
376
 
377
+ # Create random temporary file name
378
  _, temp_file_path = tempfile.mkstemp(prefix="temp_", suffix=".xlsx", dir=temp_dir)
379
 
 
380
  # Concatenate all dataframes
381
  df = pd.concat([df for _, df in all_dfs], axis=1, keys=[model for model, _ in all_dfs])
382
 
 
383
  # Create an ExcelWriter object
384
  with pd.ExcelWriter(temp_file_path, engine='xlsxwriter') as writer:
385
  # Create a new sheet
386
  worksheet = writer.book.add_worksheet('Results')
387
 
 
388
  start_row = 0
389
  for model, df in all_dfs:
 
390
  worksheet.write(start_row, 0, f"Model: {model}")
 
391
  df.to_excel(writer, sheet_name='Results', index=False, startrow=start_row + 1, startcol=0)
 
392
  start_row += df.shape[0] + 3 # Add some space between models
393
 
394
  return temp_file_path
395
 
396
 
 
397
  def check_word_in_models(word):
398
+ '''
399
+ Check in which models a word occurs
400
+
401
+ word: the word to check
402
+
403
+ Return: list of model names where the word occurs
404
+ '''
405
  all_models = load_all_models()
406
  eligible_models = []
407
 
 
414
 
415
  return eligible_models
416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
+ def count_lemmas(directory):
420
+ '''
421
+ Create a Counter with all words and their occurrences for all models
 
 
 
 
 
 
 
 
 
422
 
423
+ directory: the directory containing the text files for the models
424
 
425
+ Return: a dictionary where keys are model names and values are Counters of word occurrences
426
+ '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  lemma_count_dict = {}
428
  for file in os.listdir(directory):
429
  model_name = file.split('.')[0].replace('_', ' ').capitalize()
 
437
  lemma_count_dict[model_name] = Counter(words)
438
 
439
  return lemma_count_dict