Mark7549 commited on
Commit
6640785
1 Parent(s): 4dd8921

added lemma occurences to nearest neighbours function

Browse files
Files changed (2) hide show
  1. app.py +5 -0
  2. word2vec.py +7 -2
app.py CHANGED
@@ -136,7 +136,12 @@ if active_tab == "Nearest neighbours":
136
  nearest_neighbours[model],
137
  columns = ['Word', 'Cosine Similarity']
138
  )
 
 
 
 
139
 
 
140
  all_dfs.append((model, df))
141
  st.table(df)
142
 
 
136
  nearest_neighbours[model],
137
  columns = ['Word', 'Cosine Similarity']
138
  )
139
+
140
+ # Add word occurences to dataframe
141
+ df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
142
+
143
 
144
+
145
  all_dfs.append((model, df))
146
  st.table(df)
147
 
word2vec.py CHANGED
@@ -464,13 +464,18 @@ def count_lemmas(directory):
464
  """
465
  lemma_count_dict = {}
466
  for file in os.listdir(directory):
 
 
 
 
467
  if file.endswith(".txt"):
468
  with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
469
  text = f.read()
470
  words = text.split()
471
- lemma_count_dict[file] = Counter(words)
472
 
473
  return lemma_count_dict
 
474
 
475
 
476
 
@@ -497,7 +502,7 @@ def main():
497
  # Iterate over all words and print their vectors
498
  # iterate_over_words(model)
499
 
500
- count_lemmas('lemma_list_raw')
501
 
502
 
503
  if __name__ == "__main__":
 
464
  """
465
  lemma_count_dict = {}
466
  for file in os.listdir(directory):
467
+ model_name = file.split('.')[0].replace('_', ' ').capitalize()
468
+ if len(model_name.split()) == 2:
469
+ # Also capitalize second part of model name
470
+ model_name = ' '.join([word.capitalize() for word in model_name.split()])
471
  if file.endswith(".txt"):
472
  with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
473
  text = f.read()
474
  words = text.split()
475
+ lemma_count_dict[model_name] = Counter(words)
476
 
477
  return lemma_count_dict
478
+
479
 
480
 
481
 
 
502
  # Iterate over all words and print their vectors
503
  # iterate_over_words(model)
504
 
505
+ print(count_lemmas('lemma_list_raw'))
506
 
507
 
508
  if __name__ == "__main__":