added lemma occurences to nearest neighbours function
Browse files- app.py +5 -0
- word2vec.py +7 -2
app.py
CHANGED
@@ -136,7 +136,12 @@ if active_tab == "Nearest neighbours":
|
|
136 |
nearest_neighbours[model],
|
137 |
columns = ['Word', 'Cosine Similarity']
|
138 |
)
|
|
|
|
|
|
|
|
|
139 |
|
|
|
140 |
all_dfs.append((model, df))
|
141 |
st.table(df)
|
142 |
|
|
|
136 |
nearest_neighbours[model],
|
137 |
columns = ['Word', 'Cosine Similarity']
|
138 |
)
|
139 |
+
|
140 |
+
# Add word occurences to dataframe
|
141 |
+
df['Occurences'] = df['Word'].apply(lambda x: lemma_counts[model][x])
|
142 |
+
|
143 |
|
144 |
+
|
145 |
all_dfs.append((model, df))
|
146 |
st.table(df)
|
147 |
|
word2vec.py
CHANGED
@@ -464,13 +464,18 @@ def count_lemmas(directory):
|
|
464 |
"""
|
465 |
lemma_count_dict = {}
|
466 |
for file in os.listdir(directory):
|
|
|
|
|
|
|
|
|
467 |
if file.endswith(".txt"):
|
468 |
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
|
469 |
text = f.read()
|
470 |
words = text.split()
|
471 |
-
lemma_count_dict[
|
472 |
|
473 |
return lemma_count_dict
|
|
|
474 |
|
475 |
|
476 |
|
@@ -497,7 +502,7 @@ def main():
|
|
497 |
# Iterate over all words and print their vectors
|
498 |
# iterate_over_words(model)
|
499 |
|
500 |
-
count_lemmas('lemma_list_raw')
|
501 |
|
502 |
|
503 |
if __name__ == "__main__":
|
|
|
464 |
"""
|
465 |
lemma_count_dict = {}
|
466 |
for file in os.listdir(directory):
|
467 |
+
model_name = file.split('.')[0].replace('_', ' ').capitalize()
|
468 |
+
if len(model_name.split()) == 2:
|
469 |
+
# Also capitalize second part of model name
|
470 |
+
model_name = ' '.join([word.capitalize() for word in model_name.split()])
|
471 |
if file.endswith(".txt"):
|
472 |
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
|
473 |
text = f.read()
|
474 |
words = text.split()
|
475 |
+
lemma_count_dict[model_name] = Counter(words)
|
476 |
|
477 |
return lemma_count_dict
|
478 |
+
|
479 |
|
480 |
|
481 |
|
|
|
502 |
# Iterate over all words and print their vectors
|
503 |
# iterate_over_words(model)
|
504 |
|
505 |
+
print(count_lemmas('lemma_list_raw'))
|
506 |
|
507 |
|
508 |
if __name__ == "__main__":
|