Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- word2vec
|
4 |
+
language: vls
|
5 |
+
license: gpl-3.0
|
6 |
+
---
|
7 |
+
|
8 |
+
## Description
|
9 |
+
Word embedding model trained by Al-Rfou et al.
|
10 |
+
|
11 |
+
|
12 |
+
## How to use?
|
13 |
+
|
14 |
+
```
|
15 |
+
import pickle
|
16 |
+
from numpy import dot
|
17 |
+
from numpy.linalg import norm
|
18 |
+
from huggingface_hub import hf_hub_download
|
19 |
+
words, embeddings = pickle.load(open(hf_hub_download(repo_id="Word2vec/polyglot_words_embeddings_en", filename="words_embeddings_en.pkl"), 'rb'),encoding="latin1")
|
20 |
+
|
21 |
+
word = "Irish"
|
22 |
+
a = embeddings[words.index(word)]
|
23 |
+
most_similar = []
|
24 |
+
for i in range(len(embeddings)):
|
25 |
+
if i != words.index(word):
|
26 |
+
b = embeddings[i]
|
27 |
+
cos_sim = dot(a, b)/(norm(a)*norm(b))
|
28 |
+
most_similar.append(cos_sim)
|
29 |
+
else:
|
30 |
+
most_similar.append(0)
|
31 |
+
|
32 |
+
words[most_similar.index(max(most_similar))]
|
33 |
+
```
|
34 |
+
|
35 |
+
## Citation
|
36 |
+
|
37 |
+
```
|
38 |
+
@InProceedings{polyglot:2013:ACL-CoNLL,
|
39 |
+
author = {Al-Rfou, Rami and Perozzi, Bryan and Skiena, Steven},
|
40 |
+
title = {Polyglot: Distributed Word Representations for Multilingual NLP},
|
41 |
+
booktitle = {Proceedings of the Seventeenth Conference on Computational Natural Language Learning},
|
42 |
+
month = {August},
|
43 |
+
year = {2013},
|
44 |
+
address = {Sofia, Bulgaria},
|
45 |
+
publisher = {Association for Computational Linguistics},
|
46 |
+
pages = {183--192},
|
47 |
+
url = {http://www.aclweb.org/anthology/W13-3520}
|
48 |
+
}
|
49 |
+
```
|