luisespinosa commited on
Commit
7456b1b
1 Parent(s): bdba8b7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -0
README.md CHANGED
@@ -60,6 +60,54 @@ I am so <mask> 😢
60
  5) hungry 0.0232
61
  ```
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  ## Example Feature Extraction
64
 
65
  ```python
60
  5) hungry 0.0232
61
  ```
62
 
63
+ ## Example Tweet Embeddings
64
+ ```python
65
+ from transformers import AutoTokenizer, AutoModel, TFAutoModel
66
+ import numpy as np
67
+ from scipy.spatial.distance import cosine
68
+ from collections import defaultdict
69
+
70
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
71
+ model = AutoModel.from_pretrained(MODEL)
72
+
73
+ def get_embedding(text):
74
+ text = preprocess(text)
75
+ encoded_input = tokenizer(text, return_tensors='pt')
76
+ features = model(**encoded_input)
77
+ features = features[0].detach().cpu().numpy()
78
+ features_mean = np.mean(features[0], axis=0)
79
+ return features_mean
80
+
81
+ MODEL = "cardiffnlp/twitter-roberta-base"
82
+
83
+ query = "The book was awesome"
84
+
85
+ tweets = ["I just ordered fried chicken 🐣",
86
+ "The movie was great",
87
+ "What time is the next game?",
88
+ "Just finished reading 'Embeddings in NLP'"]
89
+
90
+ d = defaultdict(int)
91
+ for tweet in tweets:
92
+ sim = 1-cosine(get_embedding(query),get_embedding(tweet))
93
+ d[tweet] = sim
94
+
95
+ print('Most similar to: ',query)
96
+ print('----------------------------------------')
97
+ for idx,x in enumerate(sorted(d.items(), key=lambda x:x[1], reverse=True)):
98
+ print(idx+1,x[0])
99
+ ```
100
+ Output:
101
+
102
+ ```
103
+ Most similar to: The book was awesome
104
+ ----------------------------------------
105
+ 1 The movie was great
106
+ 2 Just finished reading 'Embeddings in NLP'
107
+ 3 I just ordered fried chicken 🐣
108
+ 4 What time is the next game?
109
+ ```
110
+
111
  ## Example Feature Extraction
112
 
113
  ```python