danlou commited on
Commit
54630de
1 Parent(s): 116c615

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -12
README.md CHANGED
@@ -22,12 +22,13 @@ Replace usernames and links for placeholders: "@user" and "http".
22
  If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
23
  ```python
24
  def preprocess(text):
25
- new_text = []
26
- for t in text.split(" "):
27
- t = '@user' if t.startswith('@') and len(t) > 1 else t
28
- t = 'http' if t.startswith('http') else t
29
- new_text.append(t)
30
- return " ".join(new_text)
 
31
  ```
32
 
33
  ## Example Masked Language Model
@@ -39,8 +40,8 @@ MODEL = "cardiffnlp/twitter-roberta-base-2019-90m"
39
  fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
40
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
41
 
42
- def print_candidates():
43
- for i in range(5):
44
  token = tokenizer.decode(candidates[i]['token'])
45
  score = candidates[i]['score']
46
  print("%d) %.5f %s" % (i+1, score, token))
@@ -50,11 +51,12 @@ texts = [
50
  "I keep forgetting to bring a <mask>.",
51
  "Looking forward to watching <mask> Game tonight!",
52
  ]
 
53
  for text in texts:
54
  t = preprocess(text)
55
  print(f"{'-'*30}\n{t}")
56
  candidates = fill_mask(t)
57
- print_candidates()
58
  ```
59
 
60
  Output:
@@ -90,13 +92,12 @@ import numpy as np
90
  from scipy.spatial.distance import cosine
91
  from collections import Counter
92
 
93
- def get_embedding(text):
94
  text = preprocess(text)
95
  encoded_input = tokenizer(text, return_tensors='pt')
96
  features = model(**encoded_input)
97
  features = features[0].detach().cpu().numpy()
98
- features_mean = np.mean(features[0], axis=0)
99
- return features_mean
100
 
101
 
102
  MODEL = "cardiffnlp/twitter-roberta-base-2019-90m"
 
22
  If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
23
  ```python
24
  def preprocess(text):
25
+ preprocessed_text = []
26
+ for t in text.split(): # expects whitespace tokenization
27
+ if len(t) > 1:
28
+ t = '@user' if t[0] == '@' and t.count('@') == 1 else t
29
+ t = 'http' if t.startswith('http') else t
30
+ preprocessed_text.append(t)
31
+ return ' '.join(preprocessed_text)
32
  ```
33
 
34
  ## Example Masked Language Model
 
40
  fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
41
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
42
 
43
+ def pprint(candidates, n):
44
+ for i in range(n):
45
  token = tokenizer.decode(candidates[i]['token'])
46
  score = candidates[i]['score']
47
  print("%d) %.5f %s" % (i+1, score, token))
 
51
  "I keep forgetting to bring a <mask>.",
52
  "Looking forward to watching <mask> Game tonight!",
53
  ]
54
+
55
  for text in texts:
56
  t = preprocess(text)
57
  print(f"{'-'*30}\n{t}")
58
  candidates = fill_mask(t)
59
+ pprint(candidates, 5)
60
  ```
61
 
62
  Output:
 
92
  from scipy.spatial.distance import cosine
93
  from collections import Counter
94
 
95
+ def get_embedding(text): # naive approach for demonstration
96
  text = preprocess(text)
97
  encoded_input = tokenizer(text, return_tensors='pt')
98
  features = model(**encoded_input)
99
  features = features[0].detach().cpu().numpy()
100
+ return np.mean(features[0], axis=0)
 
101
 
102
 
103
  MODEL = "cardiffnlp/twitter-roberta-base-2019-90m"