efeperro commited on
Commit
57ba920
1 Parent(s): 1dee31e

Update functions_preprocess.py

Browse files
Files changed (1) hide show
  1. functions_preprocess.py +14 -0
functions_preprocess.py CHANGED
@@ -17,6 +17,7 @@ import torch
17
  import torch.nn as nn
18
  import torch.optim as optim
19
  import torch.nn.functional as F
 
20
 
21
  def download_if_non_existent(res_path, res_name):
22
  try:
@@ -132,3 +133,16 @@ class CNN(nn.Module):
132
  pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
133
  cat = self.dropout(torch.cat(pooled, dim=1))
134
  return self.fc1(cat)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  import torch.nn as nn
18
  import torch.optim as optim
19
  import torch.nn.functional as F
20
+ from torchtext.data.utils import get_tokenizer
21
 
22
  def download_if_non_existent(res_path, res_name):
23
  try:
 
133
  pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
134
  cat = self.dropout(torch.cat(pooled, dim=1))
135
  return self.fc1(cat)
136
+
137
+ def build_vocab(data_iter):
138
+ tokenizer = get_tokenizer("basic_english")
139
+
140
+ def yield_tokens():
141
+ for example in data_iter:
142
+ cleaned_text = clean_text(example['text'])
143
+ yield tokenizer(cleaned_text)
144
+
145
+ vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>", "<pad>"])
146
+ vocab.set_default_index(vocab["<unk>"])
147
+ return vocab, tokenizer
148
+