Spaces:
Paused
Paused
Update functions_preprocess.py
Browse files- functions_preprocess.py +14 -0
functions_preprocess.py
CHANGED
@@ -17,6 +17,7 @@ import torch
|
|
17 |
import torch.nn as nn
|
18 |
import torch.optim as optim
|
19 |
import torch.nn.functional as F
|
|
|
20 |
|
21 |
def download_if_non_existent(res_path, res_name):
|
22 |
try:
|
@@ -132,3 +133,16 @@ class CNN(nn.Module):
|
|
132 |
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
|
133 |
cat = self.dropout(torch.cat(pooled, dim=1))
|
134 |
return self.fc1(cat)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
import torch.nn as nn
|
18 |
import torch.optim as optim
|
19 |
import torch.nn.functional as F
|
20 |
+
from torchtext.data.utils import get_tokenizer
|
21 |
|
22 |
def download_if_non_existent(res_path, res_name):
|
23 |
try:
|
|
|
133 |
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
|
134 |
cat = self.dropout(torch.cat(pooled, dim=1))
|
135 |
return self.fc1(cat)
|
136 |
+
|
137 |
+
def build_vocab(data_iter):
|
138 |
+
tokenizer = get_tokenizer("basic_english")
|
139 |
+
|
140 |
+
def yield_tokens():
|
141 |
+
for example in data_iter:
|
142 |
+
cleaned_text = clean_text(example['text'])
|
143 |
+
yield tokenizer(cleaned_text)
|
144 |
+
|
145 |
+
vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>", "<pad>"])
|
146 |
+
vocab.set_default_index(vocab["<unk>"])
|
147 |
+
return vocab, tokenizer
|
148 |
+
|