smhavens commited on
Commit
96d7c96
1 Parent(s): 27f00a9

Preprocessing attempt 2

Browse files
Files changed (1) hide show
  1. app.py +32 -8
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import math
 
3
  from datasets import load_dataset
4
  from sentence_transformers import SentenceTransformer
5
  from sentence_transformers import InputExample
@@ -11,9 +12,14 @@ import torch.nn.functional as F
11
  from torch.utils.data import DataLoader
12
  import numpy as np
13
  import evaluate
 
 
14
 
15
 
16
  tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 
 
 
17
 
18
  # answer = "Pizza"
19
  guesses = []
@@ -27,6 +33,19 @@ def mean_pooling(model_output, attention_mask):
27
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def tokenize_function(examples):
31
  return tokenizer(examples["text"])
32
 
@@ -51,20 +70,25 @@ def training():
51
  # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
52
  # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
53
 
54
- dataset = dataset["train"].map(tokenize_function, batched=True)
55
- dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
56
- dataset.format['type']
57
 
58
- print(dataset)
59
 
60
  train_examples = []
61
- train_data = dataset
62
  # For agility we only 1/2 of our available data
63
- n_examples = dataset.num_rows // 2
 
 
 
 
 
64
 
65
  for i in range(n_examples):
66
- example = train_data[i]
67
- print(example["text"])
68
  train_examples.append(InputExample(texts=example['text'], label=example['label']))
69
 
70
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
 
1
  import gradio as gr
2
  import math
3
+ import spacy
4
  from datasets import load_dataset
5
  from sentence_transformers import SentenceTransformer
6
  from sentence_transformers import InputExample
 
12
  from torch.utils.data import DataLoader
13
  import numpy as np
14
  import evaluate
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
 
18
 
19
  tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
20
+ nltk.download('stopwords')
21
+ nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
22
+ stops = stopwords.words("english")
23
 
24
  # answer = "Pizza"
25
  guesses = []
 
33
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
34
 
35
 
36
+ def normalize(comment, lowercase, remove_stopwords):
37
+ if lowercase:
38
+ comment = comment.lower()
39
+ comment = nlp(comment)
40
+ lemmatized = list()
41
+ for word in comment:
42
+ lemma = word.lemma_.strip()
43
+ if lemma:
44
+ if not remove_stopwords or (remove_stopwords and lemma not in stops):
45
+ lemmatized.append(lemma)
46
+ return " ".join(lemmatized)
47
+
48
+
49
  def tokenize_function(examples):
50
  return tokenizer(examples["text"])
51
 
 
70
  # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
71
  # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
72
 
73
+ # dataset = dataset["train"].map(tokenize_function, batched=True)
74
+ # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
75
+ # dataset.format['type']
76
 
77
+ # print(dataset)
78
 
79
  train_examples = []
80
+ train_data = dataset["train"]
81
  # For agility we only 1/2 of our available data
82
+ n_examples = dataset["train"].num_rows // 2
83
+
84
+ dataset_clean = {}
85
+ for i in range(n_examples):
86
+ dataset_clean[i]["text"] = train_data[i]["text"].apply(normalize, lowercase=True, remove_stopwords=True)
87
+ dataset_clean[i]["label"] = train_data[i]["label"]
88
 
89
  for i in range(n_examples):
90
+ example = dataset_clean[i]
91
+ # print(example["text"])
92
  train_examples.append(InputExample(texts=example['text'], label=example['label']))
93
 
94
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)