smhavens commited on
Commit
f4440c5
1 Parent(s): 45148d7

Separate cleaned data by label

Browse files
Files changed (1) hide show
  1. app.py +47 -5
app.py CHANGED
@@ -83,17 +83,59 @@ def training():
83
  train_data = dataset["train"]
84
  # For agility we only 1/2 of our available data
85
  n_examples = dataset["train"].num_rows // 2
86
-
87
- dataset_clean = {}
 
 
 
 
88
  for i in range(n_examples):
89
  dataset_clean[i] = {}
90
  dataset_clean[i]["text"] = normalize(train_data[i]["text"], lowercase=True, remove_stopwords=True)
91
  dataset_clean[i]["label"] = train_data[i]["label"]
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- for i in range(n_examples):
94
- example = dataset_clean[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  # print(example["text"])
96
- train_examples.append(InputExample(texts=example['text'], label=example['label']))
97
 
98
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
99
 
 
83
  train_data = dataset["train"]
84
  # For agility we only 1/2 of our available data
85
  n_examples = dataset["train"].num_rows // 2
86
+ # n_remaining = dataset["train"].num_rows - n_examples
87
+ dataset_clean = []
88
+ dataset_0 = []
89
+ dataset_1 = []
90
+ dataset_2 = []
91
+ dataset_3 = []
92
  for i in range(n_examples):
93
  dataset_clean[i] = {}
94
  dataset_clean[i]["text"] = normalize(train_data[i]["text"], lowercase=True, remove_stopwords=True)
95
  dataset_clean[i]["label"] = train_data[i]["label"]
96
+ if train_data[i]["label"] == 0:
97
+ dataset_0.append(dataset_clean[i])
98
+ elif train_data[i]["label"] == 1:
99
+ dataset_1.append(dataset_clean[i])
100
+ elif train_data[i]["label"] == 2:
101
+ dataset_2.append(dataset_clean[i])
102
+ elif train_data[i]["label"] == 3:
103
+ dataset_3.append(dataset_clean[i])
104
+ n_0 = len(dataset_0) // 2
105
+ n_1 = len(dataset_1) // 2
106
+ n_2 = len(dataset_2) // 2
107
+ n_3 = len(dataset_3) // 2
108
+ print("Label lengths:", len(dataset_0), len(dataset_1), len(dataset_2), len(dataset_3))
109
 
110
+ # for i in range(n_examples):
111
+ # example = dataset_clean[i]
112
+ # example_opposite = dataset_clean[-(i)]
113
+ # # print(example["text"])
114
+ # train_examples.append(InputExample(texts=[example['text'], example_opposite["text"]]))
115
+
116
+ for i in range(n_0):
117
+ example = dataset_0[i]
118
+ example_opposite = dataset_0[-(i)]
119
+ # print(example["text"])
120
+ train_examples.append(InputExample(texts=[example['text'], example_opposite["text"]], label=0))
121
+
122
+ for i in range(n_1):
123
+ example = dataset_1[i]
124
+ example_opposite = dataset_1[-(i)]
125
+ # print(example["text"])
126
+ train_examples.append(InputExample(texts=[example['text'], example_opposite["text"]], label=1))
127
+
128
+ for i in range(n_2):
129
+ example = dataset_2[i]
130
+ example_opposite = dataset_2[-(i)]
131
+ # print(example["text"])
132
+ train_examples.append(InputExample(texts=[example['text'], example_opposite["text"]], label=2))
133
+
134
+ for i in range(n_3):
135
+ example = dataset_3[i]
136
+ example_opposite = dataset_3[-(i)]
137
  # print(example["text"])
138
+ train_examples.append(InputExample(texts=[example['text'], example_opposite["text"]], label=3))
139
 
140
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
141