thotran commited on
Commit
9993f32
1 Parent(s): 952a624

reduced wait time

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. app.py +53 -37
  3. data/sub.csv +0 -0
  4. requirements.txt +2 -2
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -7,13 +7,15 @@ import torch.nn.functional as F
7
  from torch.utils.data import TensorDataset, DataLoader, Dataset
8
  from sklearn.metrics import roc_auc_score
9
  import re
10
- from tqdm import tqdm
11
  from typing import *
12
  import string
13
  from sklearn.model_selection import train_test_split
14
  from transformers import DistilBertTokenizer, AdamW
15
  from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
 
16
  import streamlit as st
 
17
  st.write("Please be patient model training takes 20+ mins :P")
18
  #config constants
19
  SEED = 42
@@ -58,7 +60,7 @@ tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
58
 
59
  token_lens = []
60
 
61
- for txt in tqdm(data.comment_text):
62
  tokens = tokenizer.encode(txt, max_length=512)
63
  token_lens.append(len(tokens))
64
 
@@ -134,7 +136,7 @@ def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, opt
134
  """
135
  model.train()
136
 
137
- for batch in tqdm(data_loader):
138
  input_ids = batch["input_ids"].to(device)
139
  attention_mask = batch["attention_mask"].to(device)
140
  targets = batch["targets"].float().to(device)
@@ -152,7 +154,7 @@ def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
152
  losses = []
153
  score = None
154
 
155
- for idx, batch in enumerate(tqdm(data_loader)):
156
  input_ids = batch["input_ids"].to(device)
157
  attention_mask = batch["attention_mask"].to(device)
158
  targets = batch["targets"].float().to(device)
@@ -169,38 +171,52 @@ optimizer = AdamW(model.parameters(), lr=2e-5)
169
  best_val_loss = 9999.
170
  print('====START TRAINING====')
171
  #training here
172
- for epoch in tqdm(range(EPOCHS)):
173
- print('-' * 10)
174
- train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
175
- _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
176
- val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
177
- y_pred_np = val_pred.numpy()
178
- val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
179
- if val_loss < best_val_loss:
180
- best_val_loss = val_loss
181
  #torch.save(model.state_dict(), 'distill_bert.pt')
182
- print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
183
  # once model is saved and generated no need to re run :)
184
- #model = DistilBertForSequenceClassification(config)
185
- #model.load_state_dict(torch.load('./distill_bert.pt'))
186
- #model = model.to(device)
187
- #test model here
188
- test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
189
- print('====TEST RESULT====')
190
- print(f'Log loss: {test_loss:.5}')
191
- y_pred_np = test_pred.numpy()
192
- test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
193
- print(f'ROC AUC: {test_auc:.5}')
194
-
195
- test_src_id = test.iloc[:, 0]
196
- test.drop(columns='id', inplace=True)
197
- test_labels.drop(columns='id', inplace=True)
198
- test_src = pd.concat((test, test_labels), axis=1)
199
-
200
- test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
201
- prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
202
- prediction = torch.sigmoid(prediction).numpy()
203
-
204
- sub[labels] = prediction
205
- sub.insert(1,"tweet",data.comment_text,True)
206
- st.daatframe(sub)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from torch.utils.data import TensorDataset, DataLoader, Dataset
8
  from sklearn.metrics import roc_auc_score
9
  import re
10
+ from stqdm import stqdm
11
  from typing import *
12
  import string
13
  from sklearn.model_selection import train_test_split
14
  from transformers import DistilBertTokenizer, AdamW
15
  from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
16
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
  import streamlit as st
18
+
19
  st.write("Please be patient model training takes 20+ mins :P")
20
  #config constants
21
  SEED = 42
 
60
 
61
  token_lens = []
62
 
63
+ for txt in stqdm(data.comment_text,desc="tokenizing"):
64
  tokens = tokenizer.encode(txt, max_length=512)
65
  token_lens.append(len(tokens))
66
 
 
136
  """
137
  model.train()
138
 
139
+ for batch in stqdm(data_loader, desc="training"):
140
  input_ids = batch["input_ids"].to(device)
141
  attention_mask = batch["attention_mask"].to(device)
142
  targets = batch["targets"].float().to(device)
 
154
  losses = []
155
  score = None
156
 
157
+ for idx, batch in enumerate(stqdm(data_loader,desc="evaluating")):
158
  input_ids = batch["input_ids"].to(device)
159
  attention_mask = batch["attention_mask"].to(device)
160
  targets = batch["targets"].float().to(device)
 
171
  best_val_loss = 9999.
172
  print('====START TRAINING====')
173
  #training here
174
+ #for epoch in stqdm(range(EPOCHS)):
175
+ # print('-' * 10)
176
+ # train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
177
+ # _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
178
+ # val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
179
+ # y_pred_np = val_pred.numpy()
180
+ # val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
181
+ # if val_loss < best_val_loss:
182
+ # best_val_loss = val_loss
183
  #torch.save(model.state_dict(), 'distill_bert.pt')
184
+ # print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
185
  # once model is saved and generated no need to re run :)
186
+
187
+ #PUSH MODEL TO HF
188
+ #from huggingface_hub import notebook_login
189
+ #notebook_login()
190
+ #model.push_to_hub("tweetbert")
191
+ #tokenizer.push_to_hub("tweetbert")
192
+
193
+ #LOAD MODEL
194
+ model=model = AutoModelForSequenceClassification.from_pretrained("thotranexe/tweetbert")
195
+ model = model.to(device)
196
+
197
+ #TEST MODEL
198
+ #test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
199
+ #print('====TEST RESULT====')
200
+ #print(f'Log loss: {test_loss:.5}')
201
+ #y_pred_np = test_pred.numpy()
202
+ #test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
203
+ #print(f'ROC AUC: {test_auc:.5}')
204
+
205
+ #test_src_id = test.iloc[:, 0]
206
+ #test.drop(columns='id', inplace=True)
207
+ #test_labels.drop(columns='id', inplace=True)
208
+ #test_src = pd.concat((test, test_labels), axis=1)
209
+
210
+ #MAKE PREDICTIONS
211
+ #test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
212
+ #prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
213
+ #prediction = torch.sigmoid(prediction).numpy()
214
+
215
+ #SAVE RESULTS INTO SUBMISSION DATAFRAME
216
+ #sub[labels] = prediction
217
+ #sub.insert(1,"tweet",data.comment_text,True)
218
+ #sub.to_csv("sub.csv", encoding='utf-8', index=False)
219
+ #^commented above code, saved to csv to reduce wait/comput time
220
+ sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8', error_bad_lines=False)
221
+ sub.drop(index="id")
222
+ st.dataframe(sub)
data/sub.csv ADDED
Binary file (71.3 MB). View file
 
requirements.txt CHANGED
@@ -2,7 +2,7 @@ numpy
2
  pandas
3
  streamlit
4
  torch
5
- tdqm
6
  scikit-learn
7
  transformers
8
- ipywidgets
 
2
  pandas
3
  streamlit
4
  torch
5
+ stqdm
6
  scikit-learn
7
  transformers
8
+ ipywidgets