thotran commited on
Commit
e92903c
1 Parent(s): 33a51aa

final touches

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -16,7 +16,7 @@ from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenc
16
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
  import streamlit as st
18
 
19
- st.write("Please be patient model training takes 20+ mins :P")
20
  #config constants
21
  SEED = 42
22
  EPOCHS = 2
@@ -57,13 +57,12 @@ data.comment_text=data.comment_text.map(cleanString)
57
 
58
  #tokenizer
59
  tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
60
-
61
  token_lens = []
62
 
63
- for txt in stqdm(data.comment_text,desc="tokenizing"):
64
- tokens = tokenizer.encode(txt, max_length=512)
65
- token_lens.append(len(tokens))
66
-
67
  #test train split
68
  df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
69
  df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
@@ -129,7 +128,7 @@ model = model.to(device)
129
  train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
130
  val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
131
  test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
132
-
133
  def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
134
  """
135
  hf = huggingface.
@@ -148,7 +147,7 @@ def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, opt
148
  loss = outputs.loss
149
  loss.backward()
150
  optimizer.step()
151
-
152
  def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
153
  model.eval()
154
  losses = []
@@ -170,7 +169,7 @@ def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
170
  optimizer = AdamW(model.parameters(), lr=2e-5)
171
  best_val_loss = 9999.
172
  print('====START TRAINING====')
173
- #training here
174
  #for epoch in stqdm(range(EPOCHS)):
175
  # print('-' * 10)
176
  # train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
@@ -216,7 +215,8 @@ model = model.to(device)
216
  #sub[labels] = prediction
217
  #sub.insert(1,"tweet",data.comment_text,True)
218
  #sub.to_csv("sub.csv", encoding='utf-8', index=False)
219
- #^commented above code, saved to csv to reduce wait/comput time
220
  sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
221
- #sub.drop(index="id")
222
- st.dataframe(sub)
 
 
16
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
  import streamlit as st
18
 
19
+ st.markdown("### Welcome to toxicity! A showcase for the TweetBert Model!")
20
  #config constants
21
  SEED = 42
22
  EPOCHS = 2
 
57
 
58
  #tokenizer
59
  tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
 
60
  token_lens = []
61
 
62
+ #for txt in stqdm(data.comment_text,desc="tokenizing"):
63
+ # tokens = tokenizer.encode(txt, max_length=512)
64
+ # token_lens.append(len(tokens))
65
+ #^code above commented for HF runtime purposes, tokenizes comment_text for the bert model
66
  #test train split
67
  df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
68
  df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
 
128
  train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
129
  val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
130
  test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
131
+ # training function
132
  def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
133
  """
134
  hf = huggingface.
 
147
  loss = outputs.loss
148
  loss.backward()
149
  optimizer.step()
150
+ #evalute and keep best model
151
  def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
152
  model.eval()
153
  losses = []
 
169
  optimizer = AdamW(model.parameters(), lr=2e-5)
170
  best_val_loss = 9999.
171
  print('====START TRAINING====')
172
+ # actuual training here
173
  #for epoch in stqdm(range(EPOCHS)):
174
  # print('-' * 10)
175
  # train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
 
215
  #sub[labels] = prediction
216
  #sub.insert(1,"tweet",data.comment_text,True)
217
  #sub.to_csv("sub.csv", encoding='utf-8', index=False)
218
+ #^commented above code, saved to csv to reduce wait/comput time on HF
219
  sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
220
+ sub.drop(columns="id")
221
+ st.dataframe(sub)
222
+ st.write("here is a table of the tweets and the likelihood of each label :) loaded from a csv out of respect for your time")