Spaces:

thotranexe
/

toxicity

Runtime error

App Files Files Community

thotran commited on Apr 30, 2023

Commit

e92903c

•

1 Parent(s): 33a51aa

final touches

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenc
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import streamlit as st
-st.write("Please be patient model training takes 20+ mins :P")
 #config constants
 SEED = 42
 EPOCHS = 2
@@ -57,13 +57,12 @@ data.comment_text=data.comment_text.map(cleanString)
 #tokenizer
 tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
 token_lens = []
-for txt in stqdm(data.comment_text,desc="tokenizing"):
-  tokens = tokenizer.encode(txt, max_length=512)
-  token_lens.append(len(tokens))
 #test train split
 df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
 df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
@@ -129,7 +128,7 @@ model = model.to(device)
 train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
 val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
 test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
 def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
     """
     hf = huggingface.
@@ -148,7 +147,7 @@ def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, opt
             loss = outputs.loss
             loss.backward()
             optimizer.step()
 def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
     model.eval()
     losses = []
@@ -170,7 +169,7 @@ def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
 optimizer = AdamW(model.parameters(), lr=2e-5)
 best_val_loss = 9999.
 print('====START TRAINING====')
-#training here
 #for epoch in stqdm(range(EPOCHS)):
 #     print('-' * 10)
 #     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
@@ -216,7 +215,8 @@ model = model.to(device)
 #sub[labels] = prediction
 #sub.insert(1,"tweet",data.comment_text,True)
 #sub.to_csv("sub.csv", encoding='utf-8', index=False)
-#^commented above code, saved to csv to reduce wait/comput time
 sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
-#sub.drop(index="id")
-st.dataframe(sub)

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import streamlit as st
+st.markdown("### Welcome to toxicity! A showcase for the TweetBert Model!")
 #config constants
 SEED = 42
 EPOCHS = 2
 #tokenizer
 tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
 token_lens = []
+#for txt in stqdm(data.comment_text,desc="tokenizing"):
+#  tokens = tokenizer.encode(txt, max_length=512)
+#  token_lens.append(len(tokens))
+#^code above commented for HF runtime purposes, tokenizes comment_text for the bert model
 #test train split
 df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
 df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
 train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
 val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
 test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
+# training function
 def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
     """
     hf = huggingface.
             loss = outputs.loss
             loss.backward()
             optimizer.step()
+#evalute and keep best model
 def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
     model.eval()
     losses = []
 optimizer = AdamW(model.parameters(), lr=2e-5)
 best_val_loss = 9999.
 print('====START TRAINING====')
+# actuual training here
 #for epoch in stqdm(range(EPOCHS)):
 #     print('-' * 10)
 #     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
 #sub[labels] = prediction
 #sub.insert(1,"tweet",data.comment_text,True)
 #sub.to_csv("sub.csv", encoding='utf-8', index=False)
+#^commented above code, saved to csv to reduce wait/comput time on HF
 sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
+sub.drop(columns="id")
+st.dataframe(sub)
+st.write("here is a table of the tweets and the likelihood of each label :) loaded from a csv out of respect for your time")