Spaces:
Runtime error
Runtime error
thotran
commited on
Commit
•
e92903c
1
Parent(s):
33a51aa
final touches
Browse files
app.py
CHANGED
@@ -16,7 +16,7 @@ from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenc
|
|
16 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
17 |
import streamlit as st
|
18 |
|
19 |
-
st.
|
20 |
#config constants
|
21 |
SEED = 42
|
22 |
EPOCHS = 2
|
@@ -57,13 +57,12 @@ data.comment_text=data.comment_text.map(cleanString)
|
|
57 |
|
58 |
#tokenizer
|
59 |
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
|
60 |
-
|
61 |
token_lens = []
|
62 |
|
63 |
-
for txt in stqdm(data.comment_text,desc="tokenizing"):
|
64 |
-
tokens = tokenizer.encode(txt, max_length=512)
|
65 |
-
token_lens.append(len(tokens))
|
66 |
-
|
67 |
#test train split
|
68 |
df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
|
69 |
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
|
@@ -129,7 +128,7 @@ model = model.to(device)
|
|
129 |
train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
|
130 |
val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
131 |
test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
132 |
-
|
133 |
def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
|
134 |
"""
|
135 |
hf = huggingface.
|
@@ -148,7 +147,7 @@ def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, opt
|
|
148 |
loss = outputs.loss
|
149 |
loss.backward()
|
150 |
optimizer.step()
|
151 |
-
|
152 |
def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
|
153 |
model.eval()
|
154 |
losses = []
|
@@ -170,7 +169,7 @@ def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
|
|
170 |
optimizer = AdamW(model.parameters(), lr=2e-5)
|
171 |
best_val_loss = 9999.
|
172 |
print('====START TRAINING====')
|
173 |
-
#training here
|
174 |
#for epoch in stqdm(range(EPOCHS)):
|
175 |
# print('-' * 10)
|
176 |
# train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
|
@@ -216,7 +215,8 @@ model = model.to(device)
|
|
216 |
#sub[labels] = prediction
|
217 |
#sub.insert(1,"tweet",data.comment_text,True)
|
218 |
#sub.to_csv("sub.csv", encoding='utf-8', index=False)
|
219 |
-
#^commented above code, saved to csv to reduce wait/comput time
|
220 |
sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
|
221 |
-
|
222 |
-
st.dataframe(sub)
|
|
|
|
16 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
17 |
import streamlit as st
|
18 |
|
19 |
+
st.markdown("### Welcome to toxicity! A showcase for the TweetBert Model!")
|
20 |
#config constants
|
21 |
SEED = 42
|
22 |
EPOCHS = 2
|
|
|
57 |
|
58 |
#tokenizer
|
59 |
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
|
|
|
60 |
token_lens = []
|
61 |
|
62 |
+
#for txt in stqdm(data.comment_text,desc="tokenizing"):
|
63 |
+
# tokens = tokenizer.encode(txt, max_length=512)
|
64 |
+
# token_lens.append(len(tokens))
|
65 |
+
#^code above commented for HF runtime purposes, tokenizes comment_text for the bert model
|
66 |
#test train split
|
67 |
df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
|
68 |
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
|
|
|
128 |
train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
|
129 |
val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
130 |
test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
131 |
+
# training function
|
132 |
def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
|
133 |
"""
|
134 |
hf = huggingface.
|
|
|
147 |
loss = outputs.loss
|
148 |
loss.backward()
|
149 |
optimizer.step()
|
150 |
+
#evalute and keep best model
|
151 |
def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
|
152 |
model.eval()
|
153 |
losses = []
|
|
|
169 |
optimizer = AdamW(model.parameters(), lr=2e-5)
|
170 |
best_val_loss = 9999.
|
171 |
print('====START TRAINING====')
|
172 |
+
# actuual training here
|
173 |
#for epoch in stqdm(range(EPOCHS)):
|
174 |
# print('-' * 10)
|
175 |
# train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
|
|
|
215 |
#sub[labels] = prediction
|
216 |
#sub.insert(1,"tweet",data.comment_text,True)
|
217 |
#sub.to_csv("sub.csv", encoding='utf-8', index=False)
|
218 |
+
#^commented above code, saved to csv to reduce wait/comput time on HF
|
219 |
sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
|
220 |
+
sub.drop(columns="id")
|
221 |
+
st.dataframe(sub)
|
222 |
+
st.write("here is a table of the tweets and the likelihood of each label :) loaded from a csv out of respect for your time")
|