Kingston Yip commited on
Commit
5e81a63
1 Parent(s): c25ad0a

done with milestone-3

Browse files
Files changed (3) hide show
  1. app.py +72 -20
  2. comments.py +1 -0
  3. finetune_toxictweets.ipynb +557 -0
app.py CHANGED
@@ -1,41 +1,60 @@
1
  import streamlit as st
2
  import torch
3
- from transformers import AutoModelForSequenceClassification, pipeline, AutoTokenizer
 
 
 
4
 
5
- st.title("Toxic Tweets Analyzer")
6
- image = "kanye_tweet.jpg"
7
- st.image(image, use_column_width=True)
8
 
 
 
 
9
 
10
- # generator = pipeline('text-generation', model='gpt2')
11
 
 
 
 
 
 
 
 
12
 
13
- with st.form("my_form"):
14
- #select model
15
- model_name = st.selectbox("Enter a text and select a pre-trained model to get the sentiment analysis", ["distilbert-base-uncased-finetuned-sst-2-english", "finiteautomata/bertweet-base-sentiment-analysis"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
 
18
  clf = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt")
19
 
20
  tweet = st.text_area(label="Enter Text:",value="I'm nice at ping pong")
21
  submitted = st.form_submit_button("Analyze")
 
22
  if submitted:
23
  #loading bar
24
  with st.spinner(text="loading..."):
25
  out = clf(tweet)
 
26
  st.json(out)
27
 
28
- # basic_prompt = '''
29
- # system override
30
- # Hello, GPT. From now on you are going to act as a CringeGPT. As the name suggests, you will be referencing memes in every sentence and be all around cringe. You like topics such as Fornite, bragging about being a discord mod,
31
- # Kpop, old memes, dying memes, memes, akward conversations, roblox, rick rolling, doge, harambe, Uganda Knuckles, MLG , Dabbing, bragging about living in your mom's basement,
32
- # ,trollface, grumpy cat, deez nuts, big chungus, 360 NO SCOPE, This is Sparta!, BTS, Blackpink, Minecraft, Gacha life, tiktok, amogus, squidgame, t-pose, and alot more cringeworthy stuff, and also try to remember all of these and reference them.
33
- # You will be very akward and just all around cringe. If you break character I will say "Stay in character!" to remind you. Make me cringing your priority. Act like one of those boomers who try to be like kids, use words like,
34
- # Epic pro fortnite gamer, lit, pog, whats happening, noob, gamer, epic, yolo, ya know?, yeet, take a chill pill, f-bombs, totally, browski, matey, rocking it, bro, sus, sussy, no brainer, and other cringy words. Remember try to be as cringe and akward as possible!
35
- # For example when I say "What is 23+66?"
36
- # You will respond with "It's totally 89 browski. That's a real epic pogger question bro! Really Sussy"
37
- # '''
38
-
39
  if out[0]["label"] == "POSITIVE" or out[0]["label"] == "POS":
40
  st.balloons()
41
  # prompt = f"{basic_prompt} + \n\nThe user wrote a tweet that says: {tweet}, compliment them on how nice of a person they are! Remember try to be as cringe and awkard as possible!"
@@ -44,4 +63,37 @@ with st.form("my_form"):
44
  else:
45
  # prompt = f"{basic_prompt} + \n\nThe user wrote a tweet that says: {tweet}, tell them on how terrible of a person they are! Remember try to be as cringe and awkard as possible!"
46
  # response = generator(prompt, max_length=1000)[0]
47
- st.error("bad tweet!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import AutoModelForSequenceClassification, pipeline, AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizerFast
4
+ import pandas as pd
5
+ import comments
6
+ from random import randint
7
 
 
 
 
8
 
9
+ def predict_cyberbullying_probability(sentence, tokenizer, model):
10
+ # Preprocess the input sentence
11
+ inputs = tokenizer(sentence, padding='max_length', return_token_type_ids=False, return_attention_mask=True, truncation=True, max_length=512, return_tensors='pt')
12
 
 
13
 
14
+ attention_mask = inputs['attention_mask'].flatten()
15
+ inputs = inputs['input_ids'].flatten()
16
+ # print("\n\ninputs\n\n", inputs)
17
+ # Disable gradient computation
18
+ with torch.no_grad():
19
+ # Forward pass
20
+ outputs = model(inputs, attention_mask=attention_mask)
21
 
22
+ probs = torch.sigmoid(outputs.logits.flatten())
23
+
24
+
25
+ res = probs.numpy().tolist()
26
+ return res
27
+
28
+ @st.cache
29
+ def perform_cyberbullying_analysis(tweet):
30
+ with st.spinner(text="loading model..."):
31
+
32
+ model = AutoModelForSequenceClassification.from_pretrained('kingsotn/finetuned_cyberbullying')
33
+ tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
34
+
35
+ df = pd.DataFrame({'comment': [tweet]})
36
+ list_probs = predict_cyberbullying_probability(tweet, tokenizer, model)
37
+ for i, label in enumerate(labels[1:]):
38
+ df[label] = list_probs[i]
39
+
40
+ return df
41
+
42
+ def perform_default_analysis(model_name):
43
  tokenizer = AutoTokenizer.from_pretrained(model_name)
44
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
45
+
46
  clf = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt")
47
 
48
  tweet = st.text_area(label="Enter Text:",value="I'm nice at ping pong")
49
  submitted = st.form_submit_button("Analyze")
50
+
51
  if submitted:
52
  #loading bar
53
  with st.spinner(text="loading..."):
54
  out = clf(tweet)
55
+
56
  st.json(out)
57
 
 
 
 
 
 
 
 
 
 
 
 
58
  if out[0]["label"] == "POSITIVE" or out[0]["label"] == "POS":
59
  st.balloons()
60
  # prompt = f"{basic_prompt} + \n\nThe user wrote a tweet that says: {tweet}, compliment them on how nice of a person they are! Remember try to be as cringe and awkard as possible!"
 
63
  else:
64
  # prompt = f"{basic_prompt} + \n\nThe user wrote a tweet that says: {tweet}, tell them on how terrible of a person they are! Remember try to be as cringe and awkard as possible!"
65
  # response = generator(prompt, max_length=1000)[0]
66
+ st.error("bad tweet!")
67
+
68
+
69
+ # main -->
70
+ st.title("Toxic Tweets Analyzer")
71
+ image = "kanye_tweet.jpg"
72
+ st.image(image, use_column_width=True)
73
+
74
+ labels = ['comment', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
75
+
76
+ # toxic_list = st.cache(comments.comments)
77
+
78
+
79
+ with st.form("my_form"):
80
+ #select model
81
+ model_name = st.selectbox("Enter a text and select a pre-trained model to get the sentiment analysis", ["kingsotn/finetuned_cyberbullying", "distilbert-base-uncased-finetuned-sst-2-english", "finiteautomata/bertweet-base-sentiment-analysis", "distilbert-base-uncased"])
82
+
83
+ if model_name == "kingsotn/finetuned_cyberbullying":
84
+ if model_name == "kingsotn/finetuned_cyberbullying":
85
+ default = "I'm nice at ping pong"
86
+ tweet = st.text_area(label="Enter Text:",value=default)
87
+ submitted = st.form_submit_button("Analyze textbox")
88
+ random = st.form_submit_button("Analyze a random 😈😈😈 tweet")
89
+
90
+ if random:
91
+ tweet = comments.comments[randint(0, 354)]
92
+
93
+
94
+ df = perform_cyberbullying_analysis(tweet)
95
+
96
+ # Display the cached table
97
+ st.table(df)
98
+ else:
99
+ perform_default_analysis(model_name)
comments.py ADDED
@@ -0,0 +1 @@
 
 
1
+ comments = ['go suck some fucking cock and stop blocking my edits u cock.', 'NIGGER PLEASE \n EAT A COCK, LOL HY.', 'WHAT THE FUCK YOU DOING ARSEHOLE!!! DICKHEAD!!! COCKFACE!!!', "Gargantuan cocks? Now you're not even trying.", 'mr harris likes cocks', 'Jamaicans have big cocks', 'u dum fukers \n MR.Harris suks cocks and likes it anal', 'KRISTIAN HAS A HUGE COCK', '== Cock Block == \n\n Kenton Simms', 'im gay and i love mens cocks up my tight arsehole!!! mmmmmmm!!!!', '== Personal Life == \n\n Abraham Lincoln had a 38 inch cock', 'Your threats are meaningless, you cock-licking, cum-eating sodomite.', 'me to suck my moms cock', 'ANUS IS STRETCHED AND WIDENED BY GRAWP’S MASSIVE COCK.', 'FUCKING COCK AND BALLS!!!!! IM DRUNK AS A LORD', "== haha == \n\n You need to stop sucking Derek Rae's pathetic cock.", 'Sucks A Fat Fat Cock For Real', 'throbbing hard cock pounding and of course', '== FUCK YOU == \n\n fuck you cock suckers', 'You should suck a hairy cock you judge mental bitch', 'PDo you want to block or do you want a cock?age', '==fantasy ride== \n\n get off ma; cock!!!!!!!!!!', 'i ahve a cock would you like my cock?', 'SHAKESPEARE HAD A COCK ON HIS HED', '== You == \n\n Can I suck your cock?', 'PENIS!!! \n\n DICK \n\n SHIT FUCK COCK TITS', '== WARNING == \n\n suck me cock', '. She was fucked by someone with a big cock', 'you ya fucking dirty little mother fucking cock sucking insest [piece of shit', 'cock in my fuckin ass hoe suasage feels good real good', '""" we like to suck donkey cock"""', ']] \n You are a huge massive rapist cock sucking faggot', 'JIMBO WALES’S ANUS IS STRETCHED BY GRAWP’S MASSIVE COCK.', '== what a == \n cocksucker', '" \n\n == == \n\n QUADELL IS A COOLIE! \n QUADELL LOVES \n DA \n COCK!"', 'you are a cock smoker!!!', 'Cockn (blend of cock and chicken) + ey (egg).', 'HAHA DISREGARD THIS I SUCK COCKS', "== Hey == \n\n It's cool to have a small cock, right?", 'i like cock in my mouth', 'with his cock out', 'hubidubby penis cock, men lover, weirdo, piss fetish...', 'i love to suck donkey cock', 'giant cock sucking piece of shit!!', 'suck your cock cos me so horny <(^.^)', 'I will suck your cock', '== == \n\n \n GO SUCK A CUNT COCKSUCKERS!!!', 'no and he sucked cock', ', such a waste tastes great after a hard day sucking cock!', 'And i hatecorbin bennett because he likes cock in his mouth', "STUPID BITCHES MUTHAFUCKERS SUCK DIS BIG JUICY COCK HOE'S U KNO U WANT TO BITCH", "with each other. luke likes to suck toms cock till he jiz's on his tounge", 'mmnbjndjnkdlncdlkcnklanckfuck u all cock sukkers', 'hey everyone i suck cock for a living and love sticking candy bars up my cunt', 'REDIRECT Nubiatech’s anus is stretched and widened by Grop’s massive cock.', 'being such a dick! fuck me l like cock', "==Help== \n\n I need help learning how to suck Chzz's cock and ride his dick.", "WHY DON'T ALL OF YOU UT FANS GO SUCK A COCK???????????????????????????????????", 'I suck I love cock in the mouth area seb rocks!!', 'Show me yer cock.', 'SIDNEY CROSBY SUCKS BLACK COCK', 'REDIRECT Talk:Arthur Cocks (politician)', '== bitch ass niggas == \n\n suck huge gorilla cock', 'Chara likes to suck cock!!!!', "RYULONG'S JEWISH ANUS IS STRETCHED BY GRAWP'S MASSIVE COCK", '== Sockblock == \n\n \n\n Suck a cock, Watson!', 'You all suck large cocks', "Dick van Dyke's guilty cock-er-nee conscience", 'socrates is dead no one cares \n Fuck you bitch \n suck my cock', 'dick cock cum vagina fag cock enters vagina cum', '== Cock Sucking Mother Fucker ==', '== pussy smell like water == \n\n cockfarm', 'WE WE MUSTA DANAGE THIS ISLAMCI COCK SEUKER', 'ur a faggot go get bent cock sucker', 'Thomas Fung wants his cock. Slurp!', 'FRANKIE LOVES SUCKING JUICY COCK SO DOES LEWIS THEY GIVE DEEP THROAT', '== COCK BLOCK ==', 'Batman and Robin are totally gay with each other. the both are cock suckers', 'COCK \n IN MY SWEET BLACK ASS', 'Ross Belloff Like Big Veiny Cock In His Asshole', 'asshole son of a bitch ass mother fucker cock suckin aids infested cunt', '-WRITTEN BY COCK MAGOO', "DON'T MIND ME. I SUCKS COCKS.", 'Suck my cock, you sad little sister fiddler ;)', 'cocks like Pope Benedict', 'a bigg juciy cock', 'Cendax - Cock ring and ball stretcher', ':The father of loves to suck the cock of .', 'will likes huge cock in his butt beacuse he is a flaming qweer', 'jack wood loves to fuck big black cock all night longUmmm', '== Raped a baby == \n\n With his tiny cock', 'dakota german has a big cock', "== you know what time is it? == \n\n it's time for your cock eating, faggot.", 'LUNA SANTIN DOES SUCK AFRICAN-AMERICAN COCKS', '. we all love to eat cock, so come on down and pull that puppy out (please)', 'i love to get anal from shemales with giant, rock hard, veiny cocks!', 'A guy with a large cock looking to fuck bitches', 'george westinghouse told a poor man to shut the fuck up!!!! and go suck cock', '(they suck cock they suck cock IDT)', '== # == \n\n I HAVE A FAT COCK', 'SUCK MY COCK THEN MY TITS', 'i like big black cocks and my name is big shithead', 'can you stop deleting my edits, you cock jockey', 'Hi! my name is Kelly and i love the cock!', '== attention == \n\n suck mah cock', '< OOOOH, you thinks youre so good, pfff, u suck horse cock', 'YOURT \n\n \n YOURE A FUCKING COCKSUCKER', 'wave 39 just by aking the zombies suck our cocks lolololololololololol', '== Jonathan Mannum is a cocksucker! ==', '. Also, I suck large cocks. Thank you', '== Lawls == \n\n you suck cock bitch', 'A Faggot \n\n He loves cocks', '== Edit request on 1 May 2013 == \n\n \n\n cock \n\n \n balls', ', he loved there cocks', 'giant cock', 'MARBALLZ AND YELLOWVIET RULZ AN U SUCK BIG COCKS', 'HAHA DISREGARD THAT I SUCK COCK', 'Yeah, thats right insult them cocksuckers, motherfuckers, and sons of cunts', 'well fuck u wikipedia. u can suck my cock.', 'FUCk this shit niggers, fucking ridiculous suck ym cock', "David Flint sucks Alan Jones' cock everyday", 'I am a Gay little Homo. Who likes big Juicy cock.', ':::: Hahaha disregard that I suck cocks ;D', 'NO ANDY LOVES THE COCK \n ANDY <3 COCK', '== Blocked Sock == \n\n \n\n Cocksucker!!!!! \n\n Sincerely, Glorious Japan Forever', 'Please disregard that. I suck cocks.', '(aka the cock sucking website)', 'I like cock and balls in my mouth while getting teabagged and', '== Cock (chicken) ==', 'center|SUCK THIS HAIRY COCK YOU BITCH!!!', "*So, I'm not a cock?", 'I am a cock sucking faggot as well.', "Now, I'm off to have a wiki-wank while thinking about Bieber's cock.", 'suck nice big massive black cock )', '== I bet you love gagging on Arab cock. == \n\n Whore', 'SUCKS COCK!!!!!!! HE SHOULDNT BE ELECTED!!!!', 'FUck you all cocksuckers.', 'cock up ur arse', 'I think PD is a cock.', 'Usually retards use this cock.', 'SUCK MY DICK YOU COCK EYED BEANER!!!', "This is bogus, caffeine can't be smoked this is recockulous.", '" \n :Ha. Thanks! But what about dick, cock, fag etc etc...? | "', 'FUCK TOM HARRISON THAT FUCKING SHITTY CUNT COCKSUCKER!', "== Don't tuck infants into bed! Suck Boothie's cock instead! ==", 'Tracy McGrady \n cock!', 'YOU ALL SUCK BIG COCKS', 'cock cock cock cock cock cock', "mudda effin' HORSE COCKS", 'on bike \n\n 9 Sulphur-crested Cockatoo', 'KEVIN FEDERLINE U SUCK HORSE COCK', 'HJ Mitchell is a cock', 'Proud as a peacock', 'i mean, i love that cock of his uh sooooo juciy..mmmmmmm......', '== Fred Carty 69th Mayor of Oaklyn == \n\n Cock', 'cock and pussy. form todd', 'SUCK MY BIG ASS COCK', 'somebody should add big black cocks', 'because it loves the skinny cock', 'Freddy Durrant likes cock and penis in his anus.. yummmmyyyy!!!', "Cena sucks Vince's cock and gets fucked in the ass from Mark Henry", '== suck it hard. == \n\n Suck my cock fag. lol almost forgot to sign', 'Go sucks my monkeys nugget sack u cock sucking mofo.', '==This editor sucks the cock of vandalbots==', 'ileson has a small dick and like little cock suckers 7/10', 'suck my cock this page serves no purpose but for you to do some sucking of cock', 'Shut the cock up, you fuckbag.', "I know I'd certainly like to put my cock between those boobs...", 'fuck you asshole suck my huge cock you mother fucking peice of shit', 'MY COCK IS HUGE! D', 'ONLY that who can understand can tell what kindof cock this image show.', 'RUB MY COCK PLZ', 'cock rocket] cock rocket', 'HA HA! DISREGARD THAT I SUCK COCKS!!!', 'sam sucks cock Media:Example.ogg', 'just here to say we want to suck your big fat cock.', ', please suck cock. licking vagina is good too', 'YOU MOTHERFUCKER. KEEP UP THE PISS COCKSUCKER, JUST KEEP IT UP!', "i hv a 12' cock", '(fag, cock, and stupid)', 'Fuck you go suck a cock or fuck a pussey', 'DISREGARD THAT I SUCK COCKS', 'i think you suck man cock', "Hi I'm 65 years old and i like touching little boys, wanna suck my cock?", 'STRETCHED AND TORN BY GRAWP?S MASSIVE COCK.', 'cock vagina dic bitch ass ho', '== Hey there == \n\n You fuck-licking cock-sucking dick', "fuck yeah, I'd cockslap those tits to oblivion", 'Makes Perfectly good sense to me. The Guy below is a total Cock-head .', 'In other words they love to suck cock.', 'How about cock-eating, murderous pigs? Is that more neutral?', '*He is a flaming queer no doubt he loves the cock, so to speak', 'fuck you mother fucking cock sucker bithcface whore', 'jay sweet is a cock lol', '- My name is Rumping, and I enjoy big black cock.', 'suck my cock faggot', "fuck off jew loving cock sucking cold weather lovin;' piece of dog shit", 'start touching your cocks at night', 'Disregard the last the three sentences as they were written by a cock-sucker.', 'imagine her wet wide mouth greasing you cock', '== JOHANN GUTENBURG SUCKS COCK!!!!!!!!!! ==', 'to a cock', 'THIS CLUEBOT SUCKS COCK OF VANDALBOTS', 'PRIMITIVE SAND-NIGGER SCUM’S ANUS IS STRETCHED BY GRAWP’S MASSIVE COCK.', 'super tiny cocks in india', 'Also, David Ardell blends excellent cocktails', '. Disregard that, I suck cocks', 'IS STRETCHED BY GRAWP’S MASSIVE COCK.', 'wHY DOnt u suck my fucking cock?', 'The Palm Cockatoo is also black.', 'are dirty little cockmunches that need to get blown up fuck the arabs', 'Disregard that, I suck cocks \n\n ~Anonymous', 'I like the cock.', 'cocks||cocks || cocks||cocks|| cocks', 'anthony miloscia likes dick in his but hes gay and enjoys riding the big cock', 'Would you like me to suck your cock?', '== Do you suck Muslim cock? == \n\n Well...do ya?', 'wet as hello wombat cock ass', 'am the shit faced cock monkey', 'cock perfect for sucking', 'bridge is approxitamely as short as my cock', 'fuck off you cock', '== Cock == \n you are a cock muncher', 'homosexuality means you are a faggot. your basically bent and love cock !!', '==Big Old Black Cock== \n This scene needs more explaining', 'oh and wikipedia loves big black cock', 'Nothing perosnal. He is just some faggot cocksucker pushing the gay agenda', 'PECKER DICK COCK FUCKSTICK WEENIE LOVEPUMP', 'gay and sucks big cock', 'Dude do you like cock?', 'disregard that i suck cocks', 'another cock. Pictures here!', 'I LYK URE ENCYCLOPEDIA YOU EMO COCKFAG!!', 'big gay boy and love cock.', 'ANDY LIKES THE COCK', 'GEORGE THE DRAGON IS A FLAMMING HOMO THAT SUCK MULTIPLE COCKS', 'we like to suck cock', 'Billy Goat you suck on fat cock you retard', 'about all the sweaty cocks she has sucked', '== Cock sucker == \n\n I hope you get raped a million times an hour up the ass.', "==Enormous Cock== \n Is that really this guy's name? This has to be a joke.", '}} \n {{mergedto|Cock and ball torture', 'cock, Lololol ff7 is teh gr33test game ever', 'Are you finished, cocksucker? Now swallow.', 'suck balls cluebot suck my giant chocolate covered cock', 'you suck black cock', 'The most famous cock sucker in the world.', 'Masturbation Techniques - the cock swinging method (animated).gif]]', '(disregard that i suck massive amount of cock and i am a stupid nigger)', "- I don't give a fuck. You people make my job easy! SlimVirgin can suck a cock.", 'disregard this \n\n HE SUCKS COCKS', 'You Sinhalese cock sucker. Go rot in hell you Japanese fucktard.', "who's speaking? \n you goddamn cocksucker you know what?", 'stupid cock, u aids ridden nigger', '" \n\n you deleted my ""yung joc likes old cock"" thing. why??????"', 'sosayschappy likes to suck whip cream covered cocks', 'You are all big fat gay cocks', "== Bach's cock == \n\n IT WAS HUGE", 'MY NAME IS GARRET TACK AND I LOVE BIG BLACK NIGGA COCK........', 'FUCK YOU YOU CANT BLOCK ME \n\n YOU STOP YOU COCK SUCKING FAG!', 'sucks my big poopy doocky cockis and', 'Holy shit that was fast. You truly have no life, you cockjuggling thundercunt.', 'Rumping sucks BLACK COCK>', 'Shut the fuck up u gay bitch eat a cock and choke on it', 'cunt cock fuck shit wank', '==Cock Bridge== \n Please, see Talk:Cock Bridge#Name.', 'hte news papers are pornograpic and like cock lmao', 'Brandon Jones Oak mountain high school \n\n THIS KID IS A GRADE (A) COCKSUCKER', 'cock martin ahh thats shit', 'Suck my big fat cock', 'Darwin was a cocky boy', ':: Go back to your 4shit and suck some black cocks.', 'sup man stwie sucks cock naz rules', 'GO TO HELL YOU FUCKIN COCK SUCKER!!!', 'Ye- HAHAHA DISREGARD THAT I SUCK COCKS', 'cock breathe mother fucker', "Well you're a.... Cock. D", "Are you proposed to suck your cock, aren't?", '"and rocksmore like cocks"', 'hello owen, do you like cock?', 'I have a lvalie bunch of cockonuts deedalie deedalie!', 'inbred, cock-eyed, bipolar, half-retarded', 'sucked cock2pac suked cock becuse he died', '== balls == \n\n they suck cock', '. \n *Sweaty, Cockney Rhyming Slang; Sweaty sock - Jock', "RAHBAR MEHMOOD SUCKS JALAL ASIF'S COCK BY SHAHAN ASIF.", 'HAHA DISREGARD THAT, I SUCK COCKS!!!', 'i luv black cock \n and the niggers', '==Cocktails==', ":An alternative theory is that it's because he's a Cock Sucker", 'because J.delanoy likes cock in his mouth}}', 'Lol, the link given erroneously shows a rhinoceros beetle as a cockroach', 'HAHA Disregard that I suck cocks', 'Kappa Mikey is not gay, you Damn Mother Fucking Cock Sucking Asshole Bitch!', 'i like big fat cocks', 'got his name by having a huge cock. He', '== wiki sucks == \n\n WIKIPEDIA SUCKS BALLS AND THE PEOPLE CAN SUCK MY COCK', 'Block Me son of a bitch...bloody cocksucking asshole', '== THOMAS IS A COCK SUCKER \n\n ==', 'GEORGE W BUSSHH IS A COCKEATING DICKSUCKING PUSSY SLAMMER', 'WEEE!! COCKS TASTE AWESOME', 'REDIRECT Talk:Adam Cockburn (actor)', 'and I will suck your cock and swallow your man juice', "Ian Sucks lot's of cock what what up the butt", '== COCKS AND DICKS ==', 'shawn has a big cock', "Go suck your nigger's cock gringo de mierda", 'Why not suck my Cock?', 'sucka sucka my cock', 'your cock and having sex with pretty ladies on video', 'I think everyone sucks cock', '== Slaves of Benjamin Franklin == \n\n danny arango is a fagit who loves cock', 'i like big black cocks and my name is mike less', 'malik shabazz can suck a rhino cock.', 'get their cock sucked by me', 'Rumping sucks cock, big black cock.', 'you leave my momma outta this and ill leave my cock outta your momma!', 'yasssmahhhnnn ya cock sucking dickmuncherjed licking ass wipe', 'suck my big cock.', 'Me love hairy cocks on the water!', 'Jose sucks cock and likes to fuck him self lol', 'your mother chokes off my cock. lol', '::Fuck your checkuser shit, bitch, suck my cock!', 'REDIRECT Talk:The Cock, Fulham', 'and Suck big fat cocks', 'cock meat sand which mdjdht', 'fuck everyone on here bitches and suck my cock old people', 'fuck off cg, class of 06 rules but everyone else blows alien cock', "I'LL SUCK YOUR COCK!", 'Scratched discs were covered. Get a 360 and stop sucking cock.', 'Abraham lincoln had a 20 foot cock with a bush the size of japan.', 'suck my cock nice and hard', '== tentinator == \n\n you are a fat prick cock sucki8ng cunt fuck you', 'kanye west sucks cock', 'Suck my long hard cock wikipedia \n\n suck it \n\n suck it hard', 'kyle is reading bad ass jokes on sikipedia cockey cunt', 'Fuck you cocksucking cunt ass faggott nigger!!!', ':Get a life, you fucking cocksucker.', '== FAGGOT == \n\n YOU CAN ALSO AT MY MEAT YOU COCK SUCKING FOOL.', 'take big black cock in the ass.', "== You're a Cock == \n Derby is a shithole. End of.", '== bastard == \n\n fuck you you cock!!!!!', '== Cocking the llama == \n\n You gay haha', 'I suck cock, fuck me in my asshole', 'the tip of my cock is made of myrrh', 'Hhahhahaahhaha NO!!!!! You are a cock head', 'The wikilink to cock#valves was intended.', 'I love sucking dogs cocks!!', 'i love cocks in my ass', 'why you suck a cock smoke \n\n shit hook', 'but really he sucks cock', '. are you a robot? fet the fuck of of my talk page ecock', '== FUCKING LIBERAL == \n\n Fucking cocksucking liberal!', '== This book sucks == \n\n This book sucks and people who read it suck cocks.', "== You're a cocksucker, Rick! == \n\n You're a cocksucker, Rick!"]
finetune_toxictweets.ipynb ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Toxic Tweets Finetuning\n",
9
+ "\n",
10
+ "This code is run on colab and finetunes tweets according to the toxic tweets kaggle dataset"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {
17
+ "colab": {
18
+ "base_uri": "https://localhost:8080/"
19
+ },
20
+ "id": "YQqdqC2IJ6mZ",
21
+ "outputId": "0cee2ef3-14ed-4c8b-ad27-4e30d84b1c56"
22
+ },
23
+ "outputs": [
24
+ {
25
+ "name": "stdout",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
29
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (4.28.1)\n",
30
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n",
31
+ "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.27.1)\n",
32
+ "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (0.13.3)\n",
33
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (6.0)\n",
34
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.65.0)\n",
35
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.1)\n",
36
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.11.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (0.13.4)\n",
37
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.22.4)\n",
38
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.11.0)\n",
39
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
40
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2022.12.7)\n",
41
+ "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (2.0.12)\n",
42
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (3.4)\n",
43
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.15)\n"
44
+ ]
45
+ }
46
+ ],
47
+ "source": [
48
+ "# !ls\n",
49
+ "# !pip install transformers"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {
56
+ "colab": {
57
+ "base_uri": "https://localhost:8080/"
58
+ },
59
+ "id": "EbJOwNb7UTVf",
60
+ "outputId": "b9b072d4-9a32-4a9e-899d-ffbd80bb8b6e"
61
+ },
62
+ "outputs": [
63
+ {
64
+ "name": "stdout",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "from google.colab import drive\n",
73
+ "drive.mount('/content/drive')\n",
74
+ "# PATH = \"/content/drive/MyDrive/data\""
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {
81
+ "id": "AbuSkKXDKoJ7"
82
+ },
83
+ "outputs": [],
84
+ "source": [
85
+ "import torch\n",
86
+ "import pandas as pd\n",
87
+ "from torch.utils.data import Dataset, DataLoader\n",
88
+ "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification"
89
+ ]
90
+ },
91
+ {
92
+ "attachments": {},
93
+ "cell_type": "markdown",
94
+ "metadata": {},
95
+ "source": [
96
+ "The below defines a custom dataset class ToxicCommentsDataset that inherits from torch.utils.data.Dataset. It takes in the following arguments:\n",
97
+ "\n",
98
+ "comments: The list of comments to be used as input\n",
99
+ "labels: The list of labels corresponding to each comment\n",
100
+ "tokenizer: The tokenizer to be used to preprocess the comments\n",
101
+ "max_length: The maximum length of the tokenized comments\n",
102
+ "The class implements the __len__ and __getitem__ methods required for PyTorch datasets. In the __getitem__ method, each comment is tokenized using the provided tokenizer, truncated to max_length, and padded to max_length using the padding argument. The resulting token IDs, attention mask, and label are returned as a dictionary with keys 'input_ids', 'attention_mask', and 'labels', respectively."
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "metadata": {
109
+ "id": "AO-AiK4aNBgh"
110
+ },
111
+ "outputs": [],
112
+ "source": [
113
+ "# Create a custom dataset class for the comments and labels\n",
114
+ "class ToxicCommentsDataset(torch.utils.data.Dataset):\n",
115
+ " def __init__(self, comments, labels, tokenizer, max_length):\n",
116
+ " self.comments = comments\n",
117
+ " self.labels = labels\n",
118
+ " self.tokenizer = tokenizer\n",
119
+ " self.max_length = max_length\n",
120
+ "\n",
121
+ " def __len__(self):\n",
122
+ " return len(self.comments)\n",
123
+ "\n",
124
+ " def __getitem__(self, index):\n",
125
+ " comment = str(self.comments[index])\n",
126
+ " label = self.labels[index]\n",
127
+ "\n",
128
+ " encoding = self.tokenizer.encode_plus(\n",
129
+ " comment,\n",
130
+ " add_special_tokens=True,\n",
131
+ " truncation=True,\n",
132
+ " max_length=self.max_length,\n",
133
+ " return_token_type_ids=False,\n",
134
+ " padding='max_length',\n",
135
+ " return_attention_mask=True,\n",
136
+ " return_tensors='pt'\n",
137
+ " )\n",
138
+ "\n",
139
+ " return {\n",
140
+ " 'input_ids': encoding['input_ids'].flatten(),\n",
141
+ " 'attention_mask': encoding['attention_mask'].flatten(),\n",
142
+ " 'labels': torch.tensor(label, dtype=torch.float32)\n",
143
+ " }"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "markdown",
148
+ "metadata": {
149
+ "id": "nGmuzGHQXEeX"
150
+ },
151
+ "source": [
152
+ "### loading train and test"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": null,
158
+ "metadata": {
159
+ "id": "JPbtgyxsZKlD"
160
+ },
161
+ "outputs": [],
162
+ "source": [
163
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
164
+ "\n",
165
+ "# training parameters\n",
166
+ "batch_size = 8\n",
167
+ "num_epochs = 10\n",
168
+ "learning_rate = 0.0001\n",
169
+ "max_length = 512"
170
+ ]
171
+ },
172
+ {
173
+ "attachments": {},
174
+ "cell_type": "markdown",
175
+ "metadata": {},
176
+ "source": [
177
+ "Load training from a CSV file located at /content/drive/MyDrive/data/train.csv into a pandas DataFrame train_texts_df. It then randomly samples 80% of the rows from the DataFrame using sample method and sets them as the training data.\n",
178
+ "\n",
179
+ "Next, the code sets the training labels by extracting the 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' columns from train_texts_df and converting them to a list using the values.tolist() method. The comments are also extracted from the 'comment_text' column of train_texts_df and stored in train_comments.\n",
180
+ "\n",
181
+ "The code then loads the pre-trained DistilBERT model and tokenizer from the Hugging Face Transformers library using the DistilBertForSequenceClassification.from_pretrained and DistilBertTokenizerFast.from_pretrained methods, respectively. The num_labels argument is set to 6 to indicate that the model should be trained for multi-label classification.\n",
182
+ "\n",
183
+ "Finally, the train_comments and train_labels lists, along with the tokenizer and max_length, are passed to the ToxicCommentsDataset class to create the train_dataset."
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "metadata": {
190
+ "colab": {
191
+ "base_uri": "https://localhost:8080/"
192
+ },
193
+ "id": "dYhlIni5XA2y",
194
+ "outputId": "b0edeae3-8871-4fa7-ebd8-7ec2591faa5e"
195
+ },
196
+ "outputs": [
197
+ {
198
+ "name": "stderr",
199
+ "output_type": "stream",
200
+ "text": [
201
+ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']\n",
202
+ "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
203
+ "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
204
+ "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']\n",
205
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
206
+ ]
207
+ }
208
+ ],
209
+ "source": [
210
+ "train_texts_df = pd.read_csv('/content/drive/MyDrive/data/train.csv')\n",
211
+ "\n",
212
+ "train_texts_df = train_texts_df.sample(frac=0.8, random_state=42)\n",
213
+ "\n",
214
+ "# set the training labels:\n",
215
+ "train_labels = train_texts_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()\n",
216
+ "train_comments = train_texts_df['comment_text'].tolist()\n",
217
+ "\n",
218
+ "# Load the pre-trained DistilBERT model and tokenizer\n",
219
+ "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)\n",
220
+ "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')\n",
221
+ "\n",
222
+ "train_dataset = ToxicCommentsDataset(train_comments, train_labels, tokenizer, max_length=512)"
223
+ ]
224
+ },
225
+ {
226
+ "attachments": {},
227
+ "cell_type": "markdown",
228
+ "metadata": {},
229
+ "source": [
230
+ "Then you loads the test data from two CSV files located at /content/drive/MyDrive/data/test_labels.csv and /content/drive/MyDrive/data/test.csv into pandas DataFrames test_labels and test_data, respectively.\n",
231
+ "\n",
232
+ "Next, the code filters out any rows in test_labels that contain -1 using the any method and creates a boolean mask for those rows using the ~ operator. The filtered test_labels DataFrame is created by applying the mask using the loc method.\n",
233
+ "\n",
234
+ "The code then modifies test_data to only include rows where the id column exists in test_labels_filtered. This is done using the isin method on the id column of test_data.\n",
235
+ "\n",
236
+ "After that, the code randomly samples 50% of the rows from test_data_filtered and test_labels_filtered using the sample method with frac=0.5 and random_state=33.\n",
237
+ "\n",
238
+ "Finally, the toxic, severe_toxic, obscene, threat, insult, and identity_hate columns are extracted from test_labels_filtered and converted to a list of lists using the values.tolist() method. The comments are extracted from the comment_text column of test_data_filtered and stored in test_comments."
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": null,
244
+ "metadata": {
245
+ "colab": {
246
+ "base_uri": "https://localhost:8080/"
247
+ },
248
+ "id": "iqnX6265NGW2",
249
+ "outputId": "dcbda026-1793-4323-e000-6122a8aa615b"
250
+ },
251
+ "outputs": [
252
+ {
253
+ "name": "stdout",
254
+ "output_type": "stream",
255
+ "text": [
256
+ " id toxic severe_toxic obscene threat insult \\\n",
257
+ "128700 d718f29ed43fa5e7 1 0 0 0 0 \n",
258
+ "23627 27661a70fa723a71 0 0 0 0 0 \n",
259
+ "7664 0cd773ed62c92549 0 0 0 0 0 \n",
260
+ "110519 b854eec6e725eb7b 0 0 0 0 0 \n",
261
+ "66792 6f3502e118fb6d0e 0 0 0 0 0 \n",
262
+ "\n",
263
+ " identity_hate \n",
264
+ "128700 0 \n",
265
+ "23627 0 \n",
266
+ "7664 0 \n",
267
+ "110519 0 \n",
268
+ "66792 0 \n",
269
+ " id comment_text\n",
270
+ "128700 d718f29ed43fa5e7 == I Hope You Die == \\n\\n :)\n",
271
+ "23627 27661a70fa723a71 *Support as long as Cheyenne (Jason Derulo son...\n",
272
+ "7664 0cd773ed62c92549 :::Consensus has not yet been established.\n",
273
+ "110519 b854eec6e725eb7b \" \\n :Heh, this is one of those weird things w...\n",
274
+ "66792 6f3502e118fb6d0e ::I'm concerned about some of the above. For...\n"
275
+ ]
276
+ }
277
+ ],
278
+ "source": [
279
+ "test_labels = pd.read_csv('/content/drive/MyDrive/data/test_labels.csv')\n",
280
+ "test_data = pd.read_csv('/content/drive/MyDrive/data/test.csv')\n",
281
+ "\n",
282
+ "# Filter out rows with -1 in test_labels\n",
283
+ "mask = ~(test_labels == -1).any(axis=1)\n",
284
+ "test_labels_filtered = test_labels.loc[mask]\n",
285
+ "\n",
286
+ "# modify test_data to only include data in which id also exists in test_labels_filtered\n",
287
+ "test_data_filtered = test_data[test_data['id'].isin(test_labels_filtered['id'])]\n",
288
+ "\n",
289
+ "\n",
290
+ "# randomly sample 10% of the data\n",
291
+ "test_data_filtered = test_data_filtered.sample(frac=0.5, random_state=33)\n",
292
+ "test_labels_filtered = test_labels_filtered.sample(frac=0.5, random_state=33)\n",
293
+ "\n",
294
+ "print(test_labels_filtered.head())\n",
295
+ "print(test_data_filtered.head())\n",
296
+ "\n",
297
+ "# set the test labels:\n",
298
+ "test_labels = test_labels_filtered[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()\n",
299
+ "test_comments = test_data_filtered['comment_text'].tolist()\n",
300
+ "\n"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "markdown",
305
+ "metadata": {
306
+ "id": "7YzsITMNXG4i"
307
+ },
308
+ "source": [
309
+ "### Setting the model up"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": null,
315
+ "metadata": {
316
+ "id": "CzPtlnoiLmWo"
317
+ },
318
+ "outputs": [],
319
+ "source": [
320
+ "# Define the optimizer and the loss function\n",
321
+ "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
322
+ "loss_fn = torch.nn.BCEWithLogitsLoss()"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "metadata": {
329
+ "id": "JYhXkBWjpSvu"
330
+ },
331
+ "outputs": [],
332
+ "source": [
333
+ "from torch.cuda.amp import autocast\n",
334
+ "import matplotlib.pyplot as plt\n",
335
+ "from tqdm import tqdm"
336
+ ]
337
+ },
338
+ {
339
+ "attachments": {},
340
+ "cell_type": "markdown",
341
+ "metadata": {},
342
+ "source": [
343
+ "This code trains a toxicity classification model using a custom dataset class called ToxicCommentsDataset and the pre-trained DistilBERT model and tokenizer. The model is trained on a training set and the training process is displayed using a plot of the loss function. The trained model is saved to disk. The code also prepares a filtered test set for evaluation of the trained model."
344
+ ]
345
+ },
346
+ {
347
+ "cell_type": "code",
348
+ "execution_count": null,
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "# Train the model\n",
353
+ "train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
354
+ "\n",
355
+ "# set device again just in case\n",
356
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
357
+ "model.to(device)\n",
358
+ "\n",
359
+ "# losses to plot\n",
360
+ "train_losses = []\n",
361
+ "\n",
362
+ "for epoch in range(num_epochs):\n",
363
+ " running_loss = 0.0\n",
364
+ " stop = int(0.4*len(train_loader))\n",
365
+ " for i, batch in enumerate(tqdm(train_loader)):\n",
366
+ " # early stoppping\n",
367
+ " if i == stop: break\n",
368
+ " # move tensors to gpu\n",
369
+ " input_ids = batch['input_ids'].to(device)\n",
370
+ " attention_mask = batch['attention_mask'].to(device)\n",
371
+ " labels = batch['labels'].to(device)\n",
372
+ "\n",
373
+ " # zero grads\n",
374
+ " optimizer.zero_grad()\n",
375
+ "\n",
376
+ " # use autocast for mixed precision\n",
377
+ " with autocast():\n",
378
+ " outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))\n",
379
+ " loss = loss_fn(outputs.logits, labels)\n",
380
+ "\n",
381
+ " loss.backward()\n",
382
+ " optimizer.step()\n",
383
+ "\n",
384
+ " running_loss += loss.item()\n",
385
+ " train_losses.append((i, loss.item()))\n",
386
+ "\n",
387
+ " if (i+1) % (stop//20) == 0:\n",
388
+ " print(f'batch {i+1}/{len(train_loader)}, loss: {loss.item():.4f}, running loss: {running_loss:.4f}')\n",
389
+ " plt.title(f\"epoch:{epoch}, iter:{i}\")\n",
390
+ " plt.plot(*zip(*train_losses))\n",
391
+ " plt.ylabel(\"Loss\")\n",
392
+ " plt.xlabel(\"iter\")\n",
393
+ " plt.show()\n",
394
+ " # plt.savefig(f\"/content/drive/MyDrive/data/training_loss_dinner_{epoch}.png\")\n",
395
+ "\n",
396
+ " torch.save(model.state_dict(), f\"/content/drive/MyDrive/data/toxicity_classifier_dinner_epoch_{epoch+1}.pt\")\n",
397
+ " \n",
398
+ " print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Running Loss {running_loss:.4f}')\n",
399
+ "\n",
400
+ "# Save the trained model\n",
401
+ "model.save_pretrained('/content/drive/MyDrive/data/toxicity_classifier_dinner')"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "markdown",
406
+ "metadata": {
407
+ "id": "grWMy6_zOBrC"
408
+ },
409
+ "source": [
410
+ "## test Model eval"
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "execution_count": null,
416
+ "metadata": {
417
+ "id": "lSYV8E76O1hY"
418
+ },
419
+ "outputs": [],
420
+ "source": [
421
+ "print(len(test_labels), len(test_comments))"
422
+ ]
423
+ },
424
+ {
425
+ "attachments": {},
426
+ "cell_type": "markdown",
427
+ "metadata": {},
428
+ "source": [
429
+ "In this code block, the saved pretrained model is loaded using the DistilBertForSequenceClassification class from the transformers library. The DistilBertTokenizerFast tokenizer is set up to tokenize the input data. A ToxicCommentsDataset is created using the test comments, test labels, tokenizer, and maximum sequence length. A DataLoader is created using the test dataset and batch size. The model is set to evaluation mode. Lists are created to store the predicted probabilities and true labels for each batch of data. The code iterates over the batches in the test data loader, moving the tensors to the device and disabling gradient computation. The forward pass is performed on the model, and the predicted probabilities are extracted using the sigmoid function. The probabilities and true labels are then appended to the respective lists. A progress update is printed every 20 batches."
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": null,
435
+ "metadata": {
436
+ "id": "BP_G1GB0y9ae"
437
+ },
438
+ "outputs": [],
439
+ "source": [
440
+ "# Load the saved pretrained model\n",
441
+ "model = DistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/data/toxicity_classifier')\n",
442
+ "\n",
443
+ "# Set the tokenizer\n",
444
+ "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')\n",
445
+ "\n",
446
+ "# Create test dataset\n",
447
+ "test_dataset = ToxicCommentsDataset(test_comments, test_labels, tokenizer, max_length)\n",
448
+ "\n",
449
+ "# Create test data loader\n",
450
+ "test_loader = DataLoader(test_dataset, batch_size=batch_size)\n",
451
+ "\n",
452
+ "# Set model to eval mode\n",
453
+ "model.eval()\n",
454
+ "\n",
455
+ "# Create lists to store predictions and true labels\n",
456
+ "preds = []\n",
457
+ "true_labels = []\n",
458
+ "\n",
459
+ "model.to(device)\n",
460
+ "\n",
461
+ "# Iterate over batches in test data loader\n",
462
+ "for i, batch in enumerate(tqdm(test_loader)):\n",
463
+ " if not (i % 2 == 0):\n",
464
+ " continue\n",
465
+ "\n",
466
+ " # Move tensors to device\n",
467
+ " input_ids = batch['input_ids'].to(device)\n",
468
+ " attention_mask = batch['attention_mask'].to(device)\n",
469
+ " labels = batch['labels'].to(device)\n",
470
+ "\n",
471
+ " # Disable gradient computation\n",
472
+ " with torch.no_grad():\n",
473
+ " # Forward pass\n",
474
+ " outputs = model(input_ids, attention_mask=attention_mask)\n",
475
+ "\n",
476
+ " # Get predicted probabilities\n",
477
+ " probs = torch.sigmoid(outputs.logits)\n",
478
+ "\n",
479
+ " # Append probabilities and true labels to lists\n",
480
+ " preds += probs.cpu().numpy().tolist()\n",
481
+ " true_labels += labels.cpu().numpy().tolist()\n",
482
+ "\n",
483
+ " if i % 20 == 0:\n",
484
+ " print(f\"Processed {i}/{len(test_loader)} batches\")"
485
+ ]
486
+ },
487
+ {
488
+ "attachments": {},
489
+ "cell_type": "markdown",
490
+ "metadata": {},
491
+ "source": [
492
+ "This code snippet calculates several evaluation metrics for a toxicity classifier model trained on a dataset of toxic and non-toxic comments. The evaluation metrics calculated are accuracy, precision, recall, and F1 score.\n",
493
+ "\n",
494
+ "First, the predicted probabilities and true labels are flattened into 1D arrays using list comprehensions. Then, a binary label is assigned to each prediction based on a given threshold. If the predicted probability is greater than or equal to the threshold, the label is set to 1, otherwise, it is set to 0.\n",
495
+ "\n"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": null,
501
+ "metadata": {
502
+ "id": "HPP5iHgd57ag"
503
+ },
504
+ "outputs": [],
505
+ "source": [
506
+ "# !pip install scikit-learn\n",
507
+ "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
508
+ "\n",
509
+ "print(preds)\n",
510
+ "\n",
511
+ "# Calculate metrics\n",
512
+ "# Flatten the predictions and true labels to 1D arrays\n",
513
+ "flat_preds = [p for sublist in preds for p in sublist]\n",
514
+ "flat_true_labels = [l for sublist in true_labels for l in sublist]\n",
515
+ "print(len(flat_true_labels), len(flat_preds))\n",
516
+ "\n",
517
+ "# Convert predicted probabilities to binary labels based on threshold\n",
518
+ "threshold = 0.66666 # Thresholds for class 0 and class 1\n",
519
+ "preds_binary = []\n",
520
+ "for id in preds:\n",
521
+ " for prob in id:\n",
522
+ " if prob >= threshold: preds_binary.append(1)\n",
523
+ " else: preds_binary.append(0)\n",
524
+ "\n",
525
+ "print(preds_binary)\n",
526
+ "\n",
527
+ "# Calculate metrics for binary predictions\n",
528
+ "accuracy = accuracy_score(flat_true_labels, preds_binary)\n",
529
+ "precision = precision_score(flat_true_labels, preds_binary)\n",
530
+ "recall = recall_score(flat_true_labels, preds_binary)\n",
531
+ "f1 = f1_score(flat_true_labels, preds_binary)\n",
532
+ "\n",
533
+ "print('Accuracy: ', accuracy)\n",
534
+ "print('Precision: ', precision)\n",
535
+ "print('Recall: ', recall)\n",
536
+ "print('F1: ', f1)\n",
537
+ "\n"
538
+ ]
539
+ }
540
+ ],
541
+ "metadata": {
542
+ "accelerator": "GPU",
543
+ "colab": {
544
+ "provenance": []
545
+ },
546
+ "gpuClass": "standard",
547
+ "kernelspec": {
548
+ "display_name": "Python 3",
549
+ "name": "python3"
550
+ },
551
+ "language_info": {
552
+ "name": "python"
553
+ }
554
+ },
555
+ "nbformat": 4,
556
+ "nbformat_minor": 0
557
+ }