Spaces:

kaitehtzeng
/

primary_app

Sleeping

App Files Files Community

primary_app / after_model_fitting.py

kaitehtzeng

Upload after_model_fitting.py

3e9f9c0 verified about 1 year ago

raw

history blame

14.5 kB

	# -- coding: utf-8 --
	"""After model-fitting

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/after-model-fitting-b220d687-d8e5-4eb5-aafd-6a7e94d72073.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20240128/auto/storage/goog4_request%26X-Goog-Date%3D20240128T102031Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D31877cdd720f27bacaa0efcdbe500b0697792af355976ce5280054514cedfe1be4c17db45656212f46a080c0a7f0369fbd3d051fd9be4a1275e0ea4bd55be70f65a681f6868cda1616ea83b3c65a363b81d4f59b864aa1aa82188ce4bbfca0d326422ccfaf462a4a322a86e8d752e875e2c7940fde584e9a1f0e25847bb77ad8e0131724aaec47d49e4ab42a1d2be2199c9053a26a40f3bf2a31489822ec9bb6dd378bec74e97866da9613ee7c54c6ed2ce69eee5fe34ea90293cb546e4cb1f84b3fcc6563aea8318d70e68b71e43b6d85e04a20e01980dd0c94bb837aa81446d9ecfdad1d56cbc1c940670eba9cf9dc647a8972ac13c6af15a28da735db694f
	"""

	# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
	# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
	# THEN FEEL FREE TO DELETE THIS CELL.
	# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
	# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
	# NOTEBOOK.

	import os
	import sys
	from tempfile import NamedTemporaryFile
	from urllib.request import urlopen
	from urllib.parse import unquote, urlparse
	from urllib.error import HTTPError
	from zipfile import ZipFile
	import tarfile
	import shutil

	CHUNK_SIZE = 40960
	DATA_SOURCE_MAPPING = 'llm-detect-ai-generated-text:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F61542%2F7516023%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240128%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240128T102030Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D038d55997cf8a860737caadb5837a5ebfaaf8477d4523afa1008387fe39c3a0c58c1ddc811284f559dbb78fd8e0f8230fca333e828951b69e5d935955b9163461cbd2f4f8b3f321dd0e73d767e2ef1a8ceb52512ef8f8d99fd19c92abf23c5a856ebd3d9ed4ee28b4c31b83427a7dc10052602e6d604e2c55f51d8e26da1e2dacb2e720476c3b874b22d5a03e8dde81374f227c87a024dea36e5973a7cabcccdcec804ba2fd73b5397d7d334be750de7ea9d4a2c2dcb12b93f4d75c18f063ebf02ff802e8912122dbd5b25695e7658bffc61997b9893958b304068a6e593653b14959b5355f4b8bb09d5d01768dda2839e271941fabfddf3cc5d8cbc5cd06746,argugpt:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3946973%2F6867914%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240128%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240128T102030Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D490ee9c880e3988ac2d0ceedc2936a72525b02e00898ca8feae1456ecdd6a542f952cedb096ce8474098bc29e06744cea2433b38c55accab1c9656f43d1baccccd2b36486e1075525b59c4f61326c5a819dc3f1bed35c76c73ef646f21d71bf8f3e8d7eb94e6c21068392293b9ba1e7fc8ac286eb68a727ac479118880aeff2c08f2e3e013aa0e888c099fb5a54a83920cebbf3ca011d818e66787427bfddf16de31a61552638a21cf583099a16a3cc660817297abdd494a926a3d58196778021bc6ea4b20d0923d7fb588d4857e95dce2979e3b246e6e282ef0b0fcabaecd2dd632c413f7f723e1178d080fc89fb31cd9a4564c84b11062fb9229d61d2dbf4e,daigt-proper-train-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3942644%2F6890527%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240128%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240128T102031Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D352a1df1e329069e50e0d64cb012986e5c75605e915c0b16383182a8618769c5ee4e3dd3f59448b11d64187657833f7f3f3e30c7c21fc343af2c51111074ea60e70e904833ef6a3aa4ad4b4864d89b924a3f063e71c41dbee1bdf1d453dc2cbe62e8819854b6e71040ca0014522e9651b9e8e6640c6caee259e981486a3ee0793ee7f56068c3d7efe66941530d2669bb8d3f989fe7b4056a81f76b0870fa2cf21cce8641b4f3e8c0b90fab4ef495464f2700bd99f20d4d94e86c11bc06301b1fc49a63bee1db180b733a12dc20b3b0f109c15b172c1cf0f91234176030f5c2241e7f646d99238ff63fc36ca1b0419463f38fe3bd477790b060c88c2bc9441ac0'

	KAGGLE_INPUT_PATH='/kaggle/input'
	KAGGLE_WORKING_PATH='/kaggle/working'
	KAGGLE_SYMLINK='kaggle'

	!umount /kaggle/input/ 2> /dev/null
	shutil.rmtree('/kaggle/input', ignore_errors=True)
	os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
	os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

	try:
	os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
	except FileExistsError:
	pass
	try:
	os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
	except FileExistsError:
	pass

	for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
	directory, download_url_encoded = data_source_mapping.split(':')
	download_url = unquote(download_url_encoded)
	filename = urlparse(download_url).path
	destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
	try:
	with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
	total_length = fileres.headers['content-length']
	print(f'Downloading {directory}, {total_length} bytes compressed')
	dl = 0
	data = fileres.read(CHUNK_SIZE)
	while len(data) > 0:
	dl += len(data)
	tfile.write(data)
	done = int(50 * dl / int(total_length))
	sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
	sys.stdout.flush()
	data = fileres.read(CHUNK_SIZE)
	if filename.endswith('.zip'):
	with ZipFile(tfile) as zfile:
	zfile.extractall(destination_path)
	else:
	with tarfile.open(tfile.name) as tarfile:
	tarfile.extractall(destination_path)
	print(f'\nDownloaded and uncompressed: {directory}')
	except HTTPError as e:
	print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
	continue
	except OSError as e:
	print(f'Failed to load {download_url} to path {destination_path}')
	continue

	print('Data source import complete.')

	# This Python 3 environment comes with many helpful analytics libraries installed
	# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
	# For example, here's several helpful packages to load

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

	# Input data files are available in the read-only "../input/" directory
	# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

	import os
	for dirname, _, filenames in os.walk('/kaggle/input'):
	for filename in filenames:
	print(os.path.join(dirname, filename))

	# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
	# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

	!git clone https://huggingface.co/spaces/kaitehtzeng/primary_app

	"""## Import Necessary Library"""

	import torch.nn.functional as F
	from transformers import AutoModel
	from transformers import AutoTokenizer
	from tokenizers import Tokenizer, trainers, pre_tokenizers, models
	from transformers import DebertaTokenizer
	from sklearn.model_selection import train_test_split
	import torch
	import torch.nn as nn
	import numpy as np
	import pandas as pd
	from tqdm.notebook import tqdm
	import matplotlib.pyplot as plt
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.tokenize.treebank import TreebankWordDetokenizer
	from collections import Counter
	#import spacy
	import re
	import gc
	# ----------
	import os

	config = {
	'model': '/kaggle/input/transformers-model-downloader-pytorch-tf2-0/microsoft/deberta-v3-base',
	'dropout': 0.2,
	'max_length': 512,
	'batch_size':3,
	'epochs': 1,
	'lr': 1e-5,
	'device': 'cuda' if torch.cuda.is_available() else 'cpu',
	'scheduler': 'CosineAnnealingWarmRestarts'
	}

	"""### Preparation
	Comparing two essays. <br>
	One predicted written by students, one predicted written by LLM
	"""

	train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
	external = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv")

	df = pd.concat([
	external[external.source=="persuade_corpus"].sample(10000,random_state=101),
	external[external.source!='persuade_corpus']
	])
	df = df.reset_index()

	df['stratify'] = df.label.astype(str)+df.source.astype(str)
	train_df,val_df = train_test_split(df,test_size=0.2,random_state = 101,stratify=df['stratify'])
	train_df, val_df = train_df.reset_index(), val_df.reset_index()

	import transformers
	print('transformers version:', transformers.__version__)

	#train_df,val_df = train_test_split(train_essays,test_size=0.2,random_state = 101)
	#train_df, val_df = train_df.reset_index(), val_df.reset_index()
	#print('dataframe shapes:',train_df.shape, val_df.shape)

	tokenizer = AutoTokenizer.from_pretrained(config['model'])
	tokenizer.train_new_from_iterator(train_essays['text'], 52000)

	"""### Building Training Dataset and Loader"""

	class EssayDataset:
	def __init__(self, df, config,tokenizer, is_test = False):
	self.df = df
	self.tokenizer = tokenizer
	self.is_test = is_test
	self.config = config

	def token_start(self, idx):
	sample_text = self.df.loc[idx,'text']

	tokenized = tokenizer.encode_plus(sample_text,
	None,
	add_special_tokens=True,
	max_length= self.config['max_length'],
	truncation=True,
	padding="max_length"
	)

	inputs = {
	"input_ids": torch.tensor(tokenized['input_ids'],dtype=torch.long),
	"token_type_ids": torch.tensor(tokenized['token_type_ids'],dtype=torch.long),
	"attention_mask": torch.tensor(tokenized['attention_mask'],dtype = torch.long)
	}

	return inputs


	def __getitem__(self,idx):

	input_text = self.token_start(idx)

	if self.is_test:
	return input_text

	else:
	labels = self.df.loc[idx,'label']
	targets = {'labels' : torch.tensor(labels,dtype = torch.float32)}

	return input_text,targets

	def __len__(self):
	return len(self.df)

	eval_ds = EssayDataset(val_df,config,tokenizer = tokenizer,is_test=True)

	eval_loader = torch.utils.data.DataLoader(eval_ds,
	batch_size= config['batch_size'])

	"""Build the Model"""

	class mymodel(nn.Module):

	def __init__(self,config):
	super(mymodel,self).__init__()

	self.model_name = config['model']
	self.deberta = AutoModel.from_pretrained(self.model_name)
	#12801 = len(tokenizer)
	self.deberta.resize_token_embeddings(128001)
	self.dropout = nn.Dropout(config['dropout'])
	self.fn0 = nn.Linear(self.deberta.config.hidden_size,256)
	self.fn2 = nn.Linear(256,1)
	self.pooling = MeanPooling()

	def forward(self, input):
	output = self.deberta(**input,return_dict = True)
	output = self.pooling(output['last_hidden_state'],input['attention_mask'])
	output = self.dropout(output)
	output = self.fn0(output)
	output = self.dropout(output)
	output = self.fn2(output)
	output = torch.sigmoid(output)
	return output

	import torch.nn as nn
	class MeanPooling(nn.Module):
	def __init__(self):
	super(MeanPooling,self).__init__()


	def forward(self,last_hidden_state, attention_mask):
	new_weight = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
	final = torch.sum(new_weight*last_hidden_state,1)
	total_weight = new_weight.sum(1)
	total_weight = torch.clamp(total_weight, min = 1e-9)
	mean_embedding = final/total_weight

	return mean_embedding

	model = mymodel(config).to(device=config['device'])
	model.load_state_dict(torch.load('/kaggle/input/fine-tune-model/my_model.pth'))
	model.eval()

	#preds = []
	#for (inputs) in eval_loader:
	# inputs = {k:inputs[k].to(device=config['device']) for k in inputs.keys()}
	#
	# outputs = model(inputs)
	# preds.append(outputs.detach().cpu())

	#preds = torch.concat(preds)

	#val_df['preds'] = preds.numpy()
	#val_df['AI'] = val_df['preds']>0.5

	#sample_predict_AI = val_df.loc[val_df['AI'] == True].iloc[0]['text']
	#sample_predict_student = val_df.loc[val_df['AI'] == False].iloc[0]['text']

	#sample_predict_AI

	#sample_predict_student

	def trial(text):

	tokenized = tokenizer.encode_plus(text,
	None,
	add_special_tokens=True,
	max_length= config['max_length'],
	truncation=True,
	padding="max_length"
	)
	inputs = {
	"input_ids": torch.tensor(tokenized['input_ids'],dtype=torch.long),
	"token_type_ids": torch.tensor(tokenized['token_type_ids'],dtype=torch.long),
	"attention_mask": torch.tensor(tokenized['attention_mask'],dtype = torch.long)
	}
	inputs = {k:inputs[k].unsqueeze(0).to(device=config['device']) for k in inputs.keys()}

	if model(inputs).item()>=0.5:
	return "AI"
	else:
	return "Student"

	!pip install -q gradio==3.45.0

	import gradio as gr

	trial('hello fuck you')



	demo = gr.Interface(
	fn=trial,
	inputs=gr.Textbox(placeholder="..."),
	outputs="textbox"
	)

	demo.launch(share=True)

	"""### Model
	Fine tuning the deberta-v3-base model with new-added layers

	The model is later used to participate the Kaggle Competition:LLM - Detect AI Generated Text.
	The Auc of the model is 0.75
	"""

	!git push