Spaces:

kaitehtzeng
/

primary_app

Sleeping

App Files Files Community

primary_app / app.py

kaitehtzeng

Update app.py

a4a2a0d verified 9 months ago

raw

history blame

7.32 kB

	# -- coding: utf-8 --
	"""After model-fitting

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/after-model-fitting-b220d687-d8e5-4eb5-aafd-6a7e94d72073.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20240128/auto/storage/goog4_request%26X-Goog-Date%3D20240128T102031Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D31877cdd720f27bacaa0efcdbe500b0697792af355976ce5280054514cedfe1be4c17db45656212f46a080c0a7f0369fbd3d051fd9be4a1275e0ea4bd55be70f65a681f6868cda1616ea83b3c65a363b81d4f59b864aa1aa82188ce4bbfca0d326422ccfaf462a4a322a86e8d752e875e2c7940fde584e9a1f0e25847bb77ad8e0131724aaec47d49e4ab42a1d2be2199c9053a26a40f3bf2a31489822ec9bb6dd378bec74e97866da9613ee7c54c6ed2ce69eee5fe34ea90293cb546e4cb1f84b3fcc6563aea8318d70e68b71e43b6d85e04a20e01980dd0c94bb837aa81446d9ecfdad1d56cbc1c940670eba9cf9dc647a8972ac13c6af15a28da735db694f
	"""

	# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
	# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
	# THEN FEEL FREE TO DELETE THIS CELL.
	# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
	# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
	# NOTEBOOK.

	import os
	import sys
	from tempfile import NamedTemporaryFile
	from urllib.request import urlopen
	from urllib.parse import unquote, urlparse
	from urllib.error import HTTPError
	from zipfile import ZipFile
	import tarfile
	import shutil

	# This Python 3 environment comes with many helpful analytics libraries installed
	# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
	# For example, here's several helpful packages to load

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

	# Input data files are available in the read-only "../input/" directory
	# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



	# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
	# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


	"""## Import Necessary Library"""

	from transformers import AutoModel
	from transformers import AutoTokenizer
	from tokenizers import Tokenizer, trainers, pre_tokenizers, models
	from transformers import DebertaTokenizer
	from sklearn.model_selection import train_test_split
	import torch
	import torch.nn as nn
	import numpy as np
	import pandas as pd
	from tqdm.notebook import tqdm
	import matplotlib.pyplot as plt
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.tokenize.treebank import TreebankWordDetokenizer
	from collections import Counter
	#import spacy
	import re
	import gc
	# ----------
	import os

	config = {
	'model': 'kaitehtzeng/primary_app/microsoft/deberta-v3-base',
	'dropout': 0.2,
	'max_length': 512,
	'batch_size':3,
	'epochs': 1,
	'lr': 1e-5,
	'device': 'cuda' if torch.cuda.is_available() else 'cpu',
	'scheduler': 'CosineAnnealingWarmRestarts'
	}

	"""### Preparation
	Comparing two essays. <br>
	One predicted written by students, one predicted written by LLM
	"""

	train_essays = pd.read_csv("kaitehtzeng/primary_app/train_essays.csv")


	import transformers
	print('transformers version:', transformers.__version__)

	#train_df,val_df = train_test_split(train_essays,test_size=0.2,random_state = 101)
	#train_df, val_df = train_df.reset_index(), val_df.reset_index()
	#print('dataframe shapes:',train_df.shape, val_df.shape)

	tokenizer = AutoTokenizer.from_pretrained(config['model'])
	tokenizer.train_new_from_iterator(train_essays['text'], 52000)



	"""Build the Model"""

	class mymodel(nn.Module):

	def __init__(self,config):
	super(mymodel,self).__init__()

	self.model_name = config['model']
	self.deberta = AutoModel.from_pretrained(self.model_name)
	#128001 = len(tokenizer)
	self.deberta.resize_token_embeddings(128001)
	self.dropout = nn.Dropout(config['dropout'])
	self.fn0 = nn.Linear(self.deberta.config.hidden_size,256)
	self.fn2 = nn.Linear(256,1)
	self.pooling = MeanPooling()

	def forward(self, input):
	output = self.deberta(**input,return_dict = True)
	output = self.pooling(output['last_hidden_state'],input['attention_mask'])
	output = self.dropout(output)
	output = self.fn0(output)
	output = self.dropout(output)
	output = self.fn2(output)
	output = torch.sigmoid(output)
	return output

	import torch.nn as nn
	class MeanPooling(nn.Module):
	def __init__(self):
	super(MeanPooling,self).__init__()


	def forward(self,last_hidden_state, attention_mask):
	new_weight = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
	final = torch.sum(new_weight*last_hidden_state,1)
	total_weight = new_weight.sum(1)
	total_weight = torch.clamp(total_weight, min = 1e-9)
	mean_embedding = final/total_weight

	return mean_embedding

	model = mymodel(config).to(device=config['device'])
	model.load_state_dict(torch.load('kaitehtzeng/primary_app/my_model.pth'))
	model.eval()

	#preds = []
	#for (inputs) in eval_loader:
	# inputs = {k:inputs[k].to(device=config['device']) for k in inputs.keys()}
	#
	# outputs = model(inputs)
	# preds.append(outputs.detach().cpu())

	#preds = torch.concat(preds)

	#val_df['preds'] = preds.numpy()
	#val_df['AI'] = val_df['preds']>0.5

	#sample_predict_AI = val_df.loc[val_df['AI'] == True].iloc[0]['text']
	#sample_predict_student = val_df.loc[val_df['AI'] == False].iloc[0]['text']

	#sample_predict_AI

	#sample_predict_student

	def trial(text):

	tokenized = tokenizer.encode_plus(text,
	None,
	add_special_tokens=True,
	max_length= config['max_length'],
	truncation=True,
	padding="max_length"
	)
	inputs = {
	"input_ids": torch.tensor(tokenized['input_ids'],dtype=torch.long),
	"token_type_ids": torch.tensor(tokenized['token_type_ids'],dtype=torch.long),
	"attention_mask": torch.tensor(tokenized['attention_mask'],dtype = torch.long)
	}
	inputs = {k:inputs[k].unsqueeze(0).to(device=config['device']) for k in inputs.keys()}

	if model(inputs).item()>=0.5:
	return "AI"
	else:
	return "Student"

	import subprocess

	# Use subprocess to run the pip install command
	subprocess.run(['pip', 'install', '-q', 'gradio==3.45.0'])

	import gradio as gr




	demo = gr.Interface(
	fn=trial,
	inputs=gr.Textbox(placeholder="..."),
	outputs="textbox"
	)

	demo.launch(share=True)

	"""### Model
	Fine tuning the deberta-v3-base model with new-added layers

	The model is later used to participate the Kaggle Competition:LLM - Detect AI Generated Text.
	The Auc of the model is 0.75
	"""