Spaces:

saccharinedreams
/

sentiment-analysis-app

Runtime error

sentiment-analysis-app / finetunehupd.py

Benjamin S Liang

Added finetuned model

cfabf1c about 1 year ago

2.94 kB

	# -- coding: utf-8 --
	"""FinetuneHUPD.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/17c2CQZx_kyD3-0fuQqv_pCMJ0Evd7fLN
	"""

	# Pretty print
	from pprint import pprint
	# Datasets load_dataset function
	from datasets import load_dataset
	# Transformers Autokenizer
	from transformers import AutoTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, AdamW
	from torch.utils.data import DataLoader
	import torch

	tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

	dataset_dict = load_dataset('HUPD/hupd',
	name='sample',
	data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
	icpr_label=None,
	train_filing_start_date='2016-01-01',
	train_filing_end_date='2016-01-31',
	val_filing_start_date='2016-01-01',
	val_filing_end_date='2016-01-31',
	)

	print('Loading is done!')

	# Label-to-index mapping for the decision status field
	decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

	# Helper function
	def map_decision_to_string(example):
	return {'decision': decision_to_str[example['decision']]}

	# Re-labeling/mapping.
	train_set = dataset_dict['train'].map(map_decision_to_string)
	val_set = dataset_dict['validation'].map(map_decision_to_string)

	# Focus on the abstract section and tokenize the text using the tokenizer.
	_SECTION_ = 'abstract'

	# Training set
	train_set = train_set.map(
	lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
	batched=True)

	# Validation set
	val_set = val_set.map(
	lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
	batched=True)

	# Set the format
	train_set.set_format(type='torch',
	columns=['input_ids', 'attention_mask', 'decision'])

	val_set.set_format(type='torch',
	columns=['input_ids', 'attention_mask', 'decision'])

	#print(train_set['decision'])

	# train_dataloader and val_data_loader
	train_dataloader = DataLoader(train_set, batch_size=16)
	val_dataloader = DataLoader(val_set, batch_size=16)

	device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
	model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
	model.to(device)
	print(device)
	print("torch cuda is avail: ")
	print(torch.cuda.is_available())
	model.train()
	optim = AdamW(model.parameters(), lr=5e-5)
	num_training_epochs = 2

	for epoch in range(num_training_epochs):
	for batch in train_dataloader:
	optim.zero_grad()
	input_ids = batch['input_ids'].to(device)
	attention_mask = batch['attention_mask'].to(device)
	labels = batch['decision'].to(device)
	outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs[0]
	loss.backward()
	optim.step()
	print("batch finished")

	model.eval()