Spaces:
Running
Running
File size: 2,187 Bytes
dc4b86a 01e2b4e dc4b86a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# THIS file is meant to be used once hence not having functions just sequential code
import pandas as pd
from transformers import AutoTokenizer, set_seed
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from z_utils import get_dataframe
# CONST
INP_DATASET_CSV = "clean_books_summary.csv"
BASE_CASUAL_MODEL = "openai-community/gpt2"
# TRAINED_MODEL_OUTPUT_DIR = "gpt2-book-summary-generator" # same name for HF Hub
TRAINED_MODEL_OUTPUT_DIR = "content" # same name for HF Hub
set_seed(42)
EPOCHS = 2
LR = 2e-5
# Load dataset
books: pd.DataFrame = get_dataframe(INP_DATASET_CSV)
# Create HF dataset, easier to perform preprocessing at scale
dataset_books = Dataset.from_pandas(books, split="train")
# Loading Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_CASUAL_MODEL)
# Data Preprocessing
def preprocess_function(book):
'''Funtion to convert dataset to in prompt form
'''
# Its Multiline, so DONT put tabs in this editor view otherwise it will get inside string
text = f'''Genre: {book['categories']}
Book Title: {book['book_name']}
Description: {book['book_name']} {book['summaries']}
'''
return tokenizer(text)
# Apply Preprocessing
tokenized_dataset_books = dataset_books.map(
preprocess_function,
# batched=True,
num_proc=4,
remove_columns=dataset_books.column_names,
)
# Data Collator, req for Casual LM
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Load Casual LM
model = AutoModelForCausalLM.from_pretrained(BASE_CASUAL_MODEL)
training_args = TrainingArguments(
output_dir=TRAINED_MODEL_OUTPUT_DIR,
eval_strategy="no",
learning_rate=LR,
weight_decay=0.01,
push_to_hub=True,
num_train_epochs=EPOCHS,
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset_books,
# eval_dataset=lm_dataset["test"],
data_collator=data_collator,
tokenizer=tokenizer,
)
# Start training
trainer.train()
# Commit model files to HF
trainer.push_to_hub() |