dejanseo's picture
Upload 12 files
fffb0cd verified
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
from datasets import Dataset as HFDataset
import pandas as pd
import os
# Ensure the /model/ directory exists
model_dir = 'model'
os.makedirs(model_dir, exist_ok=True)
# Load datasets from the Arrow files
train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')
# Convert datasets to pandas DataFrame
train_df = train_dataset.to_pandas()
val_df = val_dataset.to_pandas()
test_df = test_dataset.to_pandas()
# Remove question marks at the end of each query
train_df['content'] = train_df['content'].str.rstrip('?')
val_df['content'] = val_df['content'].str.rstrip('?')
test_df['content'] = test_df['content'].str.rstrip('?')
# Convert labels to integers (0 or 1)
train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))
# Initialize ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
# Custom Dataset class for PyTorch
class QueryDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=32):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = int(self.labels[idx]) # Ensure label is an integer
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_length,
padding='max_length', # Ensure consistent length
truncation=True, # Truncate longer sequences
return_attention_mask=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
# Prepare datasets
train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)
# DataLoaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
# Training loop
epochs = 4
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs.logits, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
# Validation step at the end of each epoch
model.eval()
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
for batch in val_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, dim=1)
correct_predictions += (preds == labels).sum().item()
total_predictions += labels.size(0)
accuracy = correct_predictions / total_predictions
print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')
# Save the model, tokenizer, and config to /model/ directory
model.save_pretrained(model_dir, safe_serialization=True) # Save model weights in safetensors format
tokenizer.save_pretrained(model_dir)
# Update config with correct classifier details
config = AlbertConfig.from_pretrained('albert-base-v2')
config.num_labels = 2 # Set the number of labels for classification
config.save_pretrained(model_dir)
print(f"Model and all required files saved to {model_dir}")