|
import torch
|
|
import torch.nn as nn
|
|
from torch.utils.data import DataLoader, Dataset
|
|
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
|
|
from datasets import Dataset as HFDataset
|
|
import pandas as pd
|
|
import os
|
|
|
|
|
|
model_dir = 'model'
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
|
|
train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
|
|
val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
|
|
test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')
|
|
|
|
|
|
train_df = train_dataset.to_pandas()
|
|
val_df = val_dataset.to_pandas()
|
|
test_df = test_dataset.to_pandas()
|
|
|
|
|
|
train_df['content'] = train_df['content'].str.rstrip('?')
|
|
val_df['content'] = val_df['content'].str.rstrip('?')
|
|
test_df['content'] = test_df['content'].str.rstrip('?')
|
|
|
|
|
|
train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
|
|
val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
|
|
test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))
|
|
|
|
|
|
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
|
|
|
|
|
class QueryDataset(Dataset):
|
|
def __init__(self, texts, labels, tokenizer, max_length=32):
|
|
self.texts = texts
|
|
self.labels = labels
|
|
self.tokenizer = tokenizer
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.texts)
|
|
|
|
def __getitem__(self, idx):
|
|
text = str(self.texts[idx])
|
|
label = int(self.labels[idx])
|
|
encoding = self.tokenizer.encode_plus(
|
|
text,
|
|
add_special_tokens=True,
|
|
max_length=self.max_length,
|
|
padding='max_length',
|
|
truncation=True,
|
|
return_attention_mask=True,
|
|
return_tensors='pt'
|
|
)
|
|
|
|
return {
|
|
'input_ids': encoding['input_ids'].flatten(),
|
|
'attention_mask': encoding['attention_mask'].flatten(),
|
|
'label': torch.tensor(label, dtype=torch.long)
|
|
}
|
|
|
|
|
|
train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
|
|
val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
|
|
test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)
|
|
|
|
|
|
batch_size = 128
|
|
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
|
val_loader = DataLoader(val_dataset, batch_size=batch_size)
|
|
test_loader = DataLoader(test_dataset, batch_size=batch_size)
|
|
|
|
|
|
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
model.to(device)
|
|
|
|
|
|
optimizer = AdamW(model.parameters(), lr=1e-5)
|
|
criterion = nn.CrossEntropyLoss()
|
|
|
|
|
|
epochs = 4
|
|
for epoch in range(epochs):
|
|
model.train()
|
|
total_loss = 0
|
|
for batch in train_loader:
|
|
input_ids = batch['input_ids'].to(device)
|
|
attention_mask = batch['attention_mask'].to(device)
|
|
labels = batch['label'].to(device)
|
|
|
|
optimizer.zero_grad()
|
|
outputs = model(input_ids, attention_mask=attention_mask)
|
|
loss = criterion(outputs.logits, labels)
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
total_loss += loss.item()
|
|
|
|
avg_loss = total_loss / len(train_loader)
|
|
print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
|
|
|
|
|
|
model.eval()
|
|
correct_predictions = 0
|
|
total_predictions = 0
|
|
with torch.no_grad():
|
|
for batch in val_loader:
|
|
input_ids = batch['input_ids'].to(device)
|
|
attention_mask = batch['attention_mask'].to(device)
|
|
labels = batch['label'].to(device)
|
|
|
|
outputs = model(input_ids, attention_mask=attention_mask)
|
|
preds = torch.argmax(outputs.logits, dim=1)
|
|
correct_predictions += (preds == labels).sum().item()
|
|
total_predictions += labels.size(0)
|
|
|
|
accuracy = correct_predictions / total_predictions
|
|
print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')
|
|
|
|
|
|
model.save_pretrained(model_dir, safe_serialization=True)
|
|
tokenizer.save_pretrained(model_dir)
|
|
|
|
|
|
config = AlbertConfig.from_pretrained('albert-base-v2')
|
|
config.num_labels = 2
|
|
config.save_pretrained(model_dir)
|
|
|
|
print(f"Model and all required files saved to {model_dir}")
|
|
|