|
|
|
"""MiniLM.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1XhhoPPH_g3mfrRD7SEiB0dhLld8GfTcw |
|
""" |
|
|
|
!pip install transformers torch |
|
|
|
import torch |
|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast |
|
from transformers import BertTokenizer, pipeline |
|
|
|
|
|
with open('ao_childes_curriculum.txt', 'r', encoding='utf-8') as file: |
|
text_data = file.read() |
|
|
|
|
|
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=512) |
|
|
|
|
|
tokenized_data = tokenizer(text_data, return_tensors='pt', max_length=512, truncation=True) |
|
|
|
|
|
class CustomDataset(Dataset): |
|
def __init__(self, tokenized_data): |
|
self.input_ids = tokenized_data['input_ids'] |
|
self.attention_mask = tokenized_data['attention_mask'] |
|
|
|
def __len__(self): |
|
return len(self.input_ids) |
|
|
|
def __getitem__(self, idx): |
|
return { |
|
'input_ids': self.input_ids[idx], |
|
'attention_mask': self.attention_mask[idx], |
|
} |
|
|
|
|
|
dataset = CustomDataset(tokenized_data) |
|
|
|
|
|
config = BertConfig( |
|
vocab_size=tokenizer.vocab_size, |
|
hidden_size=256, |
|
num_hidden_layers=3, |
|
num_attention_heads=4, |
|
) |
|
|
|
|
|
model = BertForPreTraining(config=config) |
|
|
|
|
|
dataloader = DataLoader(dataset, batch_size=2, shuffle=True) |
|
|
|
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) |
|
criterion = torch.nn.CrossEntropyLoss() |
|
|
|
|
|
num_epochs = 100 |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
model.to(device) |
|
|
|
for epoch in range(num_epochs): |
|
for batch in dataloader: |
|
input_ids = batch['input_ids'].to(device) |
|
attention_mask = batch['attention_mask'].to(device) |
|
|
|
|
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
prediction_logits = outputs.prediction_logits |
|
|
|
|
|
loss = criterion(prediction_logits.view(-1, tokenizer.vocab_size), input_ids.view(-1)) |
|
|
|
|
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
|
|
print(f'Epoch {epoch + 1}, Batch loss: {loss.item()}') |
|
|
|
|
|
model.save_pretrained('my_miniLM_model') |
|
|
|
import torch |
|
from transformers import BertTokenizer, BertLMHeadModel, pipeline |
|
|
|
|
|
model = BertLMHeadModel.from_pretrained('my_miniLM_model') |
|
|
|
|
|
|
|
|
|
|
|
generator = pipeline('text-generation', model=model, tokenizer=tokenizer) |
|
samples = generator('Your prompt here', max_length=100, num_return_sequences=1) |
|
|
|
|
|
for i, sample in enumerate(samples): |
|
print(f"Sample {i + 1}: {sample['generated_text']}") |
|
|
|
|