File size: 3,206 Bytes
9348d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""MiniLM.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1XhhoPPH_g3mfrRD7SEiB0dhLld8GfTcw
"""

!pip install transformers torch

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast
from transformers import BertTokenizer, pipeline

# 'ao_childes_curriculum.txt' contains your text data
with open('ao_childes_curriculum.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

# Initialize the MiniLM-like tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=512)

# Tokenize the text data
tokenized_data = tokenizer(text_data, return_tensors='pt', max_length=512, truncation=True)

# custom dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
        }

# instance of the custom dataset
dataset = CustomDataset(tokenized_data)

# Define the MiniLM-like configuration
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    num_hidden_layers=3,
    num_attention_heads=4,
)

# Initialize the MiniLM-like model
model = BertForPreTraining(config=config)

# Set up the DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction_logits = outputs.prediction_logits

        # Compute loss
        loss = criterion(prediction_logits.view(-1, tokenizer.vocab_size), input_ids.view(-1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch {epoch + 1}, Batch loss: {loss.item()}')

# Save the trained model
model.save_pretrained('my_miniLM_model')

import torch
from transformers import BertTokenizer, BertLMHeadModel, pipeline

# Load the trained MiniLM model
model = BertLMHeadModel.from_pretrained('my_miniLM_model')

# Load the tokenizer
# tokenizer = BertTokenizer.from_pretrained('path/to/your/trained/model')

# Generate text samples using the model
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
samples = generator('Your prompt here', max_length=100, num_return_sequences=1)  # Set num_return_sequences to 1

# Print generated samples
for i, sample in enumerate(samples):
    print(f"Sample {i + 1}: {sample['generated_text']}")