# -*- coding: utf-8 -*- """MiniLM.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1XhhoPPH_g3mfrRD7SEiB0dhLld8GfTcw """ !pip install transformers torch import torch from torch.utils.data import Dataset, DataLoader from transformers import BertConfig, BertForPreTraining, BertTokenizerFast from transformers import BertTokenizer, pipeline # 'ao_childes_curriculum.txt' contains your text data with open('ao_childes_curriculum.txt', 'r', encoding='utf-8') as file: text_data = file.read() # Initialize the MiniLM-like tokenizer tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=512) # Tokenize the text data tokenized_data = tokenizer(text_data, return_tensors='pt', max_length=512, truncation=True) # custom dataset class CustomDataset(Dataset): def __init__(self, tokenized_data): self.input_ids = tokenized_data['input_ids'] self.attention_mask = tokenized_data['attention_mask'] def __len__(self): return len(self.input_ids) def __getitem__(self, idx): return { 'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], } # instance of the custom dataset dataset = CustomDataset(tokenized_data) # Define the MiniLM-like configuration config = BertConfig( vocab_size=tokenizer.vocab_size, hidden_size=256, num_hidden_layers=3, num_attention_heads=4, ) # Initialize the MiniLM-like model model = BertForPreTraining(config=config) # Set up the DataLoader dataloader = DataLoader(dataset, batch_size=2, shuffle=True) # optimizer and loss function optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) criterion = torch.nn.CrossEntropyLoss() # Training loop num_epochs = 100 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) for epoch in range(num_epochs): for batch in dataloader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) # Forward pass outputs = model(input_ids, attention_mask=attention_mask) prediction_logits = outputs.prediction_logits # Compute loss loss = criterion(prediction_logits.view(-1, tokenizer.vocab_size), input_ids.view(-1)) # Backward pass and optimization optimizer.zero_grad() loss.backward() optimizer.step() print(f'Epoch {epoch + 1}, Batch loss: {loss.item()}') # Save the trained model model.save_pretrained('my_miniLM_model') import torch from transformers import BertTokenizer, BertLMHeadModel, pipeline # Load the trained MiniLM model model = BertLMHeadModel.from_pretrained('my_miniLM_model') # Load the tokenizer # tokenizer = BertTokenizer.from_pretrained('path/to/your/trained/model') # Generate text samples using the model generator = pipeline('text-generation', model=model, tokenizer=tokenizer) samples = generator('Your prompt here', max_length=100, num_return_sequences=1) # Set num_return_sequences to 1 # Print generated samples for i, sample in enumerate(samples): print(f"Sample {i + 1}: {sample['generated_text']}")