vishnukk33
/

schooled_minilm

Inference Endpoints

Model card Files Files and versions Community

schooled_minilm / minilm.py

vishnukk33's picture

Upload 6 files

9348d55 8 months ago

history blame contribute delete

No virus

3.21 kB

	# -- coding: utf-8 --
	"""MiniLM.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1XhhoPPH_g3mfrRD7SEiB0dhLld8GfTcw
	"""

	!pip install transformers torch

	import torch
	from torch.utils.data import Dataset, DataLoader
	from transformers import BertConfig, BertForPreTraining, BertTokenizerFast
	from transformers import BertTokenizer, pipeline

	# 'ao_childes_curriculum.txt' contains your text data
	with open('ao_childes_curriculum.txt', 'r', encoding='utf-8') as file:
	text_data = file.read()

	# Initialize the MiniLM-like tokenizer
	tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=512)

	# Tokenize the text data
	tokenized_data = tokenizer(text_data, return_tensors='pt', max_length=512, truncation=True)

	# custom dataset
	class CustomDataset(Dataset):
	def __init__(self, tokenized_data):
	self.input_ids = tokenized_data['input_ids']
	self.attention_mask = tokenized_data['attention_mask']

	def __len__(self):
	return len(self.input_ids)

	def __getitem__(self, idx):
	return {
	'input_ids': self.input_ids[idx],
	'attention_mask': self.attention_mask[idx],
	}

	# instance of the custom dataset
	dataset = CustomDataset(tokenized_data)

	# Define the MiniLM-like configuration
	config = BertConfig(
	vocab_size=tokenizer.vocab_size,
	hidden_size=256,
	num_hidden_layers=3,
	num_attention_heads=4,
	)

	# Initialize the MiniLM-like model
	model = BertForPreTraining(config=config)

	# Set up the DataLoader
	dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

	# optimizer and loss function
	optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
	criterion = torch.nn.CrossEntropyLoss()

	# Training loop
	num_epochs = 100
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)

	for epoch in range(num_epochs):
	for batch in dataloader:
	input_ids = batch['input_ids'].to(device)
	attention_mask = batch['attention_mask'].to(device)

	# Forward pass
	outputs = model(input_ids, attention_mask=attention_mask)
	prediction_logits = outputs.prediction_logits

	# Compute loss
	loss = criterion(prediction_logits.view(-1, tokenizer.vocab_size), input_ids.view(-1))

	# Backward pass and optimization
	optimizer.zero_grad()
	loss.backward()
	optimizer.step()

	print(f'Epoch {epoch + 1}, Batch loss: {loss.item()}')

	# Save the trained model
	model.save_pretrained('my_miniLM_model')

	import torch
	from transformers import BertTokenizer, BertLMHeadModel, pipeline

	# Load the trained MiniLM model
	model = BertLMHeadModel.from_pretrained('my_miniLM_model')

	# Load the tokenizer
	# tokenizer = BertTokenizer.from_pretrained('path/to/your/trained/model')

	# Generate text samples using the model
	generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
	samples = generator('Your prompt here', max_length=100, num_return_sequences=1) # Set num_return_sequences to 1

	# Print generated samples
	for i, sample in enumerate(samples):
	print(f"Sample {i + 1}: {sample['generated_text']}")