Tarush-AI
/

AureliusGPT

small-language-model

Model card Files Files and versions

AureliusGPT / model /model.py

Tarush-AI's picture

Upload folder using huggingface_hub

f451089 verified about 1 month ago

history blame contribute delete

3.15 kB

	import numpy as np
	import torch
	import torch.nn as nn
	from config import num_blocks, vocab_size, d_model, h, d_head, d_ff, max_seq_length

	#main transformer

	class Util:
	def sinusoidal(self):
	PE = np.zeros((max_seq_length, d_model))

	for pos in range(max_seq_length):
	for i in range(0, d_model, 2):
	div_term = 10000 ** (i / d_model)
	PE[pos, i] = np.sin(pos / div_term)
	if i + 1 < d_model:
	PE[pos, i + 1] = np.cos(pos / div_term)

	return PE

	class Transformer(nn.Module):
	def __init__(self):
	super().__init__()
	self.blocks = nn.ModuleList([TransformerBlock() for i in range(num_blocks)])
	self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
	util = Util()
	self.positionals = util.sinusoidal()
	self.linear = nn.Linear(d_model, vocab_size)

	def forward(self, X):
	embeddings = self.embeddings(X)
	positionals = torch.tensor(self.positionals[:X.shape[0]]).float()
	embeddings = embeddings + positionals

	for block in self.blocks:
	embeddings = block(embeddings)

	return self.linear(embeddings)

	class TransformerBlock(nn.Module):
	def __init__(self):
	super().__init__()
	self.attentionblock = AttentionBlock()
	self.layernorm = LayerNorm()
	self.ffn = FFN()
	self.layernorm2 = LayerNorm()

	def forward(self, X):
	X = self.layernorm(X + self.attentionblock(X))
	X = self.layernorm2(X + self.ffn(X))
	return X

	#attention
	class AttentionBlock(nn.Module):
	def __init__(self):
	super().__init__()
	self.attentionheads = nn.ModuleList([AttentionHead() for i in range(h)])
	self.Wo = nn.Linear(d_model, d_model)

	def forward(self, X):
	headoutputs = [head(X) for head in self.attentionheads]
	MHA = torch.cat(headoutputs, dim=-1)
	return self.Wo(MHA)


	class AttentionHead(nn.Module):
	def __init__(self):
	super().__init__()
	self.queries = nn.Linear(d_model, d_head, bias=False)
	self.keys = nn.Linear(d_model, d_head, bias=False)
	self.values = nn.Linear(d_model, d_head, bias=False)

	def forward(self, X):
	Q = self.queries(X)
	K = self.keys(X)
	V = self.values(X)

	scores = Q @ K.T
	scores /= (d_head ** 0.5)
	mask = torch.tril(torch.ones(X.shape[0], X.shape[0]))
	scores = scores.masked_fill(mask == 0, float('-inf'))
	attention = torch.softmax(scores, dim=-1)
	return attention @ V

	#adding / prenorm
	class LayerNorm(nn.Module):
	def __init__(self):
	super().__init__()
	self.norm = nn.LayerNorm(d_model)
	def forward(self, X):
	return self.norm(X)

	#ffn
	class FFN(nn.Module):
	def __init__(self):
	super().__init__()
	self.net = nn.Sequential(
	nn.Linear(d_model, d_ff),
	nn.ReLU(),
	nn.Linear(d_ff, d_model)
	)

	def forward(self, X):
	return self.net(X)