Spaces:

bkhmsi
/

Partial-Arabic-Diacritization

Runtime error

App Files Files Community

Partial-Arabic-Diacritization / model_dd.py

bkhmsi

bug fixes

5314058 7 months ago

raw history blame contribute delete

No virus

18.9 kB

	import numpy as np
	import torch as T

	from tqdm import tqdm
	from torch import nn
	from torch.nn import functional as F

	from components.k_lstm import K_LSTM
	from components.attention import Attention
	from data_utils import DatasetUtils
	from diac_utils import flat2_3head, flat_2_3head

	class DiacritizerD2(nn.Module):
	def __init__(self, config):
	super(DiacritizerD2, self).__init__()
	self.max_word_len = config["train"]["max-word-len"]
	self.max_sent_len = config["train"]["max-sent-len"]
	self.char_embed_dim = config["train"]["char-embed-dim"]

	self.final_dropout_p = config["train"]["final-dropout"]
	self.sent_dropout_p = config["train"]["sent-dropout"]
	self.diac_dropout_p = config["train"]["diac-dropout"]
	self.vertical_dropout = config['train']['vertical-dropout']
	self.recurrent_dropout = config['train']['recurrent-dropout']
	self.recurrent_dropout_mode = config['train'].get('recurrent-dropout-mode', 'gal_tied')
	self.recurrent_activation = config['train'].get('recurrent-activation', 'sigmoid')

	self.sent_lstm_units = config["train"]["sent-lstm-units"]
	self.word_lstm_units = config["train"]["word-lstm-units"]
	self.decoder_units = config["train"]["decoder-units"]

	self.sent_lstm_layers = config["train"]["sent-lstm-layers"]
	self.word_lstm_layers = config["train"]["word-lstm-layers"]

	self.cell = config['train'].get('rnn-cell', 'lstm')
	self.num_layers = config["train"].get("num-layers", 2)
	self.RNN_Layer = K_LSTM

	self.batch_first = config['train'].get('batch-first', True)
	self.device = 'cuda' if T.cuda.is_available() else 'cpu'
	self.num_classes = 15

	def build(self, wembs: T.Tensor, abjad_size: int):
	self.closs = F.cross_entropy
	self.bloss = F.binary_cross_entropy_with_logits

	rnn_kargs = dict(
	recurrent_dropout_mode=self.recurrent_dropout_mode,
	recurrent_activation=self.recurrent_activation,
	)

	self.sent_lstm = self.RNN_Layer(
	input_size=300,
	hidden_size=self.sent_lstm_units,
	num_layers=self.sent_lstm_layers,
	bidirectional=True,
	vertical_dropout=self.vertical_dropout,
	recurrent_dropout=self.recurrent_dropout,
	batch_first=self.batch_first,
	**rnn_kargs,
	)

	self.word_lstm = self.RNN_Layer(
	input_size=self.sent_lstm_units * 2 + self.char_embed_dim,
	hidden_size=self.word_lstm_units,
	num_layers=self.word_lstm_layers,
	bidirectional=True,
	vertical_dropout=self.vertical_dropout,
	recurrent_dropout=self.recurrent_dropout,
	batch_first=self.batch_first,
	return_states=True,
	**rnn_kargs,
	)

	self.char_embs = nn.Embedding(
	abjad_size,
	self.char_embed_dim,
	padding_idx=0,
	)

	self.attention = Attention(
	kind="dot",
	query_dim=self.word_lstm_units * 2,
	input_dim=self.sent_lstm_units * 2,
	)

	self.word_embs = T.tensor(wembs).clone().to(dtype=T.float32)
	self.word_embs = self.word_embs.to(self.device)

	self.classifier = nn.Linear(self.attention.Dout + self.word_lstm_units * 2, self.num_classes)
	self.dropout = nn.Dropout(self.final_dropout_p)

	def forward(self, sents, words, labels=None, subword_lengths=None):
	#^ sents : [b ts]
	#^ words : [b ts tw]
	#^ labels: [b ts tw]
	max_words = min(self.max_sent_len, sents.shape[1])

	word_mask = words.ne(0.).float()
	#^ word_mask: [b ts tw]

	if self.training:
	q = 1.0 - self.sent_dropout_p
	sdo = T.bernoulli(T.full(sents.shape, q))
	sents_do = sents * sdo.long()
	#^ sents_do : [b ts] ; DO(ts)
	wembs = self.word_embs[sents_do]
	#^ wembs : [b ts dw] ; DO(ts)
	else:
	wembs = self.word_embs[sents]
	#^ wembs : [b ts dw]

	sent_enc = self.sent_lstm(wembs.to(self.device))
	#^ sent_enc : [b ts dwe]

	sentword_do = sent_enc.unsqueeze(2)
	#^ sentword_do : [b ts _ dwe]

	sentword_do = self.dropout(sentword_do * word_mask.unsqueeze(-1))
	#^ sentword_do : [b ts tw dwe]

	word_index = words.view(-1, self.max_word_len)
	#^ word_index: [b*ts tw]?

	cembs = self.char_embs(word_index)
	#^ cembs : [b*ts tw dc]

	sentword_do = sentword_do.view(-1, self.max_word_len, self.sent_lstm_units * 2)
	#^ sentword_do : [b*ts tw dwe]

	char_embs = T.cat([cembs, sentword_do], dim=-1)
	#^ char_embs : [b*ts tw dcw] ; dcw = dc + dwe

	char_enc, _ = self.word_lstm(char_embs)
	#^ char_enc: [b*ts tw dce]

	char_enc_reshaped = char_enc.view(-1, max_words, self.max_word_len, self.word_lstm_units * 2)
	# #^ char_enc: [b ts tw dce]

	omit_self_mask = (1.0 - T.eye(max_words)).unsqueeze(0).to(self.device)
	attn_enc, attn_map = self.attention(char_enc_reshaped, sent_enc, word_mask.bool(), prejudice_mask=omit_self_mask)
	# # #^ attn_enc: [b ts tw dae]

	attn_enc = attn_enc.reshape(-1, self.max_word_len, self.attention.Dout)
	# #^ attn_enc: [b*ts tw dae]

	final_vec = T.cat([attn_enc, char_enc], dim=-1)

	diac_out = self.classifier(self.dropout(final_vec))
	#^ diac_out: [b*ts tw 7]

	diac_out = diac_out.view(-1, max_words, self.max_word_len, self.num_classes)
	#^ diac_out: [b ts tw 7]

	if not self.batch_first:
	diac_out = diac_out.swapaxes(1, 0)

	return diac_out


	def step(self, xt, yt, mask=None):
	xt[1] = xt[1].to(self.device)
	xt[2] = xt[2].to(self.device)

	yt = yt.to(self.device)
	#^ yt: [b ts tw]

	diac, _ = self(*xt)
	loss = self.closs(diac.view(-1, self.num_classes), yt.view(-1))

	return loss

	def predict(self, dataloader):
	training = self.training
	self.eval()

	preds = {'haraka': [], 'shadda': [], 'tanween': []}
	print("> Predicting...")
	for inputs, _ in tqdm(dataloader, total=len(dataloader)):
	inputs[0] = inputs[0].to(self.device)
	inputs[1] = inputs[1].to(self.device)
	diac = self(*inputs)

	output = np.argmax(T.softmax(diac.detach(), dim=-1).cpu().numpy(), axis=-1)
	#^ [b ts tw]

	haraka, tanween, shadda = flat_2_3head(output)

	preds['haraka'].extend(haraka)
	preds['tanween'].extend(tanween)
	preds['shadda'].extend(shadda)

	self.train(training)
	return (
	np.array(preds['haraka']),
	np.array(preds["tanween"]),
	np.array(preds["shadda"]),
	)

	class DiacritizerD3(nn.Module):
	def __init__(self, config, device='cuda'):
	super(DiacritizerD3, self).__init__()
	self.max_word_len = config["train"]["max-word-len"]
	self.max_sent_len = config["train"]["max-sent-len"]
	self.char_embed_dim = config["train"]["char-embed-dim"]

	self.sent_dropout_p = config["train"]["sent-dropout"]
	self.diac_dropout_p = config["train"]["diac-dropout"]
	self.vertical_dropout = config['train']['vertical-dropout']
	self.recurrent_dropout = config['train']['recurrent-dropout']
	self.recurrent_dropout_mode = config['train'].get('recurrent-dropout-mode', 'gal_tied')
	self.recurrent_activation = config['train'].get('recurrent-activation', 'sigmoid')

	self.sent_lstm_units = config["train"]["sent-lstm-units"]
	self.word_lstm_units = config["train"]["word-lstm-units"]
	self.decoder_units = config["train"]["decoder-units"]

	self.sent_lstm_layers = config["train"]["sent-lstm-layers"]
	self.word_lstm_layers = config["train"]["word-lstm-layers"]

	self.cell = config['train'].get('rnn-cell', 'lstm')
	self.num_layers = config["train"].get("num-layers", 2)
	self.RNN_Layer = K_LSTM

	self.batch_first = config['train'].get('batch-first', True)

	self.baseline = config["train"].get("baseline", False)
	self.device = device

	def build(self, wembs: T.Tensor, abjad_size: int):
	self.closs = F.cross_entropy
	self.bloss = F.binary_cross_entropy_with_logits

	rnn_kargs = dict(
	recurrent_dropout_mode=self.recurrent_dropout_mode,
	recurrent_activation=self.recurrent_activation,
	)

	self.sent_lstm = self.RNN_Layer(
	input_size=300,
	hidden_size=self.sent_lstm_units,
	num_layers=self.sent_lstm_layers,
	bidirectional=True,
	vertical_dropout=self.vertical_dropout,
	recurrent_dropout=self.recurrent_dropout,
	batch_first=self.batch_first,
	**rnn_kargs,
	)

	self.word_lstm = self.RNN_Layer(
	input_size=self.sent_lstm_units * 2 + self.char_embed_dim,
	hidden_size=self.word_lstm_units,
	num_layers=self.word_lstm_layers,
	bidirectional=True,
	vertical_dropout=self.vertical_dropout,
	recurrent_dropout=self.recurrent_dropout,
	batch_first=self.batch_first,
	return_states=True,
	**rnn_kargs,
	)

	self.char_embs = nn.Embedding(
	abjad_size,
	self.char_embed_dim,
	padding_idx=0,
	)

	self.attention = Attention(
	kind="dot",
	query_dim=self.word_lstm_units * 2,
	input_dim=self.sent_lstm_units * 2,
	)

	self.lstm_decoder = self.RNN_Layer(
	input_size=self.word_lstm_units * 2 + self.attention.Dout + 8,
	hidden_size=self.word_lstm_units * 2,
	num_layers=1,
	bidirectional=False,
	vertical_dropout=self.vertical_dropout,
	recurrent_dropout=self.recurrent_dropout,
	batch_first=self.batch_first,
	return_states=True,
	**rnn_kargs,
	)

	self.word_embs = T.tensor(wembs, dtype=T.float32)

	self.classifier = nn.Linear(self.lstm_decoder.hidden_size, 15)
	self.dropout = nn.Dropout(0.2)

	def forward(self, sents, words, labels):
	#^ sents : [b ts]
	#^ words : [b ts tw]
	#^ labels: [b ts tw]

	word_mask = words.ne(0.).float()
	#^ word_mask: [b ts tw]

	if self.training:
	q = 1.0 - self.sent_dropout_p
	sdo = T.bernoulli(T.full(sents.shape, q))
	sents_do = sents * sdo.long()
	#^ sents_do : [b ts] ; DO(ts)
	wembs = self.word_embs[sents_do]
	#^ wembs : [b ts dw] ; DO(ts)
	else:
	wembs = self.word_embs[sents]
	#^ wembs : [b ts dw]

	sent_enc = self.sent_lstm(wembs.to(self.device))
	#^ sent_enc : [b ts dwe]

	sentword_do = sent_enc.unsqueeze(2)
	#^ sentword_do : [b ts _ dwe]

	sentword_do = self.dropout(sentword_do * word_mask.unsqueeze(-1))
	#^ sentword_do : [b ts tw dwe]

	word_index = words.view(-1, self.max_word_len)
	#^ word_index: [b*ts tw]?

	cembs = self.char_embs(word_index)
	#^ cembs : [b*ts tw dc]

	sentword_do = sentword_do.view(-1, self.max_word_len, self.sent_lstm_units * 2)
	#^ sentword_do : [b*ts tw dwe]

	char_embs = T.cat([cembs, sentword_do], dim=-1)
	#^ char_embs : [b*ts tw dcw] ; dcw = dc + dwe

	char_enc, _ = self.word_lstm(char_embs)
	#^ char_enc: [b*ts tw dce]

	char_enc_reshaped = char_enc.view(-1, self.max_sent_len, self.max_word_len, self.word_lstm_units * 2)
	#^ char_enc: [b ts tw dce]

	omit_self_mask = (1.0 - T.eye(self.max_sent_len)).unsqueeze(0).to(self.device)
	attn_enc, attn_map = self.attention(char_enc_reshaped, sent_enc, word_mask.bool(), prejudice_mask=omit_self_mask)
	#^ attn_enc: [b ts tw dae]

	attn_enc = attn_enc.view(-1, self.max_sent_len*self.max_word_len, self.attention.Dout)
	#^ attn_enc: [b*ts tw dae]

	if self.training and self.diac_dropout_p > 0:
	q = 1.0 - self.diac_dropout_p
	ddo = T.bernoulli(T.full(labels.shape[:-1], q))
	labels = labels * ddo.unsqueeze(-1).long().to(self.device)
	#^ labels : [b ts tw] ; DO(ts)

	labels = labels.view(-1, self.max_sent_len*self.max_word_len, 8).float()
	#^ labels: [b*ts tw 8]

	char_enc = char_enc.view(-1, self.max_sent_lenself.max_word_len, self.word_lstm_units 2)

	final_vec = T.cat([attn_enc, char_enc, labels], dim=-1)
	#^ final_vec: [b ts*tw dae+8]

	dec_out, _ = self.lstm_decoder(final_vec)
	#^ dec_out: [b*ts tw du]

	dec_out = dec_out.reshape(-1, self.max_word_len, self.lstm_decoder.hidden_size)

	diac_out = self.classifier(self.dropout(dec_out))
	#^ diac_out: [b*ts tw 7]

	diac_out = diac_out.view(-1, self.max_sent_len, self.max_word_len, 15)
	#^ diac_out: [b ts tw 7]

	if not self.batch_first:
	diac_out = diac_out.swapaxes(1, 0)

	return diac_out, attn_map

	def predict_sample(self, sents, words, labels):

	word_mask = words.ne(0.).float()
	#^ mask: [b ts tw 1]

	if self.training:
	q = 1.0 - self.sent_dropout_p
	sdo = T.bernoulli(T.full(sents.shape, q))
	sents_do = sents * sdo.long()
	#^ sents_do : [b ts] ; DO(ts)
	wembs = self.word_embs[sents_do]
	#^ wembs : [b ts dw] ; DO(ts)
	else:
	wembs = self.word_embs[sents]
	#^ wembs : [b ts dw]

	sent_enc = self.sent_lstm(wembs.to(self.device))
	#^ sent_enc : [b ts dwe]

	sentword_do = sent_enc.unsqueeze(2)
	#^ sentword_do : [b ts _ dwe]

	sentword_do = self.dropout(sentword_do * word_mask.unsqueeze(-1))
	#^ sentword_do : [b ts tw dwe]

	word_index = words.view(-1, self.max_word_len)
	#^ word_index: [b*ts tw]?

	cembs = self.char_embs(word_index)
	#^ cembs : [b*ts tw dc]

	sentword_do = sentword_do.view(-1, self.max_word_len, self.sent_lstm_units * 2)
	#^ sentword_do : [b*ts tw dwe]

	char_embs = T.cat([cembs, sentword_do], dim=-1)
	#^ char_embs : [b*ts tw dcw] ; dcw = dc + dwe

	char_enc, _ = self.word_lstm(char_embs)
	#^ char_enc: [b*ts tw dce]
	#^ word_states: ([bts dce], [bts dce])

	char_enc = char_enc.view(-1, self.max_sent_len, self.max_word_len, self.word_lstm_units*2)
	#^ char_enc: [b ts tw dce]

	omit_self_mask = (1.0 - T.eye(self.max_sent_len)).unsqueeze(0).to(self.device)
	attn_enc, _ = self.attention(char_enc, sent_enc, word_mask.bool(), prejudice_mask=omit_self_mask)
	#^ attn_enc: [b ts tw dae]

	all_out = T.zeros(*char_enc.size()[:-1], 15).to(self.device)
	#^ all_out: [b ts tw 7]

	batch_sz = char_enc.size()[0]
	#^ batch_sz: b

	zeros = T.zeros(1, batch_sz, self.lstm_decoder.hidden_size).to(self.device)
	#^ zeros: [1 b du]

	bos_tag = T.tensor([0,0,0,0,0,1,0,0]).unsqueeze(0)
	#^ bos_tag: [1 8]

	prev_label = T.cat([bos_tag]*batch_sz).to(self.device).float()
	# bos_vec = T.cat([bos_tag]*batch_sz).to(self.device).float()
	#^ prev_label: [b 8]

	for ts in range(self.max_sent_len):
	dec_hx = (zeros, zeros)
	#^ dec_hx: [1 b du]
	for tw in range(self.max_word_len):
	final_vec = T.cat([attn_enc[:,ts,tw,:], char_enc[:,ts,tw,:], prev_label], dim=-1).unsqueeze(1)
	#^ final_vec: [b 1 dce+8]
	dec_out, dec_hx = self.lstm_decoder(final_vec, dec_hx)
	#^ dec_out: [b 1 du]
	dec_out = dec_out.squeeze(0)
	dec_out = dec_out.transpose(0,1)

	logits_raw = self.classifier(self.dropout(dec_out))
	#^ logits_raw: [b 1 15]

	out_idx = T.max(T.softmax(logits_raw.squeeze(), dim=-1), dim=-1)[1]

	haraka, tanween, shadda = flat2_3head(out_idx.detach().cpu().numpy())

	haraka_onehot = T.eye(6)[haraka].float().to(self.device)
	#^ haraka_onehot+bos_tag: [b 6]

	tanween = T.tensor(tanween).float().unsqueeze(-1).to(self.device)
	shadda = T.tensor(shadda).float().unsqueeze(-1).to(self.device)

	prev_label = T.cat([haraka_onehot, tanween, shadda], dim=-1)

	all_out[:,ts,tw,:] = logits_raw.squeeze()

	if not self.batch_first:
	all_out = all_out.swapaxes(1, 0)

	return all_out

	def step(self, xt, yt, mask=None):
	xt[1] = xt[1].to(self.device)
	xt[2] = xt[2].to(self.device)
	#^ yt: [b ts tw]
	yt = yt.to(self.device)

	if self.training:
	diac, _ = self(*xt)
	else:
	diac = self.predict_sample(*xt)
	#^ diac[0] : [b ts tw 5]

	loss = self.closs(diac.view(-1,15), yt.view(-1))
	return loss

	def predict(self, dataloader):
	training = self.training
	self.eval()

	preds = {'haraka': [], 'shadda': [], 'tanween': []}
	print("> Predicting...")
	for inputs, _ in tqdm(dataloader, total=len(dataloader)):
	inputs[1] = inputs[1].to(self.device)
	inputs[2] = inputs[2].to(self.device)
	diac = self.predict_sample(*inputs)
	output = np.argmax(T.softmax(diac.detach(), dim=-1).cpu().numpy(), axis=-1)
	#^ [b ts tw]

	haraka, tanween, shadda = flat_2_3head(output)

	preds['haraka'].extend(haraka)
	preds['tanween'].extend(tanween)
	preds['shadda'].extend(shadda)

	self.train(training)
	return (
	np.array(preds['haraka']),
	np.array(preds["tanween"]),
	np.array(preds["shadda"]),
	)

	if __name__ == "__main__":

	import yaml
	config_path = "configs/dd/config_d2.yaml"
	model_path = "models/tashkeela-d2.pt"
	with open(config_path, 'r', encoding="utf-8") as file:
	config = yaml.load(file, Loader=yaml.FullLoader)

	data_utils = DatasetUtils(config)
	vocab_size = len(data_utils.letter_list)
	word_embeddings = data_utils.embeddings

	model = DiacritizerD2(config, device='cpu')
	model.build(word_embeddings, vocab_size)
	model.load_state_dict(T.load(model_path, map_location=T.device('cpu'))["state_dict"])