skripsi-jtik-pnj / inference_config.py
Bintang Fajar Julio
init
428a607
raw
history blame
5.62 kB
import torch
import emoji
import re
import pickle
import torch.nn as nn
import torch.nn.functional as F
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
class BERT_CNN(nn.Module):
def __init__(self, labels, pretrained_bert, window_sizes=[1, 2, 3, 4, 5], in_channels=4, out_channels=32, dropout=0.1, num_bert_states=4):
super(BERT_CNN, self).__init__()
self.pretrained_bert = AutoModel.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=True)
conv_layers = []
for window_size in window_sizes:
conv_layer = nn.Conv2d(in_channels, out_channels, (window_size, self.pretrained_bert.embeddings.word_embeddings.weight.size(1)))
conv_layers.append(conv_layer)
self.cnn = nn.ModuleList(conv_layers)
self.dropout = nn.Dropout(dropout)
self.num_bert_states = num_bert_states
self.output_layer = nn.Linear(len(window_sizes) * out_channels, len(labels))
def forward(self, input_ids, attention_mask):
bert_output = self.pretrained_bert(input_ids=input_ids, attention_mask=attention_mask)
stacked_hidden_states = torch.stack(bert_output.hidden_states[-self.num_bert_states:], dim=1)
pooling = []
for layer in self.cnn:
hidden_states = layer(stacked_hidden_states)
relu_output = F.relu(hidden_states.squeeze(3))
pooling.append(relu_output)
max_pooling = []
for features in pooling:
pooled_features = F.max_pool1d(features, features.size(2)).squeeze(2)
max_pooling.append(pooled_features)
concatenated = torch.cat(max_pooling, dim=1)
preds = self.dropout(concatenated)
preds = self.output_layer(preds)
return preds
class Inference():
def __init__(self, max_length=360, pretrained_bert="indolem/indobert-base-uncased"):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.labels = ['Jaringan & IoT', 'Multimedia & Teknologi: AI Game', 'Rekayasa Perangkat Lunak', 'Sistem Cerdas']
self.stop_words = StopWordRemoverFactory().get_stop_words()
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert, use_fast=False)
self.stemmer = StemmerFactory().create_stemmer()
self.max_length = max_length
self.model = BERT_CNN(labels=self.labels, pretrained_bert=pretrained_bert)
checkpoint = torch.load("checkpoint/pretrained_classifier.pt", map_location=self.device)
self.model.load_state_dict(checkpoint)
self.model.to(self.device)
with open('checkpoint/pretrained_tfidf.pkl', 'rb') as f:
tfidf_data = pickle.load(f)
self.vectorizer = tfidf_data['vectorizer']
self.tfidf_matrix = tfidf_data['tfidf_matrix']
self.attribut = tfidf_data['attribut']
def text_processing(self, abstrak, kata_kunci):
text = str(kata_kunci) + " - " + str(abstrak)
text = text.lower()
text = emoji.replace_emoji(text, replace='')
text = re.sub(r'\n', ' ', text)
text = re.sub(r'http\S+', '', text)
text = re.sub(r'\d+', '', text)
text = re.sub(r'[^a-zA-Z ]', '', text)
text = ' '.join([word for word in text.split() if word not in self.stop_words])
text = self.stemmer.stem(text)
text = text.strip()
return text
def bert_tokenizer(self, text):
token = self.tokenizer.encode_plus(
text=text,
add_special_tokens=True,
max_length=self.max_length,
return_tensors='pt',
padding="max_length",
truncation=True
)
return token['input_ids'], token['attention_mask']
def classification(self, data):
input_ids, attention_mask = self.bert_tokenizer(data)
self.model.eval()
with torch.no_grad():
preds = self.model(input_ids=input_ids.to(self.device), attention_mask=attention_mask.to(self.device))
result = torch.softmax(preds, dim=1)[0]
probs = {}
for index, prob in enumerate(result):
probs[self.labels[index]] = round(prob.item() * 100, 2)
highest_prob = torch.argmax(preds, dim=1)
kbk = self.labels[highest_prob]
return probs, kbk
def content_based_filtering(self, data):
matrix = self.vectorizer.transform([data])
similarity_scores = cosine_similarity(matrix, self.tfidf_matrix).flatten()
score_indices = similarity_scores.argsort()[::-1]
top_indices = score_indices[:3]
top_similarity = [(index, similarity_scores[index]) for index in top_indices]
attribut_recommended = [self.attribut[idx] for idx, _ in top_similarity]
recommended = []
for idx, (attribut, score) in enumerate(zip(attribut_recommended, top_similarity)):
result = {
"rank": idx + 1,
"similarity_score": round(score[1] * 100, 2),
"title": attribut['judul'],
"abstract": attribut['abstrak'],
"keywords": attribut['kata_kunci'],
"supervisor": attribut['nama_pembimbing'],
"url": attribut['url']
}
recommended.append(result)
return recommended