Spaces:
Running
Running
import torch | |
import emoji | |
import re | |
import pickle | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory | |
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
from transformers import AutoTokenizer, AutoModel | |
from sklearn.metrics.pairwise import cosine_similarity | |
class BERT_CNN(nn.Module): | |
def __init__(self, labels, pretrained_bert, window_sizes=[1, 2, 3, 4, 5], in_channels=4, out_channels=32, dropout=0.1, num_bert_states=4): | |
super(BERT_CNN, self).__init__() | |
self.pretrained_bert = AutoModel.from_pretrained(pretrained_bert, output_attentions=False, output_hidden_states=True) | |
conv_layers = [] | |
for window_size in window_sizes: | |
conv_layer = nn.Conv2d(in_channels, out_channels, (window_size, self.pretrained_bert.embeddings.word_embeddings.weight.size(1))) | |
conv_layers.append(conv_layer) | |
self.cnn = nn.ModuleList(conv_layers) | |
self.dropout = nn.Dropout(dropout) | |
self.num_bert_states = num_bert_states | |
self.output_layer = nn.Linear(len(window_sizes) * out_channels, len(labels)) | |
def forward(self, input_ids, attention_mask): | |
bert_output = self.pretrained_bert(input_ids=input_ids, attention_mask=attention_mask) | |
stacked_hidden_states = torch.stack(bert_output.hidden_states[-self.num_bert_states:], dim=1) | |
pooling = [] | |
for layer in self.cnn: | |
hidden_states = layer(stacked_hidden_states) | |
relu_output = F.relu(hidden_states.squeeze(3)) | |
pooling.append(relu_output) | |
max_pooling = [] | |
for features in pooling: | |
pooled_features = F.max_pool1d(features, features.size(2)).squeeze(2) | |
max_pooling.append(pooled_features) | |
concatenated = torch.cat(max_pooling, dim=1) | |
preds = self.dropout(concatenated) | |
preds = self.output_layer(preds) | |
return preds | |
class Inference(): | |
def __init__(self, max_length=360, pretrained_bert="indolem/indobert-base-uncased"): | |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
self.labels = ['Jaringan & IoT', 'Multimedia & Teknologi: AI Game', 'Rekayasa Perangkat Lunak', 'Sistem Cerdas'] | |
self.stop_words = StopWordRemoverFactory().get_stop_words() | |
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert, use_fast=False) | |
self.stemmer = StemmerFactory().create_stemmer() | |
self.max_length = max_length | |
self.model = BERT_CNN(labels=self.labels, pretrained_bert=pretrained_bert) | |
checkpoint = torch.load("checkpoint/pretrained_classifier.pt", map_location=self.device) | |
self.model.load_state_dict(checkpoint) | |
self.model.to(self.device) | |
with open('checkpoint/pretrained_tfidf.pkl', 'rb') as f: | |
tfidf_data = pickle.load(f) | |
self.vectorizer = tfidf_data['vectorizer'] | |
self.tfidf_matrix = tfidf_data['tfidf_matrix'] | |
self.attribut = tfidf_data['attribut'] | |
def text_processing(self, abstrak, kata_kunci): | |
text = str(kata_kunci) + " - " + str(abstrak) | |
text = text.lower() | |
text = emoji.replace_emoji(text, replace='') | |
text = re.sub(r'\n', ' ', text) | |
text = re.sub(r'http\S+', '', text) | |
text = re.sub(r'\d+', '', text) | |
text = re.sub(r'[^a-zA-Z ]', '', text) | |
text = ' '.join([word for word in text.split() if word not in self.stop_words]) | |
text = self.stemmer.stem(text) | |
text = text.strip() | |
return text | |
def bert_tokenizer(self, text): | |
token = self.tokenizer.encode_plus( | |
text=text, | |
add_special_tokens=True, | |
max_length=self.max_length, | |
return_tensors='pt', | |
padding="max_length", | |
truncation=True | |
) | |
return token['input_ids'], token['attention_mask'] | |
def classification(self, data): | |
input_ids, attention_mask = self.bert_tokenizer(data) | |
self.model.eval() | |
with torch.no_grad(): | |
preds = self.model(input_ids=input_ids.to(self.device), attention_mask=attention_mask.to(self.device)) | |
result = torch.softmax(preds, dim=1)[0] | |
probs = {} | |
for index, prob in enumerate(result): | |
probs[self.labels[index]] = round(prob.item() * 100, 2) | |
highest_prob = torch.argmax(preds, dim=1) | |
kbk = self.labels[highest_prob] | |
return probs, kbk | |
def content_based_filtering(self, data): | |
matrix = self.vectorizer.transform([data]) | |
similarity_scores = cosine_similarity(matrix, self.tfidf_matrix).flatten() | |
score_indices = similarity_scores.argsort()[::-1] | |
top_indices = score_indices[:3] | |
top_similarity = [(index, similarity_scores[index]) for index in top_indices] | |
attribut_recommended = [self.attribut[idx] for idx, _ in top_similarity] | |
recommended = [] | |
for idx, (attribut, score) in enumerate(zip(attribut_recommended, top_similarity)): | |
result = { | |
"rank": idx + 1, | |
"similarity_score": round(score[1] * 100, 2), | |
"title": attribut['judul'], | |
"abstract": attribut['abstrak'], | |
"keywords": attribut['kata_kunci'], | |
"supervisor": attribut['nama_pembimbing'], | |
"url": attribut['url'] | |
} | |
recommended.append(result) | |
return recommended |