SkimLit / MakePredictions.py
Vrk's picture
Model Predictions
8758be0
raw
history blame
4.48 kB
import numpy as np
from spacy.lang.en import English
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import torch
import torch.nn.functional as F
from Dataset import SkimlitDataset
# nltk.download("stopwords")
# STOPWORDS = stopwords.words("english")
# porter = PorterStemmer()
def download_stopwords():
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
porter = PorterStemmer()
return STOPWORDS, porter
def preprocess(text, stopwords):
"""Conditional preprocessing on our text unique to our task."""
# Lower
text = text.lower()
# Remove stopwords
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub("", text)
# Remove words in paranthesis
text = re.sub(r"\([^)]*\)", "", text)
# Spacing and filters
text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip()
return text
def spacy_function(abstract):
# setup English sentence parser
nlp = English()
# create sentence splitting pipeline object
sentencizer = nlp.create_pipe("sentencizer")
# add sentence splitting pipeline object to sentence parser
nlp.add_pipe('sentencizer')
# create "doc" of parsed sequences, change index for a different abstract
doc = nlp(abstract)
# return detected sentences from doc in string type (not spaCy token type)
abstract_lines = [str(sent) for sent in list(doc.sents)]
return abstract_lines
# ---------------------------------------------------------------------------------------------------------------------------
def model_prediction(model, dataloader):
"""Prediction step."""
# Set model to eval mode
model.eval()
y_trues, y_probs = [], []
# Iterate over val batches
for i, batch in enumerate(dataloader):
# Forward pass w/ inputs
# batch = [item.to(.device) for item in batch] # Set device
inputs = batch
z = model(inputs)
# Store outputs
y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
y_probs.extend(y_prob)
return np.vstack(y_probs)
# ---------------------------------------------------------------------------------------------------------------------------
def make_skimlit_predictions(text, model, tokenizer, label_encoder): # embedding path
# getting all lines seprated from abstract
abstract_lines = list()
abstract_lines = spacy_function(text)
# Get total number of lines
total_lines_in_sample = len(abstract_lines)
# Go through each line in abstract and create a list of dictionaries containing features for each line
sample_lines = []
for i, line in enumerate(abstract_lines):
sample_dict = {}
sample_dict["text"] = str(line)
sample_dict["line_number"] = i
sample_dict["total_lines"] = total_lines_in_sample - 1
sample_lines.append(sample_dict)
# converting sample line list into pandas Dataframe
df = pd.DataFrame(sample_lines)
# getting stopword
STOPWORDS, porter = download_stopwords()
# applying preprocessing function to lines
df.text = df.text.apply(lambda x: preprocess(x, STOPWORDS))
# converting texts into numberical sequences
text_seq = tokenizer.texts_to_sequences(texts=df['text'])
# creating Dataset
dataset = SkimlitDataset(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines'])
# creating dataloader
dataloader = dataset.create_dataloader(batch_size=2)
# Preparing embedings
# embedding_matrix = get_embeddings(embeding_path, tokenizer, 300)
# creating model
# model = SkimlitModel(embedding_dim=300, vocab_size=len(tokenizer), hidden_dim=128, n_layers=3, linear_output=128, num_classes=len(label_encoder), pretrained_embeddings=embedding_matrix)
# loading model weight
# model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', map_location='cpu'))
# setting model into evaluation mode
model.eval()
# getting predictions
y_pred = model_prediction(model, dataloader)
# converting predictions into label class
pred = y_pred.argmax(axis=1)
pred = label_encoder.decode(pred)
return abstract_lines, pred