|
import numpy as np |
|
from spacy.lang.en import English |
|
import pandas as pd |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer |
|
import re |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
|
|
from Dataset import CustomDataSetManager |
|
|
|
|
|
|
|
|
|
|
|
def download_stopwords(): |
|
nltk.download("stopwords") |
|
STOPWORDS = stopwords.words("english") |
|
porter = PorterStemmer() |
|
return STOPWORDS, porter |
|
|
|
def preprocess(text, stopwords): |
|
"""Conditional preprocessing on our text unique to our task.""" |
|
|
|
text = text.lower() |
|
|
|
|
|
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*") |
|
text = pattern.sub("", text) |
|
|
|
|
|
text = re.sub(r"\([^)]*\)", "", text) |
|
|
|
|
|
text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text) |
|
text = re.sub("[^A-Za-z0-9]+", " ", text) |
|
text = re.sub(" +", " ", text) |
|
text = text.strip() |
|
|
|
return text |
|
|
|
def spacy_function(abstract): |
|
|
|
|
|
nlp = English() |
|
|
|
|
|
sentencizer = nlp.create_pipe("sentencizer") |
|
|
|
|
|
nlp.add_pipe('sentencizer') |
|
|
|
|
|
doc = nlp(abstract) |
|
|
|
|
|
abstract_lines = [str(sent) for sent in list(doc.sents)] |
|
|
|
return abstract_lines |
|
|
|
|
|
|
|
def model_prediction(model, dataloader): |
|
"""Prediction step.""" |
|
|
|
model.eval() |
|
y_trues, y_probs = [], [] |
|
|
|
for i, batch in enumerate(dataloader): |
|
|
|
|
|
inputs = batch |
|
z = model(inputs) |
|
|
|
y_prob = F.softmax(z, dim=1).detach().cpu().numpy() |
|
y_probs.extend(y_prob) |
|
return np.vstack(y_probs) |
|
|
|
|
|
|
|
def make_predictions(text, model, tokenizer, label_encoder): |
|
|
|
abstract_lines = list() |
|
abstract_lines = spacy_function(text) |
|
|
|
|
|
total_lines_in_sample = len(abstract_lines) |
|
|
|
|
|
sample_lines = [] |
|
for i, line in enumerate(abstract_lines): |
|
sample_dict = {} |
|
sample_dict["text"] = str(line) |
|
sample_dict["line_number"] = i |
|
sample_dict["total_lines"] = total_lines_in_sample - 1 |
|
sample_lines.append(sample_dict) |
|
|
|
|
|
df = pd.DataFrame(sample_lines) |
|
|
|
|
|
STOPWORDS, porter = download_stopwords() |
|
|
|
|
|
df.text = df.text.apply(lambda x: preprocess(x, STOPWORDS)) |
|
|
|
|
|
text_seq = tokenizer.texts_to_sequences(texts=df['text']) |
|
|
|
|
|
dataset = CustomDataSetManager(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines']) |
|
|
|
|
|
dataloader = dataset.create_dataloader(batch_size=2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.eval() |
|
|
|
|
|
y_pred = model_prediction(model, dataloader) |
|
|
|
|
|
pred = y_pred.argmax(axis=1) |
|
pred = label_encoder.decode(pred) |
|
|
|
return abstract_lines, pred |