File size: 4,478 Bytes
8758be0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import numpy as np
from spacy.lang.en import English
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

import torch
import torch.nn.functional as F

from Dataset import SkimlitDataset

# nltk.download("stopwords")
# STOPWORDS = stopwords.words("english")
# porter = PorterStemmer()

def download_stopwords():
    nltk.download("stopwords")
    STOPWORDS = stopwords.words("english")
    porter = PorterStemmer()
    return STOPWORDS, porter

def preprocess(text, stopwords):
    """Conditional preprocessing on our text unique to our task."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in paranthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

def spacy_function(abstract):
    
  # setup English sentence parser
  nlp = English()

  # create sentence splitting pipeline object
  sentencizer = nlp.create_pipe("sentencizer")

  # add sentence splitting pipeline object to sentence parser
  nlp.add_pipe('sentencizer')
    
  # create "doc" of parsed sequences, change index for a different abstract
  doc = nlp(abstract) 

  # return detected sentences from doc in string type (not spaCy token type)
  abstract_lines = [str(sent) for sent in list(doc.sents)]
    
  return abstract_lines
    
# ---------------------------------------------------------------------------------------------------------------------------

def model_prediction(model, dataloader):
  """Prediction step."""
  # Set model to eval mode
  model.eval()
  y_trues, y_probs = [], []
  # Iterate over val batches
  for i, batch in enumerate(dataloader):
    # Forward pass w/ inputs
    # batch = [item.to(.device) for item in batch]  # Set device
    inputs = batch
    z = model(inputs)
    # Store outputs
    y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
    y_probs.extend(y_prob)
  return np.vstack(y_probs)

# ---------------------------------------------------------------------------------------------------------------------------

def make_skimlit_predictions(text, model, tokenizer, label_encoder): # embedding path
  # getting all lines seprated from abstract
  abstract_lines = list()
  abstract_lines = spacy_function(text)  
    
  # Get total number of lines
  total_lines_in_sample = len(abstract_lines)

  # Go through each line in abstract and create a list of dictionaries containing features for each line
  sample_lines = []
  for i, line in enumerate(abstract_lines):
    sample_dict = {}
    sample_dict["text"] = str(line)
    sample_dict["line_number"] = i
    sample_dict["total_lines"] = total_lines_in_sample - 1
    sample_lines.append(sample_dict)

  # converting sample line list into pandas Dataframe
  df = pd.DataFrame(sample_lines)
  
  # getting stopword
  STOPWORDS, porter = download_stopwords()

  # applying preprocessing function to lines
  df.text = df.text.apply(lambda x: preprocess(x, STOPWORDS))

  # converting texts into numberical sequences
  text_seq = tokenizer.texts_to_sequences(texts=df['text'])

  # creating Dataset
  dataset = SkimlitDataset(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines'])

  # creating dataloader
  dataloader = dataset.create_dataloader(batch_size=2)

  # Preparing embedings
#   embedding_matrix = get_embeddings(embeding_path, tokenizer, 300)

  # creating model
#   model = SkimlitModel(embedding_dim=300, vocab_size=len(tokenizer), hidden_dim=128, n_layers=3, linear_output=128, num_classes=len(label_encoder), pretrained_embeddings=embedding_matrix)

  # loading model weight
#   model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', map_location='cpu'))

  # setting model into evaluation mode
  model.eval()

  # getting predictions 
  y_pred = model_prediction(model, dataloader)

  # converting predictions into label class
  pred = y_pred.argmax(axis=1)
  pred = label_encoder.decode(pred)

  return abstract_lines, pred