Vrk commited on
Commit
8758be0
1 Parent(s): f0c2a78

Model Predictions

Browse files
Files changed (1) hide show
  1. MakePredictions.py +138 -0
MakePredictions.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from spacy.lang.en import English
3
+ import pandas as pd
4
+
5
+ import nltk
6
+ from nltk.corpus import stopwords
7
+ from nltk.stem import PorterStemmer
8
+ import re
9
+
10
+ import torch
11
+ import torch.nn.functional as F
12
+
13
+ from Dataset import SkimlitDataset
14
+
15
+ # nltk.download("stopwords")
16
+ # STOPWORDS = stopwords.words("english")
17
+ # porter = PorterStemmer()
18
+
19
+ def download_stopwords():
20
+ nltk.download("stopwords")
21
+ STOPWORDS = stopwords.words("english")
22
+ porter = PorterStemmer()
23
+ return STOPWORDS, porter
24
+
25
+ def preprocess(text, stopwords):
26
+ """Conditional preprocessing on our text unique to our task."""
27
+ # Lower
28
+ text = text.lower()
29
+
30
+ # Remove stopwords
31
+ pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
32
+ text = pattern.sub("", text)
33
+
34
+ # Remove words in paranthesis
35
+ text = re.sub(r"\([^)]*\)", "", text)
36
+
37
+ # Spacing and filters
38
+ text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
39
+ text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
40
+ text = re.sub(" +", " ", text) # remove multiple spaces
41
+ text = text.strip()
42
+
43
+ return text
44
+
45
+ def spacy_function(abstract):
46
+
47
+ # setup English sentence parser
48
+ nlp = English()
49
+
50
+ # create sentence splitting pipeline object
51
+ sentencizer = nlp.create_pipe("sentencizer")
52
+
53
+ # add sentence splitting pipeline object to sentence parser
54
+ nlp.add_pipe('sentencizer')
55
+
56
+ # create "doc" of parsed sequences, change index for a different abstract
57
+ doc = nlp(abstract)
58
+
59
+ # return detected sentences from doc in string type (not spaCy token type)
60
+ abstract_lines = [str(sent) for sent in list(doc.sents)]
61
+
62
+ return abstract_lines
63
+
64
+ # ---------------------------------------------------------------------------------------------------------------------------
65
+
66
+ def model_prediction(model, dataloader):
67
+ """Prediction step."""
68
+ # Set model to eval mode
69
+ model.eval()
70
+ y_trues, y_probs = [], []
71
+ # Iterate over val batches
72
+ for i, batch in enumerate(dataloader):
73
+ # Forward pass w/ inputs
74
+ # batch = [item.to(.device) for item in batch] # Set device
75
+ inputs = batch
76
+ z = model(inputs)
77
+ # Store outputs
78
+ y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
79
+ y_probs.extend(y_prob)
80
+ return np.vstack(y_probs)
81
+
82
+ # ---------------------------------------------------------------------------------------------------------------------------
83
+
84
+ def make_skimlit_predictions(text, model, tokenizer, label_encoder): # embedding path
85
+ # getting all lines seprated from abstract
86
+ abstract_lines = list()
87
+ abstract_lines = spacy_function(text)
88
+
89
+ # Get total number of lines
90
+ total_lines_in_sample = len(abstract_lines)
91
+
92
+ # Go through each line in abstract and create a list of dictionaries containing features for each line
93
+ sample_lines = []
94
+ for i, line in enumerate(abstract_lines):
95
+ sample_dict = {}
96
+ sample_dict["text"] = str(line)
97
+ sample_dict["line_number"] = i
98
+ sample_dict["total_lines"] = total_lines_in_sample - 1
99
+ sample_lines.append(sample_dict)
100
+
101
+ # converting sample line list into pandas Dataframe
102
+ df = pd.DataFrame(sample_lines)
103
+
104
+ # getting stopword
105
+ STOPWORDS, porter = download_stopwords()
106
+
107
+ # applying preprocessing function to lines
108
+ df.text = df.text.apply(lambda x: preprocess(x, STOPWORDS))
109
+
110
+ # converting texts into numberical sequences
111
+ text_seq = tokenizer.texts_to_sequences(texts=df['text'])
112
+
113
+ # creating Dataset
114
+ dataset = SkimlitDataset(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines'])
115
+
116
+ # creating dataloader
117
+ dataloader = dataset.create_dataloader(batch_size=2)
118
+
119
+ # Preparing embedings
120
+ # embedding_matrix = get_embeddings(embeding_path, tokenizer, 300)
121
+
122
+ # creating model
123
+ # model = SkimlitModel(embedding_dim=300, vocab_size=len(tokenizer), hidden_dim=128, n_layers=3, linear_output=128, num_classes=len(label_encoder), pretrained_embeddings=embedding_matrix)
124
+
125
+ # loading model weight
126
+ # model.load_state_dict(torch.load('/content/drive/MyDrive/Datasets/SkimLit/skimlit-pytorch-1/skimlit-model-final-1.pt', map_location='cpu'))
127
+
128
+ # setting model into evaluation mode
129
+ model.eval()
130
+
131
+ # getting predictions
132
+ y_pred = model_prediction(model, dataloader)
133
+
134
+ # converting predictions into label class
135
+ pred = y_pred.argmax(axis=1)
136
+ pred = label_encoder.decode(pred)
137
+
138
+ return abstract_lines, pred