awacke1 commited on
Commit
6c856d3
1 Parent(s): dfcb9be

Create new file

Browse files
Files changed (1) hide show
  1. source/pipeline.py +127 -0
source/pipeline.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import torch
4
+ from datasets import Dataset
5
+ from torch.utils.data import DataLoader
6
+ from tqdm import tqdm
7
+ from transformers import PerceiverTokenizer
8
+
9
+
10
+ def _map_outputs(predictions):
11
+ """
12
+ Map model outputs to classes.
13
+ :param predictions: model ouptut batch
14
+ :return:
15
+ """
16
+
17
+ labels = [
18
+ "admiration",
19
+ "amusement",
20
+ "anger",
21
+ "annoyance",
22
+ "approval",
23
+ "caring",
24
+ "confusion",
25
+ "curiosity",
26
+ "desire",
27
+ "disappointment",
28
+ "disapproval",
29
+ "disgust",
30
+ "embarrassment",
31
+ "excitement",
32
+ "fear",
33
+ "gratitude",
34
+ "grief",
35
+ "joy",
36
+ "love",
37
+ "nervousness",
38
+ "optimism",
39
+ "pride",
40
+ "realization",
41
+ "relief",
42
+ "remorse",
43
+ "sadness",
44
+ "surprise",
45
+ "neutral"
46
+ ]
47
+ classes = []
48
+ for i, example in enumerate(predictions):
49
+ out_batch = []
50
+ for j, category in enumerate(example):
51
+ out_batch.append(labels[j]) if category > 0.5 else None
52
+ classes.append(out_batch)
53
+ return classes
54
+
55
+
56
+ class MultiLabelPipeline:
57
+ """
58
+ Multi label classification pipeline.
59
+ """
60
+
61
+ def __init__(self, model_path):
62
+ """
63
+ Init MLC pipeline.
64
+ :param model_path: model to use
65
+ """
66
+
67
+ # Init attributes
68
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
69
+ if self.device == 'cuda':
70
+ self.model = torch.load(model_path).eval().to(self.device)
71
+ else:
72
+ self.model = torch.load(model_path, map_location=torch.device('cpu')).eval().to(self.device)
73
+ self.tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
74
+
75
+ def __call__(self, dataset, batch_size: int = 4):
76
+ """
77
+ Processing pipeline.
78
+ :param dataset: dataset
79
+ :return:
80
+ """
81
+
82
+ # Tokenize inputs
83
+ dataset = dataset.map(lambda row: self.tokenizer(row['text'], padding="max_length", truncation=True),
84
+ batched=True, remove_columns=['text'], desc='Tokenizing')
85
+ dataset.set_format('torch', columns=['input_ids', 'attention_mask'])
86
+ dataloader = DataLoader(dataset, batch_size=batch_size)
87
+
88
+ # Define output classes
89
+ classes = []
90
+ mem_logs = []
91
+
92
+ with tqdm(dataloader, unit='batches') as progression:
93
+ for batch in progression:
94
+ progression.set_description('Inference')
95
+ # Forward
96
+ outputs = self.model(inputs=batch['input_ids'].to(self.device),
97
+ attention_mask=batch['attention_mask'].to(self.device), )
98
+
99
+ # Outputs
100
+ predictions = outputs.logits.cpu().detach().numpy()
101
+
102
+ # Map predictions to classes
103
+ batch_classes = _map_outputs(predictions)
104
+
105
+ for row in batch_classes:
106
+ classes.append(row)
107
+
108
+ # Retrieve memory usage
109
+ memory = round(torch.cuda.memory_reserved(self.device) / 1e9, 2)
110
+ mem_logs.append(memory)
111
+
112
+ # Update pbar
113
+ progression.set_postfix(memory=f"{round(sum(mem_logs) / len(mem_logs), 2)}Go")
114
+
115
+ return classes
116
+
117
+
118
+ def inputs_to_dataset(inputs: List[str]):
119
+ """
120
+ Convert a list of strings to a dataset object.
121
+ :param inputs: list of strings
122
+ :return:
123
+ """
124
+
125
+ inputs = {'text': [input for input in inputs]}
126
+
127
+ return Dataset.from_dict(inputs)