File size: 1,599 Bytes
08ccc8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import torch
from torch.utils.data import Dataset


def prepare_input(cfg, text):
    inputs = cfg.tokenizer(
        text,
        add_special_tokens=True,
        max_length=cfg.input_max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
    )
    return {k: torch.tensor(v, dtype=torch.long) for k, v in inputs.items()}


class ReactionT5Dataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df["input"].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return prepare_input(self.cfg, self.inputs[idx])


def decode_output(output, cfg):
    sequences = [
        cfg.tokenizer.decode(seq, skip_special_tokens=True).replace(" ", "").rstrip(".")
        for seq in output["sequences"]
    ]
    if cfg.num_beams > 1:
        scores = output["sequences_scores"].tolist()
        return sequences, scores
    return sequences, None


def save_multiple_predictions(input_data, sequences, scores, cfg):
    output_list = [
        [input_data.loc[i // cfg.num_return_sequences, "input"]]
        + sequences[i : i + cfg.num_return_sequences]
        + scores[i : i + cfg.num_return_sequences]
        for i in range(0, len(sequences), cfg.num_return_sequences)
    ]
    columns = (
        ["input"]
        + [f"{i}th" for i in range(cfg.num_return_sequences)]
        + ([f"{i}th score" for i in range(cfg.num_return_sequences)] if scores else [])
    )
    output_df = pd.DataFrame(output_list, columns=columns)
    return output_df