Spaces:
Running
Running
File size: 1,599 Bytes
08ccc8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import pandas as pd
import torch
from torch.utils.data import Dataset
def prepare_input(cfg, text):
inputs = cfg.tokenizer(
text,
add_special_tokens=True,
max_length=cfg.input_max_length,
padding="max_length",
truncation=True,
return_attention_mask=True,
)
return {k: torch.tensor(v, dtype=torch.long) for k, v in inputs.items()}
class ReactionT5Dataset(Dataset):
def __init__(self, cfg, df):
self.cfg = cfg
self.inputs = df["input"].values
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
return prepare_input(self.cfg, self.inputs[idx])
def decode_output(output, cfg):
sequences = [
cfg.tokenizer.decode(seq, skip_special_tokens=True).replace(" ", "").rstrip(".")
for seq in output["sequences"]
]
if cfg.num_beams > 1:
scores = output["sequences_scores"].tolist()
return sequences, scores
return sequences, None
def save_multiple_predictions(input_data, sequences, scores, cfg):
output_list = [
[input_data.loc[i // cfg.num_return_sequences, "input"]]
+ sequences[i : i + cfg.num_return_sequences]
+ scores[i : i + cfg.num_return_sequences]
for i in range(0, len(sequences), cfg.num_return_sequences)
]
columns = (
["input"]
+ [f"{i}th" for i in range(cfg.num_return_sequences)]
+ ([f"{i}th score" for i in range(cfg.num_return_sequences)] if scores else [])
)
output_df = pd.DataFrame(output_list, columns=columns)
return output_df
|