Spaces:
Running
Running
import pandas as pd | |
import torch | |
from torch.utils.data import Dataset | |
def prepare_input(cfg, text): | |
inputs = cfg.tokenizer( | |
text, | |
add_special_tokens=True, | |
max_length=cfg.input_max_length, | |
padding="max_length", | |
truncation=True, | |
return_attention_mask=True, | |
) | |
return {k: torch.tensor(v, dtype=torch.long) for k, v in inputs.items()} | |
class ReactionT5Dataset(Dataset): | |
def __init__(self, cfg, df): | |
self.cfg = cfg | |
self.inputs = df["input"].values | |
def __len__(self): | |
return len(self.inputs) | |
def __getitem__(self, idx): | |
return prepare_input(self.cfg, self.inputs[idx]) | |
def decode_output(output, cfg): | |
sequences = [ | |
cfg.tokenizer.decode(seq, skip_special_tokens=True).replace(" ", "").rstrip(".") | |
for seq in output["sequences"] | |
] | |
if cfg.num_beams > 1: | |
scores = output["sequences_scores"].tolist() | |
return sequences, scores | |
return sequences, None | |
def save_multiple_predictions(input_data, sequences, scores, cfg): | |
output_list = [ | |
[input_data.loc[i // cfg.num_return_sequences, "input"]] | |
+ sequences[i : i + cfg.num_return_sequences] | |
+ scores[i : i + cfg.num_return_sequences] | |
for i in range(0, len(sequences), cfg.num_return_sequences) | |
] | |
columns = ( | |
["input"] | |
+ [f"{i}th" for i in range(cfg.num_return_sequences)] | |
+ ([f"{i}th score" for i in range(cfg.num_return_sequences)] if scores else []) | |
) | |
output_df = pd.DataFrame(output_list, columns=columns) | |
return output_df | |