import pandas as pd import torch def prepare_inputs(protein_tokenizer, term_tokenizer, aspect, protein_sequence): inputs = protein_tokenizer(protein_sequence, return_tensors='pt') prompt = term_tokenizer(aspect, return_tensors='pt') prompt = {f'decoder_{k}': v[:, :-1] for k, v in prompt.items()} return inputs, prompt def run_inference(model, inputs, prompt, num_annotations): outputs = model.generate(**inputs, max_new_tokens=20, num_return_sequences=1, return_dict_in_generate=True, output_scores=True, renormalize_logits=True, **prompt) return outputs def aggregate_predictions(labels, scores): df = pd.DataFrame({'labels': labels, 'scores': scores}) df = df.groupby('labels', as_index=False).agg('mean') df.sort_values(by='scores') return df def post_process_outputs(model, outputs, tokenizer): transition_scores = model.compute_transition_scores( outputs.sequences, outputs.scores, normalize_logits=True ) labels = [] scores = [] generated_tokens = outputs.sequences[:, 2:-1] special_tokens = ['', '', ''] num_seq, _ = generated_tokens.shape for i in range(num_seq): for tok, score in zip(generated_tokens[i], transition_scores[i]): tok = tokenizer.decode(tok).upper() score = torch.exp(score).item() if tok in special_tokens: continue labels.append(tok) scores.append(score) return aggregate_predictions(labels, scores) def join_predictions_with_terms(predictions_df: pd.DataFrame, terms_df: pd.DataFrame): df = predictions_df.merge(terms_df, how='left', left_on='labels', right_on='term').drop(columns=['term']) return df.sort_values(by='scores', ascending=False)