Edit model card

You need to agree to share your contact information to access this model

This repository is publicly accessible, but you have to accept the conditions to access its files and content.

Log in or Sign Up to review the conditions and access this model content.

YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)
import json

import matplotlib.pyplot as plt
import numpy as np
import torch
from ranx import evaluate
from tqdm.auto import tqdm

from rm_model import humanPreferenceModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create a list of model configurations
model_configs = [
    {
        "name": 'rm_byt5_base',
        "config": "google/byt5-base",
        "path": 'voidful/rm_byt5_base',
    }
]

eval_dataset = "test_rm.jsonl"
# one data example: {"question": "Screenshot Software recommendation - free, Windows XP/7", "answers": ["My favourite: FSCapture 5.3 (last free version)\nPortable, lightweight, free.\n\n", "Use Irfan View, is is faster than XnView and allows to set up a capture hotkey, or start capturing with a delay (possible via hotkey too).\n", "I know you are looking for a free solution; this is more of an FYI, in case you have Microsoft OneNote...\nYou can press Win - S to take a screenshot that is pasted inside the OneNote program...Then right-click the image (while it is selected), and click \"Save As\".  You can then save the image anywhere you like...\n"], "accepted_answer": ["Windows 7 comes with the snipping tool, which can be activated via hotkey with a little tweaking.\nSome nifty third party tools include Cropper:\n\nGreenshot:\n\nand of course, Gadwin.\n"]} 
maxlen = 512
batch_size = 3


def rank_answers(model, question, answers):
    model.eval()
    with torch.inference_mode():
        inputs = model.tokenizer([f"question: {question} answer: {answer}" for answer in answers], return_tensors="pt",
                                 padding=True, truncation=True, max_length=maxlen).to(device)
        decoder_input_ids = model.transformer_model._shift_right(inputs["input_ids"])
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
                        decoder_input_ids=decoder_input_ids)
        answer_scores = outputs.cpu()
        return list(zip(answers, answer_scores))


def create_test_data():
    testing_data = []
    with open(eval_dataset, "r", encoding="utf8") as f:
        for line in f:
            testing_data.append(json.loads(line))

    return testing_data


def create_qrels_and_run(test_data, model):
    qrels = {}
    run = {}
    selected_scores = []
    nonselected_scores = []
    query_id = 0

    for example in tqdm(test_data):
        question = example["question"]
        correct_answer = example["accepted_answer"][0]
        answers = example["answers"] + example["accepted_answer"]
        ranked_answers = rank_answers(model, question, answers)

        qrels[query_id] = {i: int(answer == correct_answer) for i, answer in enumerate(answers)}
        run[query_id] = {i: score for i, (_, score) in enumerate(ranked_answers)}

        for answer, score in ranked_answers:
            if answer == correct_answer:
                selected_scores.append(score.cpu().detach().numpy())
            else:
                nonselected_scores.append(score.cpu().detach().numpy())
        query_id += 1

    return qrels, run, selected_scores, nonselected_scores


# Wrap your current code inside a function
def evaluate_model(model_config, model_name, model_path):
    model = humanPreferenceModel(model_config)
    if model_path:
        model.load_state_dict(torch.load(model_path, map_location='cuda:0'))

    model.eval()

    test_data = create_test_data()
    qrels, run, selected_scores, nonselected_scores = create_qrels_and_run(test_data, model)

    # Compute mean score for selected and non-selected answers
    mean_selected_score = np.mean(selected_scores) if len(selected_scores) > 0 else 0
    mean_nonselected_score = np.mean(nonselected_scores)
    print(f"Mean score for selected answers: {mean_selected_score:.4f}")
    print(f"Mean score for non-selected answers: {mean_nonselected_score:.4f}")
    print("Selected scores:", len(selected_scores), selected_scores[:5])
    print("Non-selected scores:", len(nonselected_scores), nonselected_scores[:5])

    # Evaluate and print results
    metrics_to_compute = ["hits@5", "hit_rate@5", "precision@5", "recall@5", "f1@5", "r-precision", "bpref", "rbp.95",
                          "mrr@5", "map@5", "ndcg@5", "ndcg_burges@5"]
    results = evaluate(qrels, run, metrics_to_compute)
    print(results)
    results_perc = {metric: result * 100 for metric, result in results.items()}

    selected_scores_flat = [score.item() for score in selected_scores]
    nonselected_scores_flat = [score.item() for score in nonselected_scores]

    statistics = {'mean': np.mean}

    plt.hist(nonselected_scores_flat, bins=100, alpha=0.3, label='Non-selected answers')
    plt.hist(selected_scores_flat, bins=100, alpha=0.3, label='Selected answers')

    colors = {'selected': 'peru', 'non-selected': 'steelblue'}
    linestyles = ['dashed', 'dashed', 'dotted', 'dotted', 'dotted']

    for idx, (stat_name, stat_func) in enumerate(statistics.items()):
        for group_idx, group in enumerate(['non-selected', 'selected']):
            scores = selected_scores_flat if group == 'selected' else nonselected_scores_flat
            stat_value = stat_func(scores)
            plt.axvline(stat_value, color=colors[group], linestyle=linestyles[idx], linewidth=1)
            y_pos = plt.ylim()[1] * (0.9 - (idx * 2 + group_idx) * 0.05)
            x_offset = plt.xlim()[1] * 0.01
            plt.text(stat_value + x_offset, y_pos, f"{stat_name}: {stat_value:.2f}", color=colors[group], ha='left',
                     fontsize=8)

    plt.legend(loc='best', bbox_to_anchor=(1, 1))
    ax = plt.gca()
    legend = ax.get_legend()
    result_str = '\n'.join([f"{metric}: {result:.2f}%" for metric, result in results_perc.items()])
    plt.text(plt.xlim()[1] * 1.05, plt.ylim()[0] + (plt.ylim()[1] - plt.ylim()[0]) * 0.05, result_str, fontsize=8)
    plt.subplots_adjust(right=0.8)
    legend.set_bbox_to_anchor((1, 1))
    plt.title('Score distribution for selected and non-selected answers')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.savefig(f'score_distribution_answers_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()
    return results, selected_scores, nonselected_scores


# Iterate over model configurations
for config in model_configs:
    results, selected_scores, nonselected_scores = evaluate_model(config['config'], config['name'], config['path'])
    print(f"Results for {config['name']}: {results}")
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference API
Unable to determine this model's library. Check the docs .