import streamlit as st #import gradio as gr import torch import logging from typing import List, Dict import gc import os import pandas as pd import numpy as np import json # Huggingface stuff from datasets import concatenate_datasets, Dataset from datasets import load_dataset from huggingface_hub import hf_hub_url, ModelCard from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from evaluate import load def preprocess_function(examples): return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) def compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric): tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) batch_size = 16 args = TrainingArguments( "test-glue", eval_strategy = "epoch", learning_rate=5e-5, seed=42, lr_scheduler_type="linear", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=3, weight_decay=0.01, load_best_model_at_end=False, metric_for_best_model="accuracy", report_to="none" ) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], tokenizer=tokenizer, compute_metrics=compute_metrics ) result = trainer.evaluate() return result if __name__ == "__main__": st.title("Hugging Face Model Evaluation Demo") with st.form("my_st_form"): # Create an input text box dataset_name = st.text_input("Enter dataset identifier", "") model_checkpoint = st.text_input("Enter model identifier", "") # Every form must have a submit button. submitted = st.form_submit_button("Submit") if submitted: print(dataset_name, model_checkpoint) # hardcode input data #model_checkpoint = "sgugger/glue-mrpc" #dataset_name = "nyu-mll/glue" metric = load("glue", "mrpc") tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) #in_container = False model_checkpoint = model_checkpoint raw_datasets = load_dataset(dataset_name, "mrpc") #raw_datasets = load_dataset(dataset_name, "glue") #metric = load("glue", "mrpc") tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) output = compute_model_card_evaluation_results(tokenizer, model_checkpoint, raw_datasets, metric) print(json.dumps(output)) #st.text_area(label="Output Data:", value=st.json(output, expanded=True), height=300) st.header("Self-generated Evaluation Results:") st.json(output, expanded=True) card = ModelCard.load(model_checkpoint) #st.text_area(label="Model Card Data:", height=500, value=json.dumps(card.data.eval_results)) st.header("Model Card Evaluation Results:") st.json(card.data.eval_results, expanded=True)