File size: 3,239 Bytes
88471fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import streamlit as st
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import pandas as pd

def load_orca_dataset():
    st.info("Loading dataset... This may take a while.")
    return load_dataset("microsoft/orca-agentinstruct-1M-v1")

@st.cache_data
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

def evaluate_model(ds, tokenizer, model, max_samples, text_field):
    st.info("Evaluating the model...")
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

    results = []
    for i, example in enumerate(ds):
        if i >= max_samples:
            break
        input_text = example[text_field]
        result = classifier(input_text)[0]
        results.append({"input": input_text, "label": result["label"], "score": result["score"]})
    return results

def main():
    st.title("Orca Dataset Browser and Model Evaluator")

    st.sidebar.header("Configuration")
    load_dataset_btn = st.sidebar.button("Load Dataset")

    if load_dataset_btn:
        dataset = load_orca_dataset()
        st.session_state["dataset"] = dataset

    if "dataset" in st.session_state:
        dataset = st.session_state["dataset"]

        # List available splits
        available_splits = list(dataset.keys())
        st.sidebar.subheader("Available Dataset Splits")
        selected_split = st.sidebar.selectbox("Select Split", available_splits)

        st.subheader("Dataset Explorer")
        st.write(f"Displaying information for split: `{selected_split}`")
        st.write(dataset[selected_split].info)

        # Determine available fields
        sample_entry = dataset[selected_split][0]
        st.sidebar.subheader("Available Fields in Dataset")
        available_fields = list(sample_entry.keys())
        st.sidebar.write(available_fields)
        text_field = st.sidebar.selectbox("Select Text Field", available_fields)

        sample_size = st.slider("Number of Samples to Display", min_value=1, max_value=20, value=5)
        st.write(dataset[selected_split].shuffle(seed=42).select(range(sample_size)))

        st.subheader("Model Evaluator")
        model_name = st.text_input("Enter Hugging Face Model Name", value="distilbert-base-uncased-finetuned-sst-2-english")
        max_samples = st.number_input("Number of Samples to Evaluate", min_value=1, max_value=100, value=10)

        if st.button("Load Model and Evaluate"):
            tokenizer, model = load_model_and_tokenizer(model_name)

            results = evaluate_model(dataset[selected_split].shuffle(seed=42).select(range(max_samples)), tokenizer, model, max_samples, text_field)

            st.subheader("Evaluation Results")
            st.write(results)

            st.download_button(
                label="Download Results as CSV",
                data=pd.DataFrame(results).to_csv(index=False),
                file_name="evaluation_results.csv",
                mime="text/csv",
            )

if __name__ == "__main__":
    main()