from datasets import load_dataset import streamlit as st from ast import literal_eval import pandas as pd nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering", "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering" ] audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"] cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"] multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"] tabular = ["tabular-classification", "tabular-regression"] modalities = { "nlp": nlp_tasks, "audio": audio_tasks, "cv": cv_tasks, "multimodal": multimodal, "tabular": tabular, "rl": ["reinforcement-learning"] } def modality(row): pipeline = row["pipeline"] for modality, tasks in modalities.items(): if pipeline in tasks: return modality if type(pipeline) == "str": return "unk_modality" return None st.cache(allow_output_mutation=True) def process_dataset(version): # Load dataset at specified revision dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version) # Convert to pandas dataframe data = dataset["train"].to_pandas() # Add modality column data["modality"] = data.apply(modality, axis=1) # Bin the model card length into some bins data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000]) return data def eval_tags(row): tags = row["tags"] if tags == "none" or tags == [] or tags == "{}": return [] if tags[0] != "[": tags = str([tags]) val = literal_eval(tags) if isinstance(val, dict): return [] return val def change_pct(old, new): return round(100* (new - old) / new, 3) def change_and_delta(old_old, old, new): curr_change = change_pct(old, new) prev_change = change_pct(old_old, old) delta = round(curr_change-prev_change, 3) delta = f"{delta}%" curr_change = f"{curr_change}%" return curr_change, delta