Spaces:
Runtime error
Runtime error
import numpy as np | |
from nomic import atlas | |
import glob | |
from tqdm import tqdm | |
from datasets import load_dataset, concatenate_datasets | |
from sklearn.decomposition import PCA | |
files = glob.glob("inference/*.jsonl") | |
print(files) | |
df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)]) | |
print(len(df)) | |
print(df) | |
df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]}, | |
batched=True, | |
num_proc=64) | |
df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]}, | |
batched=True, | |
num_proc=64) | |
df = df.remove_columns("is_train") | |
text = df.remove_columns(["labels", "input_ids", "embeddings"]) | |
text_df = [text[i] for i in range(len(text))] | |
atlas.map_text(text_df, indexed_field="inputs", | |
name="CHANGE ME!", | |
colorable_fields=["source", "loss", "trained_on"], | |
reset_project_if_exists=True, | |
) | |
# index is local to train/test split, regenerate | |
data = df.remove_columns(["labels", "input_ids", "index"]) | |
data = data.add_column("index", list(range(len(data)))) | |
# max embed dim is 2048 for now | |
# note! this is slow in pyarrow/hf datasets | |
embeddings = np.array(data["embeddings"]) | |
print("embeddings shape:", embeddings.shape) | |
embeddings = PCA(n_components=2048).fit_transform(embeddings) | |
data = data.remove_columns(["embeddings"]) | |
columns = data.to_pandas().to_dict("records") | |
atlas.map_embeddings(embeddings, | |
data=columns, | |
id_field="index", | |
name="CHANGE ME!", | |
colorable_fields=["source", "loss", "trained_on"], | |
build_topic_model=True, | |
topic_label_field="inputs", | |
reset_project_if_exists=True,) | |