pile-v2-eda / app.py
Reshinth Adithyan
Add local dedup version
1b6aa17
raw
history blame
2.38 kB
import streamlit as st
import datasets
import os
import json
from transformers import AutoTokenizer
import ast
import re
version = st.sidebar.selectbox("Choose a version", ["init","local_dedup"])
if version == "init":
CACHE_DIR = "cache_ds/" #Use this to build the dataset
else:
CACHE_DIR = "local_dedup/"
contribution_json = "contributors.json"
contribution_dict = json.load(open(contribution_json,"r"))
IGNORE_LIST = ["Bible","Tanzil","GNOME"]
splits = [split for split in os.listdir(CACHE_DIR) if split not in IGNORE_LIST]
cached_ds = os.listdir(CACHE_DIR)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
def load_page(split):
with st.spinner('Downloading and buidling dataset...'):
if split not in cached_ds:
ds = datasets.load_dataset('CarperAI/pile-v2-small-filtered',"train", data_files="data/"+split+"/data.json")
else:
ds = datasets.load_from_disk(CACHE_DIR+split)
print("Sucessfully loaded "+split)
st.title("Dataset Explorer")
st.write(f"# {split}")
if split in contribution_dict:
st.caption(f"Contributors: {','.join(contribution_dict[split])}")
else:
st.caption(f"Needs to be updated....")
with st.form("dataset_form"):
index = st.slider('Select a row', 0, len(ds)-1, 0)
if st.form_submit_button("Load"):
st.write(f"Row {index}")
data = ds[index]
content = data["text"]
meta = data["meta"]
with st.expander("Render Content"):
st.write(content)
with st.expander("Raw Content"):
st.text(content)
with st.expander("Metadata and Metrics"):
st.write("### Meta:")
try:
st.write(ast.literal_eval(meta))
except:
st.write(meta)
# Tokenizer-related count
tokenized = tokenizer(content, return_length=True)['length'][0]
token_count_metric = st.metric("Token Count(compared to 2048)",value=tokenized,delta=4096-tokenized)
#Word related count
split_words = re.findall(r'\w+', content)
word_count_metric = st.metric("Word Count",value=len(split_words))
demo_name = st.sidebar.selectbox("Choose a demo", splits)
load_page(demo_name)