from datasets import load_dataset import streamlit as st from huggingface_hub import hf_hub_download import gzip import json @st.cache(allow_output_mutation=True) def load_all_usernames(): filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset") with gzip.open(filepath, 'r') as f: usernames = json.loads(f.read().decode('utf-8')) return usernames st.title("Am I in The Stack?") st.markdown("This tool lets you check if a repository under a given username is part of [The Stack dataset](https://huggingface.co/datasets/bigcode/the-stack).") usernames = load_all_usernames() username = st.text_input("Your GitHub Username:") if st.button("Check!"): if username in usernames: st.markdown("**Yes**, your data is in The Stack:") st.markdown("\n".join([f"`{repo_name}`" for repo_name in usernames[username]])) else: st.markdown("**No**, your data is not in The Stack.")