File size: 2,935 Bytes
f3bcea4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75164fc
f3bcea4
 
 
 
 
 
876dccd
f3bcea4
876dccd
f3bcea4
876dccd
f3bcea4
 
 
 
 
 
e3197a5
f3bcea4
 
 
 
 
876dccd
f3bcea4
 
876dccd
f3bcea4
 
 
 
e3197a5
 
 
 
baef036
 
11a71fe
e3197a5
 
 
baef036
e3197a5
baef036
 
f5d63b3
f3bcea4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from datasets import load_dataset
import streamlit as st
from huggingface_hub import hf_hub_download
import gzip
import json
import time

t_0 = time.time()

@st.cache(allow_output_mutation=True)
def load_all_usernames(filename):
    filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename=filename, repo_type="dataset")

    with gzip.open(filepath, 'r') as f:
        usernames = f.read().decode('utf-8')
    usernames = json.loads(usernames)

    return usernames

#st.image("./banner.png", use_column_width=True)
filename = "username_to_repo.json.gz"

st.markdown("**_The Stack is an open governance interface between the AI community and the open source community._**")
st.title("Am I in The Stack?")
st.markdown("As part of the BigCode project, we released and maintain [The Stack](https://huggingface.co/datasets/bigcode/the-stack), a 3.1 TB dataset of permissively licensed source code in 30 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.")

st.markdown("This tool lets you check if a repository under a given username is part of The Stack dataset. Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).")

t_start = time.time()
usernames = load_all_usernames(filename)
print("Time load", time.time()-t_start)

username = st.text_input("Your GitHub Username:")
repos = []
if st.button("Check!"):# or username:
    t_start = time.time()
    if username in usernames:
        repos = usernames[username]
        repo_word = "repository" if len(repos)==1 else "repositories"
        st.markdown(f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack:")
        for repo in repos:
            print(repo)
            st.markdown(f"`{repo}`")
    else:
        st.text("**No**, your code is not in The Stack.")
    print("Time to check", time.time()-t_start)

#if st.button(""):
if len(repos)>0:
    with st.expander("I want to remove my data from the Stack!"):
        st.markdown("Select which repositories you would like to have removed:")
        exclude_repo = []
        for repo in repos:
            exclude_repo.append(st.checkbox(repo, value=True))


        st.markdown("Open an issue with the below text in the opt-out repo [here](https://github.com/bigcode-project/opt-out/issues/new):")
        issue_text = "I want to remove the following repositories.\n\n"
        issue_text += " - "+ "\n - ".join([repo for (repo, exclude) in zip(repos, exclude_repo) if exclude])
        st.code(issue_text)
    

print("Full time", time.time()-t_0)