lvwerra HF staff commited on
Commit
48dc2f2
β€’
1 Parent(s): d1dbd28
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +42 -23
README.md CHANGED
@@ -3,8 +3,8 @@ title: Am I in The Stack?
3
  emoji: πŸ“‘πŸ”
4
  colorFrom: red
5
  colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.10.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
3
  emoji: πŸ“‘πŸ”
4
  colorFrom: red
5
  colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -1,36 +1,55 @@
1
- from datasets import load_dataset
2
- import streamlit as st
3
  from huggingface_hub import hf_hub_download
4
- import gzip
5
  import json
6
- import time
 
7
 
 
8
 
9
- @st.cache(allow_output_mutation=True)
10
- def load_all_usernames(filename):
11
- filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename=filename, repo_type="dataset")
12
 
13
- with gzip.open(filepath, 'r') as f:
14
- usernames = json.loads(f.read().decode('utf-8'))
15
- return usernames
16
 
17
- st.image("./banner.png", use_column_width=True)
 
 
18
 
19
- st.markdown("**_The Stack is an open governance interface between the AI community and the open source community._**")
20
- st.title("Am I in The Stack?")
21
- st.markdown("As part of the BigCode project, we released and maintain [The Stack](https://huggingface.co/datasets/bigcode/the-stack), a 3.1 TB dataset of permissively licensed source code in 30 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.")
22
 
23
- st.markdown("This tool lets you check if a repository under a given username is part of The Stack dataset. Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).")
 
24
 
25
- usernames = load_all_usernames("username_to_repo.json.gz")
26
- username = st.text_input("Your GitHub Username:")
27
 
28
- if st.button("Check!") or username:
29
- if username in usernames:
30
- repos = usernames[username]
 
31
  repo_word = "repository" if len(repos)==1 else "repositories"
32
- st.markdown(f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack:")
33
  for repo in repos:
34
- st.markdown(f"`{repo}`")
35
  else:
36
- st.markdown("**No**, your code is not in The Stack.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
 
2
  from huggingface_hub import hf_hub_download
 
3
  import json
4
+ import gzip
5
+
6
 
7
+ usernames = {}
8
 
9
+ filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v1.1")
10
+ with gzip.open(filepath, 'r') as f:
11
+ usernames["v1.1"] = json.loads(f.read().decode('utf-8'))
12
 
13
+ filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset")
14
+ with gzip.open(filepath, 'r') as f:
15
+ usernames["v1.0"] = json.loads(f.read().decode('utf-8'))
16
 
17
+ text = """\
18
+ ![](https://huggingface.co/spaces/bigcode/in-the-stack/resolve/main/banner.png)
19
+ **_The Stack is an open governance interface between the AI community and the open source community._**
20
 
21
+ # Am I in The Stack?
 
 
22
 
23
+ As part of the BigCode project, we released and maintain [The Stack](https://huggingface.co/datasets/bigcode/the-stack), a 3.1 TB dataset of permissively licensed source code in 30 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.
24
+ """ + """\
25
 
26
+ This tool lets you check if a repository under a given username is part of The Stack dataset. Would you like to have your data removed from future versions of The Stack? You can opt-out following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).
27
+ """
28
 
29
+ def check_username(username, version):
30
+ output_md = ""
31
+ if username in usernames[version] and len(usernames[version][username])>0:
32
+ repos = usernames[version][username]
33
  repo_word = "repository" if len(repos)==1 else "repositories"
34
+ output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack:\n\n"
35
  for repo in repos:
36
+ output_md += f"_{repo}_\n\n"
37
  else:
38
+ output_md += "**No**, your code is not in The Stack."
39
+ return output_md.strip()
40
+
41
+ with gr.Blocks() as demo:
42
+ with gr.Row():
43
+ _, colum_2, _ = gr.Column(scale=1), gr.Column(scale=6), gr.Column(scale=1)
44
+ with colum_2:
45
+ gr.Markdown(text)
46
+ version = gr.Dropdown(["v1.1", "v1.0"], label="The Stack version:", value="v1.1")
47
+ username = gr.Text("", label="Your GitHub username:")
48
+ check_button = gr.Button("Check!")
49
+
50
+ repos = gr.Markdown()
51
+
52
+ check_button.click(check_username, [username, version], repos)
53
+
54
+
55
+ demo.launch()