File size: 3,821 Bytes
7817c3d
 
 
 
1a65b20
7817c3d
 
1a65b20
7817c3d
 
 
 
 
 
312717b
2258a82
7817c3d
 
 
0eb5ff7
7817c3d
 
0eb5ff7
7817c3d
 
 
 
 
48b43b6
7817c3d
 
 
 
 
 
0eb5ff7
7817c3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eb5ff7
7817c3d
 
 
 
 
0eb5ff7
7817c3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from huggingface_hub import hf_hub_download
import json
import urllib
import os

filepath = hf_hub_download(repo_id="bigcode/commits-username-to-repo", filename="username_to_repo.json",
                           repo_type="dataset", token=os.environ.get("api_token"))
with open(filepath, 'r') as f:
    username_to_repo = json.load(f)

usernames = set(username_to_repo.keys())

text = """\
![](https://huggingface.co/spaces/bigcode/in-the-commitpack/resolve/main/banner.png)
**_CommitPack is is a 4TB dataset of commits scraped from GitHub repositories that are permissively licensed._**

# Am I in The CommitPack?

As part of the BigCode project, we released and maintain [CommitPack](https://huggingface.co/datasets/bigcode/commitpack), a 4 TB dataset of permissively licensed Git commits covering 350 programming languages. One of our goals in this project is to give people agency over their source code by letting them decide whether or not it should be used to develop and evaluate machine learning models, as we acknowledge that not all developers may wish to have their data used for that purpose.
""" + """\

This tool lets you check if a repository under a given username is part of the CommitPack dataset. Would you like to have your data removed from future versions of CommitPack? The CommitPack uses the same opt-out as The Stack, so you can opt-out by following the instructions [here](https://www.bigcode-project.org/docs/about/the-stack/#how-can-i-request-that-my-data-be-removed-from-the-stack).
"""

opt_out_text_template = """\
### Opt-out

If you want your data to be removed from the CommitPack and model training \
open an issue with <a href="https://github.com/bigcode-project/opt-out-v2/issues/new?title={title}&body={body}" target="_blank">this link</a> \
(if the link doesn't work try right a right click and open it in a new tab) or visit [https://github.com/bigcode-project/opt-out-v2/issues/new?&template=opt-out-request.md](https://github.com/bigcode-project/opt-out-v2/issues/new?&template=opt-out-request.md) .\
"""

opt_out_issue_title = """Opt-out request for {username}"""
opt_out_issue_body = """\
I request that the following data is removed from the CommitPack:

 - Commits
 - GitHub issue
{repo_list}

_Note_: If you don't want all resources to be included just remove the elements from the list above. If you would like to exclude all repositories and resources just add a single element "all" to the list.
"""


def issue_url(username, repos):
    title = urllib.parse.quote(opt_out_issue_title.format(username=username))
    body = urllib.parse.quote(opt_out_issue_body.format(repo_list=" - " + "\n - ".join(repos)))

    opt_out_text = opt_out_text_template.format(title=title, body=body)

    return opt_out_text


def check_username(username):
    output_md = ""
    if username in usernames and len(username_to_repo[username]) > 0:
        repos = username_to_repo[username]
        repo_word = "repository" if len(repos) == 1 else "repositories"
        output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in the CommitPack:\n\n"
        for repo in repos:
            output_md += f"_{repo}_\n\n"

        return output_md.strip(), issue_url(username, repos)
    else:
        output_md += "**No**, your code is not in the CommitPack."
        return output_md.strip(), ""


with gr.Blocks() as demo:
    with gr.Row():
        _, colum_2, _ = gr.Column(scale=1), gr.Column(scale=6), gr.Column(scale=1)
        with colum_2:
            gr.Markdown(text)
            username = gr.Text("", label="Your GitHub username:")
            check_button = gr.Button("Check!")
            repos = gr.Markdown()
            opt_out = gr.Markdown()

            check_button.click(check_username, [username], [repos, opt_out])

demo.launch()