File size: 6,951 Bytes
721f65d
c3e3b63
 
 
60cd313
c3e3b63
60cd313
 
6b7f4a6
60cd313
 
 
 
 
 
 
 
89cc465
290651e
9a56e9e
 
290651e
a404e9a
6b7f4a6
a404e9a
6b7f4a6
89cc465
 
60cd313
 
746b6f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73bf59e
 
 
 
 
 
c89b2ab
 
 
 
 
 
73bf59e
 
5343f2a
 
 
 
 
 
 
 
 
 
12d8e9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6885efd
12d8e9a
 
 
 
 
 
 
 
ea0b679
 
 
 
 
 
 
60cd313
 
746b6f9
c4d69e3
64ca365
64ef80f
c4d69e3
64ca365
 
 
 
 
c4d69e3
64ca365
c4d69e3
60cd313
e8002b4
c4d69e3
9340c4c
1c9f9ba
60cd313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89cc465
60cd313
 
 
 
 
 
 
 
 
 
89cc465
 
721f65d
 
 
5b7d34c
721f65d
 
 
60cd313
 
89cc465
60cd313
 
 
6b7f4a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import json
import os
from io import BytesIO

import gradio as gr
from huggingface_hub import upload_file

default_question = """
We're going to use the <a href="https://huggingface.co/datasets/wikitext" target="_blank"><code>wikitext (link)</a></code> dataset with the <code><a href="https://huggingface.co/distilbert-base-cased" target="_blank">distilbert-base-cased (link)</a></code> model checkpoint.

<br/><br/>

Start by loading the <code>wikitext-2-raw-v1</code> version of that dataset, and take the 11th example (index 10) of the <code>train</code> split.<br/>
We'll tokenize this using the appropriate tokenizer, and we'll mask the sixth token (index 5) the sequence.

<br/><br/>

When using the <code>distilbert-base-cased</code> checkpoint to unmask that (sixth token, index 5) token, what is the most probable predicted token (please provide the decoded token, and not the ID)?

<br/>
<br/>
Tips:
<br/>
- You might find the <a href="https://huggingface.co/docs/transformers/index" target="_blank">transformers docs (link)</a> useful.
<br/>
- You might find the <a href="https://huggingface.co/docs/datasets/index" target="_blank">datasets docs (link)</a> useful.
<br/>
- You might also be interested in the <a href="https://huggingface.co/course" target="_blank">Hugging Face course (link)</a>.
"""

skops_question = """
1. Create a python environment[1] and install `scikit-learn` version `1.0` in that environment.
<br/>
2. Using that environment, create a `LogisticRegression` model[2] and fit it on the Iris dataset[3].
<br/>
3. Save the trained model using `pickle`[4] or `joblib`[5].
<br/>
4. Create a second environment, and install `scikit-learn` version `1.1` in it.
<br/>
5. Try loading the model you saved in step 3 in this second environment.

<br/>
<br/>
Question:
<br/>
Is there a warning or error you receive while trying to load the model? If yes, what exactly is it.

<br/>
<br/>
References
<br/>
- [1] You can use any tool you want to create the environment. Two of the options are:
<br/>
- `venv`: https://docs.python.org/3/library/venv.html
<br/>
- `mamba`: https://github.com/mamba-org/mamba
<br/>
- [2] `LogisticRegression` API guide: https://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LogisticRegression.html
<br/>
- [3] `load_iris` API guide: https://scikit-learn.org/dev/modules/generated/sklearn.datasets.load_iris.html
<br/>
- [4] `pickle`: https://docs.python.org/3/library/pickle.html
<br/>
- [5] - `joblib`: https://joblib.readthedocs.io/en/latest/
"""

code_question = """
You are probing your code generation model on a program synthesis benchmark and 
1 out of 4 the candidate solutions produced by your model pass the unit tests of a coding challenge. 
<br/>
<br/>
What’s the pass@2 metric (in percent) as introduced in the 
Codex paper (see section 2.1)?
<br/>
<br/>
References
<br/>
- Codex paper: https://arxiv.org/abs/2107.03374
"""

evaluate_question = """
Use the `evaluate` library to compute the BLEU score of the model generation `"Evaluate is a library to evaluate Machine Learning models"` and the reference solution `"Evaluate is a library to evaluate ML models"`. Round the result to two digits after the comma.
<br/>
<br/>
References
<br/>
- `evaluate` library: https://huggingface.co/docs/evaluate/index
- BLEU score: https://en.wikipedia.org/wiki/BLEU
"""

embodied_question = """
We are going to use <a href="https://github.com/huggingface/simulate"> Simulate </a> to create a basic RL environment.

<br/><br/>

Instructions:

<br/>

pip install simulate

<br/>
create a scene with the unity engine
<br/>
add a box to the scene at position [0, 0, 1], add a camera named "cam" at default position
<br/>
show the scene, step the scene once
<br/>
what is the mean pixel value from the frames from "cam".

<br/><br/>

For some resources, you may want to check out:
* <a href="https://huggingface.co/docs/simulate/main/en/quicktour"> Simulate quick start </a> for installation,
* <a href="https://huggingface.co/docs/simulate/main/en/tutorials/running_the_simulation#running-the-simulation" simulation stepping <a> for running the simulation.
"""

fast_distributed_framework_question = """
We are going to understand how many operations does a matrix multiplication hold.
<br/>
<br/>
Let A,B two matrices of size MxK and NxK respectively. When computing the matrix multiplication of A and (B^T), how many scalar multiplications are done? How many scalar additions are done?
"""

internships = {
    'Accelerate': default_question,
    'Skops & Scikit-Learn': skops_question,
    'Diffusion distillation': default_question,
    # "Evaluate": evaluate_question,
    "Speech": default_question,
    "ML for Code/Code Generation": code_question,
    # "Model forgetting": default_question,
    # "Multimodal AI": default_question,
    # "OCR": default_question,
    # "Efficient video pretraining": default_question,
    # "Retrieval augmentation as prompting": default_question,
    "Embodied AI": embodied_question,
    # "Toolkit for detecting distribution shift/Robustness": default_question,
    "Social impact evaluations": default_question,
    "Gradio as an ecosystem": default_question,
    "Benchmarking transformers on various AI hardware accelerators": default_question,
    "AI Art Tooling Residency": default_question,
    "Datasets for Large Language Models": default_question,
    "Fast Distributed Training Framework": fast_distributed_framework_question,
}


with gr.Blocks() as demo:
    gr.Markdown(
        """
    # Internship introduction
    Please select the internship you would like to apply to and answer the question asked in the Answer box.
    """
    )

    internship_choice = gr.Dropdown(label='Internship', choices=list(internships.keys()))

    with gr.Column(visible=False) as details_col:
        summary = gr.HTML(label='Question')
        details = gr.Textbox(label="Answer")
        username = gr.Textbox(label="Hugging Face Username")
        comment = gr.Textbox(label="Any comment?")
        generate_btn = gr.Button("Submit")
        output = gr.Label()

    def filter_species(species):
        return gr.Label.update(
            internships[species]
        ), gr.update(visible=True)

    internship_choice.change(filter_species, internship_choice, [summary, details_col])

    def on_click(_details, _username, _internship_choice, _comment):
        response = {'response': _details, "internship": _internship_choice, "comment": _comment}
        upload_file(
            path_or_fileobj=BytesIO(bytes(json.dumps(response), 'utf-8')),
            path_in_repo=_username,
            repo_id='internships/internships-2023',
            repo_type='dataset',
            token=os.environ['HF_TOKEN']
        )
        return f"Submitted: '{_details}' for user '{_username}'"

    generate_btn.click(on_click, inputs=[details, username, internship_choice, comment], outputs=[output])


if __name__ == "__main__":
    demo.launch()