asoria HF staff commited on
Commit
88d7725
1 Parent(s): a093cd2

Push to Hub

Browse files
Files changed (2) hide show
  1. app.py +85 -73
  2. utils/prompts.py +37 -1
app.py CHANGED
@@ -9,7 +9,11 @@ import json
9
  import re
10
  import pandas as pd
11
  from gradio.data_classes import FileData
12
- from utils.prompts import generate_mapping_prompt, generate_eda_prompt
 
 
 
 
13
 
14
  """
15
  TODOs:
@@ -32,7 +36,6 @@ TODOs:
32
  # Configuration
33
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
34
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
35
- GENERATED_TEXT = ""
36
 
37
  client = Client(headers=HEADERS)
38
  inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
@@ -120,7 +123,57 @@ def content_from_output(output):
120
  return match.group(1)
121
 
122
 
123
- def generate_cells(dataset_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  try:
125
  libraries = get_compatible_libraries(dataset_id)
126
  except Exception as err:
@@ -150,7 +203,7 @@ def generate_cells(dataset_id):
150
  logging.info(f"First split file: {first_file}")
151
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
152
  sample_data = df.head(5).to_dict(orient="records")
153
- prompt = generate_eda_prompt(features, sample_data, first_code)
154
  messages = [gr.ChatMessage(role="user", content=prompt)]
155
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
156
 
@@ -159,20 +212,19 @@ def generate_cells(dataset_id):
159
  messages=prompt_messages, stream=True, max_tokens=2500
160
  )
161
 
162
- global GENERATED_TEXT
163
- GENERATED_TEXT = ""
164
  current_line = ""
165
  for chunk in output:
166
  current_line += chunk.choices[0].delta.content
167
  if current_line.endswith("\n"):
168
- GENERATED_TEXT += current_line
169
  messages.append(gr.ChatMessage(role="assistant", content=current_line))
170
  current_line = ""
171
  yield messages
172
  yield messages
173
 
174
  logging.info("---> Formated prompt")
175
- formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
176
  logging.info(formatted_prompt)
177
  prompt_messages = [{"role": "user", "content": formatted_prompt}]
178
  yield messages + [
@@ -212,32 +264,8 @@ def generate_cells(dataset_id):
212
  yield messages
213
 
214
 
215
- def write_notebook_file(dataset_id, history):
216
- if not GENERATED_TEXT:
217
- raise Exception("No generated notebook")
218
- commands = get_txt_from_output(GENERATED_TEXT)
219
- html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
220
- # Adding dataset viewer on the first part
221
- commands.insert(
222
- 0,
223
- {
224
- "cell_type": "code",
225
- "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
226
- },
227
- )
228
- commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
229
- notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
230
- create_notebook_file(commands, notebook_name=notebook_name)
231
- history.append(
232
- gr.ChatMessage(role="user", content="Here is the generated notebook")
233
- )
234
- history.append(
235
- gr.ChatMessage(
236
- role="user",
237
- content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
238
- )
239
- )
240
- return history
241
 
242
 
243
  with gr.Blocks(fill_height=True) as demo:
@@ -267,8 +295,8 @@ with gr.Blocks(fill_height=True) as demo:
267
 
268
  with gr.Row():
269
  generate_eda_btn = gr.Button("Generate EDA notebook")
 
270
  generate_training_btn = gr.Button("Generate Training notebook")
271
- generate_rag_btn = gr.Button("Generate RAG notebook")
272
  with gr.Column():
273
  chatbot = gr.Chatbot(
274
  label="Results",
@@ -278,47 +306,31 @@ with gr.Blocks(fill_height=True) as demo:
278
  None,
279
  ),
280
  )
281
-
 
 
 
282
  generate_eda_btn.click(
283
- generate_cells,
284
  inputs=[dataset_name],
285
- outputs=[chatbot],
286
  )
287
 
288
- # with gr.Row(visible=False) as auth_page:
289
- # with gr.Column():
290
- # gr.Markdown(
291
- # "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
292
- # )
293
- # token_box = gr.Textbox(
294
- # "", label="token", placeholder="hf_xxx", type="password"
295
- # )
296
- # auth_error = gr.Markdown("", visible=False)
297
-
298
- # push_btn = gr.Button("Push notebook to hub", visible=False)
299
- # output_lbl = gr.HTML(value="", visible=False)
300
-
301
- # def auth(token):
302
- # if not token:
303
- # return {
304
- # auth_error: gr.Markdown(value="", visible=False),
305
- # push_btn: gr.Button(visible=False),
306
- # }
307
- # return {
308
- # auth_error: gr.Markdown(value="", visible=False),
309
- # push_btn: gr.Button("Push notebook to hub", visible=True),
310
- # }
311
-
312
- # token_box.change(
313
- # auth,
314
- # inputs=token_box,
315
- # outputs=[auth_error, push_btn],
316
- # )
317
-
318
- # push_btn.click(
319
- # push_notebook,
320
- # inputs=[dataset_name, token_box],
321
- # outputs=output_lbl,
322
- # )
323
 
324
  demo.launch()
 
9
  import re
10
  import pandas as pd
11
  from gradio.data_classes import FileData
12
+ from utils.prompts import (
13
+ generate_mapping_prompt,
14
+ generate_eda_prompt,
15
+ generate_embedding_prompt,
16
+ )
17
 
18
  """
19
  TODOs:
 
36
  # Configuration
37
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
38
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 
39
 
40
  client = Client(headers=HEADERS)
41
  inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
 
123
  return match.group(1)
124
 
125
 
126
+ def generate_eda_cells(dataset_id):
127
+ for messages in generate_cells(dataset_id, generate_eda_prompt):
128
+ yield messages, gr.update(visible=False), None # Keep button hidden
129
+
130
+ yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
131
+
132
+
133
+ def generate_embedding_cells(dataset_id):
134
+ for messages in generate_cells(dataset_id, generate_embedding_prompt):
135
+ yield messages, gr.update(visible=False), None # Keep button hidden
136
+
137
+ yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
138
+
139
+
140
+ def push_to_hub(
141
+ history,
142
+ dataset_id,
143
+ notebook_file,
144
+ profile: gr.OAuthProfile | None,
145
+ oauth_token: gr.OAuthToken | None,
146
+ ):
147
+ logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
148
+ if not profile or not oauth_token:
149
+ yield history + [
150
+ gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
151
+ ]
152
+ logging.info(f"Profile: {profile}, token: {oauth_token.token}")
153
+
154
+ notebook_name = "dataset_analysis.ipynb"
155
+ api = HfApi(token=oauth_token.token)
156
+ try:
157
+ logging.info(f"About to push {notebook_file} - {notebook_name} - {dataset_id}")
158
+ api.upload_file(
159
+ path_or_fileobj=notebook_file,
160
+ path_in_repo=notebook_name,
161
+ repo_id=dataset_id,
162
+ repo_type="dataset",
163
+ )
164
+ link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
165
+ logging.info(f"Notebook pushed to hub: {link}")
166
+ yield history + [
167
+ gr.ChatMessage(
168
+ role="assistant", content=f"[Here is the generated notebook]({link})"
169
+ )
170
+ ]
171
+ except Exception as err:
172
+ logging.info("Failed to push notebook", err)
173
+ yield history + [gr.ChatMessage(role="assistant", content=err)]
174
+
175
+
176
+ def generate_cells(dataset_id, prompt_fn):
177
  try:
178
  libraries = get_compatible_libraries(dataset_id)
179
  except Exception as err:
 
203
  logging.info(f"First split file: {first_file}")
204
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
205
  sample_data = df.head(5).to_dict(orient="records")
206
+ prompt = prompt_fn(features, sample_data, first_code)
207
  messages = [gr.ChatMessage(role="user", content=prompt)]
208
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
209
 
 
212
  messages=prompt_messages, stream=True, max_tokens=2500
213
  )
214
 
215
+ generated_text = ""
 
216
  current_line = ""
217
  for chunk in output:
218
  current_line += chunk.choices[0].delta.content
219
  if current_line.endswith("\n"):
220
+ generated_text += current_line
221
  messages.append(gr.ChatMessage(role="assistant", content=current_line))
222
  current_line = ""
223
  yield messages
224
  yield messages
225
 
226
  logging.info("---> Formated prompt")
227
+ formatted_prompt = generate_mapping_prompt(generated_text)
228
  logging.info(formatted_prompt)
229
  prompt_messages = [{"role": "user", "content": formatted_prompt}]
230
  yield messages + [
 
264
  yield messages
265
 
266
 
267
+ def comming_soon_message():
268
+ gr.Info("Comming soon")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
 
271
  with gr.Blocks(fill_height=True) as demo:
 
295
 
296
  with gr.Row():
297
  generate_eda_btn = gr.Button("Generate EDA notebook")
298
+ generate_embedding_btn = gr.Button("Generate Embeddings notebook")
299
  generate_training_btn = gr.Button("Generate Training notebook")
 
300
  with gr.Column():
301
  chatbot = gr.Chatbot(
302
  label="Results",
 
306
  None,
307
  ),
308
  )
309
+ with gr.Row():
310
+ login_btn = gr.LoginButton()
311
+ push_btn = gr.Button("Push to hub", visible=False)
312
+ notebook_file = gr.File(visible=False)
313
  generate_eda_btn.click(
314
+ generate_eda_cells,
315
  inputs=[dataset_name],
316
+ outputs=[chatbot, push_btn, notebook_file],
317
  )
318
 
319
+ generate_embedding_btn.click(
320
+ generate_embedding_cells,
321
+ inputs=[dataset_name],
322
+ outputs=[chatbot, push_btn, notebook_file],
323
+ )
324
+
325
+ generate_training_btn.click(comming_soon_message, inputs=[], outputs=[])
326
+ push_btn.click(
327
+ push_to_hub,
328
+ inputs=[
329
+ chatbot,
330
+ dataset_name,
331
+ notebook_file,
332
+ ],
333
+ outputs=[chatbot],
334
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  demo.launch()
utils/prompts.py CHANGED
@@ -6,7 +6,7 @@ def generate_mapping_prompt(code):
6
  """Format the following python code to a list of cells to be used in a jupyter notebook:
7
  {{ code }}
8
 
9
- The output should be a markdown code snippet formatted in the
10
  following schema, including the leading and trailing "```json" and "```":
11
 
12
  ```json
@@ -44,4 +44,40 @@ def generate_eda_prompt(columns_info, sample_data, first_code):
44
 
45
  {{ first_code }}
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  """
 
6
  """Format the following python code to a list of cells to be used in a jupyter notebook:
7
  {{ code }}
8
 
9
+ The output should be a list of json objects with the
10
  following schema, including the leading and trailing "```json" and "```":
11
 
12
  ```json
 
44
 
45
  {{ first_code }}
46
 
47
+ The output should be a markdown python code snippet between the leading and trailing "```python" and "```".
48
+
49
+ """
50
+
51
+
52
+ @outlines.prompt
53
+ def generate_embedding_prompt(columns_info, sample_data, first_code):
54
+ """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings from a dataset.
55
+ The data is provided as a pandas DataFrame with the following structure:
56
+
57
+ Columns and Data Types:
58
+ {{ columns_info }}
59
+
60
+ Sample Data:
61
+ {{ sample_data }}
62
+
63
+ Please create a notebook that includes the following:
64
+
65
+ 1. Load the dataset
66
+ 2. Load embedding model using sentence-transformers library
67
+ 3. Convert data into embeddings
68
+ 4. Store embeddings
69
+
70
+ Ensure the notebook is well-organized, with explanations for each step.
71
+
72
+ It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
73
+
74
+ {{ first_code }}
75
+
76
+ """
77
+
78
+
79
+ @outlines.prompt
80
+ def generate_training_prompt(columns_info, sample_data, first_code):
81
+ """
82
+ TODO
83
  """