Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria HF staff commited on Aug 21

Commit

88d7725

•

1 Parent(s): a093cd2

Push to Hub

Browse files

Files changed (2) hide show

app.py +85 -73
utils/prompts.py +37 -1

app.py CHANGED Viewed

@@ -9,7 +9,11 @@ import json
 import re
 import pandas as pd
 from gradio.data_classes import FileData
-from utils.prompts import generate_mapping_prompt, generate_eda_prompt
 """
 TODOs:
@@ -32,7 +36,6 @@ TODOs:
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
-GENERATED_TEXT = ""
 client = Client(headers=HEADERS)
 inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
@@ -120,7 +123,57 @@ def content_from_output(output):
     return match.group(1)
-def generate_cells(dataset_id):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
@@ -150,7 +203,7 @@ def generate_cells(dataset_id):
     logging.info(f"First split file: {first_file}")
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     sample_data = df.head(5).to_dict(orient="records")
-    prompt = generate_eda_prompt(features, sample_data, first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
@@ -159,20 +212,19 @@ def generate_cells(dataset_id):
         messages=prompt_messages, stream=True, max_tokens=2500
     )
-    global GENERATED_TEXT
-    GENERATED_TEXT = ""
     current_line = ""
     for chunk in output:
         current_line += chunk.choices[0].delta.content
         if current_line.endswith("\n"):
-            GENERATED_TEXT += current_line
             messages.append(gr.ChatMessage(role="assistant", content=current_line))
             current_line = ""
         yield messages
     yield messages
     logging.info("---> Formated prompt")
-    formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
     logging.info(formatted_prompt)
     prompt_messages = [{"role": "user", "content": formatted_prompt}]
     yield messages + [
@@ -212,32 +264,8 @@ def generate_cells(dataset_id):
     yield messages
-def write_notebook_file(dataset_id, history):
-    if not GENERATED_TEXT:
-        raise Exception("No generated notebook")
-    commands = get_txt_from_output(GENERATED_TEXT)
-    html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
-    # Adding dataset viewer on the first part
-    commands.insert(
-        0,
-        {
-            "cell_type": "code",
-            "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
-        },
-    )
-    commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
-    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
-    create_notebook_file(commands, notebook_name=notebook_name)
-    history.append(
-        gr.ChatMessage(role="user", content="Here is the generated notebook")
-    )
-    history.append(
-        gr.ChatMessage(
-            role="user",
-            content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
-        )
-    )
-    return history
 with gr.Blocks(fill_height=True) as demo:
@@ -267,8 +295,8 @@ with gr.Blocks(fill_height=True) as demo:
             with gr.Row():
                 generate_eda_btn = gr.Button("Generate EDA notebook")
                 generate_training_btn = gr.Button("Generate Training notebook")
-                generate_rag_btn = gr.Button("Generate RAG notebook")
         with gr.Column():
             chatbot = gr.Chatbot(
                 label="Results",
@@ -278,47 +306,31 @@ with gr.Blocks(fill_height=True) as demo:
                     None,
                 ),
             )
     generate_eda_btn.click(
-        generate_cells,
         inputs=[dataset_name],
-        outputs=[chatbot],
     )
-    # with gr.Row(visible=False) as auth_page:
-    #     with gr.Column():
-    #         gr.Markdown(
-    #             "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
-    #         )
-    #         token_box = gr.Textbox(
-    #             "", label="token", placeholder="hf_xxx", type="password"
-    #         )
-    #         auth_error = gr.Markdown("", visible=False)
-    # push_btn = gr.Button("Push notebook to hub", visible=False)
-    # output_lbl = gr.HTML(value="", visible=False)
-    # def auth(token):
-    #     if not token:
-    #         return {
-    #             auth_error: gr.Markdown(value="", visible=False),
-    #             push_btn: gr.Button(visible=False),
-    #         }
-    #     return {
-    #         auth_error: gr.Markdown(value="", visible=False),
-    #         push_btn: gr.Button("Push notebook to hub", visible=True),
-    #     }
-    # token_box.change(
-    #     auth,
-    #     inputs=token_box,
-    #     outputs=[auth_error, push_btn],
-    # )
-    # push_btn.click(
-    #     push_notebook,
-    #     inputs=[dataset_name, token_box],
-    #     outputs=output_lbl,
-    # )
 demo.launch()

 import re
 import pandas as pd
 from gradio.data_classes import FileData
+from utils.prompts import (
+    generate_mapping_prompt,
+    generate_eda_prompt,
+    generate_embedding_prompt,
+)
 """
 TODOs:
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 client = Client(headers=HEADERS)
 inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
     return match.group(1)
+def generate_eda_cells(dataset_id):
+    for messages in generate_cells(dataset_id, generate_eda_prompt):
+        yield messages, gr.update(visible=False), None  # Keep button hidden
+    yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
+def generate_embedding_cells(dataset_id):
+    for messages in generate_cells(dataset_id, generate_embedding_prompt):
+        yield messages, gr.update(visible=False), None  # Keep button hidden
+    yield messages, gr.update(visible=True), f"{dataset_id.replace('/', '-')}.ipynb"
+def push_to_hub(
+    history,
+    dataset_id,
+    notebook_file,
+    profile: gr.OAuthProfile | None,
+    oauth_token: gr.OAuthToken | None,
+):
+    logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
+    if not profile or not oauth_token:
+        yield history + [
+            gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
+        ]
+    logging.info(f"Profile: {profile}, token: {oauth_token.token}")
+    notebook_name = "dataset_analysis.ipynb"
+    api = HfApi(token=oauth_token.token)
+    try:
+        logging.info(f"About to push {notebook_file} - {notebook_name} - {dataset_id}")
+        api.upload_file(
+            path_or_fileobj=notebook_file,
+            path_in_repo=notebook_name,
+            repo_id=dataset_id,
+            repo_type="dataset",
+        )
+        link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
+        logging.info(f"Notebook pushed to hub: {link}")
+        yield history + [
+            gr.ChatMessage(
+                role="assistant", content=f"[Here is the generated notebook]({link})"
+            )
+        ]
+    except Exception as err:
+        logging.info("Failed to push notebook", err)
+        yield history + [gr.ChatMessage(role="assistant", content=err)]
+def generate_cells(dataset_id, prompt_fn):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
     logging.info(f"First split file: {first_file}")
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     sample_data = df.head(5).to_dict(orient="records")
+    prompt = prompt_fn(features, sample_data, first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
         messages=prompt_messages, stream=True, max_tokens=2500
     )
+    generated_text = ""
     current_line = ""
     for chunk in output:
         current_line += chunk.choices[0].delta.content
         if current_line.endswith("\n"):
+            generated_text += current_line
             messages.append(gr.ChatMessage(role="assistant", content=current_line))
             current_line = ""
         yield messages
     yield messages
     logging.info("---> Formated prompt")
+    formatted_prompt = generate_mapping_prompt(generated_text)
     logging.info(formatted_prompt)
     prompt_messages = [{"role": "user", "content": formatted_prompt}]
     yield messages + [
     yield messages
+def comming_soon_message():
+    gr.Info("Comming soon")
 with gr.Blocks(fill_height=True) as demo:
             with gr.Row():
                 generate_eda_btn = gr.Button("Generate EDA notebook")
+                generate_embedding_btn = gr.Button("Generate Embeddings notebook")
                 generate_training_btn = gr.Button("Generate Training notebook")
         with gr.Column():
             chatbot = gr.Chatbot(
                 label="Results",
                     None,
                 ),
             )
+            with gr.Row():
+                login_btn = gr.LoginButton()
+                push_btn = gr.Button("Push to hub", visible=False)
+    notebook_file = gr.File(visible=False)
     generate_eda_btn.click(
+        generate_eda_cells,
         inputs=[dataset_name],
+        outputs=[chatbot, push_btn, notebook_file],
     )
+    generate_embedding_btn.click(
+        generate_embedding_cells,
+        inputs=[dataset_name],
+        outputs=[chatbot, push_btn, notebook_file],
+    )
+    generate_training_btn.click(comming_soon_message, inputs=[], outputs=[])
+    push_btn.click(
+        push_to_hub,
+        inputs=[
+            chatbot,
+            dataset_name,
+            notebook_file,
+        ],
+        outputs=[chatbot],
+    )
 demo.launch()

utils/prompts.py CHANGED Viewed

@@ -6,7 +6,7 @@ def generate_mapping_prompt(code):
     """Format the following python code to a list of cells to be used in a jupyter notebook:
     {{ code }}
-    The output should be a markdown code snippet formatted in the
     following schema, including the leading and trailing "```json" and "```":
     ```json
@@ -44,4 +44,40 @@ def generate_eda_prompt(columns_info, sample_data, first_code):
     {{ first_code }}
     """

     """Format the following python code to a list of cells to be used in a jupyter notebook:
     {{ code }}
+    The output should be a list of json objects with the
     following schema, including the leading and trailing "```json" and "```":
     ```json
     {{ first_code }}
+    The output should be a markdown python code snippet between the leading and trailing "```python" and "```".
+    """
+@outlines.prompt
+def generate_embedding_prompt(columns_info, sample_data, first_code):
+    """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings from a dataset.
+    The data is provided as a pandas DataFrame with the following structure:
+    Columns and Data Types:
+    {{ columns_info }}
+    Sample Data:
+    {{ sample_data }}
+    Please create a notebook that includes the following:
+    1. Load the dataset
+    2. Load embedding model using sentence-transformers library
+    3. Convert data into embeddings
+    4. Store embeddings
+    Ensure the notebook is well-organized, with explanations for each step.
+    It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
+    {{ first_code }}
+    """
+@outlines.prompt
+def generate_training_prompt(columns_info, sample_data, first_code):
+    """
+    TODO
     """