Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria commited on Aug 30, 2024

Commit

e62a0e5

1 Parent(s): 93c417c

Add code

Browse files

Files changed (4) hide show

README.md +5 -5
requirements.txt +6 -0
utils/ __init__.py +0 -0
utils/notebook_utils.py +184 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Auto Notebook Creator
-emoji: 💻
-colorFrom: red
-colorTo: red
 sdk: gradio
-sdk_version: 4.42.0
 app_file: app.py
 pinned: false
 ---

 ---
+title: Auto notebook creator
+emoji: 📔
+colorFrom: yellow
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.39.0
 app_file: app.py
 pinned: false
 ---

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio_huggingfacehub_search==0.0.7
+huggingface_hub
+nbformat
+httpx
+outlines
+python-dotenv

utils/ __init__.py ADDED Viewed

File without changes

utils/notebook_utils.py ADDED Viewed

	@@ -0,0 +1,184 @@

+def replace_wildcards(templates, wildcards, replacements):
+    if len(wildcards) != len(replacements):
+        raise ValueError(
+            "The number of wildcards must match the number of replacements."
+        )
+    new_templates = []
+    for tmp in templates:
+        tmp_text = tmp["source"]
+        for wildcard, replacement in zip(wildcards, replacements):
+            tmp_text = tmp_text.replace(wildcard, replacement)
+        new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
+    return new_templates
+rag_cells = [
+    {
+        "cell_type": "markdown",
+        "source": "# Retrieval-Augmented Generation (RAG) System Notebook",
+    },
+    {"cell_type": "code", "source": ""},
+]
+embeggins_cells = [
+    {
+        "cell_type": "markdown",
+        "source": "# Embeddings Generation Notebook",
+    },
+    {"cell_type": "code", "source": ""},
+]
+eda_cells = [
+    {
+        "cell_type": "markdown",
+        "source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+from IPython.display import HTML
+display(HTML("{html_code}"))
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 1. Install and import necessary libraries.
+!pip install pandas matplotlib seaborn
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 2. Load the dataset as a DataFrame using the provided code
+{first_code}
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 3. Understand the dataset structure
+print(df.head())
+print(df.info())
+print(df.describe())
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 4. Check for missing values
+print(df.isnull().sum())
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 5. Identify data types of each column
+print(df.dtypes)
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 6. Detect duplicated rows
+print(df.duplicated().sum())
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 7. Generate descriptive statistics
+print(df.describe())
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# 8. Visualize the distribution of each column.
+# TODO: Add code to visualize the distribution of each column.
+# 9. Explore relationships between columns.
+# TODO: Add code to explore relationships between columns.
+# 10. Perform correlation analysis.
+# TODO: Add code to perform correlation analysis.
+""",
+    },
+]
+def generate_embedding_system_prompt():
+    """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
+    Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
+    The notebook should include:
+    1. Install necessary libraries with !pip install.
+    2. Import libraries.
+    3. Load the dataset as a DataFrame using the provided code.
+    4. Select the column to generate embeddings.
+    5. Remove duplicate data.
+    6. Convert the selected column to a list.
+    7. Load the sentence-transformers model.
+    8. Create a FAISS index.
+    9. Encode a query sample.
+    10. Search for similar documents using the FAISS index.
+    Ensure the notebook is well-organized with explanations for each step.
+    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
+    The user will provide dataset information in the following format:
+    ## Columns and Data Types
+    ## Sample Data
+    ## Loading Data code
+    Use the provided code to load the dataset; do not use any other method.
+    """
+def generate_rag_system_prompt():
+    """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
+    The dataset is provided as a pandas DataFrame.
+    Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
+    The RAG notebook should include:
+    1. Install necessary libraries.
+    2. Import libraries.
+    3. Load the dataset as a DataFrame using the provided code.
+    4. Select the column for generating embeddings.
+    5. Remove duplicate data.
+    6. Convert the selected column to a list.
+    7. Load the sentence-transformers model.
+    8. Create a FAISS index.
+    9. Encode a query sample.
+    10. Search for similar documents using the FAISS index.
+    11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
+    12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
+    13. Send the prompt to the pipeline and display the answer.
+    Ensure the notebook is well-organized with explanations for each step.
+    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
+    The user will provide the dataset information in the following format:
+    ## Columns and Data Types
+    ## Sample Data
+    ## Loading Data code
+    Use the provided code to load the dataset; do not use any other method.
+    """