Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

auto-notebook-creator / utils /notebook_utils.py

asoria HF staff

Add code

e62a0e5 3 months ago

raw

history blame

5.49 kB

	def replace_wildcards(templates, wildcards, replacements):
	if len(wildcards) != len(replacements):
	raise ValueError(
	"The number of wildcards must match the number of replacements."
	)

	new_templates = []
	for tmp in templates:
	tmp_text = tmp["source"]
	for wildcard, replacement in zip(wildcards, replacements):
	tmp_text = tmp_text.replace(wildcard, replacement)
	new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})

	return new_templates


	rag_cells = [
	{
	"cell_type": "markdown",
	"source": "# Retrieval-Augmented Generation (RAG) System Notebook",
	},
	{"cell_type": "code", "source": ""},
	]

	embeggins_cells = [
	{
	"cell_type": "markdown",
	"source": "# Embeddings Generation Notebook",
	},
	{"cell_type": "code", "source": ""},
	]

	eda_cells = [
	{
	"cell_type": "markdown",
	"source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset",
	},
	{
	"cell_type": "code",
	"source": """
	from IPython.display import HTML
	display(HTML("{html_code}"))
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 1. Install and import necessary libraries.
	!pip install pandas matplotlib seaborn
	""",
	},
	{
	"cell_type": "code",
	"source": """
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 2. Load the dataset as a DataFrame using the provided code
	{first_code}
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 3. Understand the dataset structure
	print(df.head())
	print(df.info())
	print(df.describe())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 4. Check for missing values
	print(df.isnull().sum())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 5. Identify data types of each column
	print(df.dtypes)
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 6. Detect duplicated rows
	print(df.duplicated().sum())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 7. Generate descriptive statistics
	print(df.describe())
	""",
	},
	{
	"cell_type": "code",
	"source": """
	# 8. Visualize the distribution of each column.
	# TODO: Add code to visualize the distribution of each column.
	# 9. Explore relationships between columns.
	# TODO: Add code to explore relationships between columns.
	# 10. Perform correlation analysis.
	# TODO: Add code to perform correlation analysis.
	""",
	},
	]


	def generate_embedding_system_prompt():
	"""You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
	Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.

	The notebook should include:

	1. Install necessary libraries with !pip install.
	2. Import libraries.
	3. Load the dataset as a DataFrame using the provided code.
	4. Select the column to generate embeddings.
	5. Remove duplicate data.
	6. Convert the selected column to a list.
	7. Load the sentence-transformers model.
	8. Create a FAISS index.
	9. Encode a query sample.
	10. Search for similar documents using the FAISS index.

	Ensure the notebook is well-organized with explanations for each step.
	The output should be Markdown content with Python code snippets enclosed in "```python" and "```".

	The user will provide dataset information in the following format:

	## Columns and Data Types

	## Sample Data

	## Loading Data code

	Use the provided code to load the dataset; do not use any other method.
	"""


	def generate_rag_system_prompt():
	"""You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
	The dataset is provided as a pandas DataFrame.

	Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.

	The RAG notebook should include:

	1. Install necessary libraries.
	2. Import libraries.
	3. Load the dataset as a DataFrame using the provided code.
	4. Select the column for generating embeddings.
	5. Remove duplicate data.
	6. Convert the selected column to a list.
	7. Load the sentence-transformers model.
	8. Create a FAISS index.
	9. Encode a query sample.
	10. Search for similar documents using the FAISS index.
	11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
	12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
	13. Send the prompt to the pipeline and display the answer.

	Ensure the notebook is well-organized with explanations for each step.
	The output should be Markdown content with Python code snippets enclosed in "```python" and "```".

	The user will provide the dataset information in the following format:

	## Columns and Data Types

	## Sample Data

	## Loading Data code

	Use the provided code to load the dataset; do not use any other method.
	"""