Spaces:

burtenshaw
/

yoga_nistra_config_space

Sleeping

yoga_nistra_config_space / pages /3_🌱 Generate Dataset.py

Ben Burtenshaw

lose codeless version

32014a1 7 months ago

4.26 kB

	import streamlit as st

	from defaults import ARGILLA_URL
	from utils import project_sidebar

	st.set_page_config(
	page_title="Domain Data Grower",
	page_icon="🧑‍🌾",
	)

	project_sidebar()

	################################################################################
	# HEADER
	################################################################################

	st.header("🧑‍🌾 Domain Data Grower")
	st.divider()
	st.subheader("Step 3. Run the pipeline to generate synthetic data")
	st.write("Define the distilabel pipeline for generating the dataset.")

	###############################################################
	# CONFIGURATION
	###############################################################

	hub_username = st.session_state.get("hub_username")
	project_name = st.session_state.get("project_name")
	hub_token = st.session_state.get("hub_token")

	st.divider()

	st.markdown("#### 🤖 Inference configuration")

	st.write(
	"Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
	)

	with st.expander("🤗 Recommended Models"):
	st.write("All inference endpoint compatible models can be found via the link below")
	st.link_button(
	"🤗 Inference compaptible models on the hub",
	"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
	)
	st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
	st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")

	st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
	st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")

	st.write("🍃Projects with even less resources could take advantage of Phi-2")
	st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")

	st.write("Note Hugggingface Pro gives access to more compute resources")
	st.link_button(
	"🤗 Huggingface Pro",
	"https://huggingface.co/pricing",
	)


	base_url = st.text_input(
	label="Base URL for the Inference API",
	value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
	)
	st.divider()
	st.markdown("#### 🔬 Argilla API details to push the generated dataset")
	argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
	argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
	argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
	st.divider()

	###############################################################
	# LOCAL
	###############################################################

	st.markdown("## Run the pipeline")

	st.markdown(
	"Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
	)


	if all(
	[
	argilla_api_key,
	argilla_url,
	base_url,
	hub_token,
	project_name,
	hub_token,
	argilla_dataset_name,
	]
	):
	st.markdown(
	"To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
	)

	st.code(
	f"""

	# Install the distilabel library
	pip install git+https://github.com/argilla-io/distilabel.git
	"""
	)

	st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")

	st.code(
	f"""
	# Clone the project and install the requirements
	git clone https://huggingface.co/datasets/{hub_username}/{project_name}
	cd {project_name}
	pip install -r requirements.txt

	# Run the pipeline
	python pipeline.py
	--argilla-api-key {argilla_api_key}
	--argilla-api-url {argilla_url}
	--argilla-dataset-name {argilla_dataset_name}
	--endpoint-base-url {base_url}
	--hub-token {st.session_state["hub_token"]}
	""",
	language="bash",
	)
	st.markdown(
	"👩‍🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
	)

	else:
	st.info("Please fill all the required fields.")