File size: 2,364 Bytes
8773ff3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
from defaults import (
PROJECT_NAME,
ARGILLA_SPACE_REPO_ID,
DATASET_REPO_ID,
ARGILLA_URL,
PROJECT_SPACE_REPO_ID,
DIBT_PARENT_APP_URL,
)
from utils import project_sidebar
st.set_page_config("Domain Data Grower", page_icon="π§βπΎ")
project_sidebar()
if PROJECT_NAME == "DEFAULT_DOMAIN":
st.warning(
"Please set up the project configuration in the parent app before proceeding."
)
st.stop()
st.header("π§βπΎ Domain Data Grower")
st.divider()
st.markdown(
"""
## π± Create a dataset seed for aligning models to a specific domain
This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
"""
)
st.markdown(
"""
## π How it works
You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
The dataset seed is then used to generate synthetic data for training a language model.
"""
)
st.markdown(
"""
## πΊοΈ The process
### Step 1: ~~Setup the project~~
~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
"""
)
st.link_button("π ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
st.markdown(
"""
### Step 2: Describe the Domain
Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
You can collaborate with domain experts to define the domain expertise and perspectives.
"""
)
st.page_link(
"pages/2_π©πΌβπ¬ Describe Domain.py",
label="Describe Domain",
icon="π©πΌβπ¬",
)
st.markdown(
"""
### Step 3: Generate Synthetic Data
Use distilabel to generate synthetic data for your domain-specific dataset.
You can run the pipeline locally or in this space to generate synthetic data.
"""
)
st.page_link(
"pages/3_π± Generate Dataset.py",
label="Generate Dataset",
icon="π±",
)
st.markdown(
"""
### Step 4: Review the Dataset
Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
"""
)
st.link_button("π Review the dataset in Argilla", ARGILLA_URL)
|