File size: 2,364 Bytes
f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 8c543d4 f92d1a9 f474299 8c543d4 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 839621c f92d1a9 811432c f92d1a9 811432c f92d1a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
from defaults import (
PROJECT_NAME,
ARGILLA_SPACE_REPO_ID,
DATASET_REPO_ID,
ARGILLA_URL,
PROJECT_SPACE_REPO_ID,
DIBT_PARENT_APP_URL,
)
from utils import project_sidebar
st.set_page_config("Domain Data Grower", page_icon="π§βπΎ")
project_sidebar()
if PROJECT_NAME == "DEFAULT_DOMAIN":
st.warning(
"Please set up the project configuration in the parent app before proceeding."
)
st.stop()
st.header("π§βπΎ Domain Data Grower")
st.divider()
st.markdown(
"""
## π± Create a dataset seed for aligning models to a specific domain
This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
"""
)
st.markdown(
"""
## π How it works
You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
The dataset seed is then used to generate synthetic data for training a language model.
"""
)
st.markdown(
"""
## πΊοΈ The process
### Step 1: ~~Setup the project~~
~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
"""
)
st.link_button("π ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
st.markdown(
"""
### Step 2: Describe the Domain
Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
You can collaborate with domain experts to define the domain expertise and perspectives.
"""
)
st.page_link(
"pages/2_π©πΌβπ¬ Describe Domain.py",
label="Describe Domain",
icon="π©πΌβπ¬",
)
st.markdown(
"""
### Step 3: Generate Synthetic Data
Use distilabel to generate synthetic data for your domain-specific dataset.
You can run the pipeline locally or in this space to generate synthetic data.
"""
)
st.page_link(
"pages/3_π± Generate Dataset.py",
label="Generate Dataset",
icon="π±",
)
st.markdown(
"""
### Step 4: Review the Dataset
Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
"""
)
st.link_button("π Review the dataset in Argilla", ARGILLA_URL)
|