burtenshaw HF staff commited on
Commit
f92d1a9
β€’
1 Parent(s): 8c543d4

Upload 12 files

Browse files
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Domain Specific Welcome
3
- emoji: 🐒
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.33.0
8
  app_file: app.py
 
1
  ---
2
+ title: Domain Specific Seed
3
+ emoji: πŸ’»
4
+ colorFrom: purple
5
+ colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.33.0
8
  app_file: app.py
app.py CHANGED
@@ -1,118 +1,94 @@
1
- import time
2
 
3
- from hub import (
4
- setup_dataset_on_hub,
5
- duplicate_space_on_hub,
6
- add_project_config_to_space_repo,
 
 
 
7
  )
 
8
 
9
- import streamlit as st
10
 
 
11
 
12
- # Constants
13
- # Written here to avoid defaults.py
14
- DEFAULT_DOMAIN = "farming"
 
 
15
 
16
- st.set_page_config(
17
- "Domain Data Grower", page_icon="πŸ§‘β€πŸŒΎ", initial_sidebar_state="collapsed"
18
- )
19
 
20
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
21
  st.divider()
22
 
23
- st.sidebar.link_button(
24
- "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
25
- )
26
-
27
- ################################################################################
28
- # APP MARKDOWN
29
- ################################################################################
30
-
31
- st.header("🌱 Create a domain specific dataset")
32
-
33
  st.markdown(
34
- """This space will set up your domain specific dataset project. It will
35
- create the resources that you need to build a dataset. Those resources include:
36
-
37
- - A dataset repository on the Hub
38
- - Another space to define expert domain and run generation pipelines
39
 
40
- For a complete overview of the project. Check out the README
 
41
  """
42
  )
 
 
 
43
 
44
- st.page_link(
45
- "pages/πŸ§‘β€πŸŒΎ Domain Data Grower.py",
46
- label="Domain Data Grower",
47
- icon="πŸ§‘β€πŸŒΎ",
48
- )
49
-
50
- ################################################################################
51
- # CONFIGURATION
52
- ################################################################################
53
-
54
- st.subheader("🌾 Project Configuration")
55
-
56
- project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
57
- hub_username = st.text_input("Hub Username", "argilla")
58
- hub_token = st.text_input("Hub Token", type="password")
59
- private_selector = st.checkbox("Private Space", value=False)
60
-
61
- if st.button("πŸ€— Setup Project Resources"):
62
- repo_id = f"{hub_username}/{project_name}"
63
 
64
- setup_dataset_on_hub(
65
- repo_id=repo_id,
66
- hub_token=hub_token,
67
- )
 
68
 
69
- st.success(
70
- f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
71
- )
72
 
73
- space_name = f"{project_name}_config_space"
 
 
 
74
 
75
- duplicate_space_on_hub(
76
- source_repo="argilla/domain-specific-datasets-template",
77
- target_repo=space_name,
78
- hub_token=hub_token,
79
- private=private_selector,
80
- )
81
 
82
- st.success(
83
- f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
84
- )
 
85
 
86
- argilla_name = f"{project_name}_argilla_space"
 
 
 
 
87
 
88
- duplicate_space_on_hub(
89
- source_repo="argilla/argilla-template-space",
90
- target_repo=argilla_name,
91
- hub_token=hub_token,
92
- private=private_selector,
93
- )
94
 
95
- st.success(
96
- f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
97
- )
 
98
 
99
- seconds = 5
 
 
 
 
100
 
101
- with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
102
- time.sleep(seconds)
103
- add_project_config_to_space_repo(
104
- dataset_repo_id=repo_id,
105
- hub_token=hub_token,
106
- project_name=project_name,
107
- argilla_space_repo_id=f"{hub_username}/{argilla_name}",
108
- project_space_repo_id=f"{hub_username}/{space_name}",
109
- )
110
 
111
- st.subheader("πŸ‘’ Next Steps")
112
 
113
- st.write("Go to you project specific space!")
114
 
115
- st.link_button(
116
- "πŸ§‘β€πŸŒΎ Open Configuration Space",
117
- f"https://huggingface.co/spaces/{hub_username}/{space_name}",
118
- )
 
1
+ import streamlit as st
2
 
3
+ from defaults import (
4
+ PROJECT_NAME,
5
+ ARGILLA_SPACE_REPO_ID,
6
+ DATASET_REPO_ID,
7
+ ARGILLA_URL,
8
+ PROJECT_SPACE_REPO_ID,
9
+ DIBT_PARENT_APP_URL,
10
  )
11
+ from utils import project_sidebar
12
 
13
+ st.set_page_config("Domain Data Grower", page_icon="πŸ§‘β€πŸŒΎ")
14
 
15
+ project_sidebar()
16
 
17
+ if PROJECT_NAME == "DEFAULT_DOMAIN":
18
+ st.warning(
19
+ "Please set up the project configuration in the parent app before proceeding."
20
+ )
21
+ st.stop()
22
 
 
 
 
23
 
24
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
25
  st.divider()
26
 
 
 
 
 
 
 
 
 
 
 
27
  st.markdown(
28
+ """
29
+ ## 🌱 Create a dataset seed for aligning models to a specific domain
 
 
 
30
 
31
+ This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
32
+ Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
33
  """
34
  )
35
+ st.markdown(
36
+ """
37
+ ## 🚜 How it works
38
 
39
+ You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
40
+ The dataset seed is then used to generate synthetic data for training a language model.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ """
43
+ )
44
+ st.markdown(
45
+ """
46
+ ## πŸ—ΊοΈ The process
47
 
48
+ ### Step 1: ~~Setup the project~~
 
 
49
 
50
+ ~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
51
+ """
52
+ )
53
+ st.link_button("πŸš€ ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
54
 
55
+ st.markdown(
56
+ """
57
+ ### Step 2: Describe the Domain
 
 
 
58
 
59
+ Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
60
+ You can collaborate with domain experts to define the domain expertise and perspectives.
61
+ """
62
+ )
63
 
64
+ st.page_link(
65
+ "pages/2_πŸ‘©πŸΌβ€πŸ”¬ Describe Domain.py",
66
+ label="Describe Domain",
67
+ icon="πŸ‘©πŸΌβ€πŸ”¬",
68
+ )
69
 
70
+ st.markdown(
71
+ """
72
+ ### Step 3: Generate Synthetic Data
 
 
 
73
 
74
+ Use distilabel to generate synthetic data for your domain-specific dataset.
75
+ You can run the pipeline locally or in this space to generate synthetic data.
76
+ """
77
+ )
78
 
79
+ st.page_link(
80
+ "pages/3_🌱 Generate Dataset.py",
81
+ label="Generate Dataset",
82
+ icon="🌱",
83
+ )
84
 
85
+ st.markdown(
86
+ """
87
+ ### Step 4: Review the Dataset
 
 
 
 
 
 
88
 
89
+ Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
90
 
 
91
 
92
+ """
93
+ )
94
+ st.link_button("πŸ” Review the dataset in Argilla", ARGILLA_URL)
 
defaults.py CHANGED
@@ -1,7 +1,49 @@
 
1
  import json
2
 
3
  SEED_DATA_PATH = "seed_data.json"
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  with open(SEED_DATA_PATH) as f:
6
  DEFAULT_DATA = json.load(f)
 
7
  DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import json
3
 
4
  SEED_DATA_PATH = "seed_data.json"
5
+ PIPELINE_PATH = "pipeline.yaml"
6
+ REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py", "requirements.txt"]
7
+ DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
8
+ N_PERSPECTIVES = 5
9
+ N_TOPICS = 5
10
+ N_EXAMPLES = 5
11
+ CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True)
12
+
13
+ ################################################
14
+ # DEFAULTS ON FARMING
15
+ ################################################
16
 
17
  with open(SEED_DATA_PATH) as f:
18
  DEFAULT_DATA = json.load(f)
19
+
20
  DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
21
+ DEFAULT_PERSPECTIVES = DEFAULT_DATA["perspectives"]
22
+ DEFAULT_TOPICS = DEFAULT_DATA["topics"]
23
+ DEFAULT_EXAMPLES = DEFAULT_DATA["examples"]
24
+ DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
25
+
26
+ ################################################
27
+ # PROJECT CONFIG FROM PARENT APP
28
+ ################################################
29
+
30
+ try:
31
+ with open("project_config.json") as f:
32
+ PROJECT_CONFIG = json.load(f)
33
+
34
+ PROJECT_NAME = PROJECT_CONFIG["project_name"]
35
+ ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
36
+ DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
37
+ ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
38
+ ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
39
+ PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
40
+ DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
41
+ HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
42
+ except FileNotFoundError:
43
+ PROJECT_NAME = "DEFAULT_DOMAIN"
44
+ ARGILLA_SPACE_REPO_ID = ""
45
+ DATASET_REPO_ID = ""
46
+ ARGILLA_URL = ""
47
+ PROJECT_SPACE_REPO_ID = ""
48
+ DATASET_URL = ""
49
+ HUB_USERNAME = ""
domain.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from distilabel.steps.tasks.typing import ChatType
5
+ from distilabel.steps.tasks.text_generation import TextGeneration
6
+ from distilabel.steps import StepInput, StepOutput, Step
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ from defaults import (
11
+ DEFAULT_DOMAIN,
12
+ DEFAULT_PERSPECTIVES,
13
+ DEFAULT_TOPICS,
14
+ DEFAULT_EXAMPLES,
15
+ DEFAULT_SYSTEM_PROMPT,
16
+ N_PERSPECTIVES,
17
+ N_TOPICS,
18
+ N_EXAMPLES,
19
+ )
20
+
21
+ load_dotenv()
22
+
23
+ # Application description used for SelfInstruct
24
+ APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
25
+ Your should not expect basic but profound questions from your users.
26
+ The queries should reflect a diversity of vision and economic positions and political positions.
27
+ The queries may know about different methods of {DEFAULT_DOMAIN}.
28
+ The queries can be positioned politically, economically, socially, or practically.
29
+ Also take into account the impact of diverse causes on diverse domains."""
30
+
31
+
32
+ TOPICS = DEFAULT_TOPICS[:N_TOPICS]
33
+ PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
34
+ EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
35
+
36
+
37
+ def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
38
+ questions = """ Examples of high quality questions:"""
39
+ answers = """ Examples of high quality answers:"""
40
+ for example in examples:
41
+ questions += f"""\n- Question: {example["question"]}\n"""
42
+ answers += f"""\n- Answer: {example["answer"]}\n"""
43
+
44
+ _template: str = (
45
+ """{instruction}\nThis is the the instruction.\n Examples: """
46
+ + questions
47
+ + answers
48
+ )
49
+ return _template
50
+
51
+
52
+ def create_topics(topics: List[str], positions: List[str]) -> List[str]:
53
+ return [
54
+ f"{topic} from a {position} perspective"
55
+ for topic in topics
56
+ for position in positions
57
+ ]
58
+
59
+
60
+ class DomainExpert(TextGeneration):
61
+ """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
62
+
63
+ _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
64
+ _template: str = """{instruction}\nThis is the the instruction.\n Examples: """
65
+
66
+ def format_input(self, input: Dict[str, Any]) -> "ChatType":
67
+ return [
68
+ {
69
+ "role": "system",
70
+ "content": self._system_prompt,
71
+ },
72
+ {
73
+ "role": "user",
74
+ "content": self._template.format(**input),
75
+ },
76
+ ]
77
+
78
+
79
+ class CleanNumberedList(Step):
80
+ """A step to clean the numbered list of questions."""
81
+
82
+ def process(self, inputs: StepInput) -> StepOutput:
83
+ import re
84
+
85
+ pattern = r"^\d+\.\s"
86
+
87
+ for input in inputs:
88
+ input["question"] = re.sub(pattern, "", input["question"])
89
+ yield inputs
hub.py CHANGED
@@ -1,10 +1,43 @@
1
  import json
 
2
 
3
- from huggingface_hub import duplicate_space, HfApi
 
 
 
4
 
5
 
6
  hf_api = HfApi()
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def setup_dataset_on_hub(repo_id, hub_token):
10
  # create an empty dataset repo on the hub
@@ -12,52 +45,85 @@ def setup_dataset_on_hub(repo_id, hub_token):
12
  repo_id=repo_id,
13
  token=hub_token,
14
  repo_type="dataset",
 
15
  )
16
 
17
- # upload the seed data
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  hf_api.upload_file(
19
- path_or_fileobj="seed_data.json",
20
  path_in_repo="seed_data.json",
 
21
  repo_id=repo_id,
22
  repo_type="dataset",
23
- token=hub_token,
24
  )
25
 
26
-
27
- def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
28
- duplicate_space(
29
- from_id=source_repo,
30
- to_id=target_repo,
 
 
31
  token=hub_token,
32
- private=private,
33
- exist_ok=True,
34
  )
35
 
36
 
37
- def add_project_config_to_space_repo(
38
- dataset_repo_id,
39
- hub_token,
 
40
  project_name,
41
- argilla_space_repo_id,
42
- project_space_repo_id,
43
  ):
44
- # upload the seed data and readme to the hub
45
-
46
- with open("project_config.json", "w") as f:
47
- json.dump(
48
- {
49
- "project_name": project_name,
50
- "argilla_space_repo_id": argilla_space_repo_id,
51
- "project_space_repo_id": project_space_repo_id,
52
- "dataset_repo_id": dataset_repo_id,
53
- },
54
- f,
55
- )
56
 
 
57
  hf_api.upload_file(
58
- path_or_fileobj="project_config.json",
59
- path_in_repo="project_config.json",
60
  token=hub_token,
61
- repo_id=project_space_repo_id,
62
- repo_type="space",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  )
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ from tempfile import mktemp
3
 
4
+ import argilla as rg
5
+ from huggingface_hub import HfApi
6
+
7
+ from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
8
 
9
 
10
  hf_api = HfApi()
11
 
12
+ with open("DATASET_README_BASE.md") as f:
13
+ DATASET_README_BASE = f.read()
14
+
15
+
16
+ def create_readme(domain_seed_data, project_name, domain):
17
+ # create a readme for the project that shows the domain and project name
18
+ readme = DATASET_README_BASE
19
+ readme += f"# {project_name}\n\n## Domain: {domain}"
20
+ perspectives = domain_seed_data.get("perspectives")
21
+ topics = domain_seed_data.get("topics")
22
+ examples = domain_seed_data.get("examples")
23
+ if perspectives:
24
+ readme += "\n\n## Perspectives\n\n"
25
+ for p in perspectives:
26
+ readme += f"- {p}\n"
27
+ if topics:
28
+ readme += "\n\n## Topics\n\n"
29
+ for t in topics:
30
+ readme += f"- {t}\n"
31
+ if examples:
32
+ readme += "\n\n## Examples\n\n"
33
+ for example in examples:
34
+ readme += f"### {example['question']}\n\n{example['answer']}\n\n"
35
+ temp_file = mktemp()
36
+
37
+ with open(temp_file, "w") as f:
38
+ f.write(readme)
39
+ return temp_file
40
+
41
 
42
  def setup_dataset_on_hub(repo_id, hub_token):
43
  # create an empty dataset repo on the hub
 
45
  repo_id=repo_id,
46
  token=hub_token,
47
  repo_type="dataset",
48
+ exist_ok=True,
49
  )
50
 
51
+
52
+ def push_dataset_to_hub(
53
+ domain_seed_data_path,
54
+ project_name,
55
+ domain,
56
+ pipeline_path,
57
+ hub_username,
58
+ hub_token: str,
59
+ ):
60
+ repo_id = f"{hub_username}/{project_name}"
61
+
62
+ setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
63
+
64
+ # upload the seed data and readme to the hub
65
  hf_api.upload_file(
66
+ path_or_fileobj=domain_seed_data_path,
67
  path_in_repo="seed_data.json",
68
+ token=hub_token,
69
  repo_id=repo_id,
70
  repo_type="dataset",
 
71
  )
72
 
73
+ # upload the readme to the hub
74
+ domain_seed_data = json.load(open(domain_seed_data_path))
75
+ hf_api.upload_file(
76
+ path_or_fileobj=create_readme(
77
+ domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
78
+ ),
79
+ path_in_repo="README.md",
80
  token=hub_token,
81
+ repo_id=repo_id,
82
+ repo_type="dataset",
83
  )
84
 
85
 
86
+ def push_pipeline_to_hub(
87
+ pipeline_path,
88
+ hub_username,
89
+ hub_token: str,
90
  project_name,
 
 
91
  ):
92
+ repo_id = f"{hub_username}/{project_name}"
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # upload the pipeline to the hub
95
  hf_api.upload_file(
96
+ path_or_fileobj=pipeline_path,
97
+ path_in_repo="pipeline.yaml",
98
  token=hub_token,
99
+ repo_id=repo_id,
100
+ repo_type="dataset",
101
+ )
102
+
103
+ for code_path in REMOTE_CODE_PATHS:
104
+ hf_api.upload_file(
105
+ path_or_fileobj=code_path,
106
+ path_in_repo=code_path,
107
+ token=hub_token,
108
+ repo_id=repo_id,
109
+ repo_type="dataset",
110
+ )
111
+
112
+ print(f"Dataset uploaded to {repo_id}")
113
+
114
+
115
+ def pull_seed_data_from_repo(repo_id, hub_token):
116
+ # pull the dataset repo from the hub
117
+ hf_api.hf_hub_download(
118
+ repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
119
  )
120
+ return json.load(open(SEED_DATA_PATH))
121
+
122
+
123
+ def push_argilla_dataset_to_hub(
124
+ name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
125
+ ):
126
+ rg.init(api_url=url, api_key=api_key)
127
+ feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
128
+ local_dataset = feedback_dataset.pull()
129
+ local_dataset.push_to_huggingface(repo_id=repo_id)
infer.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+
4
+ API_URL = (
5
+ "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
6
+ )
7
+
8
+
9
+
10
+
11
+
12
+ def query(question, hub_token: str):
13
+ payload = {
14
+ "inputs": question,
15
+ }
16
+ headers = {"Authorization": f"Bearer {hub_token}"}
17
+ response = requests.post(API_URL, headers=headers, json=payload)
18
+ return response.json()[0]["generated_text"]
pages/2_πŸ‘©πŸΌβ€πŸ”¬ Describe Domain.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import streamlit as st
4
+
5
+ from hub import push_dataset_to_hub
6
+ from infer import query
7
+ from defaults import (
8
+ DEFAULT_DOMAIN,
9
+ DEFAULT_PERSPECTIVES,
10
+ DEFAULT_TOPICS,
11
+ DEFAULT_EXAMPLES,
12
+ DEFAULT_SYSTEM_PROMPT,
13
+ N_PERSPECTIVES,
14
+ N_TOPICS,
15
+ SEED_DATA_PATH,
16
+ PIPELINE_PATH,
17
+ DATASET_REPO_ID,
18
+ )
19
+ from utils import project_sidebar
20
+
21
+ st.set_page_config(
22
+ page_title="Domain Data Grower",
23
+ page_icon="πŸ§‘β€πŸŒΎ",
24
+ )
25
+ project_sidebar()
26
+
27
+ ################################################################################
28
+ # HEADER
29
+ ################################################################################
30
+
31
+ st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
32
+ st.divider()
33
+ st.subheader(
34
+ "Step 2. Define the specific domain that you want to generate synthetic data for.",
35
+ )
36
+ st.write(
37
+ "Define the project details, including the project name, domain, and API credentials"
38
+ )
39
+
40
+ ################################################################################
41
+ # Domain Expert Section
42
+ ################################################################################
43
+
44
+ (
45
+ tab_domain_expert,
46
+ tab_domain_perspectives,
47
+ tab_domain_topics,
48
+ tab_examples,
49
+ tab_raw_seed,
50
+ ) = st.tabs(
51
+ tabs=[
52
+ "πŸ‘©πŸΌβ€πŸ”¬ Domain Expert",
53
+ "πŸ” Domain Perspectives",
54
+ "πŸ•ΈοΈ Domain Topics",
55
+ "πŸ“š Examples",
56
+ "🌱 Raw Seed Data",
57
+ ]
58
+ )
59
+
60
+ with tab_domain_expert:
61
+ st.text("Define the domain expertise that you want to train a language model")
62
+ st.info(
63
+ "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
64
+ )
65
+
66
+ domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
67
+
68
+ domain_expert_prompt = st.text_area(
69
+ label="Domain Expert Definition",
70
+ value=DEFAULT_SYSTEM_PROMPT,
71
+ height=200,
72
+ )
73
+
74
+ ################################################################################
75
+ # Domain Perspectives
76
+ ################################################################################
77
+
78
+ with tab_domain_perspectives:
79
+ st.text("Define the different perspectives from which the domain can be viewed")
80
+ st.info(
81
+ """
82
+ Perspectives are different viewpoints or angles from which a domain can be viewed.
83
+ For example, the domain of farming can be viewed from the perspective of a commercial
84
+ farmer or an independent family farmer."""
85
+ )
86
+
87
+ perspectives = st.session_state.get(
88
+ "perspectives",
89
+ [DEFAULT_PERSPECTIVES[0]],
90
+ )
91
+ perspectives_container = st.container()
92
+
93
+ perspectives = [
94
+ perspectives_container.text_input(
95
+ f"Domain Perspective {i + 1}", value=perspective
96
+ )
97
+ for i, perspective in enumerate(perspectives)
98
+ ]
99
+
100
+ if st.button("Add Perspective", key="add_perspective"):
101
+ n = len(perspectives)
102
+ value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
103
+ perspectives.append(
104
+ perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
105
+ )
106
+
107
+ st.session_state["perspectives"] = perspectives
108
+
109
+
110
+ ################################################################################
111
+ # Domain Topics
112
+ ################################################################################
113
+
114
+ with tab_domain_topics:
115
+ st.text("Define the main themes or subjects that are relevant to the domain")
116
+ st.info(
117
+ """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
118
+ )
119
+ topics = st.session_state.get(
120
+ "topics",
121
+ [DEFAULT_TOPICS[0]],
122
+ )
123
+ topics_container = st.container()
124
+ topics = [
125
+ topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
126
+ for i, topic in enumerate(topics)
127
+ ]
128
+
129
+ if st.button("Add Topic", key="add_topic"):
130
+ n = len(topics)
131
+ value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
132
+ topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
133
+
134
+ st.session_state["topics"] = topics
135
+
136
+
137
+ ################################################################################
138
+ # Examples Section
139
+ ################################################################################
140
+
141
+ with tab_examples:
142
+ st.text(
143
+ "Add high-quality questions and answers that can be used to generate synthetic data"
144
+ )
145
+ st.info(
146
+ """
147
+ Examples are high-quality questions and answers that can be used to generate
148
+ synthetic data for the domain. These examples will be used to train the language model
149
+ to generate questions and answers.
150
+ """
151
+ )
152
+
153
+ examples = st.session_state.get(
154
+ "examples",
155
+ [
156
+ {
157
+ "question": "",
158
+ "answer": "",
159
+ }
160
+ ],
161
+ )
162
+
163
+ for n, example in enumerate(examples, 1):
164
+ question = example["question"]
165
+ answer = example["answer"]
166
+ examples_container = st.container()
167
+ question_column, answer_column = examples_container.columns(2)
168
+
169
+ if st.button(f"Generate Answer {n}"):
170
+ if st.session_state["hub_token"] is None:
171
+ st.error("Please provide a Hub token to generate answers")
172
+ else:
173
+ answer = query(question, st.session_state["hub_token"])
174
+ with question_column:
175
+ question = st.text_area(f"Question {n}", value=question)
176
+
177
+ with answer_column:
178
+ answer = st.text_area(f"Answer {n}", value=answer)
179
+ examples[n - 1] = {"question": question, "answer": answer}
180
+ st.session_state["examples"] = examples
181
+ st.divider()
182
+
183
+ if st.button("Add Example"):
184
+ examples.append({"question": "", "answer": ""})
185
+ st.session_state["examples"] = examples
186
+ st.rerun()
187
+
188
+ ################################################################################
189
+ # Save Domain Data
190
+ ################################################################################
191
+
192
+ perspectives = list(filter(None, perspectives))
193
+ topics = list(filter(None, topics))
194
+
195
+ domain_data = {
196
+ "domain": domain,
197
+ "perspectives": perspectives,
198
+ "topics": topics,
199
+ "examples": examples,
200
+ "domain_expert_prompt": domain_expert_prompt,
201
+ }
202
+
203
+ with open(SEED_DATA_PATH, "w") as f:
204
+ json.dump(domain_data, f, indent=2)
205
+
206
+ with tab_raw_seed:
207
+ st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
208
+
209
+ ################################################################################
210
+ # Setup Dataset on the Hub
211
+ ################################################################################
212
+
213
+ st.divider()
214
+
215
+ hub_username = DATASET_REPO_ID.split("/")[0]
216
+ project_name = DATASET_REPO_ID.split("/")[1]
217
+ st.write("Define the dataset repo details on the Hub")
218
+ st.session_state["project_name"] = st.text_input("Project Name", project_name)
219
+ st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
220
+ st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
221
+
222
+ if all(
223
+ (
224
+ st.session_state.get("project_name"),
225
+ st.session_state.get("hub_username"),
226
+ st.session_state.get("hub_token"),
227
+ )
228
+ ):
229
+ st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
230
+
231
+
232
+ if st.button("πŸ€— Push Dataset Seed") and all(
233
+ (
234
+ domain,
235
+ domain_expert_prompt,
236
+ perspectives,
237
+ topics,
238
+ questions_answers,
239
+ )
240
+ ):
241
+ if all(
242
+ (
243
+ st.session_state.get("project_name"),
244
+ st.session_state.get("hub_username"),
245
+ st.session_state.get("hub_token"),
246
+ )
247
+ ):
248
+ project_name = st.session_state["project_name"]
249
+ hub_username = st.session_state["hub_username"]
250
+ hub_token = st.session_state["hub_token"]
251
+ else:
252
+ st.error(
253
+ "Please create a dataset repo on the Hub before pushing the dataset seed"
254
+ )
255
+ st.stop()
256
+
257
+ push_dataset_to_hub(
258
+ domain_seed_data_path=SEED_DATA_PATH,
259
+ project_name=project_name,
260
+ domain=domain,
261
+ hub_username=hub_username,
262
+ hub_token=hub_token,
263
+ pipeline_path=PIPELINE_PATH,
264
+ )
265
+
266
+ st.success(
267
+ f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
268
+ )
269
+
270
+ st.write("You can now move on to runnning your distilabel pipeline.")
271
+
272
+ st.page_link(
273
+ page="pages/3_🌱 Generate Dataset.py",
274
+ label="Generate Dataset",
275
+ icon="🌱",
276
+ )
277
+
278
+ else:
279
+ st.info(
280
+ "Please fill in all the required domain fields to push the dataset seed to the Hub"
281
+ )
pages/3_🌱 Generate Dataset.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from hub import pull_seed_data_from_repo, push_pipeline_to_hub
4
+ from defaults import (
5
+ DEFAULT_SYSTEM_PROMPT,
6
+ PIPELINE_PATH,
7
+ PROJECT_NAME,
8
+ ARGILLA_URL,
9
+ HUB_USERNAME,
10
+ CODELESS_DISTILABEL,
11
+ )
12
+ from utils import project_sidebar
13
+
14
+ from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
15
+
16
+ st.set_page_config(
17
+ page_title="Domain Data Grower",
18
+ page_icon="πŸ§‘β€πŸŒΎ",
19
+ )
20
+
21
+ project_sidebar()
22
+
23
+ ################################################################################
24
+ # HEADER
25
+ ################################################################################
26
+
27
+ st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
28
+ st.divider()
29
+ st.subheader("Step 3. Run the pipeline to generate synthetic data")
30
+ st.write("Define the project repos and models that the pipeline will use.")
31
+
32
+ st.divider()
33
+ ###############################################################
34
+ # CONFIGURATION
35
+ ###############################################################
36
+
37
+ st.markdown("## Pipeline Configuration")
38
+
39
+ st.markdown("#### πŸ€— Hub details to pull the seed data")
40
+ hub_username = st.text_input("Hub Username", HUB_USERNAME)
41
+ project_name = st.text_input("Project Name", PROJECT_NAME)
42
+ repo_id = f"{hub_username}/{project_name}"
43
+ hub_token = st.text_input("Hub Token", type="password")
44
+
45
+ st.divider()
46
+
47
+ st.markdown("#### πŸ€– Inference configuration")
48
+
49
+ st.write(
50
+ "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
51
+ )
52
+
53
+ with st.expander("πŸ€— Recommended Models"):
54
+ st.write("All inference endpoint compatible models can be found via the link below")
55
+ st.link_button(
56
+ "πŸ€— Inference compaptible models on the hub",
57
+ "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
58
+ )
59
+ st.write("πŸ”‹Projects with sufficient resources could take advantage of LLama3 70b")
60
+ st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
61
+
62
+ st.write("πŸͺ«Projects with less resources could take advantage of LLama 3 8b")
63
+ st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
64
+
65
+ st.write("πŸƒProjects with even less resources could take advantage of Phi-2")
66
+ st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
67
+
68
+ st.write("Note Hugggingface Pro gives access to more compute resources")
69
+ st.link_button(
70
+ "πŸ€— Huggingface Pro",
71
+ "https://huggingface.co/pricing",
72
+ )
73
+
74
+
75
+ base_url = st.text_input(
76
+ label="Base URL for the Inference API",
77
+ value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
78
+ )
79
+ st.divider()
80
+ st.markdown("#### πŸ”¬ Argilla API details to push the generated dataset")
81
+ argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
82
+ argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
83
+ argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
84
+ st.divider()
85
+
86
+ ###############################################################
87
+ # LOCAL
88
+ ###############################################################
89
+
90
+ st.markdown("## Run the pipeline")
91
+
92
+ st.write(
93
+ "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
94
+ )
95
+
96
+ if CODELESS_DISTILABEL:
97
+ st.write(
98
+ """We recommend running the pipeline locally if you're planning on generating a large dataset. \
99
+ But running the pipeline on this space is a handy way to get started quickly. Your synthetic
100
+ samples will be pushed to Argilla and available for review.
101
+ """
102
+ )
103
+ st.write(
104
+ """If you're planning on running the pipeline on the space, be aware that it \
105
+ will take some time to complete and you will need to maintain a \
106
+ connection to the space."""
107
+ )
108
+
109
+
110
+ if st.button("πŸ’» Run pipeline locally", key="run_pipeline_local"):
111
+ if all(
112
+ [
113
+ argilla_api_key,
114
+ argilla_url,
115
+ base_url,
116
+ hub_username,
117
+ project_name,
118
+ hub_token,
119
+ argilla_dataset_name,
120
+ ]
121
+ ):
122
+ with st.spinner("Pulling seed data from the Hub..."):
123
+ try:
124
+ seed_data = pull_seed_data_from_repo(
125
+ repo_id=f"{hub_username}/{project_name}",
126
+ hub_token=hub_token,
127
+ )
128
+ except Exception:
129
+ st.error(
130
+ "Seed data not found. Please make sure you pushed the data seed in Step 2."
131
+ )
132
+
133
+ domain = seed_data["domain"]
134
+ perspectives = seed_data["perspectives"]
135
+ topics = seed_data["topics"]
136
+ examples = seed_data["examples"]
137
+ domain_expert_prompt = seed_data["domain_expert_prompt"]
138
+
139
+ with st.spinner("Serializing the pipeline configuration..."):
140
+ serialize_pipeline(
141
+ argilla_api_key=argilla_api_key,
142
+ argilla_dataset_name=argilla_dataset_name,
143
+ argilla_api_url=argilla_url,
144
+ topics=topics,
145
+ perspectives=perspectives,
146
+ pipeline_config_path=PIPELINE_PATH,
147
+ domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
148
+ hub_token=hub_token,
149
+ endpoint_base_url=base_url,
150
+ examples=examples,
151
+ )
152
+ push_pipeline_to_hub(
153
+ pipeline_path=PIPELINE_PATH,
154
+ hub_token=hub_token,
155
+ hub_username=hub_username,
156
+ project_name=project_name,
157
+ )
158
+
159
+ st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
160
+
161
+ st.info(
162
+ "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
163
+ )
164
+ st.text(
165
+ "Execute the following command to generate a synthetic dataset from the seed data:"
166
+ )
167
+ command_to_run = create_pipelines_run_command(
168
+ hub_token=hub_token,
169
+ pipeline_config_path=PIPELINE_PATH,
170
+ argilla_dataset_name=argilla_dataset_name,
171
+ argilla_api_key=argilla_api_key,
172
+ argilla_api_url=argilla_url,
173
+ )
174
+ st.code(
175
+ f"""
176
+ pip install git+https://github.com/argilla-io/distilabel.git
177
+ git clone https://huggingface.co/datasets/{hub_username}/{project_name}
178
+ cd {project_name}
179
+ pip install -r requirements.txt
180
+ {' '.join(["python"] + command_to_run[1:])}
181
+ """,
182
+ language="bash",
183
+ )
184
+ st.subheader(
185
+ "πŸ‘©β€πŸš€ If you want to access the pipeline and manipulate the locally, you can do:"
186
+ )
187
+ st.code(
188
+ """
189
+ git clone https://github.com/huggingface/data-is-better-together
190
+ cd domain-specific-datasets
191
+ """
192
+ )
193
+ else:
194
+ st.error("Please fill all the required fields.")
195
+
196
+ ###############################################################
197
+ # SPACE
198
+ ###############################################################
199
+ if CODELESS_DISTILABEL:
200
+ if st.button("πŸ”₯ Run pipeline right here, right now!"):
201
+ if all(
202
+ [
203
+ argilla_api_key,
204
+ argilla_url,
205
+ base_url,
206
+ hub_username,
207
+ project_name,
208
+ hub_token,
209
+ argilla_dataset_name,
210
+ ]
211
+ ):
212
+ with st.spinner("Pulling seed data from the Hub..."):
213
+ try:
214
+ seed_data = pull_seed_data_from_repo(
215
+ repo_id=f"{hub_username}/{project_name}",
216
+ hub_token=hub_token,
217
+ )
218
+ except Exception as e:
219
+ st.error(
220
+ "Seed data not found. Please make sure you pushed the data seed in Step 2."
221
+ )
222
+
223
+ domain = seed_data["domain"]
224
+ perspectives = seed_data["perspectives"]
225
+ topics = seed_data["topics"]
226
+ examples = seed_data["examples"]
227
+ domain_expert_prompt = seed_data["domain_expert_prompt"]
228
+
229
+ serialize_pipeline(
230
+ argilla_api_key=argilla_api_key,
231
+ argilla_dataset_name=argilla_dataset_name,
232
+ argilla_api_url=argilla_url,
233
+ topics=topics,
234
+ perspectives=perspectives,
235
+ pipeline_config_path=PIPELINE_PATH,
236
+ domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
237
+ hub_token=hub_token,
238
+ endpoint_base_url=base_url,
239
+ examples=examples,
240
+ )
241
+
242
+ with st.spinner("Starting the pipeline..."):
243
+ logs = run_pipeline(
244
+ pipeline_config_path=PIPELINE_PATH,
245
+ argilla_api_key=argilla_api_key,
246
+ argilla_api_url=argilla_url,
247
+ hub_token=hub_token,
248
+ argilla_dataset_name=argilla_dataset_name,
249
+ )
250
+
251
+ st.success(f"Pipeline started successfully! πŸš€")
252
+
253
+ with st.expander(label="View Logs", expanded=True):
254
+ for out in logs:
255
+ st.text(out)
256
+ else:
257
+ st.error("Please fill all the required fields.")
pages/4_πŸ” Review Generated Data.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
4
+ from utils import project_sidebar
5
+ from hub import push_argilla_dataset_to_hub
6
+
7
+ st.set_page_config(
8
+ page_title="Domain Data Grower",
9
+ page_icon="πŸ§‘β€πŸŒΎ",
10
+ )
11
+
12
+ project_sidebar()
13
+
14
+ ################################################################################
15
+ # HEADER
16
+ ################################################################################
17
+
18
+ st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
19
+ st.divider()
20
+
21
+ st.write(
22
+ """Once you have reviewed the synthetic data in Argilla, you can publish the
23
+ generated dataset to the Hub."""
24
+ )
25
+
26
+
27
+ ################################################################################
28
+ # Configuration
29
+ ################################################################################
30
+
31
+ st.divider()
32
+ st.write("πŸ”¬ Argilla API details to push the generated dataset")
33
+ argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
34
+ argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
35
+ argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
36
+ dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
37
+ st.divider()
38
+
39
+ if st.button("πŸš€ Publish the generated dataset"):
40
+ with st.spinner("Publishing the generated dataset..."):
41
+ push_argilla_dataset_to_hub(
42
+ name=argilla_dataset_name,
43
+ repo_id=dataset_repo_id,
44
+ url=argilla_url,
45
+ api_key=argilla_api_key,
46
+ workspace="admin",
47
+ )
48
+ st.success("The generated dataset has been published to the Hub.")
pipeline.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import time
4
+ from typing import List
5
+
6
+ from distilabel.steps.generators.data import LoadDataFromDicts
7
+ from distilabel.steps.expand import ExpandColumns
8
+ from distilabel.steps.keep import KeepColumns
9
+ from distilabel.steps.tasks.self_instruct import SelfInstruct
10
+ from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
11
+ from distilabel.llms.huggingface import InferenceEndpointsLLM
12
+ from distilabel.pipeline import Pipeline
13
+ from distilabel.steps import TextGenerationToArgilla
14
+ from dotenv import load_dotenv
15
+
16
+ from domain import (
17
+ DomainExpert,
18
+ CleanNumberedList,
19
+ create_topics,
20
+ create_examples_template,
21
+ APPLICATION_DESCRIPTION,
22
+ )
23
+
24
+ load_dotenv()
25
+
26
+
27
+ def define_pipeline(
28
+ argilla_api_key: str,
29
+ argilla_api_url: str,
30
+ argilla_dataset_name: str,
31
+ topics: List[str],
32
+ perspectives: List[str],
33
+ domain_expert_prompt: str,
34
+ examples: List[dict],
35
+ hub_token: str,
36
+ endpoint_base_url: str,
37
+ ):
38
+ """Define the pipeline for the specific domain."""
39
+
40
+ terms = create_topics(topics, perspectives)
41
+ template = create_examples_template(examples)
42
+ with Pipeline("farming") as pipeline:
43
+ load_data = LoadDataFromDicts(
44
+ name="load_data",
45
+ data=[{"input": term} for term in terms],
46
+ batch_size=64,
47
+ )
48
+ llm = InferenceEndpointsLLM(
49
+ base_url=endpoint_base_url,
50
+ api_key=hub_token,
51
+ )
52
+ self_instruct = SelfInstruct(
53
+ name="self-instruct",
54
+ application_description=APPLICATION_DESCRIPTION,
55
+ num_instructions=5,
56
+ input_batch_size=8,
57
+ llm=llm,
58
+ )
59
+
60
+ evol_instruction_complexity = EvolInstruct(
61
+ name="evol_instruction_complexity",
62
+ llm=llm,
63
+ num_evolutions=2,
64
+ store_evolutions=True,
65
+ input_batch_size=8,
66
+ include_original_instruction=True,
67
+ input_mappings={"instruction": "question"},
68
+ )
69
+
70
+ expand_instructions = ExpandColumns(
71
+ name="expand_columns", columns={"instructions": "question"}
72
+ )
73
+ cleaner = CleanNumberedList(name="clean_numbered_list")
74
+ expand_evolutions = ExpandColumns(
75
+ name="expand_columns_evolved",
76
+ columns={"evolved_instructions": "evolved_questions"},
77
+ )
78
+
79
+ domain_expert = DomainExpert(
80
+ name="domain_expert",
81
+ llm=llm,
82
+ input_batch_size=8,
83
+ input_mappings={"instruction": "evolved_questions"},
84
+ output_mappings={"generation": "domain_expert_answer"},
85
+ )
86
+
87
+ domain_expert._system_prompt = domain_expert_prompt
88
+ domain_expert._template = template
89
+
90
+ keep_columns = KeepColumns(
91
+ name="keep_columns",
92
+ columns=["model_name", "evolved_questions", "domain_expert_answer"],
93
+ )
94
+
95
+ to_argilla = TextGenerationToArgilla(
96
+ name="text_generation_to_argilla",
97
+ dataset_name=argilla_dataset_name,
98
+ dataset_workspace="admin",
99
+ api_url=argilla_api_url,
100
+ api_key=argilla_api_key,
101
+ input_mappings={
102
+ "instruction": "evolved_questions",
103
+ "generation": "domain_expert_answer",
104
+ },
105
+ )
106
+
107
+ load_data.connect(self_instruct)
108
+ self_instruct.connect(expand_instructions)
109
+ expand_instructions.connect(cleaner)
110
+ cleaner.connect(evol_instruction_complexity)
111
+ evol_instruction_complexity.connect(expand_evolutions)
112
+ expand_evolutions.connect(domain_expert)
113
+ domain_expert.connect(keep_columns)
114
+ keep_columns.connect(to_argilla)
115
+ return pipeline
116
+
117
+
118
+ def serialize_pipeline(
119
+ argilla_api_key: str,
120
+ argilla_api_url: str,
121
+ argilla_dataset_name: str,
122
+ topics: List[str],
123
+ perspectives: List[str],
124
+ domain_expert_prompt: str,
125
+ hub_token: str,
126
+ endpoint_base_url: str,
127
+ pipeline_config_path: str = "pipeline.yaml",
128
+ examples: List[dict] = [],
129
+ ):
130
+ """Serialize the pipeline to a yaml file."""
131
+ pipeline = define_pipeline(
132
+ argilla_api_key=argilla_api_key,
133
+ argilla_api_url=argilla_api_url,
134
+ argilla_dataset_name=argilla_dataset_name,
135
+ topics=topics,
136
+ perspectives=perspectives,
137
+ domain_expert_prompt=domain_expert_prompt,
138
+ hub_token=hub_token,
139
+ endpoint_base_url=endpoint_base_url,
140
+ examples=examples,
141
+ )
142
+ pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
143
+
144
+
145
+ def create_pipelines_run_command(
146
+ hub_token: str,
147
+ argilla_api_key: str,
148
+ argilla_api_url: str,
149
+ pipeline_config_path: str = "pipeline.yaml",
150
+ argilla_dataset_name: str = "domain_specific_datasets",
151
+ ):
152
+ """Create the command to run the pipeline."""
153
+ command_to_run = [
154
+ sys.executable,
155
+ "-m",
156
+ "distilabel",
157
+ "pipeline",
158
+ "run",
159
+ "--config",
160
+ pipeline_config_path,
161
+ "--param",
162
+ f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
163
+ "--param",
164
+ f"text_generation_to_argilla.api_key={argilla_api_key}",
165
+ "--param",
166
+ f"text_generation_to_argilla.api_url={argilla_api_url}",
167
+ "--param",
168
+ f"self-instruct.llm.api_key={hub_token}",
169
+ "--param",
170
+ f"evol_instruction_complexity.llm.api_key={hub_token}",
171
+ "--param",
172
+ f"domain_expert.llm.api_key={hub_token}",
173
+ "--ignore-cache",
174
+ ]
175
+ return command_to_run
176
+
177
+
178
+ def run_pipeline(
179
+ hub_token: str,
180
+ argilla_api_key: str,
181
+ argilla_api_url: str,
182
+ pipeline_config_path: str = "pipeline.yaml",
183
+ argilla_dataset_name: str = "domain_specific_datasets",
184
+ ):
185
+ """Run the pipeline and yield the output as a generator of logs."""
186
+
187
+ command_to_run = create_pipelines_run_command(
188
+ hub_token=hub_token,
189
+ pipeline_config_path=pipeline_config_path,
190
+ argilla_dataset_name=argilla_dataset_name,
191
+ argilla_api_key=argilla_api_key,
192
+ argilla_api_url=argilla_api_url,
193
+ )
194
+
195
+ # Run the script file
196
+ process = subprocess.Popen(
197
+ args=command_to_run,
198
+ stdout=subprocess.PIPE,
199
+ stderr=subprocess.PIPE,
200
+ env={"HF_TOKEN": hub_token},
201
+ )
202
+
203
+ while process.stdout and process.stdout.readable():
204
+ time.sleep(0.2)
205
+ line = process.stdout.readline()
206
+ if not line:
207
+ break
208
+ yield line.decode("utf-8")
requirements.txt CHANGED
@@ -1 +1,8 @@
1
- huggingface_hub
 
 
 
 
 
 
 
 
1
+ datasets
2
+ python_dotenv
3
+ sentence_transformers
4
+ streamlit
5
+ huggingface_hub
6
+ mistralai
7
+ argilla
8
+ git+https://github.com/argilla-io/distilabel.git
utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from defaults import (
4
+ ARGILLA_SPACE_REPO_ID,
5
+ PROJECT_NAME,
6
+ ARGILLA_URL,
7
+ DIBT_PARENT_APP_URL,
8
+ DATASET_URL,
9
+ DATASET_REPO_ID,
10
+ ARGILLA_SPACE_REPO_ID,
11
+ )
12
+
13
+
14
+ def project_sidebar():
15
+ if PROJECT_NAME == "DEFAULT_DOMAIN":
16
+ st.warning(
17
+ "Please set up the project configuration in the parent app before proceeding."
18
+ )
19
+ st.stop()
20
+
21
+ st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
22
+ st.sidebar.markdown(
23
+ """
24
+ This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
25
+ """
26
+ )
27
+ st.sidebar.link_button(f"πŸ“š Dataset Repo", DATASET_URL)
28
+ st.sidebar.link_button(f"πŸ€– Argilla Space", ARGILLA_URL)
29
+ st.sidebar.divider()
30
+ st.sidebar.link_button("πŸ§‘β€πŸŒΎ New Project", DIBT_PARENT_APP_URL)
31
+ st.sidebar.link_button(
32
+ "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
33
+ )