Ben Burtenshaw commited on
Commit
c23ab82
·
1 Parent(s): 798f8ba

update examples

Browse files
pages/2_👩🏼‍🔬 Describe Domain.py → 2_👩🏼‍🔬 Describe Domain.py RENAMED
@@ -46,12 +46,14 @@ st.write(
46
  tab_domain_perspectives,
47
  tab_domain_topics,
48
  tab_examples,
 
49
  ) = st.tabs(
50
  tabs=[
51
  "👩🏼‍🔬 Domain Expert",
52
  "🔍 Domain Perspectives",
53
  "🕸️ Domain Topics",
54
  "📚 Examples",
 
55
  ]
56
  )
57
 
@@ -101,7 +103,8 @@ with tab_domain_perspectives:
101
  perspectives.append(
102
  perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
103
  )
104
- st.session_state["perspectives"] = perspectives
 
105
 
106
 
107
  ################################################################################
@@ -127,7 +130,8 @@ with tab_domain_topics:
127
  n = len(topics)
128
  value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
129
  topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
130
- st.session_state["topics"] = topics
 
131
 
132
 
133
  ################################################################################
@@ -146,32 +150,61 @@ with tab_examples:
146
  """
147
  )
148
 
149
- questions_answers = st.session_state.get(
150
- "questions_answers",
151
  [
152
- (
153
- st.text_area(
154
- "Question", key="question_0", value=DEFAULT_EXAMPLES[0]["question"]
155
- ),
156
- st.text_area(
157
- "Answer", key="answer_0", value=DEFAULT_EXAMPLES[0]["answer"]
158
- ),
159
- )
160
  ],
161
  )
162
 
163
- if st.button("Add New Example"):
164
- n = len(questions_answers)
165
- default_question, default_answer = DEFAULT_EXAMPLES[n].values()
166
- st.subheader(f"Example {n + 1}")
167
- if st.button("Generate New Answer", key=f"generate_{n}"):
168
- default_answer = query(default_question)
169
- _question = st.text_area(
170
- "Question", key=f"question_{n}", value=default_question
171
- )
172
- _answer = st.text_area("Answer", key=f"answer_{n}", value=default_answer)
173
- questions_answers.append((_question, _answer))
174
- st.session_state["questions_answers"] = questions_answers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  ################################################################################
177
  # Setup Dataset on the Hub
@@ -221,21 +254,6 @@ if st.button("🤗 Push Dataset Seed") and all(
221
  )
222
  st.stop()
223
 
224
- perspectives = list(filter(None, perspectives))
225
- topics = list(filter(None, topics))
226
- examples = [{"question": q, "answer": a} for q, a in questions_answers]
227
-
228
- domain_data = {
229
- "domain": domain,
230
- "perspectives": perspectives,
231
- "topics": topics,
232
- "examples": examples,
233
- "domain_expert_prompt": domain_expert_prompt,
234
- }
235
-
236
- with open(SEED_DATA_PATH, "w") as f:
237
- json.dump(domain_data, f, indent=2)
238
-
239
  push_dataset_to_hub(
240
  domain_seed_data_path=SEED_DATA_PATH,
241
  project_name=project_name,
 
46
  tab_domain_perspectives,
47
  tab_domain_topics,
48
  tab_examples,
49
+ tab_raw_seed,
50
  ) = st.tabs(
51
  tabs=[
52
  "👩🏼‍🔬 Domain Expert",
53
  "🔍 Domain Perspectives",
54
  "🕸️ Domain Topics",
55
  "📚 Examples",
56
+ "🌱 Raw Seed Data",
57
  ]
58
  )
59
 
 
103
  perspectives.append(
104
  perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
105
  )
106
+
107
+ st.session_state["perspectives"] = perspectives
108
 
109
 
110
  ################################################################################
 
130
  n = len(topics)
131
  value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
132
  topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
133
+
134
+ st.session_state["topics"] = topics
135
 
136
 
137
  ################################################################################
 
150
  """
151
  )
152
 
153
+ examples = st.session_state.get(
154
+ "examples",
155
  [
156
+ {
157
+ "question": "",
158
+ "answer": "",
159
+ }
 
 
 
 
160
  ],
161
  )
162
 
163
+ for n, example in enumerate(examples, 1):
164
+ question = example["question"]
165
+ answer = example["answer"]
166
+ examples_container = st.container()
167
+ question_column, answer_column = examples_container.columns(2)
168
+
169
+ if st.button(f"Generate Answer {n}"):
170
+ if st.session_state["hub_token"] is None:
171
+ st.error("Please provide a Hub token to generate answers")
172
+ else:
173
+ answer = query(question, st.session_state["hub_token"])
174
+ with question_column:
175
+ question = st.text_area(f"Question {n}", value=question)
176
+
177
+ with answer_column:
178
+ answer = st.text_area(f"Answer {n}", value=answer)
179
+ examples[n - 1] = {"question": question, "answer": answer}
180
+ st.session_state["examples"] = examples
181
+ st.divider()
182
+
183
+ if st.button("Add Example"):
184
+ examples.append({"question": "", "answer": ""})
185
+ st.session_state["examples"] = examples
186
+ st.rerun()
187
+
188
+ ################################################################################
189
+ # Save Domain Data
190
+ ################################################################################
191
+
192
+ perspectives = list(filter(None, perspectives))
193
+ topics = list(filter(None, topics))
194
+
195
+ domain_data = {
196
+ "domain": domain,
197
+ "perspectives": perspectives,
198
+ "topics": topics,
199
+ "examples": examples,
200
+ "domain_expert_prompt": domain_expert_prompt,
201
+ }
202
+
203
+ with open(SEED_DATA_PATH, "w") as f:
204
+ json.dump(domain_data, f, indent=2)
205
+
206
+ with tab_raw_seed:
207
+ st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
208
 
209
  ################################################################################
210
  # Setup Dataset on the Hub
 
254
  )
255
  st.stop()
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  push_dataset_to_hub(
258
  domain_seed_data_path=SEED_DATA_PATH,
259
  project_name=project_name,
pages/3_🌱 Generate Dataset.py → 3_🌱 Generate Dataset.py RENAMED
@@ -181,6 +181,15 @@ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
181
  """,
182
  language="bash",
183
  )
 
 
 
 
 
 
 
 
 
184
  else:
185
  st.error("Please fill all the required fields.")
186
 
 
181
  """,
182
  language="bash",
183
  )
184
+ st.subheader(
185
+ "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
186
+ )
187
+ st.code(
188
+ """
189
+ git clone https://github.com/huggingface/data-is-better-together
190
+ cd domain-specific-datasets
191
+ """
192
+ )
193
  else:
194
  st.error("Please fill all the required fields.")
195
 
pages/4_🔍 Review Generated Data.py → 4_🔍 Review Generated Data.py RENAMED
File without changes
infer.py CHANGED
@@ -1,16 +1,18 @@
1
  import os
2
  import requests
3
 
4
- HF_API_KEY = os.getenv("HF_API_KEY")
5
  API_URL = (
6
  "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
7
  )
8
- headers = {"Authorization": f"Bearer {HF_API_KEY}"}
9
 
10
 
11
- def query(question):
 
 
 
12
  payload = {
13
  "inputs": question,
14
  }
 
15
  response = requests.post(API_URL, headers=headers, json=payload)
16
  return response.json()[0]["generated_text"]
 
1
  import os
2
  import requests
3
 
 
4
  API_URL = (
5
  "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
6
  )
 
7
 
8
 
9
+
10
+
11
+
12
+ def query(question, hub_token: str):
13
  payload = {
14
  "inputs": question,
15
  }
16
+ headers = {"Authorization": f"Bearer {hub_token}"}
17
  response = requests.post(API_URL, headers=headers, json=payload)
18
  return response.json()[0]["generated_text"]
pages/DATASET_README_BASE.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Domain Dataset Grower
2
+
3
+ This dataset was generated by [distilabel](https://distilabel.argilla.io/latest/) as a domain specific dataset for the domain of farming. The dataset used this seed data to generate the samples. The seed data was define by a domain expert and the generated data can be reviewed in this [Argilla](https://argilla.io/) space here: [Argilla](https://huggingface.co/spaces/argilla/farming)
4
+
5
+ If you want to define a domain specific seed dataset for your own domain, you can use the distilabel tool to generate the dataset, and seed your dataset [here](https://huggingface.co/spaces/argilla/domain-specific-seed)
6
+
pages/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Domain Specific Seed
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
pages/app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from defaults import (
4
+ PROJECT_NAME,
5
+ ARGILLA_SPACE_REPO_ID,
6
+ DATASET_REPO_ID,
7
+ ARGILLA_URL,
8
+ PROJECT_SPACE_REPO_ID,
9
+ DIBT_PARENT_APP_URL,
10
+ )
11
+ from utils import project_sidebar
12
+
13
+ st.set_page_config("Domain Data Grower", page_icon="🧑‍🌾")
14
+
15
+ project_sidebar()
16
+
17
+ if PROJECT_NAME == "DEFAULT_DOMAIN":
18
+ st.warning(
19
+ "Please set up the project configuration in the parent app before proceeding."
20
+ )
21
+ st.stop()
22
+
23
+
24
+ st.header("🧑‍🌾 Domain Data Grower")
25
+ st.divider()
26
+
27
+ st.markdown(
28
+ """
29
+ ## 🌱 Create a dataset seed for aligning models to a specific domain
30
+
31
+ This app helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
32
+ Alignment datasets are used to fine-tune models to a specific domain or task, but as yet, there's a shortage of diverse datasets for this purpose.
33
+ """
34
+ )
35
+ st.markdown(
36
+ """
37
+ ## 🚜 How it works
38
+
39
+ You can create a dataset seed by defining the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
40
+ The dataset seed is then used to generate synthetic data for training a language model.
41
+
42
+ """
43
+ )
44
+ st.markdown(
45
+ """
46
+ ## 🗺️ The process
47
+
48
+ ### Step 1: ~~Setup the project~~
49
+
50
+ ~~Define the project details, including the project name, domain, and API credentials. Create Dataset Repo on the Hub.~~
51
+ """
52
+ )
53
+ st.link_button("🚀 ~~Setup Project via the parent app~~", DIBT_PARENT_APP_URL)
54
+
55
+ st.markdown(
56
+ """
57
+ ### Step 2: Describe the Domain
58
+
59
+ Define the domain expertise, perspectives, topics, and examples for your domain-specific dataset.
60
+ You can collaborate with domain experts to define the domain expertise and perspectives.
61
+ """
62
+ )
63
+
64
+ st.page_link(
65
+ "pages/2_👩🏼‍🔬 Describe Domain.py",
66
+ label="Describe Domain",
67
+ icon="👩🏼‍🔬",
68
+ )
69
+
70
+ st.markdown(
71
+ """
72
+ ### Step 3: Generate Synthetic Data
73
+
74
+ Use distilabel to generate synthetic data for your domain-specific dataset.
75
+ You can run the pipeline locally or in this space to generate synthetic data.
76
+ """
77
+ )
78
+
79
+ st.page_link(
80
+ "pages/3_🌱 Generate Dataset.py",
81
+ label="Generate Dataset",
82
+ icon="🌱",
83
+ )
84
+
85
+ st.markdown(
86
+ """
87
+ ### Step 4: Review the Dataset
88
+
89
+ Use Argilla to review the generated synthetic data and provide feedback on the quality of the data.
90
+
91
+
92
+ """
93
+ )
94
+ st.link_button("🔍 Review the dataset in Argilla", ARGILLA_URL)
pages/defaults.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ SEED_DATA_PATH = "seed_data.json"
5
+ PIPELINE_PATH = "pipeline.yaml"
6
+ REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py", "requirements.txt"]
7
+ DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
8
+ N_PERSPECTIVES = 5
9
+ N_TOPICS = 5
10
+ N_EXAMPLES = 5
11
+ CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True)
12
+
13
+ ################################################
14
+ # DEFAULTS ON FARMING
15
+ ################################################
16
+
17
+ with open(SEED_DATA_PATH) as f:
18
+ DEFAULT_DATA = json.load(f)
19
+
20
+ DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
21
+ DEFAULT_PERSPECTIVES = DEFAULT_DATA["perspectives"]
22
+ DEFAULT_TOPICS = DEFAULT_DATA["topics"]
23
+ DEFAULT_EXAMPLES = DEFAULT_DATA["examples"]
24
+ DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
25
+
26
+ ################################################
27
+ # PROJECT CONFIG FROM PARENT APP
28
+ ################################################
29
+
30
+ try:
31
+ with open("project_config.json") as f:
32
+ PROJECT_CONFIG = json.load(f)
33
+
34
+ PROJECT_NAME = PROJECT_CONFIG["project_name"]
35
+ ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
36
+ DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
37
+ ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
38
+ ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
39
+ PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
40
+ DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
41
+ HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
42
+ except FileNotFoundError:
43
+ PROJECT_NAME = "DEFAULT_DOMAIN"
44
+ ARGILLA_SPACE_REPO_ID = ""
45
+ DATASET_REPO_ID = ""
46
+ ARGILLA_URL = ""
47
+ PROJECT_SPACE_REPO_ID = ""
48
+ DATASET_URL = ""
49
+ HUB_USERNAME = ""
pages/domain.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from distilabel.steps.tasks.typing import ChatType
5
+ from distilabel.steps.tasks.text_generation import TextGeneration
6
+ from distilabel.steps import StepInput, StepOutput, Step
7
+
8
+ from dotenv import load_dotenv
9
+
10
+ from defaults import (
11
+ DEFAULT_DOMAIN,
12
+ DEFAULT_PERSPECTIVES,
13
+ DEFAULT_TOPICS,
14
+ DEFAULT_EXAMPLES,
15
+ DEFAULT_SYSTEM_PROMPT,
16
+ N_PERSPECTIVES,
17
+ N_TOPICS,
18
+ N_EXAMPLES,
19
+ )
20
+
21
+ load_dotenv()
22
+
23
+ # Application description used for SelfInstruct
24
+ APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
25
+ Your should not expect basic but profound questions from your users.
26
+ The queries should reflect a diversity of vision and economic positions and political positions.
27
+ The queries may know about different methods of {DEFAULT_DOMAIN}.
28
+ The queries can be positioned politically, economically, socially, or practically.
29
+ Also take into account the impact of diverse causes on diverse domains."""
30
+
31
+
32
+ TOPICS = DEFAULT_TOPICS[:N_TOPICS]
33
+ PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
34
+ EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
35
+
36
+
37
+ def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
38
+ questions = """ Examples of high quality questions:"""
39
+ answers = """ Examples of high quality answers:"""
40
+ for example in examples:
41
+ questions += f"""\n- Question: {example["question"]}\n"""
42
+ answers += f"""\n- Answer: {example["answer"]}\n"""
43
+
44
+ _template: str = (
45
+ """{instruction}\nThis is the the instruction.\n Examples: """
46
+ + questions
47
+ + answers
48
+ )
49
+ return _template
50
+
51
+
52
+ def create_topics(topics: List[str], positions: List[str]) -> List[str]:
53
+ return [
54
+ f"{topic} from a {position} perspective"
55
+ for topic in topics
56
+ for position in positions
57
+ ]
58
+
59
+
60
+ class DomainExpert(TextGeneration):
61
+ """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
62
+
63
+ _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
64
+ _template: str = """{instruction}\nThis is the the instruction.\n Examples: """
65
+
66
+ def format_input(self, input: Dict[str, Any]) -> "ChatType":
67
+ return [
68
+ {
69
+ "role": "system",
70
+ "content": self._system_prompt,
71
+ },
72
+ {
73
+ "role": "user",
74
+ "content": self._template.format(**input),
75
+ },
76
+ ]
77
+
78
+
79
+ class CleanNumberedList(Step):
80
+ """A step to clean the numbered list of questions."""
81
+
82
+ def process(self, inputs: StepInput) -> StepOutput:
83
+ import re
84
+
85
+ pattern = r"^\d+\.\s"
86
+
87
+ for input in inputs:
88
+ input["question"] = re.sub(pattern, "", input["question"])
89
+ yield inputs
pages/hub.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from tempfile import mktemp
3
+
4
+ import argilla as rg
5
+ from huggingface_hub import HfApi
6
+
7
+ from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH
8
+
9
+
10
+ hf_api = HfApi()
11
+
12
+ with open("DATASET_README_BASE.md") as f:
13
+ DATASET_README_BASE = f.read()
14
+
15
+
16
+ def create_readme(domain_seed_data, project_name, domain):
17
+ # create a readme for the project that shows the domain and project name
18
+ readme = DATASET_README_BASE
19
+ readme += f"# {project_name}\n\n## Domain: {domain}"
20
+ perspectives = domain_seed_data.get("perspectives")
21
+ topics = domain_seed_data.get("topics")
22
+ examples = domain_seed_data.get("examples")
23
+ if perspectives:
24
+ readme += "\n\n## Perspectives\n\n"
25
+ for p in perspectives:
26
+ readme += f"- {p}\n"
27
+ if topics:
28
+ readme += "\n\n## Topics\n\n"
29
+ for t in topics:
30
+ readme += f"- {t}\n"
31
+ if examples:
32
+ readme += "\n\n## Examples\n\n"
33
+ for example in examples:
34
+ readme += f"### {example['question']}\n\n{example['answer']}\n\n"
35
+ temp_file = mktemp()
36
+
37
+ with open(temp_file, "w") as f:
38
+ f.write(readme)
39
+ return temp_file
40
+
41
+
42
+ def setup_dataset_on_hub(repo_id, hub_token):
43
+ # create an empty dataset repo on the hub
44
+ hf_api.create_repo(
45
+ repo_id=repo_id,
46
+ token=hub_token,
47
+ repo_type="dataset",
48
+ exist_ok=True,
49
+ )
50
+
51
+
52
+ def push_dataset_to_hub(
53
+ domain_seed_data_path,
54
+ project_name,
55
+ domain,
56
+ pipeline_path,
57
+ hub_username,
58
+ hub_token: str,
59
+ ):
60
+ repo_id = f"{hub_username}/{project_name}"
61
+
62
+ setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token)
63
+
64
+ # upload the seed data and readme to the hub
65
+ hf_api.upload_file(
66
+ path_or_fileobj=domain_seed_data_path,
67
+ path_in_repo="seed_data.json",
68
+ token=hub_token,
69
+ repo_id=repo_id,
70
+ repo_type="dataset",
71
+ )
72
+
73
+ # upload the readme to the hub
74
+ domain_seed_data = json.load(open(domain_seed_data_path))
75
+ hf_api.upload_file(
76
+ path_or_fileobj=create_readme(
77
+ domain_seed_data=domain_seed_data, project_name=project_name, domain=domain
78
+ ),
79
+ path_in_repo="README.md",
80
+ token=hub_token,
81
+ repo_id=repo_id,
82
+ repo_type="dataset",
83
+ )
84
+
85
+
86
+ def push_pipeline_to_hub(
87
+ pipeline_path,
88
+ hub_username,
89
+ hub_token: str,
90
+ project_name,
91
+ ):
92
+ repo_id = f"{hub_username}/{project_name}"
93
+
94
+ # upload the pipeline to the hub
95
+ hf_api.upload_file(
96
+ path_or_fileobj=pipeline_path,
97
+ path_in_repo="pipeline.yaml",
98
+ token=hub_token,
99
+ repo_id=repo_id,
100
+ repo_type="dataset",
101
+ )
102
+
103
+ for code_path in REMOTE_CODE_PATHS:
104
+ hf_api.upload_file(
105
+ path_or_fileobj=code_path,
106
+ path_in_repo=code_path,
107
+ token=hub_token,
108
+ repo_id=repo_id,
109
+ repo_type="dataset",
110
+ )
111
+
112
+ print(f"Dataset uploaded to {repo_id}")
113
+
114
+
115
+ def pull_seed_data_from_repo(repo_id, hub_token):
116
+ # pull the dataset repo from the hub
117
+ hf_api.hf_hub_download(
118
+ repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH
119
+ )
120
+ return json.load(open(SEED_DATA_PATH))
121
+
122
+
123
+ def push_argilla_dataset_to_hub(
124
+ name: str, repo_id: str, url: str, api_key: str, workspace: str = "admin"
125
+ ):
126
+ rg.init(api_url=url, api_key=api_key)
127
+ feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace)
128
+ local_dataset = feedback_dataset.pull()
129
+ local_dataset.push_to_huggingface(repo_id=repo_id)
pages/infer.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+
4
+ API_URL = (
5
+ "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
6
+ )
7
+
8
+
9
+
10
+
11
+
12
+ def query(question, hub_token: str):
13
+ payload = {
14
+ "inputs": question,
15
+ }
16
+ headers = {"Authorization": f"Bearer {hub_token}"}
17
+ response = requests.post(API_URL, headers=headers, json=payload)
18
+ return response.json()[0]["generated_text"]
pages/pages/2_👩🏼‍🔬 Describe Domain.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import streamlit as st
4
+
5
+ from hub import push_dataset_to_hub
6
+ from infer import query
7
+ from defaults import (
8
+ DEFAULT_DOMAIN,
9
+ DEFAULT_PERSPECTIVES,
10
+ DEFAULT_TOPICS,
11
+ DEFAULT_EXAMPLES,
12
+ DEFAULT_SYSTEM_PROMPT,
13
+ N_PERSPECTIVES,
14
+ N_TOPICS,
15
+ SEED_DATA_PATH,
16
+ PIPELINE_PATH,
17
+ DATASET_REPO_ID,
18
+ )
19
+ from utils import project_sidebar
20
+
21
+ st.set_page_config(
22
+ page_title="Domain Data Grower",
23
+ page_icon="🧑‍🌾",
24
+ )
25
+ project_sidebar()
26
+
27
+ ################################################################################
28
+ # HEADER
29
+ ################################################################################
30
+
31
+ st.header("🧑‍🌾 Domain Data Grower")
32
+ st.divider()
33
+ st.subheader(
34
+ "Step 2. Define the specific domain that you want to generate synthetic data for.",
35
+ )
36
+ st.write(
37
+ "Define the project details, including the project name, domain, and API credentials"
38
+ )
39
+
40
+ ################################################################################
41
+ # Domain Expert Section
42
+ ################################################################################
43
+
44
+ (
45
+ tab_domain_expert,
46
+ tab_domain_perspectives,
47
+ tab_domain_topics,
48
+ tab_examples,
49
+ tab_raw_seed,
50
+ ) = st.tabs(
51
+ tabs=[
52
+ "👩🏼‍🔬 Domain Expert",
53
+ "🔍 Domain Perspectives",
54
+ "🕸️ Domain Topics",
55
+ "📚 Examples",
56
+ "🌱 Raw Seed Data",
57
+ ]
58
+ )
59
+
60
+ with tab_domain_expert:
61
+ st.text("Define the domain expertise that you want to train a language model")
62
+ st.info(
63
+ "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
64
+ )
65
+
66
+ domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
67
+
68
+ domain_expert_prompt = st.text_area(
69
+ label="Domain Expert Definition",
70
+ value=DEFAULT_SYSTEM_PROMPT,
71
+ height=200,
72
+ )
73
+
74
+ ################################################################################
75
+ # Domain Perspectives
76
+ ################################################################################
77
+
78
+ with tab_domain_perspectives:
79
+ st.text("Define the different perspectives from which the domain can be viewed")
80
+ st.info(
81
+ """
82
+ Perspectives are different viewpoints or angles from which a domain can be viewed.
83
+ For example, the domain of farming can be viewed from the perspective of a commercial
84
+ farmer or an independent family farmer."""
85
+ )
86
+
87
+ perspectives = st.session_state.get(
88
+ "perspectives",
89
+ [DEFAULT_PERSPECTIVES[0]],
90
+ )
91
+ perspectives_container = st.container()
92
+
93
+ perspectives = [
94
+ perspectives_container.text_input(
95
+ f"Domain Perspective {i + 1}", value=perspective
96
+ )
97
+ for i, perspective in enumerate(perspectives)
98
+ ]
99
+
100
+ if st.button("Add Perspective", key="add_perspective"):
101
+ n = len(perspectives)
102
+ value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
103
+ perspectives.append(
104
+ perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
105
+ )
106
+
107
+ st.session_state["perspectives"] = perspectives
108
+
109
+
110
+ ################################################################################
111
+ # Domain Topics
112
+ ################################################################################
113
+
114
+ with tab_domain_topics:
115
+ st.text("Define the main themes or subjects that are relevant to the domain")
116
+ st.info(
117
+ """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
118
+ )
119
+ topics = st.session_state.get(
120
+ "topics",
121
+ [DEFAULT_TOPICS[0]],
122
+ )
123
+ topics_container = st.container()
124
+ topics = [
125
+ topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
126
+ for i, topic in enumerate(topics)
127
+ ]
128
+
129
+ if st.button("Add Topic", key="add_topic"):
130
+ n = len(topics)
131
+ value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
132
+ topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
133
+
134
+ st.session_state["topics"] = topics
135
+
136
+
137
+ ################################################################################
138
+ # Examples Section
139
+ ################################################################################
140
+
141
+ with tab_examples:
142
+ st.text(
143
+ "Add high-quality questions and answers that can be used to generate synthetic data"
144
+ )
145
+ st.info(
146
+ """
147
+ Examples are high-quality questions and answers that can be used to generate
148
+ synthetic data for the domain. These examples will be used to train the language model
149
+ to generate questions and answers.
150
+ """
151
+ )
152
+
153
+ examples = st.session_state.get(
154
+ "examples",
155
+ [
156
+ {
157
+ "question": "",
158
+ "answer": "",
159
+ }
160
+ ],
161
+ )
162
+
163
+ for n, example in enumerate(examples, 1):
164
+ question = example["question"]
165
+ answer = example["answer"]
166
+ examples_container = st.container()
167
+ question_column, answer_column = examples_container.columns(2)
168
+
169
+ if st.button(f"Generate Answer {n}"):
170
+ if st.session_state["hub_token"] is None:
171
+ st.error("Please provide a Hub token to generate answers")
172
+ else:
173
+ answer = query(question, st.session_state["hub_token"])
174
+ with question_column:
175
+ question = st.text_area(f"Question {n}", value=question)
176
+
177
+ with answer_column:
178
+ answer = st.text_area(f"Answer {n}", value=answer)
179
+ examples[n - 1] = {"question": question, "answer": answer}
180
+ st.session_state["examples"] = examples
181
+ st.divider()
182
+
183
+ if st.button("Add Example"):
184
+ examples.append({"question": "", "answer": ""})
185
+ st.session_state["examples"] = examples
186
+ st.rerun()
187
+
188
+ ################################################################################
189
+ # Save Domain Data
190
+ ################################################################################
191
+
192
+ perspectives = list(filter(None, perspectives))
193
+ topics = list(filter(None, topics))
194
+
195
+ domain_data = {
196
+ "domain": domain,
197
+ "perspectives": perspectives,
198
+ "topics": topics,
199
+ "examples": examples,
200
+ "domain_expert_prompt": domain_expert_prompt,
201
+ }
202
+
203
+ with open(SEED_DATA_PATH, "w") as f:
204
+ json.dump(domain_data, f, indent=2)
205
+
206
+ with tab_raw_seed:
207
+ st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
208
+
209
+ ################################################################################
210
+ # Setup Dataset on the Hub
211
+ ################################################################################
212
+
213
+ st.divider()
214
+
215
+ hub_username = DATASET_REPO_ID.split("/")[0]
216
+ project_name = DATASET_REPO_ID.split("/")[1]
217
+ st.write("Define the dataset repo details on the Hub")
218
+ st.session_state["project_name"] = st.text_input("Project Name", project_name)
219
+ st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
220
+ st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
221
+
222
+ if all(
223
+ (
224
+ st.session_state.get("project_name"),
225
+ st.session_state.get("hub_username"),
226
+ st.session_state.get("hub_token"),
227
+ )
228
+ ):
229
+ st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
230
+
231
+
232
+ if st.button("🤗 Push Dataset Seed") and all(
233
+ (
234
+ domain,
235
+ domain_expert_prompt,
236
+ perspectives,
237
+ topics,
238
+ questions_answers,
239
+ )
240
+ ):
241
+ if all(
242
+ (
243
+ st.session_state.get("project_name"),
244
+ st.session_state.get("hub_username"),
245
+ st.session_state.get("hub_token"),
246
+ )
247
+ ):
248
+ project_name = st.session_state["project_name"]
249
+ hub_username = st.session_state["hub_username"]
250
+ hub_token = st.session_state["hub_token"]
251
+ else:
252
+ st.error(
253
+ "Please create a dataset repo on the Hub before pushing the dataset seed"
254
+ )
255
+ st.stop()
256
+
257
+ push_dataset_to_hub(
258
+ domain_seed_data_path=SEED_DATA_PATH,
259
+ project_name=project_name,
260
+ domain=domain,
261
+ hub_username=hub_username,
262
+ hub_token=hub_token,
263
+ pipeline_path=PIPELINE_PATH,
264
+ )
265
+
266
+ st.success(
267
+ f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
268
+ )
269
+
270
+ st.write("You can now move on to runnning your distilabel pipeline.")
271
+
272
+ st.page_link(
273
+ page="pages/3_🌱 Generate Dataset.py",
274
+ label="Generate Dataset",
275
+ icon="🌱",
276
+ )
277
+
278
+ else:
279
+ st.info(
280
+ "Please fill in all the required domain fields to push the dataset seed to the Hub"
281
+ )
pages/pages/3_🌱 Generate Dataset.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from hub import pull_seed_data_from_repo, push_pipeline_to_hub
4
+ from defaults import (
5
+ DEFAULT_SYSTEM_PROMPT,
6
+ PIPELINE_PATH,
7
+ PROJECT_NAME,
8
+ ARGILLA_URL,
9
+ HUB_USERNAME,
10
+ CODELESS_DISTILABEL,
11
+ )
12
+ from utils import project_sidebar
13
+
14
+ from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
15
+
16
+ st.set_page_config(
17
+ page_title="Domain Data Grower",
18
+ page_icon="🧑‍🌾",
19
+ )
20
+
21
+ project_sidebar()
22
+
23
+ ################################################################################
24
+ # HEADER
25
+ ################################################################################
26
+
27
+ st.header("🧑‍🌾 Domain Data Grower")
28
+ st.divider()
29
+ st.subheader("Step 3. Run the pipeline to generate synthetic data")
30
+ st.write("Define the project repos and models that the pipeline will use.")
31
+
32
+ st.divider()
33
+ ###############################################################
34
+ # CONFIGURATION
35
+ ###############################################################
36
+
37
+ st.markdown("## Pipeline Configuration")
38
+
39
+ st.markdown("#### 🤗 Hub details to pull the seed data")
40
+ hub_username = st.text_input("Hub Username", HUB_USERNAME)
41
+ project_name = st.text_input("Project Name", PROJECT_NAME)
42
+ repo_id = f"{hub_username}/{project_name}"
43
+ hub_token = st.text_input("Hub Token", type="password")
44
+
45
+ st.divider()
46
+
47
+ st.markdown("#### 🤖 Inference configuration")
48
+
49
+ st.write(
50
+ "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
51
+ )
52
+
53
+ with st.expander("🤗 Recommended Models"):
54
+ st.write("All inference endpoint compatible models can be found via the link below")
55
+ st.link_button(
56
+ "🤗 Inference compaptible models on the hub",
57
+ "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
58
+ )
59
+ st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
60
+ st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
61
+
62
+ st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
63
+ st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
64
+
65
+ st.write("🍃Projects with even less resources could take advantage of Phi-2")
66
+ st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
67
+
68
+ st.write("Note Hugggingface Pro gives access to more compute resources")
69
+ st.link_button(
70
+ "🤗 Huggingface Pro",
71
+ "https://huggingface.co/pricing",
72
+ )
73
+
74
+
75
+ base_url = st.text_input(
76
+ label="Base URL for the Inference API",
77
+ value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
78
+ )
79
+ st.divider()
80
+ st.markdown("#### 🔬 Argilla API details to push the generated dataset")
81
+ argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
82
+ argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
83
+ argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
84
+ st.divider()
85
+
86
+ ###############################################################
87
+ # LOCAL
88
+ ###############################################################
89
+
90
+ st.markdown("## Run the pipeline")
91
+
92
+ st.write(
93
+ "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
94
+ )
95
+
96
+ if CODELESS_DISTILABEL:
97
+ st.write(
98
+ """We recommend running the pipeline locally if you're planning on generating a large dataset. \
99
+ But running the pipeline on this space is a handy way to get started quickly. Your synthetic
100
+ samples will be pushed to Argilla and available for review.
101
+ """
102
+ )
103
+ st.write(
104
+ """If you're planning on running the pipeline on the space, be aware that it \
105
+ will take some time to complete and you will need to maintain a \
106
+ connection to the space."""
107
+ )
108
+
109
+
110
+ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
111
+ if all(
112
+ [
113
+ argilla_api_key,
114
+ argilla_url,
115
+ base_url,
116
+ hub_username,
117
+ project_name,
118
+ hub_token,
119
+ argilla_dataset_name,
120
+ ]
121
+ ):
122
+ with st.spinner("Pulling seed data from the Hub..."):
123
+ try:
124
+ seed_data = pull_seed_data_from_repo(
125
+ repo_id=f"{hub_username}/{project_name}",
126
+ hub_token=hub_token,
127
+ )
128
+ except Exception:
129
+ st.error(
130
+ "Seed data not found. Please make sure you pushed the data seed in Step 2."
131
+ )
132
+
133
+ domain = seed_data["domain"]
134
+ perspectives = seed_data["perspectives"]
135
+ topics = seed_data["topics"]
136
+ examples = seed_data["examples"]
137
+ domain_expert_prompt = seed_data["domain_expert_prompt"]
138
+
139
+ with st.spinner("Serializing the pipeline configuration..."):
140
+ serialize_pipeline(
141
+ argilla_api_key=argilla_api_key,
142
+ argilla_dataset_name=argilla_dataset_name,
143
+ argilla_api_url=argilla_url,
144
+ topics=topics,
145
+ perspectives=perspectives,
146
+ pipeline_config_path=PIPELINE_PATH,
147
+ domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
148
+ hub_token=hub_token,
149
+ endpoint_base_url=base_url,
150
+ examples=examples,
151
+ )
152
+ push_pipeline_to_hub(
153
+ pipeline_path=PIPELINE_PATH,
154
+ hub_token=hub_token,
155
+ hub_username=hub_username,
156
+ project_name=project_name,
157
+ )
158
+
159
+ st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
160
+
161
+ st.info(
162
+ "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
163
+ )
164
+ st.text(
165
+ "Execute the following command to generate a synthetic dataset from the seed data:"
166
+ )
167
+ command_to_run = create_pipelines_run_command(
168
+ hub_token=hub_token,
169
+ pipeline_config_path=PIPELINE_PATH,
170
+ argilla_dataset_name=argilla_dataset_name,
171
+ argilla_api_key=argilla_api_key,
172
+ argilla_api_url=argilla_url,
173
+ )
174
+ st.code(
175
+ f"""
176
+ pip install git+https://github.com/argilla-io/distilabel.git
177
+ git clone https://huggingface.co/datasets/{hub_username}/{project_name}
178
+ cd {project_name}
179
+ pip install -r requirements.txt
180
+ {' '.join(["python"] + command_to_run[1:])}
181
+ """,
182
+ language="bash",
183
+ )
184
+ st.subheader(
185
+ "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
186
+ )
187
+ st.code(
188
+ """
189
+ git clone https://github.com/huggingface/data-is-better-together
190
+ cd domain-specific-datasets
191
+ """
192
+ )
193
+ else:
194
+ st.error("Please fill all the required fields.")
195
+
196
+ ###############################################################
197
+ # SPACE
198
+ ###############################################################
199
+ if CODELESS_DISTILABEL:
200
+ if st.button("🔥 Run pipeline right here, right now!"):
201
+ if all(
202
+ [
203
+ argilla_api_key,
204
+ argilla_url,
205
+ base_url,
206
+ hub_username,
207
+ project_name,
208
+ hub_token,
209
+ argilla_dataset_name,
210
+ ]
211
+ ):
212
+ with st.spinner("Pulling seed data from the Hub..."):
213
+ try:
214
+ seed_data = pull_seed_data_from_repo(
215
+ repo_id=f"{hub_username}/{project_name}",
216
+ hub_token=hub_token,
217
+ )
218
+ except Exception as e:
219
+ st.error(
220
+ "Seed data not found. Please make sure you pushed the data seed in Step 2."
221
+ )
222
+
223
+ domain = seed_data["domain"]
224
+ perspectives = seed_data["perspectives"]
225
+ topics = seed_data["topics"]
226
+ examples = seed_data["examples"]
227
+ domain_expert_prompt = seed_data["domain_expert_prompt"]
228
+
229
+ serialize_pipeline(
230
+ argilla_api_key=argilla_api_key,
231
+ argilla_dataset_name=argilla_dataset_name,
232
+ argilla_api_url=argilla_url,
233
+ topics=topics,
234
+ perspectives=perspectives,
235
+ pipeline_config_path=PIPELINE_PATH,
236
+ domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
237
+ hub_token=hub_token,
238
+ endpoint_base_url=base_url,
239
+ examples=examples,
240
+ )
241
+
242
+ with st.spinner("Starting the pipeline..."):
243
+ logs = run_pipeline(
244
+ pipeline_config_path=PIPELINE_PATH,
245
+ argilla_api_key=argilla_api_key,
246
+ argilla_api_url=argilla_url,
247
+ hub_token=hub_token,
248
+ argilla_dataset_name=argilla_dataset_name,
249
+ )
250
+
251
+ st.success(f"Pipeline started successfully! 🚀")
252
+
253
+ with st.expander(label="View Logs", expanded=True):
254
+ for out in logs:
255
+ st.text(out)
256
+ else:
257
+ st.error("Please fill all the required fields.")
pages/pages/4_🔍 Review Generated Data.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
4
+ from utils import project_sidebar
5
+ from hub import push_argilla_dataset_to_hub
6
+
7
+ st.set_page_config(
8
+ page_title="Domain Data Grower",
9
+ page_icon="🧑‍🌾",
10
+ )
11
+
12
+ project_sidebar()
13
+
14
+ ################################################################################
15
+ # HEADER
16
+ ################################################################################
17
+
18
+ st.header("🧑‍🌾 Domain Data Grower")
19
+ st.divider()
20
+
21
+ st.write(
22
+ """Once you have reviewed the synthetic data in Argilla, you can publish the
23
+ generated dataset to the Hub."""
24
+ )
25
+
26
+
27
+ ################################################################################
28
+ # Configuration
29
+ ################################################################################
30
+
31
+ st.divider()
32
+ st.write("🔬 Argilla API details to push the generated dataset")
33
+ argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
34
+ argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
35
+ argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
36
+ dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
37
+ st.divider()
38
+
39
+ if st.button("🚀 Publish the generated dataset"):
40
+ with st.spinner("Publishing the generated dataset..."):
41
+ push_argilla_dataset_to_hub(
42
+ name=argilla_dataset_name,
43
+ repo_id=dataset_repo_id,
44
+ url=argilla_url,
45
+ api_key=argilla_api_key,
46
+ workspace="admin",
47
+ )
48
+ st.success("The generated dataset has been published to the Hub.")
pages/pipeline.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import time
4
+ from typing import List
5
+
6
+ from distilabel.steps.generators.data import LoadDataFromDicts
7
+ from distilabel.steps.expand import ExpandColumns
8
+ from distilabel.steps.keep import KeepColumns
9
+ from distilabel.steps.tasks.self_instruct import SelfInstruct
10
+ from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
11
+ from distilabel.llms.huggingface import InferenceEndpointsLLM
12
+ from distilabel.pipeline import Pipeline
13
+ from distilabel.steps import TextGenerationToArgilla
14
+ from dotenv import load_dotenv
15
+
16
+ from domain import (
17
+ DomainExpert,
18
+ CleanNumberedList,
19
+ create_topics,
20
+ create_examples_template,
21
+ APPLICATION_DESCRIPTION,
22
+ )
23
+
24
+ load_dotenv()
25
+
26
+
27
+ def define_pipeline(
28
+ argilla_api_key: str,
29
+ argilla_api_url: str,
30
+ argilla_dataset_name: str,
31
+ topics: List[str],
32
+ perspectives: List[str],
33
+ domain_expert_prompt: str,
34
+ examples: List[dict],
35
+ hub_token: str,
36
+ endpoint_base_url: str,
37
+ ):
38
+ """Define the pipeline for the specific domain."""
39
+
40
+ terms = create_topics(topics, perspectives)
41
+ template = create_examples_template(examples)
42
+ with Pipeline("farming") as pipeline:
43
+ load_data = LoadDataFromDicts(
44
+ name="load_data",
45
+ data=[{"input": term} for term in terms],
46
+ batch_size=64,
47
+ )
48
+ llm = InferenceEndpointsLLM(
49
+ base_url=endpoint_base_url,
50
+ api_key=hub_token,
51
+ )
52
+ self_instruct = SelfInstruct(
53
+ name="self-instruct",
54
+ application_description=APPLICATION_DESCRIPTION,
55
+ num_instructions=5,
56
+ input_batch_size=8,
57
+ llm=llm,
58
+ )
59
+
60
+ evol_instruction_complexity = EvolInstruct(
61
+ name="evol_instruction_complexity",
62
+ llm=llm,
63
+ num_evolutions=2,
64
+ store_evolutions=True,
65
+ input_batch_size=8,
66
+ include_original_instruction=True,
67
+ input_mappings={"instruction": "question"},
68
+ )
69
+
70
+ expand_instructions = ExpandColumns(
71
+ name="expand_columns", columns={"instructions": "question"}
72
+ )
73
+ cleaner = CleanNumberedList(name="clean_numbered_list")
74
+ expand_evolutions = ExpandColumns(
75
+ name="expand_columns_evolved",
76
+ columns={"evolved_instructions": "evolved_questions"},
77
+ )
78
+
79
+ domain_expert = DomainExpert(
80
+ name="domain_expert",
81
+ llm=llm,
82
+ input_batch_size=8,
83
+ input_mappings={"instruction": "evolved_questions"},
84
+ output_mappings={"generation": "domain_expert_answer"},
85
+ )
86
+
87
+ domain_expert._system_prompt = domain_expert_prompt
88
+ domain_expert._template = template
89
+
90
+ keep_columns = KeepColumns(
91
+ name="keep_columns",
92
+ columns=["model_name", "evolved_questions", "domain_expert_answer"],
93
+ )
94
+
95
+ to_argilla = TextGenerationToArgilla(
96
+ name="text_generation_to_argilla",
97
+ dataset_name=argilla_dataset_name,
98
+ dataset_workspace="admin",
99
+ api_url=argilla_api_url,
100
+ api_key=argilla_api_key,
101
+ input_mappings={
102
+ "instruction": "evolved_questions",
103
+ "generation": "domain_expert_answer",
104
+ },
105
+ )
106
+
107
+ load_data.connect(self_instruct)
108
+ self_instruct.connect(expand_instructions)
109
+ expand_instructions.connect(cleaner)
110
+ cleaner.connect(evol_instruction_complexity)
111
+ evol_instruction_complexity.connect(expand_evolutions)
112
+ expand_evolutions.connect(domain_expert)
113
+ domain_expert.connect(keep_columns)
114
+ keep_columns.connect(to_argilla)
115
+ return pipeline
116
+
117
+
118
+ def serialize_pipeline(
119
+ argilla_api_key: str,
120
+ argilla_api_url: str,
121
+ argilla_dataset_name: str,
122
+ topics: List[str],
123
+ perspectives: List[str],
124
+ domain_expert_prompt: str,
125
+ hub_token: str,
126
+ endpoint_base_url: str,
127
+ pipeline_config_path: str = "pipeline.yaml",
128
+ examples: List[dict] = [],
129
+ ):
130
+ """Serialize the pipeline to a yaml file."""
131
+ pipeline = define_pipeline(
132
+ argilla_api_key=argilla_api_key,
133
+ argilla_api_url=argilla_api_url,
134
+ argilla_dataset_name=argilla_dataset_name,
135
+ topics=topics,
136
+ perspectives=perspectives,
137
+ domain_expert_prompt=domain_expert_prompt,
138
+ hub_token=hub_token,
139
+ endpoint_base_url=endpoint_base_url,
140
+ examples=examples,
141
+ )
142
+ pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
143
+
144
+
145
+ def create_pipelines_run_command(
146
+ hub_token: str,
147
+ argilla_api_key: str,
148
+ argilla_api_url: str,
149
+ pipeline_config_path: str = "pipeline.yaml",
150
+ argilla_dataset_name: str = "domain_specific_datasets",
151
+ ):
152
+ """Create the command to run the pipeline."""
153
+ command_to_run = [
154
+ sys.executable,
155
+ "-m",
156
+ "distilabel",
157
+ "pipeline",
158
+ "run",
159
+ "--config",
160
+ pipeline_config_path,
161
+ "--param",
162
+ f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
163
+ "--param",
164
+ f"text_generation_to_argilla.api_key={argilla_api_key}",
165
+ "--param",
166
+ f"text_generation_to_argilla.api_url={argilla_api_url}",
167
+ "--param",
168
+ f"self-instruct.llm.api_key={hub_token}",
169
+ "--param",
170
+ f"evol_instruction_complexity.llm.api_key={hub_token}",
171
+ "--param",
172
+ f"domain_expert.llm.api_key={hub_token}",
173
+ "--ignore-cache",
174
+ ]
175
+ return command_to_run
176
+
177
+
178
+ def run_pipeline(
179
+ hub_token: str,
180
+ argilla_api_key: str,
181
+ argilla_api_url: str,
182
+ pipeline_config_path: str = "pipeline.yaml",
183
+ argilla_dataset_name: str = "domain_specific_datasets",
184
+ ):
185
+ """Run the pipeline and yield the output as a generator of logs."""
186
+
187
+ command_to_run = create_pipelines_run_command(
188
+ hub_token=hub_token,
189
+ pipeline_config_path=pipeline_config_path,
190
+ argilla_dataset_name=argilla_dataset_name,
191
+ argilla_api_key=argilla_api_key,
192
+ argilla_api_url=argilla_api_url,
193
+ )
194
+
195
+ # Run the script file
196
+ process = subprocess.Popen(
197
+ args=command_to_run,
198
+ stdout=subprocess.PIPE,
199
+ stderr=subprocess.PIPE,
200
+ env={"HF_TOKEN": hub_token},
201
+ )
202
+
203
+ while process.stdout and process.stdout.readable():
204
+ time.sleep(0.2)
205
+ line = process.stdout.readline()
206
+ if not line:
207
+ break
208
+ yield line.decode("utf-8")
pages/pipeline.yaml ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ distilabel:
2
+ version: 1.0.0
3
+ pipeline:
4
+ name: farming
5
+ description: null
6
+ steps:
7
+ - step:
8
+ name: load_data
9
+ input_mappings: {}
10
+ output_mappings: {}
11
+ batch_size: 64
12
+ data:
13
+ - input: animal welfare from a Family Farming perspective
14
+ - input: animal welfare from a Agribusiness perspective
15
+ - input: animal welfare from a Permaculture perspective
16
+ - input: animal welfare from a Agroforestery perspective
17
+ - input: animal welfare from a Conventional Farming perspective
18
+ - input: economic growth from a Family Farming perspective
19
+ - input: economic growth from a Agribusiness perspective
20
+ - input: economic growth from a Permaculture perspective
21
+ - input: economic growth from a Agroforestery perspective
22
+ - input: economic growth from a Conventional Farming perspective
23
+ - input: land from a Family Farming perspective
24
+ - input: land from a Agribusiness perspective
25
+ - input: land from a Permaculture perspective
26
+ - input: land from a Agroforestery perspective
27
+ - input: land from a Conventional Farming perspective
28
+ - input: resources from a Family Farming perspective
29
+ - input: resources from a Agribusiness perspective
30
+ - input: resources from a Permaculture perspective
31
+ - input: resources from a Agroforestery perspective
32
+ - input: resources from a Conventional Farming perspective
33
+ - input: efficiency from a Family Farming perspective
34
+ - input: efficiency from a Agribusiness perspective
35
+ - input: efficiency from a Permaculture perspective
36
+ - input: efficiency from a Agroforestery perspective
37
+ - input: efficiency from a Conventional Farming perspective
38
+ runtime_parameters_info:
39
+ - name: batch_size
40
+ optional: true
41
+ description: The number of rows that will contain the batches generated by
42
+ the step.
43
+ type_info:
44
+ module: distilabel.steps.generators.data
45
+ name: LoadDataFromDicts
46
+ name: load_data
47
+ - step:
48
+ name: self-instruct
49
+ input_mappings: {}
50
+ output_mappings: {}
51
+ input_batch_size: 8
52
+ llm:
53
+ generation_kwargs: {}
54
+ model_id: null
55
+ endpoint_name: null
56
+ endpoint_namespace: null
57
+ base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
58
+ tokenizer_id: null
59
+ model_display_name: null
60
+ use_openai_client: false
61
+ type_info:
62
+ module: distilabel.llms.huggingface.inference_endpoints
63
+ name: InferenceEndpointsLLM
64
+ group_generations: false
65
+ num_generations: 1
66
+ num_instructions: 5
67
+ criteria_for_query_generation: 'Incorporate a diverse range of verbs, avoiding
68
+ repetition.
69
+
70
+ Ensure queries are compatible with AI model''s text generation functions and
71
+ are limited to 1-2 sentences.
72
+
73
+ Design queries to be self-contained and standalone.
74
+
75
+ Blend interrogative (e.g., "What is the significance of x?") and imperative
76
+ (e.g., "Detail the process of x.") styles.'
77
+ application_description: 'You are an AI assistant than generates queries around
78
+ the domain of farming.
79
+
80
+ Your should not expect basic but profound questions from your users.
81
+
82
+ The queries should reflect a diversity of vision and economic positions and
83
+ political positions.
84
+
85
+ The queries may know about different methods of farming.
86
+
87
+ The queries can be positioned politically, economically, socially, or practically.
88
+
89
+ Also take into account the impact of diverse causes on diverse domains.'
90
+ runtime_parameters_info:
91
+ - name: input_batch_size
92
+ optional: true
93
+ description: The number of rows that will contain the batches processed by
94
+ the step.
95
+ - name: llm
96
+ runtime_parameters_info:
97
+ - name: generation_kwargs
98
+ description: The kwargs to be propagated to either `generate` or `agenerate`
99
+ methods within each `LLM`.
100
+ keys:
101
+ - name: max_new_tokens
102
+ optional: true
103
+ description: the maximum number of new tokens that the model will generate. Defaults
104
+ to `128`.
105
+ - name: frequency_penalty
106
+ optional: true
107
+ description: the repetition penalty to use for the generation. Defaults to
108
+ `0.0`. Only applies if `use_openai_client=True`.
109
+ - name: presence_penalty
110
+ optional: true
111
+ description: the presence penalty to use for the generation. Defaults
112
+ to `0.0`. Only applies if `use_openai_client=True`.
113
+ - name: repetition_penalty
114
+ optional: true
115
+ description: the repetition penalty to use for the generation. Defaults to
116
+ `None`. Only applies if `use_openai_client=False`.
117
+ - name: temperature
118
+ optional: true
119
+ description: the temperature to use for the generation. Defaults to `1.0`.
120
+ - name: do_sample
121
+ optional: true
122
+ description: whether to use sampling for the generation. Defaults to `False`. Only
123
+ applies if `use_openai_client=False`.
124
+ - name: top_k
125
+ optional: true
126
+ description: the top-k value to use for the generation. Defaults to `0.8`,
127
+ since neither `0.0` nor `1.0` are valid values in TGI.
128
+ - name: top_p
129
+ optional: true
130
+ description: the top-p value to use for the generation. Defaults to `1.0`.
131
+ - name: typical_p
132
+ optional: true
133
+ description: the typical-p value to use for the generation. Defaults to
134
+ `0.5`.
135
+ - name: endpoint_name
136
+ optional: true
137
+ description: The name of the Inference Endpoint to use for the LLM.
138
+ - name: endpoint_namespace
139
+ optional: true
140
+ description: The namespace of the Inference Endpoint to use for the LLM.
141
+ - name: base_url
142
+ optional: true
143
+ description: The base URL to use for the Inference Endpoints API requests.
144
+ - name: api_key
145
+ optional: true
146
+ description: The API key to authenticate the requests to the Inference Endpoints
147
+ API.
148
+ - name: num_generations
149
+ optional: true
150
+ description: The number of generations to be produced per input.
151
+ type_info:
152
+ module: distilabel.steps.tasks.self_instruct
153
+ name: SelfInstruct
154
+ name: self-instruct
155
+ - step:
156
+ name: evol_instruction_complexity
157
+ input_mappings:
158
+ instruction: question
159
+ output_mappings: {}
160
+ input_batch_size: 8
161
+ llm:
162
+ generation_kwargs: {}
163
+ model_id: null
164
+ endpoint_name: null
165
+ endpoint_namespace: null
166
+ base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
167
+ tokenizer_id: null
168
+ model_display_name: null
169
+ use_openai_client: false
170
+ type_info:
171
+ module: distilabel.llms.huggingface.inference_endpoints
172
+ name: InferenceEndpointsLLM
173
+ group_generations: false
174
+ num_generations: 1
175
+ num_evolutions: 2
176
+ store_evolutions: true
177
+ generate_answers: false
178
+ include_original_instruction: true
179
+ mutation_templates:
180
+ CONSTRAINTS: "I want you act as a Prompt Rewriter.\n\nYour objective is to\
181
+ \ rewrite a given prompt into a more complex version to make those famous\
182
+ \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\
183
+ \ rewritten prompt must be reasonable and must be understood and responded\
184
+ \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\
185
+ \ table and code in #The Given Prompt#:. Also, please do not omit the input\
186
+ \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\
187
+ \ the following method: \nPlease add one more constraints/requirements into\
188
+ \ '#The Given Prompt#'\n\nYou should try your best not to make the #Rewritten\
189
+ \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\
190
+ \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\
191
+ \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\
192
+ \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n"
193
+ DEEPENING: "I want you act as a Prompt Rewriter.\n\nYour objective is to rewrite\
194
+ \ a given prompt into a more complex version to make those famous AI systems\
195
+ \ (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the rewritten\
196
+ \ prompt must be reasonable and must be understood and responded by humans.\n\
197
+ \nYour rewriting cannot omit the non-text parts such as the table and code\
198
+ \ in #The Given Prompt#:. Also, please do not omit the input in #The Given\
199
+ \ Prompt#.\n\nYou SHOULD complicate the given prompt using the following\
200
+ \ method: \nIf #The Given Prompt# contains inquiries about certain issues,\
201
+ \ the depth and breadth of the inquiry can be increased.\n\nYou should try\
202
+ \ your best not to make the #Rewritten Prompt# become verbose, #Rewritten\
203
+ \ Prompt# can only add 10 to 20 words into #The Given Prompt#.\n\n'#The\
204
+ \ Given Prompt#', '#Rewritten Prompt#', 'given prompt' and 'rewritten prompt'\
205
+ \ are not allowed to appear in #Rewritten Prompt#\n\n#The Given Prompt#:\n\
206
+ <PROMPT>\n#Rewritten Prompt#:\n\n"
207
+ CONCRETIZING: "I want you act as a Prompt Rewriter.\n\nYour objective is to\
208
+ \ rewrite a given prompt into a more complex version to make those famous\
209
+ \ AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\nBut the\
210
+ \ rewritten prompt must be reasonable and must be understood and responded\
211
+ \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\
212
+ \ table and code in #The Given Prompt#:. Also, please do not omit the input\
213
+ \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\
214
+ \ the following method: \nPlease replace general concepts with more specific\
215
+ \ concepts.\n\nYou should try your best not to make the #Rewritten Prompt#\
216
+ \ become verbose, #Rewritten Prompt# can only add 10 to 20 words into #The\
217
+ \ Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#', 'given prompt'\
218
+ \ and 'rewritten prompt' are not allowed to appear in #Rewritten Prompt#\n\
219
+ \n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n"
220
+ INCREASED_REASONING_STEPS: "I want you act as a Prompt Rewriter.\n\nYour objective\
221
+ \ is to rewrite a given prompt into a more complex version to make those\
222
+ \ famous AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\n\n\
223
+ But the rewritten prompt must be reasonable and must be understood and responded\
224
+ \ by humans.\n\nYour rewriting cannot omit the non-text parts such as the\
225
+ \ table and code in #The Given Prompt#:. Also, please do not omit the input\
226
+ \ in #The Given Prompt#.\n\nYou SHOULD complicate the given prompt using\
227
+ \ the following method: \nIf #The Given Prompt# can be solved with just\
228
+ \ a few simple thinking processes, you can rewrite it to explicitly request\
229
+ \ multiple-step reasoning.\n\nYou should try your best not to make the #Rewritten\
230
+ \ Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words\
231
+ \ into #The Given Prompt#.\n\n'#The Given Prompt#', '#Rewritten Prompt#',\
232
+ \ 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten\
233
+ \ Prompt#\n\n#The Given Prompt#:\n<PROMPT>\n#Rewritten Prompt#:\n\n"
234
+ BREADTH: 'I want you act as a Prompt Creator.
235
+
236
+
237
+ Your goal is to draw inspiration from the #Given Prompt# to create a brand
238
+ new prompt.
239
+
240
+
241
+ This new prompt should belong to the same domain as the #Given Prompt# but
242
+ be even more rare.
243
+
244
+
245
+ The LENGTH and complexity of the #Created Prompt# should be similar to that
246
+ of the #Given Prompt#.
247
+
248
+
249
+ The #Created Prompt# must be reasonable and must be understood and responded
250
+ by humans.
251
+
252
+
253
+ ''#Given Prompt#'', ''#Created Prompt#'', ''given prompt'' and ''created
254
+ prompt'' are not allowed to appear in #Created Prompt#
255
+
256
+
257
+ #Given Prompt#:
258
+
259
+ <PROMPT>
260
+
261
+ #Created Prompt#:
262
+
263
+
264
+ '
265
+ seed: 42
266
+ runtime_parameters_info:
267
+ - name: input_batch_size
268
+ optional: true
269
+ description: The number of rows that will contain the batches processed by
270
+ the step.
271
+ - name: llm
272
+ runtime_parameters_info:
273
+ - name: generation_kwargs
274
+ description: The kwargs to be propagated to either `generate` or `agenerate`
275
+ methods within each `LLM`.
276
+ keys:
277
+ - name: max_new_tokens
278
+ optional: true
279
+ description: the maximum number of new tokens that the model will generate. Defaults
280
+ to `128`.
281
+ - name: frequency_penalty
282
+ optional: true
283
+ description: the repetition penalty to use for the generation. Defaults to
284
+ `0.0`. Only applies if `use_openai_client=True`.
285
+ - name: presence_penalty
286
+ optional: true
287
+ description: the presence penalty to use for the generation. Defaults
288
+ to `0.0`. Only applies if `use_openai_client=True`.
289
+ - name: repetition_penalty
290
+ optional: true
291
+ description: the repetition penalty to use for the generation. Defaults to
292
+ `None`. Only applies if `use_openai_client=False`.
293
+ - name: temperature
294
+ optional: true
295
+ description: the temperature to use for the generation. Defaults to `1.0`.
296
+ - name: do_sample
297
+ optional: true
298
+ description: whether to use sampling for the generation. Defaults to `False`. Only
299
+ applies if `use_openai_client=False`.
300
+ - name: top_k
301
+ optional: true
302
+ description: the top-k value to use for the generation. Defaults to `0.8`,
303
+ since neither `0.0` nor `1.0` are valid values in TGI.
304
+ - name: top_p
305
+ optional: true
306
+ description: the top-p value to use for the generation. Defaults to `1.0`.
307
+ - name: typical_p
308
+ optional: true
309
+ description: the typical-p value to use for the generation. Defaults to
310
+ `0.5`.
311
+ - name: endpoint_name
312
+ optional: true
313
+ description: The name of the Inference Endpoint to use for the LLM.
314
+ - name: endpoint_namespace
315
+ optional: true
316
+ description: The namespace of the Inference Endpoint to use for the LLM.
317
+ - name: base_url
318
+ optional: true
319
+ description: The base URL to use for the Inference Endpoints API requests.
320
+ - name: api_key
321
+ optional: true
322
+ description: The API key to authenticate the requests to the Inference Endpoints
323
+ API.
324
+ - name: num_generations
325
+ optional: true
326
+ description: The number of generations to be produced per input.
327
+ - name: seed
328
+ optional: true
329
+ description: As `numpy` is being used in order to randomly pick a mutation
330
+ method, then is nice to seed a random seed.
331
+ type_info:
332
+ module: distilabel.steps.tasks.evol_instruct.base
333
+ name: EvolInstruct
334
+ name: evol_instruction_complexity
335
+ - step:
336
+ name: expand_columns
337
+ input_mappings: {}
338
+ output_mappings: {}
339
+ input_batch_size: 50
340
+ columns:
341
+ instructions: question
342
+ runtime_parameters_info:
343
+ - name: input_batch_size
344
+ optional: true
345
+ description: The number of rows that will contain the batches processed by
346
+ the step.
347
+ type_info:
348
+ module: distilabel.steps.expand
349
+ name: ExpandColumns
350
+ name: expand_columns
351
+ - step:
352
+ name: clean_numbered_list
353
+ input_mappings: {}
354
+ output_mappings: {}
355
+ input_batch_size: 50
356
+ runtime_parameters_info:
357
+ - name: input_batch_size
358
+ optional: true
359
+ description: The number of rows that will contain the batches processed by
360
+ the step.
361
+ type_info:
362
+ module: domain
363
+ name: CleanNumberedList
364
+ name: clean_numbered_list
365
+ - step:
366
+ name: expand_columns_evolved
367
+ input_mappings: {}
368
+ output_mappings: {}
369
+ input_batch_size: 50
370
+ columns:
371
+ evolved_instructions: evolved_questions
372
+ runtime_parameters_info:
373
+ - name: input_batch_size
374
+ optional: true
375
+ description: The number of rows that will contain the batches processed by
376
+ the step.
377
+ type_info:
378
+ module: distilabel.steps.expand
379
+ name: ExpandColumns
380
+ name: expand_columns_evolved
381
+ - step:
382
+ name: domain_expert
383
+ input_mappings:
384
+ instruction: evolved_questions
385
+ output_mappings:
386
+ generation: domain_expert_answer
387
+ input_batch_size: 8
388
+ llm:
389
+ generation_kwargs: {}
390
+ model_id: null
391
+ endpoint_name: null
392
+ endpoint_namespace: null
393
+ base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
394
+ tokenizer_id: null
395
+ model_display_name: null
396
+ use_openai_client: false
397
+ type_info:
398
+ module: distilabel.llms.huggingface.inference_endpoints
399
+ name: InferenceEndpointsLLM
400
+ group_generations: false
401
+ num_generations: 1
402
+ runtime_parameters_info:
403
+ - name: input_batch_size
404
+ optional: true
405
+ description: The number of rows that will contain the batches processed by
406
+ the step.
407
+ - name: llm
408
+ runtime_parameters_info:
409
+ - name: generation_kwargs
410
+ description: The kwargs to be propagated to either `generate` or `agenerate`
411
+ methods within each `LLM`.
412
+ keys:
413
+ - name: max_new_tokens
414
+ optional: true
415
+ description: the maximum number of new tokens that the model will generate. Defaults
416
+ to `128`.
417
+ - name: frequency_penalty
418
+ optional: true
419
+ description: the repetition penalty to use for the generation. Defaults to
420
+ `0.0`. Only applies if `use_openai_client=True`.
421
+ - name: presence_penalty
422
+ optional: true
423
+ description: the presence penalty to use for the generation. Defaults
424
+ to `0.0`. Only applies if `use_openai_client=True`.
425
+ - name: repetition_penalty
426
+ optional: true
427
+ description: the repetition penalty to use for the generation. Defaults to
428
+ `None`. Only applies if `use_openai_client=False`.
429
+ - name: temperature
430
+ optional: true
431
+ description: the temperature to use for the generation. Defaults to `1.0`.
432
+ - name: do_sample
433
+ optional: true
434
+ description: whether to use sampling for the generation. Defaults to `False`. Only
435
+ applies if `use_openai_client=False`.
436
+ - name: top_k
437
+ optional: true
438
+ description: the top-k value to use for the generation. Defaults to `0.8`,
439
+ since neither `0.0` nor `1.0` are valid values in TGI.
440
+ - name: top_p
441
+ optional: true
442
+ description: the top-p value to use for the generation. Defaults to `1.0`.
443
+ - name: typical_p
444
+ optional: true
445
+ description: the typical-p value to use for the generation. Defaults to
446
+ `0.5`.
447
+ - name: endpoint_name
448
+ optional: true
449
+ description: The name of the Inference Endpoint to use for the LLM.
450
+ - name: endpoint_namespace
451
+ optional: true
452
+ description: The namespace of the Inference Endpoint to use for the LLM.
453
+ - name: base_url
454
+ optional: true
455
+ description: The base URL to use for the Inference Endpoints API requests.
456
+ - name: api_key
457
+ optional: true
458
+ description: The API key to authenticate the requests to the Inference Endpoints
459
+ API.
460
+ - name: num_generations
461
+ optional: true
462
+ description: The number of generations to be produced per input.
463
+ type_info:
464
+ module: domain
465
+ name: DomainExpert
466
+ name: domain_expert
467
+ - step:
468
+ name: keep_columns
469
+ input_mappings: {}
470
+ output_mappings: {}
471
+ input_batch_size: 50
472
+ columns:
473
+ - model_name
474
+ - evolved_questions
475
+ - domain_expert_answer
476
+ runtime_parameters_info:
477
+ - name: input_batch_size
478
+ optional: true
479
+ description: The number of rows that will contain the batches processed by
480
+ the step.
481
+ type_info:
482
+ module: distilabel.steps.keep
483
+ name: KeepColumns
484
+ name: keep_columns
485
+ - step:
486
+ name: text_generation_to_argilla
487
+ input_mappings:
488
+ instruction: evolved_questions
489
+ generation: domain_expert_answer
490
+ output_mappings: {}
491
+ input_batch_size: 50
492
+ dataset_name: farming
493
+ dataset_workspace: admin
494
+ api_url: https://argilla-farming.hf.space
495
+ runtime_parameters_info:
496
+ - name: input_batch_size
497
+ optional: true
498
+ description: The number of rows that will contain the batches processed by
499
+ the step.
500
+ - name: dataset_name
501
+ optional: false
502
+ description: The name of the dataset in Argilla.
503
+ - name: dataset_workspace
504
+ optional: true
505
+ description: The workspace where the dataset will be created in Argilla. Defaultsto
506
+ `None` which means it will be created in the default workspace.
507
+ - name: api_url
508
+ optional: true
509
+ description: The base URL to use for the Argilla API requests.
510
+ - name: api_key
511
+ optional: true
512
+ description: The API key to authenticate the requests to the Argilla API.
513
+ type_info:
514
+ module: distilabel.steps.argilla.text_generation
515
+ name: TextGenerationToArgilla
516
+ name: text_generation_to_argilla
517
+ connections:
518
+ - from: load_data
519
+ to:
520
+ - self-instruct
521
+ - from: self-instruct
522
+ to:
523
+ - expand_columns
524
+ - from: evol_instruction_complexity
525
+ to:
526
+ - expand_columns_evolved
527
+ - from: expand_columns
528
+ to:
529
+ - clean_numbered_list
530
+ - from: clean_numbered_list
531
+ to:
532
+ - evol_instruction_complexity
533
+ - from: expand_columns_evolved
534
+ to:
535
+ - domain_expert
536
+ - from: domain_expert
537
+ to:
538
+ - keep_columns
539
+ - from: keep_columns
540
+ to:
541
+ - text_generation_to_argilla
542
+ - from: text_generation_to_argilla
543
+ to: []
544
+ type_info:
545
+ module: distilabel.pipeline.local
546
+ name: Pipeline
pages/project_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project_name": "DEFAULT_DOMAIN", "argilla_space_repo_id": "burtenshaw/domain_test_4_argilla_space", "project_space_repo_id": "burtenshaw/domain_test_4_config_space", "dataset_repo_id": "burtenshaw/domain_test_4"}
pages/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ python_dotenv
3
+ sentence_transformers
4
+ streamlit
5
+ huggingface_hub
6
+ mistralai
7
+ argilla
8
+ git+https://github.com/argilla-io/distilabel.git
pages/seed_data.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "domain": "farming",
3
+ "perspectives": [
4
+ "Family Farming",
5
+ "Agribusiness",
6
+ "Permaculture",
7
+ "Agroforestery",
8
+ "Conventional Farming"
9
+ ],
10
+ "topics": [
11
+ "animal welfare",
12
+ "economic growth",
13
+ "land",
14
+ "resources",
15
+ "efficiency"
16
+ ],
17
+ "examples": [
18
+ {
19
+ "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
20
+ "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
21
+ },
22
+ {
23
+ "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
24
+ "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
25
+ },
26
+ {
27
+ "question": "Analyze the economic implications of transitioning from conventional to organic farming.",
28
+ "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
29
+ },
30
+ {
31
+ "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
32
+ "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
33
+ },
34
+ {
35
+ "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
36
+ "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
37
+ }
38
+ ],
39
+ "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
40
+ }
pages/utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from defaults import (
4
+ ARGILLA_SPACE_REPO_ID,
5
+ PROJECT_NAME,
6
+ ARGILLA_URL,
7
+ DIBT_PARENT_APP_URL,
8
+ DATASET_URL,
9
+ DATASET_REPO_ID,
10
+ ARGILLA_SPACE_REPO_ID,
11
+ )
12
+
13
+
14
+ def project_sidebar():
15
+ if PROJECT_NAME == "DEFAULT_DOMAIN":
16
+ st.warning(
17
+ "Please set up the project configuration in the parent app before proceeding."
18
+ )
19
+ st.stop()
20
+
21
+ st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
22
+ st.sidebar.markdown(
23
+ """
24
+ This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
25
+ """
26
+ )
27
+ st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
28
+ st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
29
+ st.sidebar.divider()
30
+ st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
31
+ st.sidebar.link_button(
32
+ "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
33
+ )
seed_data.json CHANGED
@@ -1,39 +1,23 @@
1
  {
2
  "domain": "farming",
3
  "perspectives": [
4
- "Family Farming",
5
- "Agribusiness",
6
- "Permaculture",
7
- "Agroforestery",
8
- "Conventional Farming"
9
  ],
10
  "topics": [
11
- "animal welfare",
12
- "economic growth",
13
- "land",
14
- "resources",
15
- "efficiency"
16
  ],
17
  "examples": [
18
  {
19
- "question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
20
- "answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
21
  },
22
  {
23
- "question": "Compare the environmental footprint of small-scale, local farming versus large-scale, industrial agriculture.",
24
- "answer": "Industrial agriculture typically emphasizes high-output, monoculture farming reliant on synthetic fertilizers and pesticides, which, as Horrigan, Lawrence, and Walker (2002) argue, leads to greater greenhouse gas emissions, higher energy use, and more water consumption compared to small-scale farming. In contrast, small-scale farms often employ diverse cropping systems and lower chemical inputs, resulting in a smaller environmental footprint. Pimentel et al. (2005) note that small-scale farms tend to have higher yields per unit area when environmental and sustainability factors are integrated into farming practices."
25
  },
26
  {
27
- "question": "Analyze the economic implications of transitioning from conventional to organic farming.",
28
- "answer": "Transitioning from conventional to organic farming involves significant changes in farm management, input use, and market engagement. Crowder and Reganold (2015) present evidence that organic farms often yield smaller outputs initially but achieve higher profitability due to premium prices, lower input costs, and improved soil health over time. However, this transition requires upfront investments in knowledge and infrastructure, which can be economically challenging for some farmers, as noted by Seufert and Ramankutty (2017)."
29
- },
30
- {
31
- "question": "Analyze the social, economic and environnmental impacts of land consolidation vs small-scale farmers.",
32
- "answer": "Land consolidation has been associated with increased agricultural productivity but also with negative social and environmental impacts. Larger land holdings typically lead to monocultures, which reduce biodiversity and increase vulnerability to pests and diseases, as highlighted by Li et al. (2017). Economically, while consolidation can lead to economies of scale and potential gains in gross margins, it often displaces rural populations, exacerbating poverty and reducing local food diversity (Sutherland et al., 2015)."
33
- },
34
- {
35
- "question": "Investigate the relationship between land ownership patterns, agricultural productivity and environment sustainability. ",
36
- "answer": "Land ownership patterns critically influence agricultural productivity and sustainability. Secure land tenure supports investments in long-term improvements such as soil conservation and water management, which are pivotal for sustainable outcomes. Studies by Barrett et al. (2010) demonstrate that fragmented land ownership often results in inefficient resource use and higher transaction costs, which can detract from sustainability goals."
37
  }
38
  ],
39
  "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
 
1
  {
2
  "domain": "farming",
3
  "perspectives": [
4
+ "Family Farming"
 
 
 
 
5
  ],
6
  "topics": [
7
+ "animal welfare"
 
 
 
 
8
  ],
9
  "examples": [
10
  {
11
+ "question": "",
12
+ "answer": ""
13
  },
14
  {
15
+ "question": "",
16
+ "answer": ""
17
  },
18
  {
19
+ "question": "",
20
+ "answer": ""
 
 
 
 
 
 
 
 
21
  }
22
  ],
23
  "domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."