Ben Burtenshaw commited on
Commit
0ac0929
1 Parent(s): 01af24e

fix expose pages on parent app

Browse files
domain.py DELETED
@@ -1,89 +0,0 @@
1
- import json
2
- from typing import Any, Dict, List
3
-
4
- from distilabel.steps.tasks.typing import ChatType
5
- from distilabel.steps.tasks.text_generation import TextGeneration
6
- from distilabel.steps import StepInput, StepOutput, Step
7
-
8
- from dotenv import load_dotenv
9
-
10
- from defaults import (
11
- DEFAULT_DOMAIN,
12
- DEFAULT_PERSPECTIVES,
13
- DEFAULT_TOPICS,
14
- DEFAULT_EXAMPLES,
15
- DEFAULT_SYSTEM_PROMPT,
16
- N_PERSPECTIVES,
17
- N_TOPICS,
18
- N_EXAMPLES,
19
- )
20
-
21
- load_dotenv()
22
-
23
- # Application description used for SelfInstruct
24
- APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}.
25
- Your should not expect basic but profound questions from your users.
26
- The queries should reflect a diversity of vision and economic positions and political positions.
27
- The queries may know about different methods of {DEFAULT_DOMAIN}.
28
- The queries can be positioned politically, economically, socially, or practically.
29
- Also take into account the impact of diverse causes on diverse domains."""
30
-
31
-
32
- TOPICS = DEFAULT_TOPICS[:N_TOPICS]
33
- PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES]
34
- EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES]
35
-
36
-
37
- def create_examples_template(examples: List[Dict[str, str]]) -> List[str]:
38
- questions = """ Examples of high quality questions:"""
39
- answers = """ Examples of high quality answers:"""
40
- for example in examples:
41
- questions += f"""\n- Question: {example["question"]}\n"""
42
- answers += f"""\n- Answer: {example["answer"]}\n"""
43
-
44
- _template: str = (
45
- """{instruction}\nThis is the the instruction.\n Examples: """
46
- + questions
47
- + answers
48
- )
49
- return _template
50
-
51
-
52
- def create_topics(topics: List[str], positions: List[str]) -> List[str]:
53
- return [
54
- f"{topic} from a {position} perspective"
55
- for topic in topics
56
- for position in positions
57
- ]
58
-
59
-
60
- class DomainExpert(TextGeneration):
61
- """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
62
-
63
- _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT
64
- _template: str = """{instruction}\nThis is the the instruction.\n Examples: """
65
-
66
- def format_input(self, input: Dict[str, Any]) -> "ChatType":
67
- return [
68
- {
69
- "role": "system",
70
- "content": self._system_prompt,
71
- },
72
- {
73
- "role": "user",
74
- "content": self._template.format(**input),
75
- },
76
- ]
77
-
78
-
79
- class CleanNumberedList(Step):
80
- """A step to clean the numbered list of questions."""
81
-
82
- def process(self, inputs: StepInput) -> StepOutput:
83
- import re
84
-
85
- pattern = r"^\d+\.\s"
86
-
87
- for input in inputs:
88
- input["question"] = re.sub(pattern, "", input["question"])
89
- yield inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer.py DELETED
@@ -1,18 +0,0 @@
1
- import os
2
- import requests
3
-
4
- API_URL = (
5
- "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
6
- )
7
-
8
-
9
-
10
-
11
-
12
- def query(question, hub_token: str):
13
- payload = {
14
- "inputs": question,
15
- }
16
- headers = {"Authorization": f"Bearer {hub_token}"}
17
- response = requests.post(API_URL, headers=headers, json=payload)
18
- return response.json()[0]["generated_text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/2_👩🏼‍🔬 Describe Domain.py DELETED
@@ -1,281 +0,0 @@
1
- import json
2
-
3
- import streamlit as st
4
-
5
- from hub import push_dataset_to_hub
6
- from infer import query
7
- from defaults import (
8
- DEFAULT_DOMAIN,
9
- DEFAULT_PERSPECTIVES,
10
- DEFAULT_TOPICS,
11
- DEFAULT_EXAMPLES,
12
- DEFAULT_SYSTEM_PROMPT,
13
- N_PERSPECTIVES,
14
- N_TOPICS,
15
- SEED_DATA_PATH,
16
- PIPELINE_PATH,
17
- DATASET_REPO_ID,
18
- )
19
- from utils import project_sidebar
20
-
21
- st.set_page_config(
22
- page_title="Domain Data Grower",
23
- page_icon="🧑‍🌾",
24
- )
25
- project_sidebar()
26
-
27
- ################################################################################
28
- # HEADER
29
- ################################################################################
30
-
31
- st.header("🧑‍🌾 Domain Data Grower")
32
- st.divider()
33
- st.subheader(
34
- "Step 2. Define the specific domain that you want to generate synthetic data for.",
35
- )
36
- st.write(
37
- "Define the project details, including the project name, domain, and API credentials"
38
- )
39
-
40
- ################################################################################
41
- # Domain Expert Section
42
- ################################################################################
43
-
44
- (
45
- tab_domain_expert,
46
- tab_domain_perspectives,
47
- tab_domain_topics,
48
- tab_examples,
49
- tab_raw_seed,
50
- ) = st.tabs(
51
- tabs=[
52
- "👩🏼‍🔬 Domain Expert",
53
- "🔍 Domain Perspectives",
54
- "🕸️ Domain Topics",
55
- "📚 Examples",
56
- "🌱 Raw Seed Data",
57
- ]
58
- )
59
-
60
- with tab_domain_expert:
61
- st.text("Define the domain expertise that you want to train a language model")
62
- st.info(
63
- "A domain expert is a person who is an expert in a particular field or area. For example, a domain expert in farming would be someone who has extensive knowledge and experience in farming and agriculture."
64
- )
65
-
66
- domain = st.text_input("Domain Name", DEFAULT_DOMAIN)
67
-
68
- domain_expert_prompt = st.text_area(
69
- label="Domain Expert Definition",
70
- value=DEFAULT_SYSTEM_PROMPT,
71
- height=200,
72
- )
73
-
74
- ################################################################################
75
- # Domain Perspectives
76
- ################################################################################
77
-
78
- with tab_domain_perspectives:
79
- st.text("Define the different perspectives from which the domain can be viewed")
80
- st.info(
81
- """
82
- Perspectives are different viewpoints or angles from which a domain can be viewed.
83
- For example, the domain of farming can be viewed from the perspective of a commercial
84
- farmer or an independent family farmer."""
85
- )
86
-
87
- perspectives = st.session_state.get(
88
- "perspectives",
89
- [DEFAULT_PERSPECTIVES[0]],
90
- )
91
- perspectives_container = st.container()
92
-
93
- perspectives = [
94
- perspectives_container.text_input(
95
- f"Domain Perspective {i + 1}", value=perspective
96
- )
97
- for i, perspective in enumerate(perspectives)
98
- ]
99
-
100
- if st.button("Add Perspective", key="add_perspective"):
101
- n = len(perspectives)
102
- value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
103
- perspectives.append(
104
- perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
105
- )
106
-
107
- st.session_state["perspectives"] = perspectives
108
-
109
-
110
- ################################################################################
111
- # Domain Topics
112
- ################################################################################
113
-
114
- with tab_domain_topics:
115
- st.text("Define the main themes or subjects that are relevant to the domain")
116
- st.info(
117
- """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
118
- )
119
- topics = st.session_state.get(
120
- "topics",
121
- [DEFAULT_TOPICS[0]],
122
- )
123
- topics_container = st.container()
124
- topics = [
125
- topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
126
- for i, topic in enumerate(topics)
127
- ]
128
-
129
- if st.button("Add Topic", key="add_topic"):
130
- n = len(topics)
131
- value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
132
- topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
133
-
134
- st.session_state["topics"] = topics
135
-
136
-
137
- ################################################################################
138
- # Examples Section
139
- ################################################################################
140
-
141
- with tab_examples:
142
- st.text(
143
- "Add high-quality questions and answers that can be used to generate synthetic data"
144
- )
145
- st.info(
146
- """
147
- Examples are high-quality questions and answers that can be used to generate
148
- synthetic data for the domain. These examples will be used to train the language model
149
- to generate questions and answers.
150
- """
151
- )
152
-
153
- examples = st.session_state.get(
154
- "examples",
155
- [
156
- {
157
- "question": "",
158
- "answer": "",
159
- }
160
- ],
161
- )
162
-
163
- for n, example in enumerate(examples, 1):
164
- question = example["question"]
165
- answer = example["answer"]
166
- examples_container = st.container()
167
- question_column, answer_column = examples_container.columns(2)
168
-
169
- if st.button(f"Generate Answer {n}"):
170
- if st.session_state["hub_token"] is None:
171
- st.error("Please provide a Hub token to generate answers")
172
- else:
173
- answer = query(question, st.session_state["hub_token"])
174
- with question_column:
175
- question = st.text_area(f"Question {n}", value=question)
176
-
177
- with answer_column:
178
- answer = st.text_area(f"Answer {n}", value=answer)
179
- examples[n - 1] = {"question": question, "answer": answer}
180
- st.session_state["examples"] = examples
181
- st.divider()
182
-
183
- if st.button("Add Example"):
184
- examples.append({"question": "", "answer": ""})
185
- st.session_state["examples"] = examples
186
- st.rerun()
187
-
188
- ################################################################################
189
- # Save Domain Data
190
- ################################################################################
191
-
192
- perspectives = list(filter(None, perspectives))
193
- topics = list(filter(None, topics))
194
-
195
- domain_data = {
196
- "domain": domain,
197
- "perspectives": perspectives,
198
- "topics": topics,
199
- "examples": examples,
200
- "domain_expert_prompt": domain_expert_prompt,
201
- }
202
-
203
- with open(SEED_DATA_PATH, "w") as f:
204
- json.dump(domain_data, f, indent=2)
205
-
206
- with tab_raw_seed:
207
- st.code(json.dumps(domain_data, indent=2), language="json", line_numbers=True)
208
-
209
- ################################################################################
210
- # Setup Dataset on the Hub
211
- ################################################################################
212
-
213
- st.divider()
214
-
215
- hub_username = DATASET_REPO_ID.split("/")[0]
216
- project_name = DATASET_REPO_ID.split("/")[1]
217
- st.write("Define the dataset repo details on the Hub")
218
- st.session_state["project_name"] = st.text_input("Project Name", project_name)
219
- st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
220
- st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
221
-
222
- if all(
223
- (
224
- st.session_state.get("project_name"),
225
- st.session_state.get("hub_username"),
226
- st.session_state.get("hub_token"),
227
- )
228
- ):
229
- st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
230
-
231
-
232
- if st.button("🤗 Push Dataset Seed") and all(
233
- (
234
- domain,
235
- domain_expert_prompt,
236
- perspectives,
237
- topics,
238
- questions_answers,
239
- )
240
- ):
241
- if all(
242
- (
243
- st.session_state.get("project_name"),
244
- st.session_state.get("hub_username"),
245
- st.session_state.get("hub_token"),
246
- )
247
- ):
248
- project_name = st.session_state["project_name"]
249
- hub_username = st.session_state["hub_username"]
250
- hub_token = st.session_state["hub_token"]
251
- else:
252
- st.error(
253
- "Please create a dataset repo on the Hub before pushing the dataset seed"
254
- )
255
- st.stop()
256
-
257
- push_dataset_to_hub(
258
- domain_seed_data_path=SEED_DATA_PATH,
259
- project_name=project_name,
260
- domain=domain,
261
- hub_username=hub_username,
262
- hub_token=hub_token,
263
- pipeline_path=PIPELINE_PATH,
264
- )
265
-
266
- st.success(
267
- f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
268
- )
269
-
270
- st.write("You can now move on to runnning your distilabel pipeline.")
271
-
272
- st.page_link(
273
- page="pages/3_🌱 Generate Dataset.py",
274
- label="Generate Dataset",
275
- icon="🌱",
276
- )
277
-
278
- else:
279
- st.info(
280
- "Please fill in all the required domain fields to push the dataset seed to the Hub"
281
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/3_🌱 Generate Dataset.py DELETED
@@ -1,257 +0,0 @@
1
- import streamlit as st
2
-
3
- from hub import pull_seed_data_from_repo, push_pipeline_to_hub
4
- from defaults import (
5
- DEFAULT_SYSTEM_PROMPT,
6
- PIPELINE_PATH,
7
- PROJECT_NAME,
8
- ARGILLA_URL,
9
- HUB_USERNAME,
10
- CODELESS_DISTILABEL,
11
- )
12
- from utils import project_sidebar
13
-
14
- from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
15
-
16
- st.set_page_config(
17
- page_title="Domain Data Grower",
18
- page_icon="🧑‍🌾",
19
- )
20
-
21
- project_sidebar()
22
-
23
- ################################################################################
24
- # HEADER
25
- ################################################################################
26
-
27
- st.header("🧑‍🌾 Domain Data Grower")
28
- st.divider()
29
- st.subheader("Step 3. Run the pipeline to generate synthetic data")
30
- st.write("Define the project repos and models that the pipeline will use.")
31
-
32
- st.divider()
33
- ###############################################################
34
- # CONFIGURATION
35
- ###############################################################
36
-
37
- st.markdown("## Pipeline Configuration")
38
-
39
- st.markdown("#### 🤗 Hub details to pull the seed data")
40
- hub_username = st.text_input("Hub Username", HUB_USERNAME)
41
- project_name = st.text_input("Project Name", PROJECT_NAME)
42
- repo_id = f"{hub_username}/{project_name}"
43
- hub_token = st.text_input("Hub Token", type="password")
44
-
45
- st.divider()
46
-
47
- st.markdown("#### 🤖 Inference configuration")
48
-
49
- st.write(
50
- "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
51
- )
52
-
53
- with st.expander("🤗 Recommended Models"):
54
- st.write("All inference endpoint compatible models can be found via the link below")
55
- st.link_button(
56
- "🤗 Inference compaptible models on the hub",
57
- "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
58
- )
59
- st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
60
- st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
61
-
62
- st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
63
- st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
64
-
65
- st.write("🍃Projects with even less resources could take advantage of Phi-2")
66
- st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
67
-
68
- st.write("Note Hugggingface Pro gives access to more compute resources")
69
- st.link_button(
70
- "🤗 Huggingface Pro",
71
- "https://huggingface.co/pricing",
72
- )
73
-
74
-
75
- base_url = st.text_input(
76
- label="Base URL for the Inference API",
77
- value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
78
- )
79
- st.divider()
80
- st.markdown("#### 🔬 Argilla API details to push the generated dataset")
81
- argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
82
- argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
83
- argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
84
- st.divider()
85
-
86
- ###############################################################
87
- # LOCAL
88
- ###############################################################
89
-
90
- st.markdown("## Run the pipeline")
91
-
92
- st.write(
93
- "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
94
- )
95
-
96
- if CODELESS_DISTILABEL:
97
- st.write(
98
- """We recommend running the pipeline locally if you're planning on generating a large dataset. \
99
- But running the pipeline on this space is a handy way to get started quickly. Your synthetic
100
- samples will be pushed to Argilla and available for review.
101
- """
102
- )
103
- st.write(
104
- """If you're planning on running the pipeline on the space, be aware that it \
105
- will take some time to complete and you will need to maintain a \
106
- connection to the space."""
107
- )
108
-
109
-
110
- if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
111
- if all(
112
- [
113
- argilla_api_key,
114
- argilla_url,
115
- base_url,
116
- hub_username,
117
- project_name,
118
- hub_token,
119
- argilla_dataset_name,
120
- ]
121
- ):
122
- with st.spinner("Pulling seed data from the Hub..."):
123
- try:
124
- seed_data = pull_seed_data_from_repo(
125
- repo_id=f"{hub_username}/{project_name}",
126
- hub_token=hub_token,
127
- )
128
- except Exception:
129
- st.error(
130
- "Seed data not found. Please make sure you pushed the data seed in Step 2."
131
- )
132
-
133
- domain = seed_data["domain"]
134
- perspectives = seed_data["perspectives"]
135
- topics = seed_data["topics"]
136
- examples = seed_data["examples"]
137
- domain_expert_prompt = seed_data["domain_expert_prompt"]
138
-
139
- with st.spinner("Serializing the pipeline configuration..."):
140
- serialize_pipeline(
141
- argilla_api_key=argilla_api_key,
142
- argilla_dataset_name=argilla_dataset_name,
143
- argilla_api_url=argilla_url,
144
- topics=topics,
145
- perspectives=perspectives,
146
- pipeline_config_path=PIPELINE_PATH,
147
- domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
148
- hub_token=hub_token,
149
- endpoint_base_url=base_url,
150
- examples=examples,
151
- )
152
- push_pipeline_to_hub(
153
- pipeline_path=PIPELINE_PATH,
154
- hub_token=hub_token,
155
- hub_username=hub_username,
156
- project_name=project_name,
157
- )
158
-
159
- st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
160
-
161
- st.info(
162
- "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
163
- )
164
- st.text(
165
- "Execute the following command to generate a synthetic dataset from the seed data:"
166
- )
167
- command_to_run = create_pipelines_run_command(
168
- hub_token=hub_token,
169
- pipeline_config_path=PIPELINE_PATH,
170
- argilla_dataset_name=argilla_dataset_name,
171
- argilla_api_key=argilla_api_key,
172
- argilla_api_url=argilla_url,
173
- )
174
- st.code(
175
- f"""
176
- pip install git+https://github.com/argilla-io/distilabel.git
177
- git clone https://huggingface.co/datasets/{hub_username}/{project_name}
178
- cd {project_name}
179
- pip install -r requirements.txt
180
- {' '.join(["python"] + command_to_run[1:])}
181
- """,
182
- language="bash",
183
- )
184
- st.subheader(
185
- "👩‍🚀 If you want to access the pipeline and manipulate the locally, you can do:"
186
- )
187
- st.code(
188
- """
189
- git clone https://github.com/huggingface/data-is-better-together
190
- cd domain-specific-datasets
191
- """
192
- )
193
- else:
194
- st.error("Please fill all the required fields.")
195
-
196
- ###############################################################
197
- # SPACE
198
- ###############################################################
199
- if CODELESS_DISTILABEL:
200
- if st.button("🔥 Run pipeline right here, right now!"):
201
- if all(
202
- [
203
- argilla_api_key,
204
- argilla_url,
205
- base_url,
206
- hub_username,
207
- project_name,
208
- hub_token,
209
- argilla_dataset_name,
210
- ]
211
- ):
212
- with st.spinner("Pulling seed data from the Hub..."):
213
- try:
214
- seed_data = pull_seed_data_from_repo(
215
- repo_id=f"{hub_username}/{project_name}",
216
- hub_token=hub_token,
217
- )
218
- except Exception as e:
219
- st.error(
220
- "Seed data not found. Please make sure you pushed the data seed in Step 2."
221
- )
222
-
223
- domain = seed_data["domain"]
224
- perspectives = seed_data["perspectives"]
225
- topics = seed_data["topics"]
226
- examples = seed_data["examples"]
227
- domain_expert_prompt = seed_data["domain_expert_prompt"]
228
-
229
- serialize_pipeline(
230
- argilla_api_key=argilla_api_key,
231
- argilla_dataset_name=argilla_dataset_name,
232
- argilla_api_url=argilla_url,
233
- topics=topics,
234
- perspectives=perspectives,
235
- pipeline_config_path=PIPELINE_PATH,
236
- domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
237
- hub_token=hub_token,
238
- endpoint_base_url=base_url,
239
- examples=examples,
240
- )
241
-
242
- with st.spinner("Starting the pipeline..."):
243
- logs = run_pipeline(
244
- pipeline_config_path=PIPELINE_PATH,
245
- argilla_api_key=argilla_api_key,
246
- argilla_api_url=argilla_url,
247
- hub_token=hub_token,
248
- argilla_dataset_name=argilla_dataset_name,
249
- )
250
-
251
- st.success(f"Pipeline started successfully! 🚀")
252
-
253
- with st.expander(label="View Logs", expanded=True):
254
- for out in logs:
255
- st.text(out)
256
- else:
257
- st.error("Please fill all the required fields.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/4_🔍 Review Generated Data.py DELETED
@@ -1,48 +0,0 @@
1
- import streamlit as st
2
-
3
- from defaults import PROJECT_NAME, ARGILLA_URL, DATASET_REPO_ID
4
- from utils import project_sidebar
5
- from hub import push_argilla_dataset_to_hub
6
-
7
- st.set_page_config(
8
- page_title="Domain Data Grower",
9
- page_icon="🧑‍🌾",
10
- )
11
-
12
- project_sidebar()
13
-
14
- ################################################################################
15
- # HEADER
16
- ################################################################################
17
-
18
- st.header("🧑‍🌾 Domain Data Grower")
19
- st.divider()
20
-
21
- st.write(
22
- """Once you have reviewed the synthetic data in Argilla, you can publish the
23
- generated dataset to the Hub."""
24
- )
25
-
26
-
27
- ################################################################################
28
- # Configuration
29
- ################################################################################
30
-
31
- st.divider()
32
- st.write("🔬 Argilla API details to push the generated dataset")
33
- argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
34
- argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
35
- argilla_dataset_name = st.text_input("Argilla Dataset Name", PROJECT_NAME)
36
- dataset_repo_id = st.text_input("Dataset Repo ID", DATASET_REPO_ID)
37
- st.divider()
38
-
39
- if st.button("🚀 Publish the generated dataset"):
40
- with st.spinner("Publishing the generated dataset..."):
41
- push_argilla_dataset_to_hub(
42
- name=argilla_dataset_name,
43
- repo_id=dataset_repo_id,
44
- url=argilla_url,
45
- api_key=argilla_api_key,
46
- workspace="admin",
47
- )
48
- st.success("The generated dataset has been published to the Hub.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline.py DELETED
@@ -1,208 +0,0 @@
1
- import subprocess
2
- import sys
3
- import time
4
- from typing import List
5
-
6
- from distilabel.steps.generators.data import LoadDataFromDicts
7
- from distilabel.steps.expand import ExpandColumns
8
- from distilabel.steps.keep import KeepColumns
9
- from distilabel.steps.tasks.self_instruct import SelfInstruct
10
- from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
11
- from distilabel.llms.huggingface import InferenceEndpointsLLM
12
- from distilabel.pipeline import Pipeline
13
- from distilabel.steps import TextGenerationToArgilla
14
- from dotenv import load_dotenv
15
-
16
- from domain import (
17
- DomainExpert,
18
- CleanNumberedList,
19
- create_topics,
20
- create_examples_template,
21
- APPLICATION_DESCRIPTION,
22
- )
23
-
24
- load_dotenv()
25
-
26
-
27
- def define_pipeline(
28
- argilla_api_key: str,
29
- argilla_api_url: str,
30
- argilla_dataset_name: str,
31
- topics: List[str],
32
- perspectives: List[str],
33
- domain_expert_prompt: str,
34
- examples: List[dict],
35
- hub_token: str,
36
- endpoint_base_url: str,
37
- ):
38
- """Define the pipeline for the specific domain."""
39
-
40
- terms = create_topics(topics, perspectives)
41
- template = create_examples_template(examples)
42
- with Pipeline("farming") as pipeline:
43
- load_data = LoadDataFromDicts(
44
- name="load_data",
45
- data=[{"input": term} for term in terms],
46
- batch_size=64,
47
- )
48
- llm = InferenceEndpointsLLM(
49
- base_url=endpoint_base_url,
50
- api_key=hub_token,
51
- )
52
- self_instruct = SelfInstruct(
53
- name="self-instruct",
54
- application_description=APPLICATION_DESCRIPTION,
55
- num_instructions=5,
56
- input_batch_size=8,
57
- llm=llm,
58
- )
59
-
60
- evol_instruction_complexity = EvolInstruct(
61
- name="evol_instruction_complexity",
62
- llm=llm,
63
- num_evolutions=2,
64
- store_evolutions=True,
65
- input_batch_size=8,
66
- include_original_instruction=True,
67
- input_mappings={"instruction": "question"},
68
- )
69
-
70
- expand_instructions = ExpandColumns(
71
- name="expand_columns", columns={"instructions": "question"}
72
- )
73
- cleaner = CleanNumberedList(name="clean_numbered_list")
74
- expand_evolutions = ExpandColumns(
75
- name="expand_columns_evolved",
76
- columns={"evolved_instructions": "evolved_questions"},
77
- )
78
-
79
- domain_expert = DomainExpert(
80
- name="domain_expert",
81
- llm=llm,
82
- input_batch_size=8,
83
- input_mappings={"instruction": "evolved_questions"},
84
- output_mappings={"generation": "domain_expert_answer"},
85
- )
86
-
87
- domain_expert._system_prompt = domain_expert_prompt
88
- domain_expert._template = template
89
-
90
- keep_columns = KeepColumns(
91
- name="keep_columns",
92
- columns=["model_name", "evolved_questions", "domain_expert_answer"],
93
- )
94
-
95
- to_argilla = TextGenerationToArgilla(
96
- name="text_generation_to_argilla",
97
- dataset_name=argilla_dataset_name,
98
- dataset_workspace="admin",
99
- api_url=argilla_api_url,
100
- api_key=argilla_api_key,
101
- input_mappings={
102
- "instruction": "evolved_questions",
103
- "generation": "domain_expert_answer",
104
- },
105
- )
106
-
107
- load_data.connect(self_instruct)
108
- self_instruct.connect(expand_instructions)
109
- expand_instructions.connect(cleaner)
110
- cleaner.connect(evol_instruction_complexity)
111
- evol_instruction_complexity.connect(expand_evolutions)
112
- expand_evolutions.connect(domain_expert)
113
- domain_expert.connect(keep_columns)
114
- keep_columns.connect(to_argilla)
115
- return pipeline
116
-
117
-
118
- def serialize_pipeline(
119
- argilla_api_key: str,
120
- argilla_api_url: str,
121
- argilla_dataset_name: str,
122
- topics: List[str],
123
- perspectives: List[str],
124
- domain_expert_prompt: str,
125
- hub_token: str,
126
- endpoint_base_url: str,
127
- pipeline_config_path: str = "pipeline.yaml",
128
- examples: List[dict] = [],
129
- ):
130
- """Serialize the pipeline to a yaml file."""
131
- pipeline = define_pipeline(
132
- argilla_api_key=argilla_api_key,
133
- argilla_api_url=argilla_api_url,
134
- argilla_dataset_name=argilla_dataset_name,
135
- topics=topics,
136
- perspectives=perspectives,
137
- domain_expert_prompt=domain_expert_prompt,
138
- hub_token=hub_token,
139
- endpoint_base_url=endpoint_base_url,
140
- examples=examples,
141
- )
142
- pipeline.save(path=pipeline_config_path, overwrite=True, format="yaml")
143
-
144
-
145
- def create_pipelines_run_command(
146
- hub_token: str,
147
- argilla_api_key: str,
148
- argilla_api_url: str,
149
- pipeline_config_path: str = "pipeline.yaml",
150
- argilla_dataset_name: str = "domain_specific_datasets",
151
- ):
152
- """Create the command to run the pipeline."""
153
- command_to_run = [
154
- sys.executable,
155
- "-m",
156
- "distilabel",
157
- "pipeline",
158
- "run",
159
- "--config",
160
- pipeline_config_path,
161
- "--param",
162
- f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
163
- "--param",
164
- f"text_generation_to_argilla.api_key={argilla_api_key}",
165
- "--param",
166
- f"text_generation_to_argilla.api_url={argilla_api_url}",
167
- "--param",
168
- f"self-instruct.llm.api_key={hub_token}",
169
- "--param",
170
- f"evol_instruction_complexity.llm.api_key={hub_token}",
171
- "--param",
172
- f"domain_expert.llm.api_key={hub_token}",
173
- "--ignore-cache",
174
- ]
175
- return command_to_run
176
-
177
-
178
- def run_pipeline(
179
- hub_token: str,
180
- argilla_api_key: str,
181
- argilla_api_url: str,
182
- pipeline_config_path: str = "pipeline.yaml",
183
- argilla_dataset_name: str = "domain_specific_datasets",
184
- ):
185
- """Run the pipeline and yield the output as a generator of logs."""
186
-
187
- command_to_run = create_pipelines_run_command(
188
- hub_token=hub_token,
189
- pipeline_config_path=pipeline_config_path,
190
- argilla_dataset_name=argilla_dataset_name,
191
- argilla_api_key=argilla_api_key,
192
- argilla_api_url=argilla_api_url,
193
- )
194
-
195
- # Run the script file
196
- process = subprocess.Popen(
197
- args=command_to_run,
198
- stdout=subprocess.PIPE,
199
- stderr=subprocess.PIPE,
200
- env={"HF_TOKEN": hub_token},
201
- )
202
-
203
- while process.stdout and process.stdout.readable():
204
- time.sleep(0.2)
205
- line = process.stdout.readline()
206
- if not line:
207
- break
208
- yield line.decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,8 +1,4 @@
1
  datasets
2
  python_dotenv
3
- sentence_transformers
4
  streamlit
5
  huggingface_hub
6
- mistralai
7
- argilla
8
- git+https://github.com/argilla-io/distilabel.git
 
1
  datasets
2
  python_dotenv
 
3
  streamlit
4
  huggingface_hub