Ben Burtenshaw commited on
Commit
32014a1
β€’
1 Parent(s): 7055b44

lose codeless version

Browse files
pages/2_πŸ‘©πŸΌβ€πŸ”¬ Describe Domain.py CHANGED
@@ -2,14 +2,9 @@ import json
2
 
3
  import streamlit as st
4
 
5
- from hub import push_dataset_to_hub
6
  from infer import query
7
  from defaults import (
8
- DEFAULT_DOMAIN,
9
- DEFAULT_PERSPECTIVES,
10
- DEFAULT_TOPICS,
11
- DEFAULT_EXAMPLES,
12
- DEFAULT_SYSTEM_PROMPT,
13
  N_PERSPECTIVES,
14
  N_TOPICS,
15
  SEED_DATA_PATH,
@@ -18,12 +13,14 @@ from defaults import (
18
  )
19
  from utils import project_sidebar
20
 
 
21
  st.set_page_config(
22
  page_title="Domain Data Grower",
23
  page_icon="πŸ§‘β€πŸŒΎ",
24
  )
25
  project_sidebar()
26
 
 
27
  ################################################################################
28
  # HEADER
29
  ################################################################################
@@ -37,6 +34,23 @@ st.write(
37
  "Define the project details, including the project name, domain, and API credentials"
38
  )
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ################################################################################
41
  # Domain Expert Section
42
  ################################################################################
@@ -212,22 +226,6 @@ with tab_raw_seed:
212
 
213
  st.divider()
214
 
215
- hub_username = DATASET_REPO_ID.split("/")[0]
216
- project_name = DATASET_REPO_ID.split("/")[1]
217
- st.write("Define the dataset repo details on the Hub")
218
- st.session_state["project_name"] = st.text_input("Project Name", project_name)
219
- st.session_state["hub_username"] = st.text_input("Hub Username", hub_username)
220
- st.session_state["hub_token"] = st.text_input("Hub Token", type="password", value=None)
221
-
222
- if all(
223
- (
224
- st.session_state.get("project_name"),
225
- st.session_state.get("hub_username"),
226
- st.session_state.get("hub_token"),
227
- )
228
- ):
229
- st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
230
-
231
 
232
  if st.button("πŸ€— Push Dataset Seed") and all(
233
  (
 
2
 
3
  import streamlit as st
4
 
5
+ from hub import push_dataset_to_hub, pull_seed_data_from_repo
6
  from infer import query
7
  from defaults import (
 
 
 
 
 
8
  N_PERSPECTIVES,
9
  N_TOPICS,
10
  SEED_DATA_PATH,
 
13
  )
14
  from utils import project_sidebar
15
 
16
+
17
  st.set_page_config(
18
  page_title="Domain Data Grower",
19
  page_icon="πŸ§‘β€πŸŒΎ",
20
  )
21
  project_sidebar()
22
 
23
+
24
  ################################################################################
25
  # HEADER
26
  ################################################################################
 
34
  "Define the project details, including the project name, domain, and API credentials"
35
  )
36
 
37
+
38
+ ################################################################################
39
+ # LOAD EXISTING DOMAIN DATA
40
+ ################################################################################
41
+
42
+ DATASET_REPO_ID = (
43
+ f"{st.session_state['hub_username']}/{st.session_state['project_name']}"
44
+ )
45
+ SEED_DATA = pull_seed_data_from_repo(
46
+ DATASET_REPO_ID, hub_token=st.session_state["hub_token"]
47
+ )
48
+ DEFAULT_DOMAIN = SEED_DATA.get("domain", "")
49
+ DEFAULT_PERSPECTIVES = SEED_DATA.get("perspectives", [""])
50
+ DEFAULT_TOPICS = SEED_DATA.get("topics", [""])
51
+ DEFAULT_EXAMPLES = SEED_DATA.get("examples", [{"question": "", "answer": ""}])
52
+ DEFAULT_SYSTEM_PROMPT = SEED_DATA.get("domain_expert_prompt", "")
53
+
54
  ################################################################################
55
  # Domain Expert Section
56
  ################################################################################
 
226
 
227
  st.divider()
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  if st.button("πŸ€— Push Dataset Seed") and all(
231
  (
pages/3_🌱 Generate Dataset.py CHANGED
@@ -1,18 +1,8 @@
1
  import streamlit as st
2
 
3
- from hub import pull_seed_data_from_repo, push_pipeline_to_hub
4
- from defaults import (
5
- DEFAULT_SYSTEM_PROMPT,
6
- PIPELINE_PATH,
7
- PROJECT_NAME,
8
- ARGILLA_URL,
9
- HUB_USERNAME,
10
- CODELESS_DISTILABEL,
11
- )
12
  from utils import project_sidebar
13
 
14
- from pipeline import serialize_pipeline, run_pipeline, create_pipelines_run_command
15
-
16
  st.set_page_config(
17
  page_title="Domain Data Grower",
18
  page_icon="πŸ§‘β€πŸŒΎ",
@@ -27,20 +17,15 @@ project_sidebar()
27
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
28
  st.divider()
29
  st.subheader("Step 3. Run the pipeline to generate synthetic data")
30
- st.write("Define the project repos and models that the pipeline will use.")
31
 
32
- st.divider()
33
  ###############################################################
34
  # CONFIGURATION
35
  ###############################################################
36
 
37
- st.markdown("## Pipeline Configuration")
38
-
39
- st.markdown("#### πŸ€— Hub details to pull the seed data")
40
- hub_username = st.text_input("Hub Username", HUB_USERNAME)
41
- project_name = st.text_input("Project Name", PROJECT_NAME)
42
- repo_id = f"{hub_username}/{project_name}"
43
- hub_token = st.text_input("Hub Token", type="password")
44
 
45
  st.divider()
46
 
@@ -89,169 +74,56 @@ st.divider()
89
 
90
  st.markdown("## Run the pipeline")
91
 
92
- st.write(
93
- "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
94
  )
95
 
96
- if CODELESS_DISTILABEL:
97
- st.write(
98
- """We recommend running the pipeline locally if you're planning on generating a large dataset. \
99
- But running the pipeline on this space is a handy way to get started quickly. Your synthetic
100
- samples will be pushed to Argilla and available for review.
101
- """
102
- )
103
- st.write(
104
- """If you're planning on running the pipeline on the space, be aware that it \
105
- will take some time to complete and you will need to maintain a \
106
- connection to the space."""
107
- )
108
-
109
-
110
- if st.button("πŸ’» Run pipeline locally", key="run_pipeline_local"):
111
- if all(
112
- [
113
- argilla_api_key,
114
- argilla_url,
115
- base_url,
116
- hub_username,
117
- project_name,
118
- hub_token,
119
- argilla_dataset_name,
120
- ]
121
- ):
122
- with st.spinner("Pulling seed data from the Hub..."):
123
- try:
124
- seed_data = pull_seed_data_from_repo(
125
- repo_id=f"{hub_username}/{project_name}",
126
- hub_token=hub_token,
127
- )
128
- except Exception:
129
- st.error(
130
- "Seed data not found. Please make sure you pushed the data seed in Step 2."
131
- )
132
 
133
- domain = seed_data["domain"]
134
- perspectives = seed_data["perspectives"]
135
- topics = seed_data["topics"]
136
- examples = seed_data["examples"]
137
- domain_expert_prompt = seed_data["domain_expert_prompt"]
138
-
139
- with st.spinner("Serializing the pipeline configuration..."):
140
- serialize_pipeline(
141
- argilla_api_key=argilla_api_key,
142
- argilla_dataset_name=argilla_dataset_name,
143
- argilla_api_url=argilla_url,
144
- topics=topics,
145
- perspectives=perspectives,
146
- pipeline_config_path=PIPELINE_PATH,
147
- domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
148
- hub_token=hub_token,
149
- endpoint_base_url=base_url,
150
- examples=examples,
151
- )
152
- push_pipeline_to_hub(
153
- pipeline_path=PIPELINE_PATH,
154
- hub_token=hub_token,
155
- hub_username=hub_username,
156
- project_name=project_name,
157
- )
158
 
159
- st.success(f"Pipeline configuration saved to {hub_username}/{project_name}")
 
 
 
 
 
 
160
 
161
- st.info(
162
- "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
163
- )
164
- st.text(
165
- "Execute the following command to generate a synthetic dataset from the seed data:"
166
- )
167
- command_to_run = create_pipelines_run_command(
168
- hub_token=hub_token,
169
- pipeline_config_path=PIPELINE_PATH,
170
- argilla_dataset_name=argilla_dataset_name,
171
- argilla_api_key=argilla_api_key,
172
- argilla_api_url=argilla_url,
173
- )
174
- st.code(
175
- f"""
176
- pip install git+https://github.com/argilla-io/distilabel.git
177
- git clone https://huggingface.co/datasets/{hub_username}/{project_name}
178
- cd {project_name}
179
- pip install -r requirements.txt
180
- {' '.join(["python"] + command_to_run[1:])}
181
  """,
182
- language="bash",
183
- )
184
- st.subheader(
185
- "πŸ‘©β€πŸš€ If you want to access the pipeline and manipulate the locally, you can do:"
186
- )
187
- st.code(
188
- """
189
- git clone https://github.com/huggingface/data-is-better-together
190
- cd domain-specific-datasets
191
- """
192
- )
193
- else:
194
- st.error("Please fill all the required fields.")
195
-
196
- ###############################################################
197
- # SPACE
198
- ###############################################################
199
- if CODELESS_DISTILABEL:
200
- if st.button("πŸ”₯ Run pipeline right here, right now!"):
201
- if all(
202
- [
203
- argilla_api_key,
204
- argilla_url,
205
- base_url,
206
- hub_username,
207
- project_name,
208
- hub_token,
209
- argilla_dataset_name,
210
- ]
211
- ):
212
- with st.spinner("Pulling seed data from the Hub..."):
213
- try:
214
- seed_data = pull_seed_data_from_repo(
215
- repo_id=f"{hub_username}/{project_name}",
216
- hub_token=hub_token,
217
- )
218
- except Exception as e:
219
- st.error(
220
- "Seed data not found. Please make sure you pushed the data seed in Step 2."
221
- )
222
-
223
- domain = seed_data["domain"]
224
- perspectives = seed_data["perspectives"]
225
- topics = seed_data["topics"]
226
- examples = seed_data["examples"]
227
- domain_expert_prompt = seed_data["domain_expert_prompt"]
228
-
229
- serialize_pipeline(
230
- argilla_api_key=argilla_api_key,
231
- argilla_dataset_name=argilla_dataset_name,
232
- argilla_api_url=argilla_url,
233
- topics=topics,
234
- perspectives=perspectives,
235
- pipeline_config_path=PIPELINE_PATH,
236
- domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
237
- hub_token=hub_token,
238
- endpoint_base_url=base_url,
239
- examples=examples,
240
- )
241
-
242
- with st.spinner("Starting the pipeline..."):
243
- logs = run_pipeline(
244
- pipeline_config_path=PIPELINE_PATH,
245
- argilla_api_key=argilla_api_key,
246
- argilla_api_url=argilla_url,
247
- hub_token=hub_token,
248
- argilla_dataset_name=argilla_dataset_name,
249
- )
250
-
251
- st.success(f"Pipeline started successfully! πŸš€")
252
 
253
- with st.expander(label="View Logs", expanded=True):
254
- for out in logs:
255
- st.text(out)
256
- else:
257
- st.error("Please fill all the required fields.")
 
1
  import streamlit as st
2
 
3
+ from defaults import ARGILLA_URL
 
 
 
 
 
 
 
 
4
  from utils import project_sidebar
5
 
 
 
6
  st.set_page_config(
7
  page_title="Domain Data Grower",
8
  page_icon="πŸ§‘β€πŸŒΎ",
 
17
  st.header("πŸ§‘β€πŸŒΎ Domain Data Grower")
18
  st.divider()
19
  st.subheader("Step 3. Run the pipeline to generate synthetic data")
20
+ st.write("Define the distilabel pipeline for generating the dataset.")
21
 
 
22
  ###############################################################
23
  # CONFIGURATION
24
  ###############################################################
25
 
26
+ hub_username = st.session_state.get("hub_username")
27
+ project_name = st.session_state.get("project_name")
28
+ hub_token = st.session_state.get("hub_token")
 
 
 
 
29
 
30
  st.divider()
31
 
 
74
 
75
  st.markdown("## Run the pipeline")
76
 
77
+ st.markdown(
78
+ "Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
79
  )
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ if all(
83
+ [
84
+ argilla_api_key,
85
+ argilla_url,
86
+ base_url,
87
+ hub_token,
88
+ project_name,
89
+ hub_token,
90
+ argilla_dataset_name,
91
+ ]
92
+ ):
93
+ st.markdown(
94
+ "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
95
+ )
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ st.code(
98
+ f"""
99
+
100
+ # Install the distilabel library
101
+ pip install git+https://github.com/argilla-io/distilabel.git
102
+ """
103
+ )
104
 
105
+ st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")
106
+
107
+ st.code(
108
+ f"""
109
+ # Clone the project and install the requirements
110
+ git clone https://huggingface.co/datasets/{hub_username}/{project_name}
111
+ cd {project_name}
112
+ pip install -r requirements.txt
113
+
114
+ # Run the pipeline
115
+ python pipeline.py
116
+ --argilla-api-key {argilla_api_key}
117
+ --argilla-api-url {argilla_url}
118
+ --argilla-dataset-name {argilla_dataset_name}
119
+ --endpoint-base-url {base_url}
120
+ --hub-token {st.session_state["hub_token"]}
 
 
 
 
121
  """,
122
+ language="bash",
123
+ )
124
+ st.markdown(
125
+ "πŸ‘©β€πŸš€ If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
126
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ else:
129
+ st.info("Please fill all the required fields.")
 
 
 
utils.py CHANGED
@@ -26,8 +26,30 @@ def project_sidebar():
26
  )
27
  st.sidebar.link_button(f"πŸ“š Dataset Repo", DATASET_URL)
28
  st.sidebar.link_button(f"πŸ€– Argilla Space", ARGILLA_URL)
29
- st.sidebar.divider()
30
- st.sidebar.link_button("πŸ§‘β€πŸŒΎ New Project", DIBT_PARENT_APP_URL)
 
 
 
 
 
31
  st.sidebar.link_button(
32
  "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
33
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
  st.sidebar.link_button(f"πŸ“š Dataset Repo", DATASET_URL)
28
  st.sidebar.link_button(f"πŸ€– Argilla Space", ARGILLA_URL)
29
+ hub_username = DATASET_REPO_ID.split("/")[0]
30
+ project_name = DATASET_REPO_ID.split("/")[1]
31
+ st.session_state["project_name"] = project_name
32
+ st.session_state["hub_username"] = hub_username
33
+ st.session_state["hub_token"] = st.sidebar.text_input(
34
+ "Hub Token", type="password", value=None
35
+ )
36
  st.sidebar.link_button(
37
  "πŸ€— Get your Hub Token", "https://huggingface.co/settings/tokens"
38
  )
39
+ if all(
40
+ (
41
+ st.session_state.get("project_name"),
42
+ st.session_state.get("hub_username"),
43
+ st.session_state.get("hub_token"),
44
+ )
45
+ ):
46
+ st.success(f"Using the dataset repo {hub_username}/{project_name} on the Hub")
47
+
48
+ st.sidebar.divider()
49
+
50
+ st.sidebar.link_button("πŸ§‘β€πŸŒΎ New Project", DIBT_PARENT_APP_URL)
51
+
52
+ if st.session_state["hub_token"] is None:
53
+ st.error("Please provide a Hub token to generate answers")
54
+ st.stop()
55
+