Ben Burtenshaw commited on
Commit
fc828f1
1 Parent(s): dfd3683

run pipeline locally

Browse files
__pycache__/defaults.cpython-311.pyc ADDED
Binary file (2.32 kB). View file
 
__pycache__/domain.cpython-311.pyc ADDED
Binary file (4.53 kB). View file
 
__pycache__/hub.cpython-311.pyc ADDED
Binary file (5.78 kB). View file
 
__pycache__/infer.cpython-311.pyc ADDED
Binary file (837 Bytes). View file
 
__pycache__/pipeline.cpython-311.pyc ADDED
Binary file (8.2 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (4.93 kB). View file
 
pages/2_👩🏼‍🔬 Describe Domain.py CHANGED
@@ -11,7 +11,7 @@ from defaults import (
11
  PIPELINE_PATH,
12
  DATASET_REPO_ID,
13
  )
14
- from utils import project_sidebar
15
 
16
 
17
  st.set_page_config(
@@ -212,6 +212,8 @@ domain_data = {
212
  "topics": topics,
213
  "examples": examples,
214
  "domain_expert_prompt": domain_expert_prompt,
 
 
215
  }
216
 
217
  with open(SEED_DATA_PATH, "w") as f:
 
11
  PIPELINE_PATH,
12
  DATASET_REPO_ID,
13
  )
14
+ from utils import project_sidebar, create_seed_terms, create_application_instruction
15
 
16
 
17
  st.set_page_config(
 
212
  "topics": topics,
213
  "examples": examples,
214
  "domain_expert_prompt": domain_expert_prompt,
215
+ "application_instruction": create_application_instruction(domain, examples),
216
+ "seed_terms": create_seed_terms(topics, perspectives),
217
  }
218
 
219
  with open(SEED_DATA_PATH, "w") as f:
pages/3_🌱 Generate Dataset.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
 
3
  from defaults import ARGILLA_URL
4
- from hub import push_pipeline_params, push_pipeline_to_hub
5
  from utils import project_sidebar
6
 
7
  st.set_page_config(
@@ -20,16 +20,27 @@ st.divider()
20
  st.subheader("Step 3. Run the pipeline to generate synthetic data")
21
  st.write("Define the distilabel pipeline for generating the dataset.")
22
 
23
- ###############################################################
24
- # CONFIGURATION
25
- ###############################################################
26
-
27
  hub_username = st.session_state.get("hub_username")
28
  project_name = st.session_state.get("project_name")
29
  hub_token = st.session_state.get("hub_token")
30
 
 
 
 
 
31
  st.divider()
32
 
 
 
 
 
 
 
 
 
 
 
 
33
  st.markdown("#### 🤖 Inference configuration")
34
 
35
  st.write(
@@ -43,13 +54,19 @@ with st.expander("🤗 Recommended Models"):
43
  "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
44
  )
45
  st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
46
- st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
 
 
47
 
48
  st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
49
- st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
 
 
50
 
51
- st.write("🍃Projects with even less resources could take advantage of Phi-2")
52
- st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
 
 
53
 
54
  st.write("Note Hugggingface Pro gives access to more compute resources")
55
  st.link_button(
@@ -58,10 +75,27 @@ with st.expander("🤗 Recommended Models"):
58
  )
59
 
60
 
61
- base_url = st.text_input(
62
- label="Base URL for the Inference API",
63
- value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
 
 
 
 
 
 
 
 
 
 
 
64
  )
 
 
 
 
 
 
65
  st.divider()
66
  st.markdown("#### 🔬 Argilla API details to push the generated dataset")
67
  argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
@@ -84,30 +118,38 @@ if all(
84
  [
85
  argilla_api_key,
86
  argilla_url,
87
- base_url,
88
- hub_token,
 
 
 
 
 
89
  project_name,
90
  hub_token,
91
  argilla_dataset_name,
92
  ]
93
- ):
94
- push_pipeline_params(
95
- pipeline_params={
96
- "argilla_api_key": argilla_api_key,
97
- "argilla_api_url": argilla_url,
98
- "argilla_dataset_name": argilla_dataset_name,
99
- "endpoint_base_url": base_url,
100
- },
101
- hub_username=hub_username,
102
- hub_token=hub_token,
103
- project_name=project_name,
104
- )
105
-
106
- push_pipeline_to_hub(
107
- pipeline_path="pipeline.py",
108
- hub_username=hub_username,
109
- hub_token=hub_token,
110
- project_name=project_name,
 
 
 
111
  )
112
 
113
  st.markdown(
@@ -118,7 +160,7 @@ if all(
118
  f"""
119
 
120
  # Install the distilabel library
121
- pip install git+https://github.com/argilla-io/distilabel.git
122
  """
123
  )
124
 
@@ -126,8 +168,8 @@ if all(
126
 
127
  st.code(
128
  f"""
129
- git clone https://huggingface.co/datasets/{hub_username}/{project_name}
130
- cd {project_name}
131
  pip install -r requirements.txt
132
  """
133
  )
@@ -135,9 +177,9 @@ if all(
135
  st.markdown("Finally, you can run the pipeline using the following command:")
136
 
137
  st.code(
138
- """
139
  huggingface-cli login
140
- python pipeline.py""",
141
  language="bash",
142
  )
143
  st.markdown(
 
1
  import streamlit as st
2
 
3
  from defaults import ARGILLA_URL
4
+ from hub import push_pipeline_params
5
  from utils import project_sidebar
6
 
7
  st.set_page_config(
 
20
  st.subheader("Step 3. Run the pipeline to generate synthetic data")
21
  st.write("Define the distilabel pipeline for generating the dataset.")
22
 
 
 
 
 
23
  hub_username = st.session_state.get("hub_username")
24
  project_name = st.session_state.get("project_name")
25
  hub_token = st.session_state.get("hub_token")
26
 
27
+ ###############################################################
28
+ # CONFIGURATION
29
+ ###############################################################
30
+
31
  st.divider()
32
 
33
+ st.markdown("## 🧰 Pipeline Configuration")
34
+
35
+ st.write(
36
+ "Now we need to define the configuration for the pipeline that will generate the synthetic data."
37
+ )
38
+ st.write(
39
+ "⚠️ Model and parameter choice significantly affect the quality of the generated data. \
40
+ We reccomend that you start with a few samples and review the data. The scale up from there."
41
+ )
42
+
43
+
44
  st.markdown("#### 🤖 Inference configuration")
45
 
46
  st.write(
 
54
  "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
55
  )
56
  st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
57
+ st.code(
58
+ "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
59
+ )
60
 
61
  st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
62
+ st.code(
63
+ "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
64
+ )
65
 
66
+ st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
67
+ st.code(
68
+ "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
69
+ )
70
 
71
  st.write("Note Hugggingface Pro gives access to more compute resources")
72
  st.link_button(
 
75
  )
76
 
77
 
78
+ self_instruct_base_url = st.text_input(
79
+ label="Model base URL for instruction generation",
80
+ value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
81
+ )
82
+ domain_expert_base_url = st.text_input(
83
+ label="Model base URL for domain expert response",
84
+ value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
85
+ )
86
+
87
+ st.divider()
88
+ st.markdown("#### 🧮 Parameters configuration")
89
+
90
+ self_intruct_num_generations = st.slider(
91
+ "Number of generations for self-instruction", 1, 10, 2
92
  )
93
+ domain_expert_num_generations = st.slider(
94
+ "Number of generations for domain expert", 1, 10, 2
95
+ )
96
+ self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
97
+ domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)
98
+
99
  st.divider()
100
  st.markdown("#### 🔬 Argilla API details to push the generated dataset")
101
  argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
 
118
  [
119
  argilla_api_key,
120
  argilla_url,
121
+ self_instruct_base_url,
122
+ domain_expert_base_url,
123
+ self_intruct_num_generations,
124
+ domain_expert_num_generations,
125
+ self_instruct_temperature,
126
+ domain_expert_temperature,
127
+ hub_username,
128
  project_name,
129
  hub_token,
130
  argilla_dataset_name,
131
  ]
132
+ ) and st.button("💾 Save Pipeline Config"):
133
+ with st.spinner("Pushing pipeline to the Hub..."):
134
+ push_pipeline_params(
135
+ pipeline_params={
136
+ "argilla_api_key": argilla_api_key,
137
+ "argilla_api_url": argilla_url,
138
+ "argilla_dataset_name": argilla_dataset_name,
139
+ "self_instruct_base_url": self_instruct_base_url,
140
+ "domain_expert_base_url": domain_expert_base_url,
141
+ "self_instruct_temperature": self_instruct_temperature,
142
+ "domain_expert_temperature": domain_expert_temperature,
143
+ "self_intruct_num_generations": self_intruct_num_generations,
144
+ "domain_expert_num_generations": domain_expert_num_generations,
145
+ },
146
+ hub_username=hub_username,
147
+ hub_token=hub_token,
148
+ project_name=project_name,
149
+ )
150
+
151
+ st.success(
152
+ f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
153
  )
154
 
155
  st.markdown(
 
160
  f"""
161
 
162
  # Install the distilabel library
163
+ pip install distilabel
164
  """
165
  )
166
 
 
168
 
169
  st.code(
170
  f"""
171
+ git clone https://github.com/huggingface/data-is-better-together
172
+ cd data-is-better-together/domain-specific-datasets/pipelines
173
  pip install -r requirements.txt
174
  """
175
  )
 
177
  st.markdown("Finally, you can run the pipeline using the following command:")
178
 
179
  st.code(
180
+ f"""
181
  huggingface-cli login
182
+ python domain_expert_pipeline.py {hub_username}/{project_name}""",
183
  language="bash",
184
  )
185
  st.markdown(
pipeline.yaml CHANGED
@@ -1,5 +1,5 @@
1
  distilabel:
2
- version: 1.0.0
3
  pipeline:
4
  name: farming
5
  description: null
@@ -10,31 +10,7 @@ pipeline:
10
  output_mappings: {}
11
  batch_size: 64
12
  data:
13
- - input: animal welfare from a Family Farming perspective
14
- - input: animal welfare from a Agribusiness perspective
15
- - input: animal welfare from a Permaculture perspective
16
- - input: animal welfare from a Agroforestery perspective
17
- - input: animal welfare from a Conventional Farming perspective
18
- - input: economic growth from a Family Farming perspective
19
- - input: economic growth from a Agribusiness perspective
20
- - input: economic growth from a Permaculture perspective
21
- - input: economic growth from a Agroforestery perspective
22
- - input: economic growth from a Conventional Farming perspective
23
- - input: land from a Family Farming perspective
24
- - input: land from a Agribusiness perspective
25
- - input: land from a Permaculture perspective
26
- - input: land from a Agroforestery perspective
27
- - input: land from a Conventional Farming perspective
28
- - input: resources from a Family Farming perspective
29
- - input: resources from a Agribusiness perspective
30
- - input: resources from a Permaculture perspective
31
- - input: resources from a Agroforestery perspective
32
- - input: resources from a Conventional Farming perspective
33
- - input: efficiency from a Family Farming perspective
34
- - input: efficiency from a Agribusiness perspective
35
- - input: efficiency from a Permaculture perspective
36
- - input: efficiency from a Agroforestery perspective
37
- - input: efficiency from a Conventional Farming perspective
38
  runtime_parameters_info:
39
  - name: batch_size
40
  optional: true
@@ -54,7 +30,7 @@ pipeline:
54
  model_id: null
55
  endpoint_name: null
56
  endpoint_namespace: null
57
- base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
58
  tokenizer_id: null
59
  model_display_name: null
60
  use_openai_client: false
@@ -75,14 +51,14 @@ pipeline:
75
  Blend interrogative (e.g., "What is the significance of x?") and imperative
76
  (e.g., "Detail the process of x.") styles.'
77
  application_description: 'You are an AI assistant than generates queries around
78
- the domain of farming.
79
 
80
  Your should not expect basic but profound questions from your users.
81
 
82
  The queries should reflect a diversity of vision and economic positions and
83
  political positions.
84
 
85
- The queries may know about different methods of farming.
86
 
87
  The queries can be positioned politically, economically, socially, or practically.
88
 
@@ -163,7 +139,7 @@ pipeline:
163
  model_id: null
164
  endpoint_name: null
165
  endpoint_namespace: null
166
- base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
167
  tokenizer_id: null
168
  model_display_name: null
169
  use_openai_client: false
@@ -390,7 +366,7 @@ pipeline:
390
  model_id: null
391
  endpoint_name: null
392
  endpoint_namespace: null
393
- base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
394
  tokenizer_id: null
395
  model_display_name: null
396
  use_openai_client: false
@@ -489,9 +465,9 @@ pipeline:
489
  generation: domain_expert_answer
490
  output_mappings: {}
491
  input_batch_size: 50
492
- dataset_name: farming
493
  dataset_workspace: admin
494
- api_url: https://argilla-farming.hf.space
495
  runtime_parameters_info:
496
  - name: input_batch_size
497
  optional: true
 
1
  distilabel:
2
+ version: 1.0.1
3
  pipeline:
4
  name: farming
5
  description: null
 
10
  output_mappings: {}
11
  batch_size: 64
12
  data:
13
+ - input: punctures from a Retro bikes perspective
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  runtime_parameters_info:
15
  - name: batch_size
16
  optional: true
 
30
  model_id: null
31
  endpoint_name: null
32
  endpoint_namespace: null
33
+ base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
34
  tokenizer_id: null
35
  model_display_name: null
36
  use_openai_client: false
 
51
  Blend interrogative (e.g., "What is the significance of x?") and imperative
52
  (e.g., "Detail the process of x.") styles.'
53
  application_description: 'You are an AI assistant than generates queries around
54
+ the domain of Bicycle maintenance.
55
 
56
  Your should not expect basic but profound questions from your users.
57
 
58
  The queries should reflect a diversity of vision and economic positions and
59
  political positions.
60
 
61
+ The queries may know about different methods of Bicycle maintenance.
62
 
63
  The queries can be positioned politically, economically, socially, or practically.
64
 
 
139
  model_id: null
140
  endpoint_name: null
141
  endpoint_namespace: null
142
+ base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
143
  tokenizer_id: null
144
  model_display_name: null
145
  use_openai_client: false
 
366
  model_id: null
367
  endpoint_name: null
368
  endpoint_namespace: null
369
+ base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
370
  tokenizer_id: null
371
  model_display_name: null
372
  use_openai_client: false
 
465
  generation: domain_expert_answer
466
  output_mappings: {}
467
  input_batch_size: 50
468
+ dataset_name: bicycle_maintenance
469
  dataset_workspace: admin
470
+ api_url: https://burtenshaw-bicycle-maintenance-argilla-space.hf.space
471
  runtime_parameters_info:
472
  - name: input_batch_size
473
  optional: true
pipeline_params.json ADDED
File without changes
utils.py CHANGED
@@ -1,13 +1,13 @@
 
 
1
  import streamlit as st
2
 
3
  from defaults import (
4
- ARGILLA_SPACE_REPO_ID,
5
  PROJECT_NAME,
6
  ARGILLA_URL,
7
  DIBT_PARENT_APP_URL,
8
  DATASET_URL,
9
  DATASET_REPO_ID,
10
- ARGILLA_SPACE_REPO_ID,
11
  )
12
 
13
 
@@ -48,8 +48,35 @@ def project_sidebar():
48
  st.sidebar.divider()
49
 
50
  st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
51
-
52
  if st.session_state["hub_token"] is None:
53
  st.error("Please provide a Hub token to generate answers")
54
  st.stop()
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textwrap import dedent
2
+
3
  import streamlit as st
4
 
5
  from defaults import (
 
6
  PROJECT_NAME,
7
  ARGILLA_URL,
8
  DIBT_PARENT_APP_URL,
9
  DATASET_URL,
10
  DATASET_REPO_ID,
 
11
  )
12
 
13
 
 
48
  st.sidebar.divider()
49
 
50
  st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
51
+
52
  if st.session_state["hub_token"] is None:
53
  st.error("Please provide a Hub token to generate answers")
54
  st.stop()
55
 
56
+
57
+ def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]:
58
+ """Create seed terms for self intruct to start from."""
59
+
60
+ return [
61
+ f"{topic} from a {perspective} perspective"
62
+ for topic in topics
63
+ for perspective in perspectives
64
+ ]
65
+
66
+
67
+ def create_application_instruction(domain: str, examples: list[dict[str, str]]) -> str:
68
+ """Create the instruction for Self-Instruct task."""
69
+ system_prompt = dedent(
70
+ f"""You are an AI assistant than generates queries around the domain of {domain}.
71
+ Your should not expect basic but profound questions from your users.
72
+ The queries should reflect a diversxamity of vision and economic positions and political positions.
73
+ The queries may know about different methods of {domain}.
74
+ The queries can be positioned politically, economically, socially, or practically.
75
+ Also take into account the impact of diverse causes on diverse domains."""
76
+ )
77
+ for example in examples:
78
+ question = example["question"]
79
+ answer = example["answer"]
80
+ system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
81
+
82
+ return system_prompt