File size: 6,466 Bytes
8773ff3
 
32014a1
fc828f1
8773ff3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32014a1
8773ff3
32014a1
 
 
8773ff3
fc828f1
 
 
 
798f8ba
 
fc828f1
 
 
 
 
 
 
 
 
 
 
798f8ba
8773ff3
 
 
 
 
798f8ba
 
 
 
 
 
 
fc828f1
 
 
798f8ba
 
fc828f1
 
 
798f8ba
fc828f1
 
 
 
798f8ba
 
 
 
 
 
 
8773ff3
fc828f1
 
 
 
 
 
 
 
 
 
 
 
 
 
798f8ba
fc828f1
 
 
 
 
 
798f8ba
 
8773ff3
 
 
 
 
 
 
 
 
798f8ba
8773ff3
32014a1
 
8773ff3
 
 
32014a1
 
 
 
fc828f1
 
 
 
 
 
 
32014a1
 
 
 
fc828f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfd3683
 
32014a1
 
 
8773ff3
32014a1
 
 
 
fc828f1
32014a1
 
8773ff3
32014a1
 
 
 
fc828f1
 
32014a1
dfd3683
 
 
 
 
 
fc828f1
dfd3683
fc828f1
32014a1
 
 
 
 
8773ff3
32014a1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import streamlit as st

from defaults import ARGILLA_URL
from hub import push_pipeline_params
from utils import project_sidebar

st.set_page_config(
    page_title="Domain Data Grower",
    page_icon="🧑‍🌾",
)

project_sidebar()

################################################################################
# HEADER
################################################################################

st.header("🧑‍🌾 Domain Data Grower")
st.divider()
st.subheader("Step 3. Run the pipeline to generate synthetic data")
st.write("Define the distilabel pipeline for generating the dataset.")

hub_username = st.session_state.get("hub_username")
project_name = st.session_state.get("project_name")
hub_token = st.session_state.get("hub_token")

###############################################################
# CONFIGURATION
###############################################################

st.divider()

st.markdown("## 🧰 Pipeline Configuration")

st.write(
    "Now we need to define the configuration for the pipeline that will generate the synthetic data."
)
st.write(
    "⚠️ Model and parameter choice significantly affect the quality of the generated data. \
    We reccomend that you start with a few samples and review the data. The scale up from there."
)


st.markdown("#### 🤖 Inference configuration")

st.write(
    "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
)

with st.expander("🤗 Recommended Models"):
    st.write("All inference endpoint compatible models can be found via the link below")
    st.link_button(
        "🤗 Inference compaptible models on the hub",
        "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
    )
    st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
    st.code(
        "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
    )

    st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
    st.code(
        "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
    )

    st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
    st.code(
        "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
    )

    st.write("Note Hugggingface Pro gives access to more compute resources")
    st.link_button(
        "🤗 Huggingface Pro",
        "https://huggingface.co/pricing",
    )


self_instruct_base_url = st.text_input(
    label="Model base URL for instruction generation",
    value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
)
domain_expert_base_url = st.text_input(
    label="Model base URL for domain expert response",
    value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
)

st.divider()
st.markdown("#### 🧮 Parameters configuration")

self_intruct_num_generations = st.slider(
    "Number of generations for self-instruction", 1, 10, 2
)
domain_expert_num_generations = st.slider(
    "Number of generations for domain expert", 1, 10, 2
)
self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)

st.divider()
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
st.divider()

###############################################################
# LOCAL
###############################################################

st.markdown("## Run the pipeline")

st.markdown(
    "Once you've defined the pipeline configuration above, you can run the pipeline from your local machine."
)


if all(
    [
        argilla_api_key,
        argilla_url,
        self_instruct_base_url,
        domain_expert_base_url,
        self_intruct_num_generations,
        domain_expert_num_generations,
        self_instruct_temperature,
        domain_expert_temperature,
        hub_username,
        project_name,
        hub_token,
        argilla_dataset_name,
    ]
) and st.button("💾 Save Pipeline Config"):
    with st.spinner("Pushing pipeline to the Hub..."):
        push_pipeline_params(
            pipeline_params={
                "argilla_api_key": argilla_api_key,
                "argilla_api_url": argilla_url,
                "argilla_dataset_name": argilla_dataset_name,
                "self_instruct_base_url": self_instruct_base_url,
                "domain_expert_base_url": domain_expert_base_url,
                "self_instruct_temperature": self_instruct_temperature,
                "domain_expert_temperature": domain_expert_temperature,
                "self_intruct_num_generations": self_intruct_num_generations,
                "domain_expert_num_generations": domain_expert_num_generations,
            },
            hub_username=hub_username,
            hub_token=hub_token,
            project_name=project_name,
        )

    st.success(
        f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
    )

    st.markdown(
        "To run the pipeline locally, you need to have the `distilabel` library installed. You can install it using the following command:"
    )

    st.code(
        f"""
        
        # Install the distilabel library
        pip install distilabel
        """
    )

    st.markdown("Next, you'll need to clone your dataset repo and run the pipeline:")

    st.code(
        f"""
        git clone https://github.com/huggingface/data-is-better-together
        cd data-is-better-together/domain-specific-datasets/pipelines
        pip install -r requirements.txt
        """
    )

    st.markdown("Finally, you can run the pipeline using the following command:")

    st.code(
        f"""
        huggingface-cli login
        python domain_expert_pipeline.py {hub_username}/{project_name}""",
        language="bash",
    )
    st.markdown(
        "👩‍🚀 If you want to customise the pipeline take a look in `pipeline.py` and teh [distilabel docs](https://distilabel.argilla.io/)"
    )

else:
    st.info("Please fill all the required fields.")