Ben Burtenshaw
commited on
Commit
β’
5776d7d
1
Parent(s):
7a96aaa
update for local pipeline
Browse files
parent_app/__pycache__/defaults.cpython-311.pyc
DELETED
Binary file (561 Bytes)
|
|
parent_app/__pycache__/hub.cpython-311.pyc
DELETED
Binary file (1.7 kB)
|
|
parent_app/app.py
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
|
3 |
-
from hub import (
|
4 |
-
setup_dataset_on_hub,
|
5 |
-
duplicate_space_on_hub,
|
6 |
-
add_project_config_to_space_repo,
|
7 |
-
)
|
8 |
-
|
9 |
-
import streamlit as st
|
10 |
-
|
11 |
-
|
12 |
-
# Constants
|
13 |
-
# Written here to avoid defaults.py
|
14 |
-
DEFAULT_DOMAIN = "farming"
|
15 |
-
|
16 |
-
st.set_page_config(
|
17 |
-
"Domain Data Grower", page_icon="π§βπΎ", initial_sidebar_state="collapsed"
|
18 |
-
)
|
19 |
-
|
20 |
-
st.header("π§βπΎ Domain Data Grower")
|
21 |
-
st.divider()
|
22 |
-
|
23 |
-
st.sidebar.link_button(
|
24 |
-
"π€ Get your Hub Token", "https://huggingface.co/settings/tokens"
|
25 |
-
)
|
26 |
-
|
27 |
-
################################################################################
|
28 |
-
# APP MARKDOWN
|
29 |
-
################################################################################
|
30 |
-
|
31 |
-
st.header("π± Create a domain specific dataset")
|
32 |
-
|
33 |
-
st.markdown(
|
34 |
-
"""This space will set up your domain specific dataset project. It will
|
35 |
-
create the resources that you need to build a dataset. Those resources include:
|
36 |
-
|
37 |
-
- A dataset repository on the Hub
|
38 |
-
- Another space to define expert domain and run generation pipelines
|
39 |
-
|
40 |
-
For a complete overview of the project. Check out the README
|
41 |
-
"""
|
42 |
-
)
|
43 |
-
|
44 |
-
st.page_link(
|
45 |
-
"pages/π§βπΎ Domain Data Grower.py",
|
46 |
-
label="Domain Data Grower",
|
47 |
-
icon="π§βπΎ",
|
48 |
-
)
|
49 |
-
|
50 |
-
################################################################################
|
51 |
-
# CONFIGURATION
|
52 |
-
################################################################################
|
53 |
-
|
54 |
-
st.subheader("πΎ Project Configuration")
|
55 |
-
|
56 |
-
project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
|
57 |
-
hub_username = st.text_input("Hub Username", "argilla")
|
58 |
-
hub_token = st.text_input("Hub Token", type="password")
|
59 |
-
private_selector = st.checkbox("Private Space", value=False)
|
60 |
-
|
61 |
-
if st.button("π€ Setup Project Resources"):
|
62 |
-
repo_id = f"{hub_username}/{project_name}"
|
63 |
-
|
64 |
-
setup_dataset_on_hub(
|
65 |
-
repo_id=repo_id,
|
66 |
-
hub_token=hub_token,
|
67 |
-
)
|
68 |
-
|
69 |
-
st.success(
|
70 |
-
f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
|
71 |
-
)
|
72 |
-
|
73 |
-
space_name = f"{project_name}_config_space"
|
74 |
-
|
75 |
-
duplicate_space_on_hub(
|
76 |
-
source_repo="argilla/domain-specific-datasets-template",
|
77 |
-
target_repo=space_name,
|
78 |
-
hub_token=hub_token,
|
79 |
-
private=private_selector,
|
80 |
-
)
|
81 |
-
|
82 |
-
st.success(
|
83 |
-
f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
|
84 |
-
)
|
85 |
-
|
86 |
-
argilla_name = f"{project_name}_argilla_space"
|
87 |
-
|
88 |
-
duplicate_space_on_hub(
|
89 |
-
source_repo="argilla/argilla-template-space",
|
90 |
-
target_repo=argilla_name,
|
91 |
-
hub_token=hub_token,
|
92 |
-
private=private_selector,
|
93 |
-
)
|
94 |
-
|
95 |
-
st.success(
|
96 |
-
f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
|
97 |
-
)
|
98 |
-
|
99 |
-
seconds = 5
|
100 |
-
|
101 |
-
with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
|
102 |
-
time.sleep(seconds)
|
103 |
-
add_project_config_to_space_repo(
|
104 |
-
dataset_repo_id=repo_id,
|
105 |
-
hub_token=hub_token,
|
106 |
-
project_name=project_name,
|
107 |
-
argilla_space_repo_id=f"{hub_username}/{argilla_name}",
|
108 |
-
project_space_repo_id=f"{hub_username}/{space_name}",
|
109 |
-
)
|
110 |
-
|
111 |
-
st.subheader("π’ Next Steps")
|
112 |
-
|
113 |
-
st.write("Go to you project specific space!")
|
114 |
-
|
115 |
-
st.link_button(
|
116 |
-
"π§βπΎ Open Configuration Space",
|
117 |
-
f"https://huggingface.co/spaces/{hub_username}/{space_name}",
|
118 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_app/hub.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from tempfile import mktemp
|
3 |
-
|
4 |
-
|
5 |
-
from huggingface_hub import duplicate_space, HfApi
|
6 |
-
|
7 |
-
|
8 |
-
hf_api = HfApi()
|
9 |
-
|
10 |
-
|
11 |
-
def setup_dataset_on_hub(repo_id, hub_token):
|
12 |
-
# create an empty dataset repo on the hub
|
13 |
-
hf_api.create_repo(
|
14 |
-
repo_id=repo_id,
|
15 |
-
token=hub_token,
|
16 |
-
repo_type="dataset",
|
17 |
-
)
|
18 |
-
|
19 |
-
# upload the seed data
|
20 |
-
hf_api.upload_file(
|
21 |
-
path_or_fileobj="seed_data.json",
|
22 |
-
path_in_repo="seed_data.json",
|
23 |
-
repo_id=repo_id,
|
24 |
-
repo_type="dataset",
|
25 |
-
token=hub_token,
|
26 |
-
)
|
27 |
-
|
28 |
-
|
29 |
-
def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
|
30 |
-
duplicate_space(
|
31 |
-
from_id=source_repo,
|
32 |
-
to_id=target_repo,
|
33 |
-
token=hub_token,
|
34 |
-
private=private,
|
35 |
-
exist_ok=True,
|
36 |
-
)
|
37 |
-
|
38 |
-
|
39 |
-
def add_project_config_to_space_repo(
|
40 |
-
dataset_repo_id,
|
41 |
-
hub_token,
|
42 |
-
project_name,
|
43 |
-
argilla_space_repo_id,
|
44 |
-
project_space_repo_id,
|
45 |
-
):
|
46 |
-
# upload the seed data and readme to the hub
|
47 |
-
|
48 |
-
with open("project_config.json", "w") as f:
|
49 |
-
json.dump(
|
50 |
-
{
|
51 |
-
"project_name": project_name,
|
52 |
-
"argilla_space_repo_id": argilla_space_repo_id,
|
53 |
-
"project_space_repo_id": project_space_repo_id,
|
54 |
-
"dataset_repo_id": dataset_repo_id,
|
55 |
-
},
|
56 |
-
f,
|
57 |
-
)
|
58 |
-
|
59 |
-
hf_api.upload_file(
|
60 |
-
path_or_fileobj="project_config.json",
|
61 |
-
path_in_repo="project_config.json",
|
62 |
-
token=hub_token,
|
63 |
-
repo_id=project_space_repo_id,
|
64 |
-
repo_type="space",
|
65 |
-
)
|
66 |
-
|
67 |
-
|
68 |
-
def pull_seed_data_from_repo(repo_id, hub_token):
|
69 |
-
tempfile_path = mktemp()
|
70 |
-
# pull the dataset repo from the hub
|
71 |
-
hf_api.hf_hub_download(
|
72 |
-
repo_id=repo_id, token=hub_token, repo_type="dataset", filename=tempfile_path
|
73 |
-
)
|
74 |
-
return json.load(open(tempfile_path))
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_app/pages/π§βπΎ Domain Data Grower.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import requests
|
3 |
-
|
4 |
-
|
5 |
-
readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/51f29e67165d8277d9f9d1e4be60869f4b705a08/domain-specific-datasets/README.md"
|
6 |
-
|
7 |
-
|
8 |
-
def open_markdown_file(url):
|
9 |
-
response = requests.get(url)
|
10 |
-
return response.text
|
11 |
-
|
12 |
-
|
13 |
-
readme = open_markdown_file(readme_location)
|
14 |
-
|
15 |
-
st.markdown(readme)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent_app/project_config.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"project_name": "farming", "argilla_space_repo_id": "ignacioct/farming_argilla_space", "project_space_repo_id": "ignacioct/farming_config_space", "dataset_repo_id": "ignacioct/farming"}
|
|
|
|
parent_app/seed_data.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"domain": "farming",
|
3 |
-
"perspectives": [
|
4 |
-
"Family Farming"
|
5 |
-
],
|
6 |
-
"topics": [
|
7 |
-
"animal welfare"
|
8 |
-
],
|
9 |
-
"examples": [
|
10 |
-
{
|
11 |
-
"question": "Compare and contrast the environmental footprint of industrial and small-scale farming.",
|
12 |
-
"answer": "Regenerative agriculture practices aim to restore soil health through methods that increase soil organic matter, enhance microbial activity, and improve soil structure. These practices include no-till farming, cover cropping, diverse crop rotations, and integrated livestock management. According to LaCanne and Lundgren (2018), soil health improves due to increased biodiversity and organic matter, enhancing its water retention and nutrient efficiency. Moreover, Jones (2012) in \"Soil carbon & organic farming\" reports that these practices significantly elevate biodiversity, both above and below the soil surface, promoting resilient ecosystems and agroecological balances."
|
13 |
-
}
|
14 |
-
],
|
15 |
-
"domain_expert_prompt": "You will be asked about family farming and agribusiness related topics, from different perspectives.\n Your answer should be logical and supported by facts, don't fabricate arguments. \n Try to gather a diverse point of view taking into account current theories in agronomy, biology, economics, anthropology and ecology."
|
16 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|