yourusername commited on
Commit
1dfaf00
β€’
1 Parent(s): 50f02db

:sparkles: add ability to upload datasets

Browse files
Files changed (2) hide show
  1. .gitignore +140 -0
  2. app.py +137 -14
.gitignore ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ images/
app.py CHANGED
@@ -1,5 +1,18 @@
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  from huggingpics.data import get_image_urls_by_term
 
 
 
 
3
 
4
 
5
  def show_images_of_term(search_term, num_cols=5, num_rows=3):
@@ -15,31 +28,141 @@ def show_images_of_term(search_term, num_cols=5, num_rows=3):
15
  cols[col_id].image(urls[row_id * num_cols + col_id], use_column_width=True)
16
 
17
 
18
- def explore():
19
- with st.sidebar:
20
- term_1 = st.sidebar.text_input('Search Term 1', value='shiba inu')
21
- term_2 = st.sidebar.text_input('Search Term 2', value='husky')
22
- term_3 = st.sidebar.text_input('Search Term 3', value='')
23
- term_4 = st.sidebar.text_input('Search Term 4', value='')
24
- term_5 = st.sidebar.text_input('Search Term 5', value='')
25
 
26
- terms = [t for t in [term_1, term_2, term_3, term_4, term_5] if t]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  for term in terms:
29
  show_images_of_term(term)
30
 
31
-
32
- def create_dataset():
33
- st.markdown("# Coming soon...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  def main():
37
 
38
  with st.sidebar:
39
- mode = st.sidebar.selectbox("Mode", ["Explore", "Create Dataset"])
40
- st.sidebar.markdown("---")
 
 
 
 
 
 
 
 
41
 
42
- _ = explore() if mode == "Explore" else create_dataset()
43
 
44
 
45
  if __name__ == '__main__':
1
+ import logging
2
+ import shutil
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from pathlib import Path
6
+ from tempfile import TemporaryDirectory
7
+
8
+ import requests
9
  import streamlit as st
10
+ from huggingface_hub import Repository, create_repo, login, whoami
11
  from huggingpics.data import get_image_urls_by_term
12
+ from requests.exceptions import HTTPError
13
+ from tqdm.auto import tqdm
14
+
15
+ logger = logging.getLogger(__name__)
16
 
17
 
18
  def show_images_of_term(search_term, num_cols=5, num_rows=3):
28
  cols[col_id].image(urls[row_id * num_cols + col_id], use_column_width=True)
29
 
30
 
31
+ def download_image(img_url, filename):
32
+ response = requests.get(img_url)
33
+ response.raise_for_status()
34
+ img_bytes = response.content
35
+ with open(filename, 'wb') as img_file:
36
+ img_file.write(img_bytes)
 
37
 
38
+
39
+ def make_huggingpics_imagefolder(data_dir, search_terms, count=150, overwrite=False, transform=None, resume=False):
40
+
41
+ data_dir = Path(data_dir)
42
+
43
+ if data_dir.exists():
44
+ if overwrite:
45
+ print(f"Deleting existing HuggingPics data directory to create new one: {data_dir}")
46
+ shutil.rmtree(data_dir)
47
+ else:
48
+ print(f"Using existing HuggingPics data directory: '{data_dir}'")
49
+ if not resume:
50
+ return
51
+
52
+ pbar = st.progress(0)
53
+
54
+ for search_term_idx, search_term in enumerate(search_terms):
55
+ search_term_dir = data_dir / search_term
56
+
57
+ search_term_dir.mkdir(exist_ok=True, parents=True)
58
+ is_term_dir_nonempty = any(Path(search_term_dir).iterdir())
59
+ if is_term_dir_nonempty:
60
+ print(f"Skipping search term '{search_term}' because it already has images in it.")
61
+ continue
62
+
63
+ urls = get_image_urls_by_term(search_term, count)
64
+ logger.info(f"Saving images of {search_term} to {str(search_term_dir)}...")
65
+
66
+ with ThreadPoolExecutor() as executor:
67
+ for i, url in enumerate(tqdm(urls)):
68
+ executor.submit(download_image, url, search_term_dir / f'{i}.jpg')
69
+
70
+ pbar.progress((search_term_idx + 1) / len(search_terms))
71
+
72
+ pbar.empty()
73
+
74
+
75
+ def create_dataset(terms):
76
+
77
+ msg_placeholder = st.empty()
78
 
79
  for term in terms:
80
  show_images_of_term(term)
81
 
82
+ with st.sidebar:
83
+ with st.form('Push to Hub'):
84
+ dataset_name = st.text_input('Dataset Name', value='huggingpics-data')
85
+ do_push = st.form_submit_button("Push to πŸ€— Hub")
86
+
87
+ if do_push:
88
+ msg_placeholder.empty()
89
+ if not st.session_state.get('is_logged_in'):
90
+ msg_placeholder.error("You must login to push to the hub.")
91
+ return
92
+ else:
93
+ msg_placeholder.empty()
94
+
95
+ with st.sidebar:
96
+ repo_url = create_repo(dataset_name, st.session_state.token, exist_ok=True, repo_type='dataset')
97
+ hf_username = whoami(st.session_state.token)['name']
98
+ with TemporaryDirectory() as tmp_dir:
99
+ repo_owner, repo_name = hf_username, dataset_name
100
+ repo_namespace = f"{repo_owner}/{repo_name}"
101
+ repo_url = f'https://huggingface.co/{repo_namespace}'
102
+
103
+ repo = Repository(
104
+ tmp_dir,
105
+ clone_from=repo_url,
106
+ use_auth_token=st.session_state.token,
107
+ git_user=hf_username,
108
+ git_email=f'{hf_username}@users.noreply.huggingface.co',
109
+ )
110
+
111
+ with st.spinner(f"Uploading files to [{repo_namespace}]({repo_url})..."):
112
+ with repo.commit("Uploaded from HuggingPics Explorer"):
113
+ make_huggingpics_imagefolder(Path(tmp_dir) / 'images', terms, count=150)
114
+
115
+ st.success(f"View your dataset here πŸ‘‰ [{repo_namespace}]({repo_url})")
116
+
117
+
118
+ def huggingface_auth_form():
119
+ placeholder = st.empty()
120
+
121
+ is_logged_in = st.session_state.get('is_logged_in', False)
122
+
123
+ if is_logged_in:
124
+ with placeholder.container():
125
+ st.markdown(f"βœ… Logged in as {whoami()['name']}")
126
+ do_logout = st.button("Logout")
127
+ if do_logout:
128
+ st.session_state.token = None
129
+ st.session_state.is_logged_in = False
130
+ placeholder.empty()
131
+ huggingface_auth_form()
132
+ else:
133
+ with placeholder.container():
134
+ username = st.text_input('Username', value=st.session_state.get('username', ''))
135
+ password = st.text_input('Password', value="", type="password")
136
+ submit = st.button('Login')
137
+ if submit:
138
+ try:
139
+ st.session_state.token = login(username, password)
140
+ st.session_state.is_logged_in = True
141
+ placeholder.empty()
142
+ huggingface_auth_form()
143
+ except HTTPError as e:
144
+ st.session_state.token = None
145
+ st.session_state.is_logged_in = False
146
+ st.error("Invalid username or password.")
147
+ time.sleep(2)
148
+ # huggingface_auth_form() # ???
149
 
150
 
151
  def main():
152
 
153
  with st.sidebar:
154
+ term_1 = st.sidebar.text_input('Search Term 1', value='shiba inu')
155
+ term_2 = st.sidebar.text_input('Search Term 2', value='husky')
156
+ term_3 = st.sidebar.text_input('Search Term 3', value='')
157
+ term_4 = st.sidebar.text_input('Search Term 4', value='')
158
+ term_5 = st.sidebar.text_input('Search Term 5', value='')
159
+ terms = [t for t in [term_1, term_2, term_3, term_4, term_5] if t]
160
+
161
+ st.markdown('---')
162
+ huggingface_auth_form()
163
+ st.markdown('---')
164
 
165
+ _ = create_dataset(terms)
166
 
167
 
168
  if __name__ == '__main__':