vancauwe commited on
Commit
951051a
·
1 Parent(s): a464cd8

feat: hf dataset connection

Browse files
.github/workflows/sync_dataset_hf.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync Hugging Face Dataset
2
+
3
+ on:
4
+ schedule:
5
+ - cron: '0 * * * *' # Runs every hour
6
+
7
+ jobs:
8
+ sync_dataset:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout repository
12
+ uses: actions/checkout@v2
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v2
16
+ with:
17
+ python-version: '3.x'
18
+
19
+ - name: Install dependencies
20
+ run: |
21
+ python -m pip install --upgrade pip
22
+ pip install -r requirements.txt
23
+
24
+ - name: Sync Datasets
25
+ env:
26
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
27
+ run: python sync_dataset_hf.py
app/display.py CHANGED
@@ -12,7 +12,6 @@ HEADERS = ["Identifier", "Location", "Wounded", "Dead"]
12
 
13
 
14
  def save_display_individual(gallery, df, error_box, data):
15
- #print(data)
16
  individual, error_box, data = validate_save_individual(data, error_box)
17
  if individual:
18
  all_animals = get_json_all_individuals()
 
12
 
13
 
14
  def save_display_individual(gallery, df, error_box, data):
 
15
  individual, error_box, data = validate_save_individual(data, error_box)
16
  if individual:
17
  all_animals = get_json_all_individuals()
app/main.py CHANGED
@@ -20,6 +20,12 @@ from styling.theme import css
20
 
21
  from geolocalisation.js_geolocation import js_geocode, display_location
22
 
 
 
 
 
 
 
23
  # with gr.Blocks(theme=theme, css=css) as demo:
24
  with gr.Blocks(theme='shivi/calm_seafoam') as demo:
25
  individual = gr.State({})
@@ -396,6 +402,7 @@ with gr.Blocks(theme='shivi/calm_seafoam') as demo:
396
  show_modal.click(lambda: Modal(visible=True), None, modal)
397
  show_modal.click(create_json_one_individual)
398
  show_modal.click(create_tmp)
 
399
  #submit_button.click(save_and_rest_df, inputs=[df], outputs=[df])
400
 
401
 
 
20
 
21
  from geolocalisation.js_geolocation import js_geocode, display_location
22
 
23
+ from datasets import disable_caching
24
+ disable_caching()
25
+
26
+ dataset_id = "SDSC/digiwild-dataset"
27
+ data_files = "data/train-00000-of-00001.parquet"
28
+
29
  # with gr.Blocks(theme=theme, css=css) as demo:
30
  with gr.Blocks(theme='shivi/calm_seafoam') as demo:
31
  individual = gr.State({})
 
402
  show_modal.click(lambda: Modal(visible=True), None, modal)
403
  show_modal.click(create_json_one_individual)
404
  show_modal.click(create_tmp)
405
+
406
  #submit_button.click(save_and_rest_df, inputs=[df], outputs=[df])
407
 
408
 
app/sync_dataset_hf.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, DownloadMode
2
+ import json
3
+ import os
4
+ from huggingface_hub import HfApi , hf_hub_download
5
+
6
+ dataset_id = "SDSC/digiwild-dataset"
7
+ token = os.getenv("HUGGINGFACE_TOKEN")
8
+
9
+ # Initialize API client
10
+ api = HfApi(token=token)
11
+
12
+ # Load all metadata files
13
+ files = api.list_repo_files(dataset_id, repo_type="dataset")
14
+ json_files = [file for file in files if file.endswith(".json")]
15
+
16
+ # Load the metadata compilation
17
+ try:
18
+ data_files = "data/train-00000-of-00001.parquet"
19
+ metadata = load_dataset(
20
+ dataset_id,
21
+ data_files=data_files)
22
+ # Add new json entries to dataset
23
+ for file in json_files:
24
+ file = hf_hub_download(repo_id=dataset_id, filename=file, repo_type="dataset")
25
+ with open(file, "r") as f:
26
+ new = json.load(f)
27
+ if not(new["image_md5"] in metadata["train"]["image_md5"]):
28
+ metadata["train"] = metadata["train"].add_item(new)
29
+ except:
30
+ metadata = load_dataset(
31
+ dataset_id,
32
+ data_files=json_files)
33
+
34
+
35
+ metadata.push_to_hub(dataset_id, token=token)
app/validation_submission/create_json.py CHANGED
@@ -1,6 +1,19 @@
1
  import json
2
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def create_json_one_individual(one_individual={}):
 
4
  one_individual = json.dumps(one_individual)
5
  with open("data/one_individual.json", "w") as outfile:
6
  outfile.write(one_individual)
 
1
  import json
2
 
3
+ import random
4
+ import string
5
+
6
+ import hashlib
7
+
8
+ def generate_random_md5():
9
+ # Generate a random string
10
+ random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=16))
11
+ # Encode the string and compute its MD5 hash
12
+ md5_hash = hashlib.md5(random_string.encode()).hexdigest()
13
+ return md5_hash
14
+
15
  def create_json_one_individual(one_individual={}):
16
+ one_individual["image_md5"] = generate_random_md5()
17
  one_individual = json.dumps(one_individual)
18
  with open("data/one_individual.json", "w") as outfile:
19
  outfile.write(one_individual)
app/validation_submission/get_json.py CHANGED
@@ -5,9 +5,13 @@ def get_json_one_individual():
5
  one_individual = json.load(openfile)
6
  return one_individual
7
 
 
 
8
  def get_json_all_individuals():
9
- with open("data/all_individuals.json", "r") as openfile:
10
- all_individuals = json.load(openfile)
 
 
11
  return all_individuals
12
 
13
  def get_json_tmp(tmp_name):
 
5
  one_individual = json.load(openfile)
6
  return one_individual
7
 
8
+ ## TO DO : check this works
9
+ import os
10
  def get_json_all_individuals():
11
+ all_animals = os.getfiles("data")
12
+ all_individuals = []
13
+ for animal in all_animals:
14
+ all_individuals.append(animal)
15
  return all_individuals
16
 
17
  def get_json_tmp(tmp_name):
app/validation_submission/submission.py CHANGED
@@ -14,4 +14,20 @@ def save_to_all_individuals(one_individual):
14
  all_individuals_for_json = json.dumps(all_individuals)
15
  with open("data/all_individuals.json", "w") as outfile:
16
  outfile.write(all_individuals_for_json)
17
- return all_individuals
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  all_individuals_for_json = json.dumps(all_individuals)
15
  with open("data/all_individuals.json", "w") as outfile:
16
  outfile.write(all_individuals_for_json)
17
+ return all_individuals
18
+
19
+ from huggingface_hub import HfApi
20
+ import os
21
+
22
+ #save all individuals one by one in JSON wish md5 hash as json name
23
+ def push_to_dataset_hf():
24
+ token = os.environ.get("HF_TOKEN", None)
25
+ api = HfApi(token=token)
26
+ with open("data/all_individuals.json", "r") as f:
27
+ all = json.load(f)
28
+ api.upload_file(
29
+ path_or_fileobj=f.name,
30
+ path_in_repo=path_in_repo,
31
+ repo_id="SDSC/digiwild-dataset",
32
+ repo_type="dataset",
33
+ )
requirements.txt CHANGED
@@ -3,4 +3,7 @@ gradio_modal
3
  geopy
4
  geopandas
5
  pillow
6
- python-dotenv
 
 
 
 
3
  geopy
4
  geopandas
5
  pillow
6
+ python-dotenv
7
+ datasets
8
+ huggingface_hub
9
+ hashlib