sycod commited on
Commit
a03ee84
·
1 Parent(s): 6ebb6d1

data ok and EDA begun

Browse files
Files changed (7) hide show
  1. EDA.ipynb +0 -0
  2. README.md +24 -1
  3. app.py +5 -4
  4. config.yaml +10 -1
  5. src/clear_hf_cache.sh +1 -0
  6. src/eda.py +36 -0
  7. src/load_data.py +156 -51
EDA.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -2,4 +2,27 @@
2
  license: mit
3
  datasets:
4
  - frugal-ai-challenge/public-leaderboard-image
5
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: mit
3
  datasets:
4
  - frugal-ai-challenge/public-leaderboard-image
5
+ ---
6
+
7
+ # Notices
8
+
9
+ - **Stratification is not used for data split** (splits are predefined in project)
10
+
11
+ # Installation
12
+
13
+ ✅ Checklist:
14
+ - [] Check **absolute path to HF cache folder** is up-to-date in "clear_hf_cache.sh"
15
+ - [] Check **"clear_hf_cache.sh" script is executable** 👉 `chmod +x ./src/clear_hf_cache.sh`
16
+
17
+ # 🚧 TODO
18
+
19
+ - voir répartition partenaires, caméras, temporalité, annotations
20
+ - utiliser **classification binaire**
21
+ - métriques : **matrice de confusion** complète
22
+ - décrire voir **erreurs de types et conséquences**
23
+ - tester plusieurs pré-entraînements
24
+ - modèles :
25
+ - [ ] EfficientNet
26
+ - [ ] EfficientDet
27
+ - tester si amélioration avec et sans égalisation
28
+ - voir répartition physique des annotations sur l'image
app.py CHANGED
@@ -7,7 +7,7 @@ load_dotenv()
7
 
8
  app = FastAPI(
9
  title="Frugal AI Challenge API",
10
- description="API for the Frugal AI Challenge evaluation endpoints"
11
  )
12
 
13
  # Include all routers
@@ -15,6 +15,7 @@ app.include_router(text.router)
15
  app.include_router(image.router)
16
  app.include_router(audio.router)
17
 
 
18
  @app.get("/")
19
  async def root():
20
  return {
@@ -22,6 +23,6 @@ async def root():
22
  "endpoints": {
23
  "text": "/text - Text classification task",
24
  "image": "/image - Image classification task (coming soon)",
25
- "audio": "/audio - Audio classification task (coming soon)"
26
- }
27
- }
 
7
 
8
  app = FastAPI(
9
  title="Frugal AI Challenge API",
10
+ description="API for the Frugal AI Challenge evaluation endpoints",
11
  )
12
 
13
  # Include all routers
 
15
  app.include_router(image.router)
16
  app.include_router(audio.router)
17
 
18
+
19
  @app.get("/")
20
  async def root():
21
  return {
 
23
  "endpoints": {
24
  "text": "/text - Text classification task",
25
  "image": "/image - Image classification task (coming soon)",
26
+ "audio": "/audio - Audio classification task (coming soon)",
27
+ },
28
+ }
config.yaml CHANGED
@@ -1,4 +1,13 @@
1
- data_dir: "data"
 
 
 
 
 
 
 
 
 
2
  db_info_uri: "data_info.csv"
3
 
4
  # log:
 
1
+ # From evaluation.py
2
+ repo_id: "pyronear/pyro-sdis"
3
+ split_size: 0.2
4
+ rdm_seed: 42
5
+
6
+ # Data
7
+ data_root_dir: "data"
8
+ raw_data_dir: "raw"
9
+ clr_hf_cache_script_abs_path: './src/clear_hf_cache.sh'
10
+ data_format: "keras" # "ultralytics" or "keras"
11
  db_info_uri: "data_info.csv"
12
 
13
  # log:
src/clear_hf_cache.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ rm -rvf ~/.cache/huggingface
src/eda.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load dataset and save locally in selected format"""
2
+
3
+ import logging
4
+ import os
5
+ import pandas as pd
6
+ import yaml
7
+
8
+
9
+ # Logging configuration (see all outputs, even DEBUG or INFO)
10
+ logger = logging.getLogger()
11
+ logger.setLevel(logging.INFO)
12
+
13
+ # local config
14
+ with open("config.yaml", "r") as f:
15
+ cfg = yaml.safe_load(f)
16
+
17
+
18
+ def make_autopct(values):
19
+ """
20
+ ==> Obtained from StackOverflow <==
21
+ Upgrades plt.pie(autopct=""), displaying percentages and values.
22
+
23
+ Input: list of numeric values or Pandas.Series
24
+ Output: string with percentage and value
25
+ """
26
+
27
+ def my_autopct(pct):
28
+ total = sum(values)
29
+ val = int(round(pct * total / 100.0))
30
+ return "{p:.2f}% ({v:d})".format(p=pct, v=val)
31
+
32
+ return my_autopct
33
+
34
+
35
+ if __name__ == "__main__":
36
+ help()
src/load_data.py CHANGED
@@ -1,9 +1,29 @@
1
- """Load dataset and save locally in Ultralytics format"""
2
 
3
  from datasets import load_dataset
4
  import logging
5
  import os
6
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  # Save in Ultralytics format
@@ -11,41 +31,44 @@ def save_ultralytics_format(dataset_split, split, IMAGE_DIR, LABEL_DIR):
11
  """Save a dataset split into the Ultralytics format.
12
 
13
  Args:
14
- dataset_split: The dataset split (e.g., dataset["train"])
15
- split: "train" or "val"
16
  """
 
17
  image_split_dir = os.path.join(IMAGE_DIR, split)
18
  label_split_dir = os.path.join(LABEL_DIR, split)
19
- if len(os.listdir(image_split_dir)) > 0 or len(os.listdir(label_split_dir)) > 0:
20
- logging.info(f"{image_split_dir} or {label_split_dir} not empty: passing")
21
- else:
22
- for example in dataset_split:
23
- # Save image to appropriate folder
24
- image = example["image"] # PIL.Image.Image
25
- image_name = example["image_name"] # Original file name
26
- output_image_path = os.path.join(image_split_dir, image_name)
27
- # Save image object to disk
28
- image.save(output_image_path)
29
-
30
- # Save label
31
- annotations = example["annotations"]
32
- label_name = image_name.replace(".jpg", ".txt").replace(".png", ".txt")
33
- output_label_path = os.path.join(label_split_dir, label_name)
34
- # Save label file
35
- with open(output_label_path, "w") as label_file:
36
- label_file.write(annotations)
37
-
38
- logging.info(f"Dataset {split} split exported to Ultralytics format")
39
-
40
-
41
- def create_df(ds, split_name, OUTPUT_DIR):
42
  """Create dataframe from dataset"""
 
43
  df = pd.DataFrame(
44
  [[i.size[0], i.size[1], i.format, i.mode] for i in ds["image"]],
45
- columns=["width", "height", "format", "mode"]
46
  )
47
  df["name"] = ds["image_name"]
48
- df["uri"] = df['name'].apply(lambda x: os.path.join(OUTPUT_DIR, "images", split_name, x))
 
 
 
49
  df["annotations"] = ds["annotations"]
50
  df["partner"] = ds["partner"]
51
  df["camera"] = ds["camera"]
@@ -54,42 +77,124 @@ def create_df(ds, split_name, OUTPUT_DIR):
54
  return df
55
 
56
 
57
- def load_data(OUTPUT_DIR, REPO_ID, DB_INFO_URI):
58
- """Load data and save to local directory in Ultralytics format
59
- """
60
 
61
  # Check if data information already exists before eventually loading model
62
- db_info_path = os.path.join(OUTPUT_DIR, DB_INFO_URI)
63
- if os.path.exists(db_info_path):
64
- df = pd.read_csv(db_info_path, index_col=0)
65
  return df
66
 
67
- # Create the directory structure
68
- IMAGE_DIR = os.path.join(OUTPUT_DIR, "images")
69
- LABEL_DIR = os.path.join(OUTPUT_DIR, "labels")
70
- for split in ["train", "val"]:
 
 
 
 
 
 
 
 
 
 
 
71
  os.makedirs(os.path.join(IMAGE_DIR, split), exist_ok=True)
72
  os.makedirs(os.path.join(LABEL_DIR, split), exist_ok=True)
73
 
74
- # Load the dataset from the Hugging Face Hub
75
- dataset = load_dataset(REPO_ID)
76
- logging.info("Dataset loaded in cache folder")
77
-
78
- # Save train and validation splits
79
- save_ultralytics_format(dataset["train"], "train", IMAGE_DIR, LABEL_DIR)
80
- save_ultralytics_format(dataset["val"], "val", IMAGE_DIR, LABEL_DIR)
81
 
82
  # Create global dataframe from splits
83
- df_val = create_df(dataset["val"], "val", OUTPUT_DIR)
84
  # Separate train to save memory
85
- df_train_1 = create_df(dataset["train"][:10000], "train", OUTPUT_DIR)
86
- df_train_2 = create_df(dataset["train"][10000:20000], "train", OUTPUT_DIR)
87
- df_train_3 = create_df(dataset["train"][20000:], "train", OUTPUT_DIR)
 
 
 
88
  # Save as one CSV
89
- df = pd.concat([df_val, df_train_1, df_train_2, df_train_3], axis=0, ignore_index=True)
90
- with open(db_info_path, "wb") as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  df.to_csv(f)
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return df
94
 
95
 
 
1
+ """Load dataset and save locally in selected format"""
2
 
3
  from datasets import load_dataset
4
  import logging
5
  import os
6
  import pandas as pd
7
+ import shutil
8
+ import subprocess
9
+ import yaml
10
+
11
+
12
+ # Logging configuration (see all outputs, even DEBUG or INFO)
13
+ logger = logging.getLogger()
14
+ logger.setLevel(logging.INFO)
15
+
16
+ # local config
17
+ with open("config.yaml", "r") as f:
18
+ cfg = yaml.safe_load(f)
19
+ REPO_ID = cfg["repo_id"]
20
+ SPLIT_SIZE = cfg["split_size"]
21
+ RDM_SEED = cfg["rdm_seed"]
22
+ OUTPUT_DIR = cfg["data_root_dir"]
23
+ RAW_DATA_DIR = os.path.join(OUTPUT_DIR, cfg["raw_data_dir"])
24
+ CLR_CACHE_SCRIPT = cfg["clr_hf_cache_script_abs_path"]
25
+ DATA_FORMAT = cfg["data_format"]
26
+ DB_INFO_URI = os.path.join(OUTPUT_DIR, cfg["db_info_uri"])
27
 
28
 
29
  # Save in Ultralytics format
 
31
  """Save a dataset split into the Ultralytics format.
32
 
33
  Args:
34
+ dataset_split: The dataset split (e.g. dataset["train"])
35
+ split: "train", "test" or "val"
36
  """
37
+
38
  image_split_dir = os.path.join(IMAGE_DIR, split)
39
  label_split_dir = os.path.join(LABEL_DIR, split)
40
+
41
+ for example in dataset_split:
42
+ # Save image to appropriate folder
43
+ image = example["image"] # PIL.Image.Image
44
+ image_name = example["image_name"] # Original file name
45
+ output_image_path = os.path.join(image_split_dir, image_name)
46
+ # Save image object to disk
47
+ image.save(output_image_path)
48
+
49
+ # Save label
50
+ annotations = example["annotations"]
51
+ label_name = image_name.replace(".jpg", ".txt").replace(".png", ".txt")
52
+ output_label_path = os.path.join(label_split_dir, label_name)
53
+ # Save label file
54
+ with open(output_label_path, "w") as label_file:
55
+ label_file.write(annotations)
56
+
57
+ logging.info(f"Dataset {split} split exported to Ultralytics format")
58
+
59
+
60
+ def create_df(ds, split_name, output_dir):
 
 
61
  """Create dataframe from dataset"""
62
+
63
  df = pd.DataFrame(
64
  [[i.size[0], i.size[1], i.format, i.mode] for i in ds["image"]],
65
+ columns=["width", "height", "format", "mode"],
66
  )
67
  df["name"] = ds["image_name"]
68
+ df["split"] = split_name
69
+ df["uri"] = df["name"].apply(
70
+ lambda x: os.path.join(output_dir, "images", split_name, x)
71
+ )
72
  df["annotations"] = ds["annotations"]
73
  df["partner"] = ds["partner"]
74
  df["camera"] = ds["camera"]
 
77
  return df
78
 
79
 
80
+ def load_raw_data():
81
+ """Main function for downloading, splitting and formatting data"""
 
82
 
83
  # Check if data information already exists before eventually loading model
84
+ if os.path.exists(DB_INFO_URI):
85
+ df = pd.read_csv(DB_INFO_URI, index_col=0)
 
86
  return df
87
 
88
+ # Load data
89
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
90
+ os.makedirs(RAW_DATA_DIR, exist_ok=True)
91
+ logging.info("⚙️ Dataset loading...")
92
+ dataset = load_dataset(REPO_ID)
93
+ train_test = dataset["train"].train_test_split(test_size=SPLIT_SIZE, seed=RDM_SEED)
94
+ ds_train = train_test["train"]
95
+ ds_val = dataset["val"]
96
+ ds_test = train_test["test"]
97
+ logging.info("✅ Dataset loaded in cache folder")
98
+
99
+ # Create directory structure
100
+ IMAGE_DIR = os.path.join(RAW_DATA_DIR, "images")
101
+ LABEL_DIR = os.path.join(RAW_DATA_DIR, "labels")
102
+ for split in ["train", "val", "test"]:
103
  os.makedirs(os.path.join(IMAGE_DIR, split), exist_ok=True)
104
  os.makedirs(os.path.join(LABEL_DIR, split), exist_ok=True)
105
 
106
+ # Save dataset splits
107
+ save_ultralytics_format(ds_train, "train", IMAGE_DIR, LABEL_DIR)
108
+ save_ultralytics_format(ds_val, "val", IMAGE_DIR, LABEL_DIR)
109
+ save_ultralytics_format(ds_test, "test", IMAGE_DIR, LABEL_DIR)
 
 
 
110
 
111
  # Create global dataframe from splits
 
112
  # Separate train to save memory
113
+ df_train_1 = create_df(ds_train[:6000], "train", RAW_DATA_DIR)
114
+ df_train_2 = create_df(ds_train[6000:12000], "train", RAW_DATA_DIR)
115
+ df_train_3 = create_df(ds_train[12000:18000], "train", RAW_DATA_DIR)
116
+ df_train_4 = create_df(ds_train[18000:], "train", RAW_DATA_DIR)
117
+ df_val = create_df(ds_val, "val", RAW_DATA_DIR)
118
+ df_test = create_df(ds_test, "test", RAW_DATA_DIR)
119
  # Save as one CSV
120
+ df = pd.concat(
121
+ [df_train_1, df_train_2, df_train_3, df_train_4, df_val, df_test],
122
+ axis=0,
123
+ ignore_index=True,
124
+ )
125
+ # Create label column for classification
126
+ df["label"] = "smoke"
127
+ df.loc[df["annotations"].isna() | (df["annotations"] == ""), "label"] = "no_smoke"
128
+ # Reorder columns
129
+ df = df.loc[
130
+ :,
131
+ [
132
+ "name",
133
+ "label",
134
+ "split",
135
+ "format",
136
+ "mode",
137
+ "width",
138
+ "height",
139
+ "camera",
140
+ "partner",
141
+ "timestamp",
142
+ "annotations",
143
+ "uri",
144
+ ],
145
+ ]
146
+ # Save as CSV
147
+ with open(DB_INFO_URI, "wb") as f:
148
  df.to_csv(f)
149
 
150
+ # Clear HF default cache folder after it is done (6GB)
151
+ # 💡 Check first if path up-to-date in "clear_hf_cache.sh"
152
+ logging.info("🧹 Removing HF default cache folder...")
153
+ result = subprocess.run(["bash", CLR_CACHE_SCRIPT], capture_output=True, text=True)
154
+ # logging.info(result.stdout)
155
+ logging.info("✅ HF Cache folder removed")
156
+
157
+ return df
158
+
159
+
160
+ def clean_df(df):
161
+ """Filter data to keep only necessary"""
162
+ # Filter columns
163
+ df = df[["name", "label", "split", "uri"]]
164
+ # Remove ".jpg" in name
165
+ df.loc[:, "name"] = df.name.apply(lambda x: x[:-4])
166
+
167
+ return df
168
+
169
+
170
+ def format_data_keras(df):
171
+ """Format data for Keras models"""
172
+ if not os.path.exists(OUTPUT_DIR):
173
+ logging.warning(f"{OUTPUT_DIR} doesn't exist: (re)load data first")
174
+ return df
175
+
176
+ # Create Keras parent folder
177
+ keras_dir = os.path.join(OUTPUT_DIR, "keras")
178
+ os.makedirs(keras_dir, exist_ok=True)
179
+ # Create splits folders
180
+ for split in df.split.unique():
181
+ split_dir = os.path.join(keras_dir, split)
182
+ os.makedirs(split_dir, exist_ok=True)
183
+ # Create labels folders
184
+ for label in df.label.unique():
185
+ label_dir = os.path.join(split_dir, label)
186
+ os.makedirs(label_dir, exist_ok=True)
187
+
188
+ # Copy images to new URI and update in dataframe
189
+ df.loc[:, "uri_dest"] = df.apply(
190
+ lambda x: os.path.join(OUTPUT_DIR, "keras", x["split"], x["label"], x["name"])
191
+ + ".jpg",
192
+ axis=1,
193
+ )
194
+ df.apply(lambda x: shutil.copy2(x["uri"], x["uri_dest"]), axis=1)
195
+ df.drop(columns="uri", inplace=True)
196
+ df.rename(columns={"uri_dest": "uri"}, inplace=True)
197
+
198
  return df
199
 
200