latterworks commited on
Commit
fad02fc
·
verified ·
1 Parent(s): 13d168d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +338 -243
app.py CHANGED
@@ -1,9 +1,3 @@
1
- """
2
- EXIF Extraction Pipeline - HuggingFace Space Implementation
3
- Provides a full-stack solution for extracting EXIF metadata from images and
4
- pushing directly to a linked HuggingFace dataset repository.
5
- """
6
-
7
  import os
8
  import io
9
  import json
@@ -15,35 +9,57 @@ from datetime import datetime
15
  import threading
16
  import queue
17
 
 
 
 
 
 
 
 
 
 
18
  import gradio as gr
19
- from PIL import Image, ExifTags, UnidentifiedImageError
20
  import pandas as pd
21
- from huggingface_hub import HfApi, upload_file, create_repo, Repository, hf_hub_download
22
- from datasets import Dataset, load_dataset, concatenate_datasets
23
 
24
- # Configuration variables
 
 
 
 
 
 
 
 
 
 
25
  HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
26
- HF_TOKEN = os.environ.get("HF_TOKEN", None) # Will use Spaces runtime token if not provided
27
  DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
28
  DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
29
- SPACE_ID = os.environ.get("SPACE_ID", f"{HF_USERNAME}/exif-extractor")
30
- REPO_MOUNTED = os.environ.get("REPO_MOUNTED", "true").lower() in ("true", "1", "t")
31
- LOCAL_STORAGE_PATH = Path("data")
 
32
  METADATA_FILE = LOCAL_STORAGE_PATH / "metadata.jsonl"
 
 
 
 
 
 
 
 
 
33
  MAX_BATCH_SIZE = 25
34
  SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.heic', '.tiff', '.tif', '.bmp', '.webp']
35
 
36
- # Initialize storage
37
- LOCAL_STORAGE_PATH.mkdir(exist_ok=True, parents=True)
38
-
39
- # Processing queue for background tasks
40
  process_queue = queue.Queue()
41
  upload_queue = queue.Queue()
42
 
43
- # ========== EXIF Extraction Core ==========
44
-
45
  def convert_to_degrees(value):
46
- """Convert GPS coordinates to decimal degrees"""
47
  try:
48
  d, m, s = value
49
  return d + (m / 60.0) + (s / 3600.0)
@@ -51,7 +67,7 @@ def convert_to_degrees(value):
51
  return value
52
 
53
  def extract_gps_info(gps_info):
54
- """Extract and format GPS metadata from EXIF"""
55
  if not gps_info or not isinstance(gps_info, dict):
56
  return None
57
 
@@ -60,12 +76,9 @@ def extract_gps_info(gps_info):
60
  tag_name = ExifTags.GPSTAGS.get(key, key)
61
  gps_data[tag_name] = val
62
 
63
- # Convert GPS coordinates to decimal format
64
  if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
65
  lat = convert_to_degrees(gps_data['GPSLatitude'])
66
  lon = convert_to_degrees(gps_data['GPSLongitude'])
67
-
68
- # Apply reference direction
69
  if gps_data.get('GPSLatitudeRef') == 'S':
70
  lat = -lat
71
  if gps_data.get('GPSLongitudeRef') == 'W':
@@ -73,58 +86,41 @@ def extract_gps_info(gps_info):
73
 
74
  gps_data['Latitude'] = lat
75
  gps_data['Longitude'] = lon
76
-
77
  return gps_data
78
 
79
  def make_serializable(value):
80
- """Convert non-serializable objects to JSON-serializable types"""
81
- # Handle PIL IFDRational objects
82
  if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
83
  try:
84
  return float(value.numerator) / float(value.denominator)
85
- except (TypeError, ValueError, ZeroDivisionError):
86
  return str(value)
87
-
88
- # Handle rational numbers as tuples
89
  elif isinstance(value, tuple) and len(value) == 2:
90
  try:
91
  return float(value[0]) / float(value[1])
92
- except (TypeError, ValueError, ZeroDivisionError):
93
  return str(value)
94
-
95
- # Handle compound types recursively
96
  elif isinstance(value, (list, tuple)):
97
- return [make_serializable(item) for item in value]
98
  elif isinstance(value, dict):
99
  return {k: make_serializable(v) for k, v in value.items()}
100
-
101
- # Handle binary data
102
  elif isinstance(value, bytes):
103
  try:
104
  return value.decode('utf-8')
105
  except UnicodeDecodeError:
106
  return str(value)
107
-
108
- # Check JSON serializability
109
  try:
110
  json.dumps(value)
111
  return value
112
- except (TypeError, OverflowError):
113
  return str(value)
114
 
115
  def extract_metadata(image_path_or_obj, original_filename=None):
116
  """
117
- Extract EXIF and metadata from an image file or PIL Image object
118
-
119
- Args:
120
- image_path_or_obj: Path object, string path, or PIL Image object
121
- original_filename: Original filename if image_path_or_obj is a PIL Image
122
-
123
- Returns:
124
- Dict containing image metadata
125
  """
126
  try:
127
- # Handle different input types
128
  if isinstance(image_path_or_obj, Image.Image):
129
  image = image_path_or_obj
130
  file_name = original_filename or "unknown.jpg"
@@ -137,21 +133,17 @@ def extract_metadata(image_path_or_obj, original_filename=None):
137
  file_size = image_path.stat().st_size
138
  file_extension = image_path.suffix.lower()
139
 
140
- # Basic image metadata
141
  metadata = {
142
  "file_name": file_name,
143
  "format": image.format,
144
  "size": list(image.size),
145
  "mode": image.mode,
146
  "extraction_timestamp": datetime.now().isoformat(),
 
147
  }
148
-
149
  if file_size:
150
  metadata["file_size"] = file_size
151
-
152
- metadata["file_extension"] = file_extension
153
 
154
- # Extract EXIF data with error handling
155
  try:
156
  exif_data = image._getexif()
157
  except Exception as e:
@@ -162,8 +154,6 @@ def extract_metadata(image_path_or_obj, original_filename=None):
162
  for tag_id, value in exif_data.items():
163
  try:
164
  tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}")
165
-
166
- # Extract GPS info
167
  if tag_name == "GPSInfo":
168
  gps_info = extract_gps_info(value)
169
  if gps_info:
@@ -175,16 +165,20 @@ def extract_metadata(image_path_or_obj, original_filename=None):
175
  else:
176
  metadata["exif"] = "No EXIF data available"
177
 
178
- # Validate serialization before returning
179
  try:
180
  json.dumps(metadata)
181
- except (TypeError, OverflowError) as e:
182
- # Filter out problematic entries as last resort
183
- basic_metadata = {k: v for k, v in metadata.items()
184
- if k in ["file_name", "format", "size", "mode", "file_size", "file_extension"]}
185
- basic_metadata["serialization_error"] = "Some metadata fields were removed due to JSON issues"
 
 
 
 
 
186
  return basic_metadata
187
-
188
  return metadata
189
 
190
  except Exception as e:
@@ -194,13 +188,10 @@ def extract_metadata(image_path_or_obj, original_filename=None):
194
  "extraction_timestamp": datetime.now().isoformat()
195
  }
196
 
197
- # ========== HuggingFace Integration ==========
198
-
199
  def save_metadata_to_jsonl(metadata_list, append=True):
200
- """Save metadata to JSONL file with error handling"""
201
  mode = 'a' if append and METADATA_FILE.exists() else 'w'
202
  success_count = 0
203
-
204
  with open(METADATA_FILE, mode) as f:
205
  for entry in metadata_list:
206
  try:
@@ -209,18 +200,16 @@ def save_metadata_to_jsonl(metadata_list, append=True):
209
  success_count += 1
210
  except Exception as e:
211
  print(f"Failed to serialize entry: {e}")
212
- # Write simplified version as fallback
213
- simplified = {"file_name": entry.get("file_name", "unknown"),
214
- "error": "Serialization failed"}
 
215
  f.write(json.dumps(simplified) + '\n')
216
-
217
  return success_count, len(metadata_list)
218
 
219
  def read_metadata_jsonl():
220
- """Read metadata from JSONL file"""
221
  if not METADATA_FILE.exists():
222
  return []
223
-
224
  metadata_list = []
225
  with open(METADATA_FILE, 'r') as f:
226
  for line in f:
@@ -230,75 +219,56 @@ def read_metadata_jsonl():
230
  continue
231
  return metadata_list
232
 
 
233
  def push_to_hub(metadata_list=None, create_if_not_exists=True):
234
- """Push metadata to HuggingFace Hub as a dataset"""
235
  api = HfApi(token=HF_TOKEN)
236
  try:
237
  if metadata_list is None:
238
  metadata_list = read_metadata_jsonl()
239
-
240
  if not metadata_list:
241
  return "No metadata to push", "warning"
242
 
243
- # Check if repository exists and create if needed
244
  repo_exists = True
245
  try:
246
  api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
247
  except Exception:
248
  repo_exists = False
249
  if create_if_not_exists:
250
- create_repo(
251
- repo_id=DATASET_REPO,
252
- repo_type="dataset",
253
- token=HF_TOKEN,
254
- private=False
255
- )
256
  else:
257
- return f"Dataset repository {DATASET_REPO} doesn't exist", "error"
258
 
259
- # Check if we need to merge with existing data
260
  existing_metadata = []
261
  if repo_exists:
262
  try:
263
- # Attempt to download existing metadata
264
- try:
265
- existing_file = hf_hub_download(
266
- repo_id=DATASET_REPO,
267
- filename="metadata.jsonl",
268
- repo_type="dataset",
269
- token=HF_TOKEN
270
- )
271
-
272
- # Parse existing metadata
273
- with open(existing_file, 'r') as f:
274
- for line in f:
275
- try:
276
- existing_metadata.append(json.loads(line))
277
- except json.JSONDecodeError:
278
- continue
279
- except Exception as e:
280
- print(f"No existing metadata found: {e}")
281
  except Exception as e:
282
- print(f"Error fetching existing metadata: {e}")
283
 
284
- # Merge new metadata with existing (avoiding duplicates by filename)
285
  if existing_metadata:
286
  existing_filenames = {item.get("file_name") for item in existing_metadata}
287
- unique_new_items = [item for item in metadata_list
288
- if item.get("file_name") not in existing_filenames]
289
-
290
- combined_metadata = existing_metadata + unique_new_items
291
- print(f"Combining {len(existing_metadata)} existing entries with {len(unique_new_items)} new entries")
292
  else:
293
  combined_metadata = metadata_list
294
-
295
- # Save temporary JSONL for upload
296
  temp_file = Path(tempfile.mktemp(suffix=".jsonl"))
297
  with open(temp_file, 'w') as f:
298
  for entry in combined_metadata:
299
  f.write(json.dumps(entry) + '\n')
300
 
301
- # Push to Hub with explicit API version compatibility
302
  api.upload_file(
303
  path_or_fileobj=str(temp_file),
304
  path_in_repo="metadata.jsonl",
@@ -307,32 +277,27 @@ def push_to_hub(metadata_list=None, create_if_not_exists=True):
307
  token=HF_TOKEN
308
  )
309
 
310
- # Create dataset card if needed
311
  readme_path = LOCAL_STORAGE_PATH / "README.md"
312
  if not readme_path.exists():
313
  with open(readme_path, 'w') as f:
314
- f.write(f"# EXIF Metadata Dataset\n\n"
315
- f"This dataset contains EXIF metadata extracted from images.\n\n"
316
- f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
317
- f"Total entries: {len(combined_metadata)}")
318
-
319
- # Update timestamp in README
320
  try:
321
  with open(readme_path, 'r') as f:
322
  readme_content = f.read()
323
-
324
- # Handle both cases: update existing timestamp or add one
325
- if "Last updated:" in readme_content:
326
- updated_readme = readme_content.replace(
327
- "Last updated:",
328
- f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\nTotal entries: {len(combined_metadata)}"
329
- )
330
- else:
331
- updated_readme = readme_content + f"\n\nLast updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\nTotal entries: {len(combined_metadata)}"
332
-
333
  with open(readme_path, 'w') as f:
334
  f.write(updated_readme)
335
-
336
  api.upload_file(
337
  path_or_fileobj=str(readme_path),
338
  path_in_repo="README.md",
@@ -343,216 +308,346 @@ def push_to_hub(metadata_list=None, create_if_not_exists=True):
343
  except Exception as e:
344
  print(f"Error updating README: {e}")
345
 
346
- return f"Successfully pushed {len(metadata_list)} metadata entries to {DATASET_REPO}", "success"
347
-
348
  except Exception as e:
349
- return f"Error pushing to Hub: {str(e)}", "error"
350
-
351
- # ========== Background Processing ==========
352
 
 
353
  def process_worker():
354
- """Background worker to process images in the queue"""
355
  while True:
356
  try:
357
  task = process_queue.get()
358
- if task is None: # Sentinel to stop the thread
359
  break
360
-
361
  file_path, original_filename = task
362
  metadata = extract_metadata(file_path, original_filename)
363
 
364
- # Save to JSONL
365
  success, total = save_metadata_to_jsonl([metadata])
366
-
367
- # Add to upload queue
368
  if success:
369
  upload_queue.put(metadata)
370
-
371
  process_queue.task_done()
372
  except Exception as e:
373
  print(f"Error in process worker: {e}")
374
  process_queue.task_done()
375
 
376
  def upload_worker():
377
- """Background worker to batch upload metadata to Hub"""
378
  batch = []
379
  last_upload_time = time.time()
380
-
381
  while True:
382
  try:
383
- # Wait for item with timeout
384
  try:
385
- metadata = upload_queue.get(timeout=60) # 1 minute timeout
386
  except queue.Empty:
387
- # If timeout and we have items, upload them
388
- if batch and (time.time() - last_upload_time) > 300: # 5 minutes passed
389
  push_to_hub(batch)
390
  batch = []
391
  last_upload_time = time.time()
392
  continue
393
-
394
- if metadata is None: # Sentinel to stop the thread
395
  break
396
-
397
  batch.append(metadata)
398
  upload_queue.task_done()
399
-
400
- # If batch size reached, upload
401
  if len(batch) >= MAX_BATCH_SIZE:
402
  push_to_hub(batch)
403
  batch = []
404
  last_upload_time = time.time()
405
-
406
  except Exception as e:
407
  print(f"Error in upload worker: {e}")
408
  if metadata:
409
  upload_queue.task_done()
410
 
411
- # Start worker threads
412
  process_thread = threading.Thread(target=process_worker, daemon=True)
413
  process_thread.start()
414
 
415
  upload_thread = threading.Thread(target=upload_worker, daemon=True)
416
  upload_thread.start()
417
 
418
- # ========== Gradio Interface ==========
419
-
420
  def process_uploaded_files(files):
421
- """Process uploaded files and extract metadata"""
422
  if not files:
423
  return "No files uploaded", "warning"
424
-
425
  processed = 0
426
  metadata_list = []
427
-
428
  for file in files:
429
  try:
430
- # Handle both Gradio v3.x and v4.x file objects
431
  if hasattr(file, 'name'):
432
- # Gradio v3.x
433
  file_path = Path(file.name)
434
  file_name = file_path.name
435
  else:
436
- # Gradio v4.x returns a tuple (path, orig_name)
437
  file_path = Path(file)
438
  file_name = file_path.name
439
-
440
  if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
441
  continue
442
-
443
  metadata = extract_metadata(file_path, file_name)
444
  metadata_list.append(metadata)
445
  processed += 1
446
-
447
- # Queue for background processing if needed
448
  process_queue.put((file_path, file_name))
449
  except Exception as e:
450
- print(f"Error processing {getattr(file, 'name', str(file))}: {e}")
451
-
452
  if metadata_list:
453
  success, total = save_metadata_to_jsonl(metadata_list)
454
- return f"Processed {processed} files. {success}/{total} metadata entries saved successfully.", "success"
 
455
  else:
456
- return f"No valid image files found among {len(files)} uploaded files", "warning"
457
 
458
  def view_metadata():
459
- """Display current metadata as a DataFrame"""
460
  metadata_list = read_metadata_jsonl()
461
-
462
  if not metadata_list:
463
  return "No metadata available", pd.DataFrame()
464
 
465
- # Create a flattened version for display
466
  display_data = []
467
  for entry in metadata_list:
468
- display_row = {
469
  "filename": entry.get("file_name", "unknown"),
470
- "width": entry.get("size", [0, 0])[0] if isinstance(entry.get("size"), list) else None,
471
- "height": entry.get("size", [0, 0])[1] if isinstance(entry.get("size"), list) else None,
472
  "format": entry.get("format"),
473
  "has_gps": "Yes" if entry.get("gps_info") else "No"
474
  }
475
-
476
- # Extract GPS coordinates if available
 
477
  if entry.get("gps_info"):
478
  gps = entry["gps_info"]
479
- display_row["latitude"] = gps.get("Latitude")
480
- display_row["longitude"] = gps.get("Longitude")
481
-
482
- display_data.append(display_row)
483
-
484
  df = pd.DataFrame(display_data)
485
- return f"Found {len(metadata_list)} metadata entries", df
486
 
487
  def manual_push_to_hub():
488
- """Manually trigger push to Hub"""
489
  return push_to_hub()
490
 
491
  with gr.Blocks(title="EXIF Extraction Pipeline") as app:
492
- gr.Markdown("""
493
  # EXIF Metadata Extraction Pipeline
494
 
495
- Upload images to extract EXIF metadata including GPS coordinates and publish to HuggingFace Hub.
 
 
 
496
 
497
- **Current configuration:**
498
- * Dataset repo: {repo}
499
- * Local storage: {storage}
500
- * Supported formats: {formats}
501
- """.format(
502
- repo=DATASET_REPO,
503
- storage=LOCAL_STORAGE_PATH,
504
- formats=", ".join(SUPPORTED_EXTENSIONS)
505
- ))
506
 
507
  with gr.Tabs():
508
  with gr.TabItem("Upload Images"):
509
- with gr.Row():
510
- file_input = gr.File(file_count="multiple", label="Upload Images")
511
-
512
- with gr.Row():
513
- submit_btn = gr.Button("Process Images")
514
- output_status = gr.Textbox(label="Status")
515
-
516
- submit_btn.click(
517
- fn=process_uploaded_files,
518
- inputs=[file_input],
519
- outputs=[output_status]
520
- )
521
-
522
  with gr.TabItem("View Metadata"):
523
- with gr.Row():
524
- refresh_btn = gr.Button("Refresh Metadata")
525
-
526
- with gr.Row():
527
- view_status = gr.Textbox(label="Status")
528
-
529
- with gr.Row():
530
- results_df = gr.DataFrame(label="Metadata Overview")
531
-
532
- refresh_btn.click(
533
- fn=view_metadata,
534
- inputs=[],
535
- outputs=[view_status, results_df]
536
- )
537
-
538
- # Auto-load metadata on tab selection
539
- app.load(
540
- fn=view_metadata,
541
- inputs=[],
542
- outputs=[view_status, results_df]
543
- )
544
-
545
  with gr.TabItem("Hub Management"):
546
- with gr.Row():
547
- push_btn = gr.Button("Push to HuggingFace Hub")
548
- push_status = gr.Textbox(label="Status")
549
-
550
- push_btn.click(
551
- fn=manual_push_to_hub,
552
- inputs=[],
553
- outputs=[push_status]
554
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
 
556
- # Initialize application
557
  if __name__ == "__main__":
558
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import io
3
  import json
 
9
  import threading
10
  import queue
11
 
12
+ # ====================== Additional Imports ======================
13
+ import torch
14
+ from torch import nn
15
+ import torch.nn.functional as F
16
+ from torch.utils.data import Dataset, DataLoader
17
+ from torchvision import transforms
18
+ from PIL import Image, ExifTags
19
+
20
+ from tqdm import tqdm
21
  import gradio as gr
 
22
  import pandas as pd
 
 
23
 
24
+ # Hugging Face Hub
25
+ from huggingface_hub import (
26
+ hf_hub_download,
27
+ login,
28
+ whoami,
29
+ create_repo,
30
+ HfApi,
31
+ InferenceClient,
32
+ )
33
+
34
+ # ====================== Configuration & Paths ======================
35
  HF_USERNAME = os.environ.get("HF_USERNAME", "latterworks")
36
+ HF_TOKEN = os.environ.get("HF_TOKEN", None) # If not provided, use default Spaces token
37
  DATASET_NAME = os.environ.get("DATASET_NAME", "geo-metadata")
38
  DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
39
+
40
+ # Relative local paths
41
+ LOCAL_STORAGE_PATH = Path("./data")
42
+ LOCAL_STORAGE_PATH.mkdir(exist_ok=True, parents=True)
43
  METADATA_FILE = LOCAL_STORAGE_PATH / "metadata.jsonl"
44
+
45
+ IMAGES_DIR = Path("./images") # place your images here
46
+ IMAGES_DIR.mkdir(exist_ok=True, parents=True)
47
+
48
+ # We’ll store checkpoints here:
49
+ CHECKPOINTS_DIR = Path("./checkpoints")
50
+ CHECKPOINTS_DIR.mkdir(exist_ok=True, parents=True)
51
+ CHECKPOINT_PATH = CHECKPOINTS_DIR / "last_checkpoint.pth"
52
+
53
  MAX_BATCH_SIZE = 25
54
  SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.heic', '.tiff', '.tif', '.bmp', '.webp']
55
 
56
+ # ====================== Queues and Threads ======================
 
 
 
57
  process_queue = queue.Queue()
58
  upload_queue = queue.Queue()
59
 
60
+ # ====================== EXIF Extraction Core ======================
 
61
  def convert_to_degrees(value):
62
+ """Convert GPS coords to decimal degrees."""
63
  try:
64
  d, m, s = value
65
  return d + (m / 60.0) + (s / 3600.0)
 
67
  return value
68
 
69
  def extract_gps_info(gps_info):
70
+ """Extract and format GPS metadata from EXIF."""
71
  if not gps_info or not isinstance(gps_info, dict):
72
  return None
73
 
 
76
  tag_name = ExifTags.GPSTAGS.get(key, key)
77
  gps_data[tag_name] = val
78
 
 
79
  if 'GPSLatitude' in gps_data and 'GPSLongitude' in gps_data:
80
  lat = convert_to_degrees(gps_data['GPSLatitude'])
81
  lon = convert_to_degrees(gps_data['GPSLongitude'])
 
 
82
  if gps_data.get('GPSLatitudeRef') == 'S':
83
  lat = -lat
84
  if gps_data.get('GPSLongitudeRef') == 'W':
 
86
 
87
  gps_data['Latitude'] = lat
88
  gps_data['Longitude'] = lon
 
89
  return gps_data
90
 
91
  def make_serializable(value):
92
+ """Convert objects to JSON-serializable."""
 
93
  if hasattr(value, 'numerator') and hasattr(value, 'denominator'):
94
  try:
95
  return float(value.numerator) / float(value.denominator)
96
+ except:
97
  return str(value)
 
 
98
  elif isinstance(value, tuple) and len(value) == 2:
99
  try:
100
  return float(value[0]) / float(value[1])
101
+ except:
102
  return str(value)
 
 
103
  elif isinstance(value, (list, tuple)):
104
+ return [make_serializable(v) for v in value]
105
  elif isinstance(value, dict):
106
  return {k: make_serializable(v) for k, v in value.items()}
 
 
107
  elif isinstance(value, bytes):
108
  try:
109
  return value.decode('utf-8')
110
  except UnicodeDecodeError:
111
  return str(value)
112
+ # final fallback
 
113
  try:
114
  json.dumps(value)
115
  return value
116
+ except:
117
  return str(value)
118
 
119
  def extract_metadata(image_path_or_obj, original_filename=None):
120
  """
121
+ Extract EXIF & metadata from a file or PIL Image.
 
 
 
 
 
 
 
122
  """
123
  try:
 
124
  if isinstance(image_path_or_obj, Image.Image):
125
  image = image_path_or_obj
126
  file_name = original_filename or "unknown.jpg"
 
133
  file_size = image_path.stat().st_size
134
  file_extension = image_path.suffix.lower()
135
 
 
136
  metadata = {
137
  "file_name": file_name,
138
  "format": image.format,
139
  "size": list(image.size),
140
  "mode": image.mode,
141
  "extraction_timestamp": datetime.now().isoformat(),
142
+ "file_extension": file_extension
143
  }
 
144
  if file_size:
145
  metadata["file_size"] = file_size
 
 
146
 
 
147
  try:
148
  exif_data = image._getexif()
149
  except Exception as e:
 
154
  for tag_id, value in exif_data.items():
155
  try:
156
  tag_name = ExifTags.TAGS.get(tag_id, f"tag_{tag_id}")
 
 
157
  if tag_name == "GPSInfo":
158
  gps_info = extract_gps_info(value)
159
  if gps_info:
 
165
  else:
166
  metadata["exif"] = "No EXIF data available"
167
 
168
+ # Validate serializability
169
  try:
170
  json.dumps(metadata)
171
+ except:
172
+ # fallback
173
+ basic_metadata = {
174
+ "file_name": metadata.get("file_name", "unknown"),
175
+ "format": metadata.get("format", None),
176
+ "size": metadata.get("size", None),
177
+ "mode": metadata.get("mode", None),
178
+ "file_extension": metadata.get("file_extension", None),
179
+ }
180
+ basic_metadata["serialization_error"] = "Some metadata were removed."
181
  return basic_metadata
 
182
  return metadata
183
 
184
  except Exception as e:
 
188
  "extraction_timestamp": datetime.now().isoformat()
189
  }
190
 
191
+ # ====================== Save/Load JSONL ======================
 
192
  def save_metadata_to_jsonl(metadata_list, append=True):
 
193
  mode = 'a' if append and METADATA_FILE.exists() else 'w'
194
  success_count = 0
 
195
  with open(METADATA_FILE, mode) as f:
196
  for entry in metadata_list:
197
  try:
 
200
  success_count += 1
201
  except Exception as e:
202
  print(f"Failed to serialize entry: {e}")
203
+ simplified = {
204
+ "file_name": entry.get("file_name", "unknown"),
205
+ "error": "Serialization failed"
206
+ }
207
  f.write(json.dumps(simplified) + '\n')
 
208
  return success_count, len(metadata_list)
209
 
210
  def read_metadata_jsonl():
 
211
  if not METADATA_FILE.exists():
212
  return []
 
213
  metadata_list = []
214
  with open(METADATA_FILE, 'r') as f:
215
  for line in f:
 
219
  continue
220
  return metadata_list
221
 
222
+ # ====================== Pushing to HuggingFace Hub ======================
223
  def push_to_hub(metadata_list=None, create_if_not_exists=True):
 
224
  api = HfApi(token=HF_TOKEN)
225
  try:
226
  if metadata_list is None:
227
  metadata_list = read_metadata_jsonl()
 
228
  if not metadata_list:
229
  return "No metadata to push", "warning"
230
 
 
231
  repo_exists = True
232
  try:
233
  api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
234
  except Exception:
235
  repo_exists = False
236
  if create_if_not_exists:
237
+ create_repo(repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, private=False)
 
 
 
 
 
238
  else:
239
+ return f"Dataset repo {DATASET_REPO} doesn't exist.", "error"
240
 
 
241
  existing_metadata = []
242
  if repo_exists:
243
  try:
244
+ existing_file = hf_hub_download(
245
+ repo_id=DATASET_REPO,
246
+ filename="metadata.jsonl",
247
+ repo_type="dataset",
248
+ token=HF_TOKEN
249
+ )
250
+ with open(existing_file, 'r') as f:
251
+ for line in f:
252
+ try:
253
+ existing_metadata.append(json.loads(line))
254
+ except:
255
+ pass
 
 
 
 
 
 
256
  except Exception as e:
257
+ print(f"No existing metadata found or error reading: {e}")
258
 
 
259
  if existing_metadata:
260
  existing_filenames = {item.get("file_name") for item in existing_metadata}
261
+ unique_new = [item for item in metadata_list
262
+ if item.get("file_name") not in existing_filenames]
263
+ combined_metadata = existing_metadata + unique_new
 
 
264
  else:
265
  combined_metadata = metadata_list
266
+
 
267
  temp_file = Path(tempfile.mktemp(suffix=".jsonl"))
268
  with open(temp_file, 'w') as f:
269
  for entry in combined_metadata:
270
  f.write(json.dumps(entry) + '\n')
271
 
 
272
  api.upload_file(
273
  path_or_fileobj=str(temp_file),
274
  path_in_repo="metadata.jsonl",
 
277
  token=HF_TOKEN
278
  )
279
 
 
280
  readme_path = LOCAL_STORAGE_PATH / "README.md"
281
  if not readme_path.exists():
282
  with open(readme_path, 'w') as f:
283
+ f.write(
284
+ f"# EXIF Metadata Dataset\n\n"
285
+ f"This dataset contains EXIF metadata.\n\n"
286
+ f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
287
+ f"Total entries: {len(combined_metadata)}"
288
+ )
289
  try:
290
  with open(readme_path, 'r') as f:
291
  readme_content = f.read()
292
+ updated_readme = (
293
+ f"# EXIF Metadata Dataset\n\n"
294
+ f"This dataset contains EXIF metadata.\n\n"
295
+ f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
296
+ f"Total entries: {len(combined_metadata)}"
297
+ )
 
 
 
 
298
  with open(readme_path, 'w') as f:
299
  f.write(updated_readme)
300
+
301
  api.upload_file(
302
  path_or_fileobj=str(readme_path),
303
  path_in_repo="README.md",
 
308
  except Exception as e:
309
  print(f"Error updating README: {e}")
310
 
311
+ return f"Successfully pushed {len(metadata_list)} entries to {DATASET_REPO}", "success"
 
312
  except Exception as e:
313
+ return f"Error pushing to Hub: {e}", "error"
 
 
314
 
315
+ # ====================== Background Processing Threads ======================
316
  def process_worker():
 
317
  while True:
318
  try:
319
  task = process_queue.get()
320
+ if task is None:
321
  break
 
322
  file_path, original_filename = task
323
  metadata = extract_metadata(file_path, original_filename)
324
 
 
325
  success, total = save_metadata_to_jsonl([metadata])
 
 
326
  if success:
327
  upload_queue.put(metadata)
 
328
  process_queue.task_done()
329
  except Exception as e:
330
  print(f"Error in process worker: {e}")
331
  process_queue.task_done()
332
 
333
  def upload_worker():
 
334
  batch = []
335
  last_upload_time = time.time()
 
336
  while True:
337
  try:
 
338
  try:
339
+ metadata = upload_queue.get(timeout=60)
340
  except queue.Empty:
341
+ if batch and (time.time() - last_upload_time) > 300:
 
342
  push_to_hub(batch)
343
  batch = []
344
  last_upload_time = time.time()
345
  continue
346
+ if metadata is None:
 
347
  break
 
348
  batch.append(metadata)
349
  upload_queue.task_done()
 
 
350
  if len(batch) >= MAX_BATCH_SIZE:
351
  push_to_hub(batch)
352
  batch = []
353
  last_upload_time = time.time()
 
354
  except Exception as e:
355
  print(f"Error in upload worker: {e}")
356
  if metadata:
357
  upload_queue.task_done()
358
 
 
359
  process_thread = threading.Thread(target=process_worker, daemon=True)
360
  process_thread.start()
361
 
362
  upload_thread = threading.Thread(target=upload_worker, daemon=True)
363
  upload_thread.start()
364
 
365
+ # ====================== Gradio App ======================
 
366
  def process_uploaded_files(files):
 
367
  if not files:
368
  return "No files uploaded", "warning"
 
369
  processed = 0
370
  metadata_list = []
 
371
  for file in files:
372
  try:
373
+ # If using Gradio 3.x
374
  if hasattr(file, 'name'):
 
375
  file_path = Path(file.name)
376
  file_name = file_path.name
377
  else:
378
+ # If using Gradio 4.x => (path, orig_name)
379
  file_path = Path(file)
380
  file_name = file_path.name
381
+
382
  if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
383
  continue
384
+
385
  metadata = extract_metadata(file_path, file_name)
386
  metadata_list.append(metadata)
387
  processed += 1
 
 
388
  process_queue.put((file_path, file_name))
389
  except Exception as e:
390
+ print(f"Error processing {file_path}: {e}")
 
391
  if metadata_list:
392
  success, total = save_metadata_to_jsonl(metadata_list)
393
+ return (f"Processed {processed} files. "
394
+ f"{success}/{total} metadata entries saved."), "success"
395
  else:
396
+ return f"No valid image files among the {len(files)} uploaded.", "warning"
397
 
398
  def view_metadata():
 
399
  metadata_list = read_metadata_jsonl()
 
400
  if not metadata_list:
401
  return "No metadata available", pd.DataFrame()
402
 
 
403
  display_data = []
404
  for entry in metadata_list:
405
+ row = {
406
  "filename": entry.get("file_name", "unknown"),
407
+ "width": None,
408
+ "height": None,
409
  "format": entry.get("format"),
410
  "has_gps": "Yes" if entry.get("gps_info") else "No"
411
  }
412
+ size = entry.get("size")
413
+ if isinstance(size, list) and len(size) == 2:
414
+ row["width"], row["height"] = size
415
  if entry.get("gps_info"):
416
  gps = entry["gps_info"]
417
+ row["latitude"] = gps.get("Latitude")
418
+ row["longitude"] = gps.get("Longitude")
419
+ display_data.append(row)
 
 
420
  df = pd.DataFrame(display_data)
421
+ return f"Found {len(metadata_list)} entries", df
422
 
423
  def manual_push_to_hub():
 
424
  return push_to_hub()
425
 
426
  with gr.Blocks(title="EXIF Extraction Pipeline") as app:
427
+ gr.Markdown(f"""
428
  # EXIF Metadata Extraction Pipeline
429
 
430
+ **Local storage**: `./data`
431
+ **Images directory**: `./images`
432
+ **Checkpoints**: `./checkpoints`
433
+ **Supported formats**: {", ".join(SUPPORTED_EXTENSIONS)}
434
 
435
+ Upload images to extract EXIF metadata (including GPS) and push to HuggingFace Hub.
436
+ """)
 
 
 
 
 
 
 
437
 
438
  with gr.Tabs():
439
  with gr.TabItem("Upload Images"):
440
+ file_input = gr.File(file_count="multiple", label="Upload Images")
441
+ submit_btn = gr.Button("Process Images")
442
+ output_status = gr.Textbox(label="Status")
443
+ submit_btn.click(fn=process_uploaded_files, inputs=[file_input], outputs=[output_status])
444
+
 
 
 
 
 
 
 
 
445
  with gr.TabItem("View Metadata"):
446
+ refresh_btn = gr.Button("Refresh Metadata")
447
+ view_status = gr.Textbox(label="Status")
448
+ results_df = gr.DataFrame(label="Metadata Overview")
449
+ refresh_btn.click(fn=view_metadata, inputs=[], outputs=[view_status, results_df])
450
+ app.load(fn=view_metadata, inputs=[], outputs=[view_status, results_df])
451
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  with gr.TabItem("Hub Management"):
453
+ push_btn = gr.Button("Push to HuggingFace Hub")
454
+ push_status = gr.Textbox(label="Status")
455
+ push_btn.click(fn=manual_push_to_hub, inputs=[], outputs=[push_status])
456
+
457
+ # ====================== PyTorch: Using GPS Data ======================
458
+ def load_exif_gps_metadata(metadata_file=METADATA_FILE):
459
+ gps_map = {}
460
+ if not os.path.exists(metadata_file):
461
+ return gps_map
462
+ with open(metadata_file, "r") as f:
463
+ for line in f:
464
+ try:
465
+ entry = json.loads(line)
466
+ gps_info = entry.get("gps_info")
467
+ if gps_info and "Latitude" in gps_info and "Longitude" in gps_info:
468
+ lat = gps_info["Latitude"]
469
+ lon = gps_info["Longitude"]
470
+ gps_map[entry["file_name"]] = (lat, lon)
471
+ except:
472
+ pass
473
+ return gps_map
474
+
475
+ class GPSImageDataset(Dataset):
476
+ def __init__(self, images_dir, gps_map, transform=None):
477
+ self.images_dir = Path(images_dir)
478
+ self.transform = transform
479
+ self.gps_map = gps_map
480
+
481
+ # Filter to only files that have GPS data
482
+ self.file_names = []
483
+ for fn in os.listdir(self.images_dir):
484
+ if fn in gps_map: # ensure we have matching metadata
485
+ self.file_names.append(fn)
486
+
487
+ def __len__(self):
488
+ return len(self.file_names)
489
+
490
+ def __getitem__(self, idx):
491
+ file_name = self.file_names[idx]
492
+ img_path = self.images_dir / file_name
493
+ image = Image.open(img_path).convert("RGB")
494
+ if self.transform:
495
+ image = self.transform(image)
496
+
497
+ lat, lon = self.gps_map[file_name]
498
+ gps_tensor = torch.tensor([lat, lon], dtype=torch.float)
499
+ return image, gps_tensor
500
+
501
+ def train_one_epoch(
502
+ train_dataloader, model, optimizer, epoch, batch_size, device,
503
+ scheduler=None, criterion=nn.CrossEntropyLoss()
504
+ ):
505
+ print(f"\nStarting Epoch {epoch} ...")
506
+ bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
507
+
508
+ # Create some placeholder targets (for demonstration only).
509
+ targets_img_gps = torch.arange(0, batch_size).long().to(device)
510
+
511
+ for i, (imgs, gps) in bar:
512
+ imgs, gps = imgs.to(device), gps.to(device)
513
+ gps_queue = model.get_gps_queue() # Hypothetical in your model
514
+
515
+ optimizer.zero_grad()
516
+ gps_all = torch.cat([gps, gps_queue], dim=0)
517
+ model.dequeue_and_enqueue(gps)
518
+
519
+ logits_img_gps = model(imgs, gps_all)
520
+ loss = criterion(logits_img_gps, targets_img_gps)
521
+
522
+ loss.backward()
523
+ optimizer.step()
524
+
525
+ bar.set_description(f"Epoch {epoch} loss: {loss.item():.5f}")
526
+
527
+ if scheduler:
528
+ scheduler.step()
529
+
530
+ # ====================== Checkpoint Helpers ======================
531
+ def save_checkpoint(model, optimizer, epoch, path=CHECKPOINT_PATH):
532
+ """
533
+ Saves model + optimizer state_dict along with current epoch
534
+ to `path`.
535
+ """
536
+ ckpt = {
537
+ "epoch": epoch,
538
+ "model_state": model.state_dict(),
539
+ "optimizer_state": optimizer.state_dict(),
540
+ }
541
+ torch.save(ckpt, path)
542
+ print(f"[Checkpoint] Saved at epoch={epoch} -> {path}")
543
+
544
+ def load_checkpoint(model, optimizer, path=CHECKPOINT_PATH, device="cpu"):
545
+ """
546
+ Loads checkpoint into model + optimizer, returns the last epoch.
547
+ """
548
+ if not os.path.exists(path):
549
+ print(f"No checkpoint found at {path}. Starting fresh.")
550
+ return 0
551
+ ckpt = torch.load(path, map_location=device)
552
+ model.load_state_dict(ckpt["model_state"])
553
+ optimizer.load_state_dict(ckpt["optimizer_state"])
554
+ print(f"[Checkpoint] Loaded from {path} (epoch={ckpt['epoch']})")
555
+ return ckpt["epoch"]
556
+
557
+ # ====================== Continuous Trainer ======================
558
+ def continuous_train(
559
+ train_dataloader,
560
+ model,
561
+ optimizer,
562
+ device,
563
+ start_epoch=1,
564
+ max_epochs=5,
565
+ scheduler=None
566
+ ):
567
+ """
568
+ Loads checkpoint if available, then trains up to `max_epochs`.
569
+ Saves new checkpoint at the end of each epoch.
570
+ """
571
+ # Attempt to load from existing checkpoint
572
+ loaded_epoch = load_checkpoint(model, optimizer, path=CHECKPOINT_PATH, device=device)
573
+ # If loaded_epoch=3 and user says max_epochs=5, we continue from epoch 4, 5
574
+ current_epoch = loaded_epoch + 1
575
+ final_epoch = max(loaded_epoch + 1, max_epochs) # ensure we do something
576
+
577
+ # Example: train from current_epoch -> max_epochs
578
+ while current_epoch <= max_epochs:
579
+ train_one_epoch(
580
+ train_dataloader=train_dataloader,
581
+ model=model,
582
+ optimizer=optimizer,
583
+ epoch=current_epoch,
584
+ batch_size=train_dataloader.batch_size,
585
+ device=device,
586
+ scheduler=scheduler
587
+ )
588
+ # Save checkpoint each epoch
589
+ save_checkpoint(model, optimizer, current_epoch, CHECKPOINT_PATH)
590
+ current_epoch += 1
591
+
592
+ class ExampleGPSModel(nn.Module):
593
+ def __init__(self, gps_queue_len=10):
594
+ super().__init__()
595
+ self.conv = nn.Conv2d(3, 16, kernel_size=3, padding=1)
596
+ self.flatten = nn.Flatten()
597
+ self.fc_img = nn.Linear(16 * 224 * 224, 32)
598
+ self.fc_gps = nn.Linear(2, 32)
599
+ self.fc_out = nn.Linear(64, 10)
600
+ self.gps_queue_len = gps_queue_len
601
+ self._gps_queue = torch.zeros((gps_queue_len, 2), dtype=torch.float)
602
+
603
+ def forward(self, imgs, gps_all):
604
+ x = self.conv(imgs)
605
+ x = F.relu(x)
606
+ x = self.flatten(x)
607
+ x = self.fc_img(x)
608
+
609
+ g = self.fc_gps(gps_all)
610
+ # Average all GPS embeddings
611
+ if g.dim() == 2:
612
+ g = g.mean(dim=0, keepdim=True)
613
+ combined = torch.cat([x, g.repeat(x.size(0), 1)], dim=1)
614
+ out = self.fc_out(combined)
615
+ return out
616
+
617
+ def get_gps_queue(self):
618
+ return self._gps_queue
619
+
620
+ def dequeue_and_enqueue(self, new_gps):
621
+ B = new_gps.shape[0]
622
+ self._gps_queue = torch.roll(self._gps_queue, shifts=-B, dims=0)
623
+ self._gps_queue[-B:] = new_gps
624
 
 
625
  if __name__ == "__main__":
626
+ # ========== Example usage: build dataset/dataloader ==========
627
+ gps_map = load_exif_gps_metadata(METADATA_FILE) # from ./data/metadata.jsonl
628
+ transform = transforms.Compose([
629
+ transforms.Resize((224, 224)),
630
+ transforms.ToTensor(),
631
+ ])
632
+ train_dataset = GPSImageDataset(IMAGES_DIR, gps_map, transform=transform)
633
+ train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
634
+
635
+ # ========== Create model & optimizer ==========
636
+ device = "cuda" if torch.cuda.is_available() else "cpu"
637
+ model = ExampleGPSModel().to(device)
638
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
639
+
640
+ # ========== Continuous training example (5 epochs) ==========
641
+ continuous_train(
642
+ train_dataloader=train_dataloader,
643
+ model=model,
644
+ optimizer=optimizer,
645
+ device=device,
646
+ start_epoch=1, # not used if there's a checkpoint
647
+ max_epochs=5
648
+ )
649
+
650
+ print("Done training. Launching Gradio app...")
651
+
652
+ # ========== Launch Gradio ==========
653
+ app.launch(server_name="0.0.0.0", server_port=7860)