par-meta commited on
Commit
bc42ceb
·
unverified ·
1 Parent(s): 392117b

Update file check script to check sizes (#32)

Browse files
Files changed (1) hide show
  1. bytelatent/data/file_util.py +48 -3
bytelatent/data/file_util.py CHANGED
@@ -65,7 +65,10 @@ def print_local_to_delete(
65
 
66
  @app.command()
67
  def compare_local_to_blob(
68
- source_dirs: list[str], dst_dir: str, s3_profile: str = "blt"
 
 
 
69
  ):
70
  for s in source_dirs:
71
  assert s.endswith("/"), "Dirs must end with /"
@@ -75,6 +78,7 @@ def compare_local_to_blob(
75
  local_fs = fsspec.filesystem("file")
76
  dst_fs = fsspec.filesystem("s3", profile=s3_profile)
77
  source_to_files = {}
 
78
  all_local_files = set()
79
  for s in source_dirs:
80
  skipped = []
@@ -97,14 +101,28 @@ def compare_local_to_blob(
97
  skipped.append(f)
98
  continue
99
 
 
 
 
 
 
 
 
 
100
  source_to_files[s].append(f)
101
- all_local_files.add(f[len(s) :])
102
  print(s, len(source_to_files[s]), "skipped", len(skipped), skipped[:10])
103
 
104
  dst_files = dst_fs.find(dst_dir)
105
  print(dst_dir, len(dst_files))
106
 
107
- dst_file_set = {f[len(dst_dir) - len(S3_PREFIX) :] for f in dst_files}
 
 
 
 
 
 
108
  diff = all_local_files.symmetric_difference(dst_file_set)
109
  print("Local files", len(all_local_files))
110
  print("DST Files", len(dst_file_set))
@@ -112,6 +130,33 @@ def compare_local_to_blob(
112
  dst_only_files = dst_file_set - all_local_files
113
  print("DST only", len(dst_only_files), list(dst_only_files)[:10])
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  if __name__ == "__main__":
117
  app()
 
65
 
66
  @app.command()
67
  def compare_local_to_blob(
68
+ source_dirs: list[str],
69
+ dst_dir: str,
70
+ s3_profile: str = "blt",
71
+ print_sizes: bool = False,
72
  ):
73
  for s in source_dirs:
74
  assert s.endswith("/"), "Dirs must end with /"
 
78
  local_fs = fsspec.filesystem("file")
79
  dst_fs = fsspec.filesystem("s3", profile=s3_profile)
80
  source_to_files = {}
81
+ source_file_to_size = {}
82
  all_local_files = set()
83
  for s in source_dirs:
84
  skipped = []
 
101
  skipped.append(f)
102
  continue
103
 
104
+ file_without_prefix = f[len(s) :]
105
+ if file_without_prefix not in source_file_to_size:
106
+ source_file_to_size[file_without_prefix] = os.path.getsize(f)
107
+ else:
108
+ source_file_to_size[file_without_prefix] = max(
109
+ source_file_to_size[file_without_prefix], os.path.getsize(f)
110
+ )
111
+
112
  source_to_files[s].append(f)
113
+ all_local_files.add(file_without_prefix)
114
  print(s, len(source_to_files[s]), "skipped", len(skipped), skipped[:10])
115
 
116
  dst_files = dst_fs.find(dst_dir)
117
  print(dst_dir, len(dst_files))
118
 
119
+ dst_file_to_size = {}
120
+ dst_file_set = set()
121
+ for f in dst_files:
122
+ dst_file_without_prefix = f[len(dst_dir) - len(S3_PREFIX) :]
123
+ dst_file_set.add(dst_file_without_prefix)
124
+ dst_file_to_size[dst_file_without_prefix] = dst_fs.size(f)
125
+
126
  diff = all_local_files.symmetric_difference(dst_file_set)
127
  print("Local files", len(all_local_files))
128
  print("DST Files", len(dst_file_set))
 
130
  dst_only_files = dst_file_set - all_local_files
131
  print("DST only", len(dst_only_files), list(dst_only_files)[:10])
132
 
133
+ all_files = dst_file_set | all_local_files
134
+ print("Check that files match")
135
+ size_success = True
136
+ for f in sorted(all_files):
137
+ if f in source_file_to_size and f in dst_file_to_size:
138
+ if source_file_to_size[f] != dst_file_to_size[f]:
139
+ size_success = False
140
+ print(
141
+ f"Mismatch file size for {f}, Local: {source_file_to_size[f]} Blob: {dst_file_to_size[f]}"
142
+ )
143
+ else:
144
+ if print_sizes:
145
+ print(f"Matching file size: {dst_file_to_size[f]} for {f}")
146
+ elif f not in source_file_to_size:
147
+ size_success = False
148
+ print(f"Missing file in source: {f}")
149
+ elif f not in dst_file_to_size:
150
+ size_success = False
151
+ print(f"missing file in dst: {f}")
152
+ else:
153
+ raise ValueError("Unexpected to be missing file in src and dst")
154
+
155
+ if size_success:
156
+ print("All files pass size check")
157
+ else:
158
+ raise ValueError("At least one file failed size comparison check")
159
+
160
 
161
  if __name__ == "__main__":
162
  app()