Spaces:
Running
on
Zero
Running
on
Zero
Update file check script to check sizes (#32)
Browse files- bytelatent/data/file_util.py +48 -3
bytelatent/data/file_util.py
CHANGED
@@ -65,7 +65,10 @@ def print_local_to_delete(
|
|
65 |
|
66 |
@app.command()
|
67 |
def compare_local_to_blob(
|
68 |
-
source_dirs: list[str],
|
|
|
|
|
|
|
69 |
):
|
70 |
for s in source_dirs:
|
71 |
assert s.endswith("/"), "Dirs must end with /"
|
@@ -75,6 +78,7 @@ def compare_local_to_blob(
|
|
75 |
local_fs = fsspec.filesystem("file")
|
76 |
dst_fs = fsspec.filesystem("s3", profile=s3_profile)
|
77 |
source_to_files = {}
|
|
|
78 |
all_local_files = set()
|
79 |
for s in source_dirs:
|
80 |
skipped = []
|
@@ -97,14 +101,28 @@ def compare_local_to_blob(
|
|
97 |
skipped.append(f)
|
98 |
continue
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
source_to_files[s].append(f)
|
101 |
-
all_local_files.add(
|
102 |
print(s, len(source_to_files[s]), "skipped", len(skipped), skipped[:10])
|
103 |
|
104 |
dst_files = dst_fs.find(dst_dir)
|
105 |
print(dst_dir, len(dst_files))
|
106 |
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
diff = all_local_files.symmetric_difference(dst_file_set)
|
109 |
print("Local files", len(all_local_files))
|
110 |
print("DST Files", len(dst_file_set))
|
@@ -112,6 +130,33 @@ def compare_local_to_blob(
|
|
112 |
dst_only_files = dst_file_set - all_local_files
|
113 |
print("DST only", len(dst_only_files), list(dst_only_files)[:10])
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
if __name__ == "__main__":
|
117 |
app()
|
|
|
65 |
|
66 |
@app.command()
|
67 |
def compare_local_to_blob(
|
68 |
+
source_dirs: list[str],
|
69 |
+
dst_dir: str,
|
70 |
+
s3_profile: str = "blt",
|
71 |
+
print_sizes: bool = False,
|
72 |
):
|
73 |
for s in source_dirs:
|
74 |
assert s.endswith("/"), "Dirs must end with /"
|
|
|
78 |
local_fs = fsspec.filesystem("file")
|
79 |
dst_fs = fsspec.filesystem("s3", profile=s3_profile)
|
80 |
source_to_files = {}
|
81 |
+
source_file_to_size = {}
|
82 |
all_local_files = set()
|
83 |
for s in source_dirs:
|
84 |
skipped = []
|
|
|
101 |
skipped.append(f)
|
102 |
continue
|
103 |
|
104 |
+
file_without_prefix = f[len(s) :]
|
105 |
+
if file_without_prefix not in source_file_to_size:
|
106 |
+
source_file_to_size[file_without_prefix] = os.path.getsize(f)
|
107 |
+
else:
|
108 |
+
source_file_to_size[file_without_prefix] = max(
|
109 |
+
source_file_to_size[file_without_prefix], os.path.getsize(f)
|
110 |
+
)
|
111 |
+
|
112 |
source_to_files[s].append(f)
|
113 |
+
all_local_files.add(file_without_prefix)
|
114 |
print(s, len(source_to_files[s]), "skipped", len(skipped), skipped[:10])
|
115 |
|
116 |
dst_files = dst_fs.find(dst_dir)
|
117 |
print(dst_dir, len(dst_files))
|
118 |
|
119 |
+
dst_file_to_size = {}
|
120 |
+
dst_file_set = set()
|
121 |
+
for f in dst_files:
|
122 |
+
dst_file_without_prefix = f[len(dst_dir) - len(S3_PREFIX) :]
|
123 |
+
dst_file_set.add(dst_file_without_prefix)
|
124 |
+
dst_file_to_size[dst_file_without_prefix] = dst_fs.size(f)
|
125 |
+
|
126 |
diff = all_local_files.symmetric_difference(dst_file_set)
|
127 |
print("Local files", len(all_local_files))
|
128 |
print("DST Files", len(dst_file_set))
|
|
|
130 |
dst_only_files = dst_file_set - all_local_files
|
131 |
print("DST only", len(dst_only_files), list(dst_only_files)[:10])
|
132 |
|
133 |
+
all_files = dst_file_set | all_local_files
|
134 |
+
print("Check that files match")
|
135 |
+
size_success = True
|
136 |
+
for f in sorted(all_files):
|
137 |
+
if f in source_file_to_size and f in dst_file_to_size:
|
138 |
+
if source_file_to_size[f] != dst_file_to_size[f]:
|
139 |
+
size_success = False
|
140 |
+
print(
|
141 |
+
f"Mismatch file size for {f}, Local: {source_file_to_size[f]} Blob: {dst_file_to_size[f]}"
|
142 |
+
)
|
143 |
+
else:
|
144 |
+
if print_sizes:
|
145 |
+
print(f"Matching file size: {dst_file_to_size[f]} for {f}")
|
146 |
+
elif f not in source_file_to_size:
|
147 |
+
size_success = False
|
148 |
+
print(f"Missing file in source: {f}")
|
149 |
+
elif f not in dst_file_to_size:
|
150 |
+
size_success = False
|
151 |
+
print(f"missing file in dst: {f}")
|
152 |
+
else:
|
153 |
+
raise ValueError("Unexpected to be missing file in src and dst")
|
154 |
+
|
155 |
+
if size_success:
|
156 |
+
print("All files pass size check")
|
157 |
+
else:
|
158 |
+
raise ValueError("At least one file failed size comparison check")
|
159 |
+
|
160 |
|
161 |
if __name__ == "__main__":
|
162 |
app()
|