k4d3 commited on
Commit
ce752fd
·
2 Parent(s): 81c0b65 645293b

Merge branch 'main' of hf.co:/k4d3/toolkit

Browse files
Files changed (4) hide show
  1. .gitmodules +1 -1
  2. dataset-tools +1 -1
  3. utils/crc32.py +18 -0
  4. utils/remove_grandfathered.py +18 -34
.gitmodules CHANGED
@@ -1,3 +1,3 @@
1
  [submodule "dataset-tools"]
2
  path = dataset-tools
3
- url = https://github.com/ka-de/dataset-tools
 
1
  [submodule "dataset-tools"]
2
  path = dataset-tools
3
+ url = git@github.com:ka-de/dataset-tools
dataset-tools CHANGED
@@ -1 +1 @@
1
- Subproject commit 0af984140656aa40d830d3317fc6c00ac0fc9ba3
 
1
+ Subproject commit a8c787494826e9ae2d1e386be8ce304eafb3a0e6
utils/crc32.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zlib
2
+ import sys
3
+
4
+ def calculate_crc32(file_path):
5
+ with open(file_path, 'rb') as file:
6
+ data = file.read()
7
+ crc32_checksum = zlib.crc32(data)
8
+ return crc32_checksum
9
+
10
+ if __name__ == "__main__":
11
+ if len(sys.argv) != 2:
12
+ print("Usage: python script.py <file_path>")
13
+ sys.exit(1)
14
+
15
+ file_path = sys.argv[1]
16
+ checksum = calculate_crc32(file_path)
17
+ print(f"CRC32 checksum of {file_path}: {checksum:#010x}")
18
+
utils/remove_grandfathered.py CHANGED
@@ -1,42 +1,26 @@
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
3
- #
4
- import os
5
- import sys
6
- # Add the parent directory to the Python path
7
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
8
-
9
- from utils.file_processor import FileProcessor, ProcessorOptions
10
- import re
11
- from pathlib import Path
12
 
13
- class GrandfatheredRemovalProcessor(FileProcessor):
14
- def __init__(self, options: ProcessorOptions, pattern: str):
15
- super().__init__(options)
16
- self.pattern = pattern
17
-
18
- def process_content(self, content: str) -> str:
19
- # Remove occurrences of the grandfathered content pattern
20
- content = re.sub(self.pattern, '', content)
21
- # Normalize whitespace and commas
22
- content = re.sub(r'\s+,', ',', content)
23
- content = re.sub(r',\s+', ',', content)
24
- return re.sub(r'\s+', ' ', content).strip()
25
 
26
- def main():
27
- target_dir = sys.argv[1] if len(sys.argv) > 1 else '.'
 
28
 
29
- options = ProcessorOptions(
30
- recursive=True,
31
- dry_run=False,
32
- file_extensions={'.txt', '.tags'}
33
- )
34
 
35
- # Define the pattern for grandfathered content (modify as needed)
36
- grandfathered_pattern = r'grandfathered_content_pattern'
37
-
38
- processor = GrandfatheredRemovalProcessor(options, grandfathered_pattern)
39
- processor.process_directory(Path(target_dir))
 
 
 
 
40
 
41
  if __name__ == "__main__":
42
- main()
 
 
 
 
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
 
 
 
3
 
4
+ import os
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ def remove_text_from_file(file_path, text_to_remove):
7
+ with open(file_path, 'r') as file:
8
+ content = file.read()
9
 
10
+ content = content.replace(text_to_remove, "")
 
 
 
 
11
 
12
+ with open(file_path, 'w') as file:
13
+ file.write(content)
14
+
15
+ def remove_text_from_files(root_dir, text_to_remove):
16
+ for subdir, _, files in os.walk(root_dir):
17
+ for file in files:
18
+ if file.endswith('.txt') or file.endswith('.tags'):
19
+ file_path = os.path.join(subdir, file)
20
+ remove_text_from_file(file_path, text_to_remove)
21
 
22
  if __name__ == "__main__":
23
+ root_directory = '.' # Change this to your root directory
24
+ text_to_remove = "grandfathered content, "
25
+ remove_text_from_files(root_directory, text_to_remove)
26
+