Hopsakee's picture
Upload folder using huggingface_hub
2197ab7 verified
from pathlib import Path
from shutil import copy2, rmtree
from fastcore.utils import L
import re
from src.fabrics_processor.config import config
def sentence2snake(name: str) -> str:
"""Convert any string to snake_case, replacing non-alphanumeric with underscore"""
s1 = name.lower()
s2 = re.sub(r'\W', r'_', s1)
return re.sub(r'_+', r'_', s2)
def round_timestamp(ts: float) -> int:
"""Round timestamp to handle filesystem differences"""
return int(str(ts).split('.')[0][:-4])
def get_md_files_obsidian(path: Path) -> dict:
"""Get files from obsidian vault: stem -> (path, timestamp, size)"""
# Rename files to snake_case and add identifier to distinguish own prompts from others
return {sentence2snake(p.stem)+"-"+p.parent.name.lower(): (p, p.stat().st_mtime, p.stat().st_size)
for p in Path(path).glob('**/*.md')}
def get_md_files_fabricsfolder(path: Path) -> dict:
"""Get files from target structure: dir_name -> (system.md_path, timestamp, size)"""
target_subdirs = [x for x in path.iterdir() if x.is_dir()]
return {x.stem: (x/'system.md', (x/'system.md').stat().st_mtime, (x/'system.md').stat().st_size)
for x in target_subdirs
if (x/'system.md').exists()}
def get_modified_files(source_files: dict, target_files: dict) -> list:
"""Compare timestamps between source and target files, returns dictionary of
entries needing updates. The dictionary has the filename as key and the following
values:
path, timestamp, size"""
existing_files = L(k for k in source_files.keys() if k in target_files)
# removed checking for timestamp. Because you don't want false positives because of file system differences
# or daylight savings. But you also want to be able to update files that have almost the same timestamp
# when you change the file.
# different_timestamps = L(k for k in existing_files
# if round_timestamp(source_files[k][1]) > round_timestamp(target_files[k][1]))
# return L(source_files[k][0] for k in different_timestamps
# if source_files[k][2] != target_files[k][2])
return L({k: source_files[k]} for k in existing_files if source_files[k][2] != target_files[k][2])
def get_new_files(source_files: dict, target_files: dict) -> list:
"""Return list of dictionaries containing with the key as filename and these values:
path, timestamp, size"""
return L({k: source_files[k]} for k in source_files.keys() if k not in target_files)
def process_file(source: dict, target_dir: Path) -> None:
"""
Process a single file: create directory, copy as system.md, create user.md
Args:
source: Dict of source file: filename:(path, timestamp, size)
target_dir: Base target directory (e.g. 'md_target')
"""
filename = next(iter(source))
filepath = next(iter(source.values()))[0]
subdir = target_dir/filename
subdir.mkdir(mode=0o755, exist_ok=True)
copy2(filepath, subdir/'system.md')
(subdir/'user.md').touch()
def sync_folders(source_dir: Path, target_dir: Path) -> None:
"""
Main function to synchronize folders
Args:
source_dir: Path to source directory (obsidian vault)
target_dir: Path to target directory (fabrics folder)
"""
source_files = get_md_files_obsidian(Path(source_dir))
target_files = get_md_files_fabricsfolder(Path(target_dir))
# Get all files that need processing
files_to_process = L(get_new_files(source_files, target_files) +
get_modified_files(source_files, target_files))
# Process each file
for i in files_to_process:
process_file(i, target_dir)
# Get all files that need deleting
files_to_delete = L(k for k in target_files.keys() if k not in source_files and "-" in k)
# Delete each directory and its contents
for file_name in files_to_delete:
rmtree(target_dir/file_name)