saliacoel
/

tmp

Model card Files Files and versions

xet

Community

saliacoel commited on 21 days ago

Commit

2a6a218

verified ·

1 Parent(s): 078fcdb

Upload TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip.py

Browse files

Files changed (1) hide show

TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip.py +500 -0

TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import os
+import re
+import shutil
+import tempfile
+import time
+import unicodedata
+import zipfile
+from collections import defaultdict
+from urllib.parse import quote, unquote
+from urllib.request import Request, urlopen
+try:
+    from huggingface_hub import HfApi
+except Exception:
+    HfApi = None
+SOURCE_ZIP_URL = "https://huggingface.co/saliacoel/tmp/resolve/main/chars_unsorted.zip"
+TMP_REPO_ID = "saliacoel/tmp"
+OUTPUT_ZIP_NAME = "chars_numbered_desc.zip"
+# Finds entries like:
+# 1. Adali, tags...
+# 0001. Adali, tags...
+# 2000. Name, tags...
+#
+# Does not depend on line breaks.
+ID_MARKER_RE = re.compile(r"(?<!\S)(0*\d{1,8})\.\s*")
+def _resolve_hf_token(hf_token: str) -> str:
+    token = (hf_token or "").strip()
+    if token:
+        return token
+    for env_name in ("HF_TOKEN", "HUGGINGFACE_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
+        env_token = os.getenv(env_name, "").strip()
+        if env_token:
+            return env_token
+    raise ValueError("No Hugging Face token provided.")
+def _download_file(url: str, suffix: str = ".zip") -> str:
+    """
+    Download URL once into a temporary file.
+    This avoids 2000 separate Hugging Face requests.
+    """
+    req = Request(
+        url,
+        headers={
+            "User-Agent": "Mozilla/5.0",
+            "Accept": "*/*",
+        },
+    )
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+    tmp_path = tmp.name
+    tmp.close()
+    try:
+        with urlopen(req, timeout=300) as resp, open(tmp_path, "wb") as f:
+            shutil.copyfileobj(resp, f, length=1024 * 1024)
+    except Exception:
+        try:
+            os.remove(tmp_path)
+        except Exception:
+            pass
+        raise
+    return tmp_path
+def _clean_zip_path(path: str) -> str:
+    path = path.replace("\\", "/")
+    path = path.lstrip("/")
+    while path.startswith("./"):
+        path = path[2:]
+    return path
+def _split_zip_path(path: str):
+    clean = _clean_zip_path(path)
+    return [p for p in clean.split("/") if p and p != "."]
+def _name_key(name: str) -> str:
+    """
+    Normalize names for folder-name/BAM-name matching.
+    Examples that become equivalent:
+      "Adali"
+      " Adali "
+      "Adali%20Name" vs "Adali Name"
+    """
+    name = unquote(name)
+    name = unicodedata.normalize("NFC", name)
+    name = " ".join(name.strip().split())
+    return name.casefold()
+def _extract_name_from_bam_segment(segment: str) -> str:
+    segment = segment.strip()
+    if not segment:
+        return ""
+    comma_index = segment.find(",")
+    if comma_index == -1:
+        name = segment.strip()
+    else:
+        name = segment[:comma_index].strip()
+    name = unicodedata.normalize("NFC", name)
+    name = " ".join(name.split())
+    return name
+def _parse_bam_name_to_id(bam_text: str):
+    """
+    Returns:
+      name_to_id: normalized character name -> integer ID
+      id_to_name: integer ID -> original parsed name
+      duplicate_names: list of duplicate names found in BAM
+    """
+    matches = list(ID_MARKER_RE.finditer(bam_text))
+    if not matches:
+        raise ValueError("No ID markers found in bam.txt.")
+    name_to_id = {}
+    id_to_name = {}
+    duplicate_names = []
+    for i, match in enumerate(matches):
+        character_id = int(match.group(1))
+        segment_start = match.end()
+        segment_end = matches[i + 1].start() if i + 1 < len(matches) else len(bam_text)
+        segment = bam_text[segment_start:segment_end]
+        name = _extract_name_from_bam_segment(segment)
+        if not name:
+            continue
+        key = _name_key(name)
+        if key in name_to_id:
+            duplicate_names.append(name)
+            # Keep first mapping. Duplicate names are ambiguous.
+            continue
+        name_to_id[key] = character_id
+        id_to_name[character_id] = name
+    return name_to_id, id_to_name, duplicate_names
+def _count_immediate_folders_under_prefix(zip_infos, prefix_parts):
+    """
+    Used to detect whether the zip has a wrapper folder like:
+      chars_unsorted/bam.txt
+      chars_unsorted/Adali/character_description.txt
+    or no wrapper:
+      bam.txt
+      Adali/character_description.txt
+    """
+    folders = set()
+    for info in zip_infos:
+        if info.is_dir():
+            continue
+        parts = _split_zip_path(info.filename)
+        if len(parts) <= len(prefix_parts):
+            continue
+        if parts[:len(prefix_parts)] != prefix_parts:
+            continue
+        rest = parts[len(prefix_parts):]
+        if len(rest) >= 2:
+            folders.add(rest[0])
+    return len(folders)
+def _find_bam_member_and_root_prefix(zf: zipfile.ZipFile):
+    """
+    Finds the main bam.txt inside the zip and determines the logical root.
+    Handles:
+      bam.txt
+      Adali/...
+    and:
+      chars_unsorted/bam.txt
+      chars_unsorted/Adali/...
+    """
+    infos = zf.infolist()
+    candidates = []
+    for info in infos:
+        if info.is_dir():
+            continue
+        parts = _split_zip_path(info.filename)
+        if not parts:
+            continue
+        if parts[-1].casefold() == "bam.txt":
+            prefix = parts[:-1]
+            score = _count_immediate_folders_under_prefix(infos, prefix)
+            candidates.append((score, len(prefix), info, prefix))
+    if not candidates:
+        raise ValueError("Could not find bam.txt inside the zip.")
+    # Highest number of immediate child folders wins.
+    # If tied, shorter prefix wins.
+    candidates.sort(key=lambda x: (-x[0], x[1]))
+    score, _, bam_info, root_prefix = candidates[0]
+    if score == 0:
+        raise ValueError(
+            "Found bam.txt, but could not find character folders next to it."
+        )
+    return bam_info, root_prefix
+def _collect_character_folders(zf: zipfile.ZipFile, root_prefix):
+    """
+    Returns:
+      folder_files = {
+        "Adali": [(ZipInfo, ["character_description.txt"]), ...],
+        ...
+      }
+    """
+    folder_files = defaultdict(list)
+    for info in zf.infolist():
+        if info.is_dir():
+            continue
+        parts = _split_zip_path(info.filename)
+        if len(parts) <= len(root_prefix):
+            continue
+        if parts[:len(root_prefix)] != root_prefix:
+            continue
+        rest = parts[len(root_prefix):]
+        # Skip root-level bam.txt
+        if len(rest) == 1 and rest[0].casefold() == "bam.txt":
+            continue
+        # Character folder must contain at least one file.
+        if len(rest) >= 2:
+            folder_name = rest[0]
+            relative_inside_folder = rest[1:]
+            folder_files[folder_name].append((info, relative_inside_folder))
+    return folder_files
+def _description_candidate_score(relative_parts):
+    """
+    Lower score = better.
+    We prefer:
+      Character Description.txt
+      character_description.txt
+      description.txt
+    Then any .txt containing "description".
+    Then any .txt except bam.txt.
+    """
+    basename = relative_parts[-1]
+    basename_l = basename.casefold()
+    direct_file = len(relative_parts) == 1
+    exact_names = {
+        "character description.txt",
+        "character_description.txt",
+        "character-description.txt",
+        "characterdescription.txt",
+        "character desc.txt",
+        "character_desc.txt",
+        "description.txt",
+    }
+    if basename_l in exact_names:
+        base_score = 0
+    elif basename_l.endswith(".txt") and "description" in basename_l:
+        base_score = 1
+    elif basename_l.endswith(".txt") and "desc" in basename_l:
+        base_score = 2
+    elif basename_l.endswith(".txt") and basename_l != "bam.txt":
+        base_score = 3
+    else:
+        base_score = 999
+    nested_penalty = 0 if direct_file else 10
+    return base_score + nested_penalty
+def _select_description_file(files_for_folder):
+    candidates = []
+    for info, relative_parts in files_for_folder:
+        if not relative_parts:
+            continue
+        score = _description_candidate_score(relative_parts)
+        if score < 999:
+            path_depth = len(relative_parts)
+            path_text = "/".join(relative_parts)
+            candidates.append((score, path_depth, path_text, info))
+    if not candidates:
+        return None
+    candidates.sort(key=lambda x: (x[0], x[1], x[2]))
+    return candidates[0][3]
+def _safe_status_list(items, limit=20):
+    if not items:
+        return "[]"
+    shown = items[:limit]
+    text = ", ".join(str(x) for x in shown)
+    if len(items) > limit:
+        text += f", ... +{len(items) - limit} more"
+    return "[" + text + "]"
+class TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "hf_token": ("STRING", {"default": "", "multiline": False}),
+            }
+        }
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("status",)
+    FUNCTION = "run"
+    CATEGORY = "Salia"
+    @classmethod
+    def IS_CHANGED(cls, hf_token):
+        # This is an action node. Force ComfyUI to run it again when queued.
+        return time.time()
+    def run(self, hf_token):
+        if HfApi is None:
+            raise ValueError(
+                "huggingface_hub is not installed. Install it with:\n"
+                "pip install huggingface_hub"
+            )
+        token = _resolve_hf_token(hf_token)
+        source_zip_path = None
+        output_zip_path = None
+        successes = []
+        failures = []
+        warnings = []
+        try:
+            # 1. Download chars_unsorted.zip once.
+            source_zip_path = _download_file(SOURCE_ZIP_URL, suffix=".zip")
+            # 2. Open zip, find root bam.txt, parse name -> ID mapping.
+            with zipfile.ZipFile(source_zip_path, "r") as zf:
+                bam_info, root_prefix = _find_bam_member_and_root_prefix(zf)
+                bam_text = zf.read(bam_info).decode("utf-8", errors="replace")
+                name_to_id, id_to_name, duplicate_names = _parse_bam_name_to_id(bam_text)
+                if duplicate_names:
+                    warnings.append(
+                        "duplicate_names_in_bam="
+                        + _safe_status_list(duplicate_names, limit=10)
+                    )
+                folder_files = _collect_character_folders(zf, root_prefix)
+                if not folder_files:
+                    raise ValueError("No character folders found inside zip.")
+                numbered_items = []
+                used_ids = set()
+                # 3. Assign folder name -> BAM ID.
+                for folder_name, files_for_folder in folder_files.items():
+                    key = _name_key(folder_name)
+                    character_id = name_to_id.get(key)
+                    if character_id is None:
+                        failures.append(f"folder_not_found_in_bam:{folder_name}")
+                        continue
+                    if character_id in used_ids:
+                        failures.append(
+                            f"duplicate_id_assignment:{character_id:04d}:{folder_name}"
+                        )
+                        continue
+                    desc_info = _select_description_file(files_for_folder)
+                    if desc_info is None:
+                        failures.append(f"no_description_file:{folder_name}")
+                        continue
+                    used_ids.add(character_id)
+                    numbered_items.append((character_id, folder_name, desc_info))
+                if not numbered_items:
+                    raise ValueError(
+                        "No folders could be assigned to BAM IDs. "
+                        f"Failures: {_safe_status_list(failures, limit=20)}"
+                    )
+                numbered_items.sort(key=lambda x: x[0])
+                # 4. Create numbered output zip.
+                output_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".zip")
+                output_zip_path = output_tmp.name
+                output_tmp.close()
+                with zipfile.ZipFile(
+                    output_zip_path,
+                    "w",
+                    compression=zipfile.ZIP_DEFLATED,
+                    compresslevel=6,
+                ) as out_zf:
+                    for character_id, folder_name, desc_info in numbered_items:
+                        output_name = f"{character_id:04d}_desc.txt"
+                        data = zf.read(desc_info)
+                        out_zf.writestr(output_name, data)
+                        successes.append(f"{character_id:04d}:{folder_name}")
+            # 5. Upload final zip to saliacoel/tmp.
+            api = HfApi(token=token)
+            api.upload_file(
+                path_or_fileobj=output_zip_path,
+                path_in_repo=OUTPUT_ZIP_NAME,
+                repo_id=TMP_REPO_ID,
+                repo_type="model",
+                revision="main",
+                commit_message=f"Upload {OUTPUT_ZIP_NAME} from chars_unsorted.zip",
+            )
+            uploaded_url = (
+                f"https://huggingface.co/{TMP_REPO_ID}/resolve/main/"
+                f"{quote(OUTPUT_ZIP_NAME, safe='')}"
+            )
+            status_parts = [
+                f"uploaded={uploaded_url}",
+                f"output_zip={OUTPUT_ZIP_NAME}",
+                f"descriptions_written={len(successes)}",
+                f"folders_failed={len(failures)}",
+                f"bam_entries={len(id_to_name)}",
+                f"root_prefix={'/'.join(root_prefix) if root_prefix else '<zip_root>'}",
+            ]
+            if warnings:
+                status_parts.append("warnings=" + " ; ".join(warnings))
+            if failures:
+                status_parts.append("failures=" + _safe_status_list(failures, limit=20))
+            status_parts.append("ok=" + _safe_status_list(successes, limit=20))
+            return (" | ".join(status_parts),)
+        finally:
+            for path in (source_zip_path, output_zip_path):
+                if path and os.path.exists(path):
+                    try:
+                        os.remove(path)
+                    except Exception:
+                        pass
+NODE_CLASS_MAPPINGS = {
+    "TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip": TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip": "TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip",
+}