AngelBottomless
/

camie-tagger-onnxruntime

Image-to-Text

ONNX

Model card Files Files and versions Community

AngelBottomless commited on 8 days ago

Commit

ca9b012

verified ·

1 Parent(s): acee4df

nicer verbosity by o1-pro

Browse files

Files changed (1) hide show

infer.py +98 -80

infer.py CHANGED Viewed

@@ -1,80 +1,98 @@
-import onnxruntime as ort
-import numpy as np
-import json
-from PIL import Image
-# 1) Load ONNX model
-session = ort.InferenceSession("camie_tagger_initial.onnx", providers=["CPUExecutionProvider"])
-# 2) Preprocess your image (512x512, etc.)
-def preprocess_image(img_path):
-    """
-    Loads and resizes an image to 512x512, converts it to float32 [0..1],
-    and returns a (1,3,512,512) NumPy array (NCHW format).
-    """
-    img = Image.open(img_path).convert("RGB").resize((512, 512))
-    x = np.array(img).astype(np.float32) / 255.0
-    x = np.transpose(x, (2, 0, 1))  # HWC -> CHW
-    x = np.expand_dims(x, 0)        # add batch dimension -> (1,3,512,512)
-    return x
-# Example input
-def inference(input_path):
-    input_tensor = preprocess_image(input_path)
-    # 3) Run inference
-    input_name = session.get_inputs()[0].name
-    outputs = session.run(None, {input_name: input_tensor})
-    initial_logits, refined_logits = outputs  # shape: (1, 70527) each
-    # 4) Convert logits to probabilities via sigmoid
-    refined_probs = 1 / (1 + np.exp(-refined_logits))  # shape: (1, 70527)
-    # 5) Load metadata & retrieve threshold info
-    with open("metadata.json", "r", encoding="utf-8") as f:
-        metadata = json.load(f)
-    # Dictionary of idx->tag_name, e.g. { "0": "brown_hair", "1": "blue_eyes", ... }
-    idx_to_tag = metadata["idx_to_tag"]
-    # Dictionary of tag->category, e.g. { "brown_hair": "character", "landscape": "general", ... }
-    tag_to_category = metadata.get("tag_to_category", {})
-    # Dictionary of category->threshold, e.g. { "character": 0.30, "general": 0.325, ... }
-    # If not present or incomplete, we'll use a default threshold of 0.325
-    category_thresholds = metadata.get("category_thresholds", {})
-    default_threshold = 0.325
-    # 6) Collect predictions by category
-    # We'll loop through all tags and check if the probability is above the category-specific threshold
-    results_by_category = {}
-    num_tags = refined_probs.shape[1]  # 70527
-    for i in range(num_tags):
-        prob = float(refined_probs[0, i])  # get probability for this tag
-        tag_name = idx_to_tag[str(i)]      # convert index -> tag name (keys in idx_to_tag are strings)
-        # Find category; if not in 'tag_to_category', label it "unknown"
-        category = tag_to_category.get(tag_name, "unknown")
-        # Find threshold for this category; fallback to default
-        cat_threshold = category_thresholds.get(category, default_threshold)
-        # Check if prob meets or exceeds the threshold
-        if prob >= cat_threshold:
-            if category not in results_by_category:
-                results_by_category[category] = []
-            # Store the tag name + its probability
-            results_by_category[category].append((tag_name, prob))
-    # 7) Print out the predicted tags category-wise
-    print("Predicted Tags by Category:\n")
-    for cat, tags_list in results_by_category.items():
-        print(f"Category: {cat} | Predicted {len(tags_list)} tags")
-        for tname, tprob in sorted(tags_list, key=lambda x: x[1], reverse=True):
-            print(f"  Tag: {tname:30s}  Prob: {tprob:.4f}")
-        print()
-if __name__ == "__main__":
-    inference("example_image.jpg")

+import onnxruntime as ort
+import numpy as np
+import json
+from PIL import Image
+# 1) Load ONNX model
+session = ort.InferenceSession("camie_tagger_initial.onnx", providers=["CPUExecutionProvider"])
+# 2) Preprocess your image (512x512, etc.)
+def preprocess_image(img_path):
+    """
+    Loads and resizes an image to 512x512, converts it to float32 [0..1],
+    and returns a (1,3,512,512) NumPy array (NCHW format).
+    """
+    img = Image.open(img_path).convert("RGB").resize((512, 512))
+    x = np.array(img).astype(np.float32) / 255.0
+    x = np.transpose(x, (2, 0, 1))  # HWC -> CHW
+    x = np.expand_dims(x, 0)        # add batch dimension -> (1,3,512,512)
+    return x
+# Example input
+def inference(input_path, output_format="verbose"):
+    """
+    Returns either:
+      - A verbose category breakdown, or
+      - A comma-separated string of predicted tags (underscores replaced with spaces).
+    """
+    # 1) Preprocess
+    input_tensor = preprocess_image(input_path)
+    # 2) Run inference
+    input_name = session.get_inputs()[0].name
+    outputs = session.run(None, {input_name: input_tensor})
+    initial_logits, refined_logits = outputs  # shape: (1, 70527) each
+    # 3) Convert logits to probabilities
+    refined_probs = 1 / (1 + np.exp(-refined_logits))  # shape: (1, 70527)
+    # 4) Load metadata & retrieve threshold info
+    with open("metadata.json", "r", encoding="utf-8") as f:
+        metadata = json.load(f)
+    idx_to_tag = metadata["idx_to_tag"]  # e.g. { "0": "brown_hair", "1": "blue_eyes", ... }
+    tag_to_category = metadata.get("tag_to_category", {})
+    category_thresholds = metadata.get(
+        "category_thresholds",
+        {"artist": 0.1, "character": 0.2, "meta": 0.3, "style": 0.1}
+    )
+    default_threshold = 0.325
+    # 5) Collect predictions by category
+    results_by_category = {}
+    num_tags = refined_probs.shape[1]
+    for i in range(num_tags):
+        prob = float(refined_probs[0, i])
+        tag_name = idx_to_tag[str(i)]  # str(i) because metadata uses string keys
+        category = tag_to_category.get(tag_name, "unknown")
+        cat_threshold = category_thresholds.get(category, default_threshold)
+        if prob >= cat_threshold:
+            if category not in results_by_category:
+                results_by_category[category] = []
+            results_by_category[category].append((tag_name, prob))
+    # 6) Depending on output_format, produce different return strings
+    if output_format == "as_prompt":
+        # Flatten all predicted tags across categories
+        all_predicted_tags = []
+        for cat, tags_list in results_by_category.items():
+            # We only need the tag name in as_prompt format
+            for tname, tprob in tags_list:
+                # convert underscores to spaces
+                tag_name_spaces = tname.replace("_", " ")
+                all_predicted_tags.append(tag_name_spaces)
+        # Create a comma-separated string
+        prompt_string = ", ".join(all_predicted_tags)
+        return prompt_string
+    else:  # "verbose"
+        # We'll build a multiline string describing the predictions
+        lines = []
+        lines.append("Predicted Tags by Category:\n")
+        for cat, tags_list in results_by_category.items():
+            lines.append(f"Category: {cat} | Predicted {len(tags_list)} tags")
+            # Sort descending by probability
+            for tname, tprob in sorted(tags_list, key=lambda x: x[1], reverse=True):
+                lines.append(f"  Tag: {tname:30s}  Prob: {tprob:.4f}")
+            lines.append("")  # blank line after each category
+        # Join lines with newlines
+        verbose_output = "\n".join(lines)
+        return verbose_output
+if __name__ == "__main__":
+    result = inference("path/to/image", output_format="as_prompt")
+    print(result)