Spaces:

city96
/

AnimeClassifiers-demo

Sleeping

App Files Files Community

City commited on Nov 29, 2023

Commit

bb0a0a7

•

1 Parent(s): dbfacdc

Sync with github

Browse files

Files changed (5) hide show

README.md +5 -6
demo_class_gradio.py +62 -0
inference.py +236 -0
model.py +45 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: AnimeClassifiers Demo
-emoji: 📚
-colorFrom: gray
-colorTo: indigo
 sdk: gradio
 sdk_version: 4.7.1
-app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: AnimeClassifiers Demo
+emoji: 🧱
+colorFrom: blue
+colorTo: yellow
 sdk: gradio
 sdk_version: 4.7.1
+app_file: demo_class_gradio.py
+models: [city96/AnimeClassifiers]
 pinned: false
 license: apache-2.0
 ---

demo_class_gradio.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import torch
+import gradio as gr
+from inference import CityClassifierMultiModelPipeline, get_model_path
+TOKEN  = os.environ.get("HFS_TOKEN")
+HFREPO = "City96/AnimeClassifiers"
+MODELS = [
+	"CCAnime-ChromaticAberration-v1.16",
+]
+article = """\
+These are classifiers meant to work with anime images.
+For more information, you can check out the [Huggingface Hub](https://huggingface.co/city96/AnimeClassifiers) or [GitHub page](https://github.com/city96/CityClassifiers).
+"""
+info_default="""\
+Include default class (unknown/negative) in output results.
+"""
+info_tiling = """\
+Divide the image into parts and run classifier on each part separately.
+Greatly improves accuracy but slows down inference.
+"""
+info_tiling_combine = """\
+How to combine the confidence scores of the different tiles.
+Mean averages confidence over all tiles. Median takes the value in the middle.
+Max/min take the score from the tile with the highest/lowest confidence respectively, but can results in multiple labels having very high/very low confidence scores.
+"""
+pipeline_args = {}
+if torch.cuda.is_available():
+	pipeline_args.update({
+		"device"     : "cuda",
+		"clip_dtype" : torch.float16,
+	})
+pipeline = CityClassifierMultiModelPipeline(
+	model_paths = [get_model_path(x, HFREPO, TOKEN) for x in MODELS],
+	config_paths = [get_model_path(x, HFREPO, TOKEN, extension="config.json") for x in MODELS],
+	**pipeline_args,
+)
+gr.Interface(
+	fn      = pipeline,
+	title   = "CityClassifiers demo",
+	article = article,
+	inputs  = [
+		gr.Image(label="Input image", type="pil"),
+		gr.Checkbox(label="Include default", value=True, info=info_default),
+		gr.Checkbox(label="Tiling", value=True, info=info_tiling),
+		gr.Dropdown(
+			label   = "Tiling combine strategy",
+			choices = ["mean", "median", "max", "min"],
+			value = "mean",
+			type = "value",
+			info = info_tiling_combine,
+		)
+	],
+	outputs = [gr.Label(label=x) for x in MODELS],
+	examples = "./examples" if os.path.isdir("./examples") else None,
+	allow_flagging = "never",
+	analytics_enabled = False,
+).launch()

inference.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import os
+import json
+import torch
+import torchvision.transforms as TF
+from safetensors.torch import load_file
+from huggingface_hub import hf_hub_download
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from model import PredictorModel
+class CityAestheticsPipeline:
+	"""
+	Demo model pipeline for [image=>score] prediction
+		Accepts a single model path on initialization.
+		Resulting object can be called directly with a PIL image as the input
+		Returns a single float value with the predicted score [0.0;1.0].
+	"""
+	clip_ver = "openai/clip-vit-large-patch14"
+	def __init__(self, model_path, device="cpu", clip_dtype=torch.float32):
+		self.device = device
+		self.clip_dtype = clip_dtype
+		self._init_clip()
+		self.model = self._load_model(model_path)
+		print("CityAesthetics: Pipeline init ok") # debug
+	def __call__(self, raw):
+		emb = self.get_clip_emb(raw)
+		return self.get_model_pred(self.model, emb)
+	def get_model_pred(self, model, emb):
+		with torch.no_grad():
+			pred = model(emb)
+		return float(pred.detach().cpu().squeeze(0))
+	def get_clip_emb(self, raw):
+		img = self.proc(
+			images = raw,
+			return_tensors = "pt"
+		)["pixel_values"].to(self.clip_dtype).to(self.device)
+		with torch.no_grad():
+			emb = self.clip(pixel_values=img)
+		return emb["image_embeds"].detach().to(torch.float32)
+	def _init_clip(self):
+		self.proc = CLIPImageProcessor.from_pretrained(self.clip_ver)
+		self.clip = CLIPVisionModelWithProjection.from_pretrained(
+			self.clip_ver,
+			device_map  = self.device,
+			torch_dtype = self.clip_dtype,
+		)
+	def _load_model(self, path):
+		sd = load_file(path)
+		assert tuple(sd["up.0.weight"].shape) == (1024, 768) # only allow CLIP ver
+		model = PredictorModel(outputs=1)
+		model.eval()
+		model.load_state_dict(sd)
+		model.to(self.device)
+		return model
+class CityAestheticsMultiModelPipeline(CityAestheticsPipeline):
+	"""
+	Demo multi-model pipeline for [image=>score] prediction
+		Accepts a list of model paths on initialization.
+		Resulting object can be called directly with a PIL image as the input.
+		Returns a dict with the model name as key and the score [0.0;1.0] as a value.
+	"""
+	def __init__(self, model_paths, device="cpu", clip_dtype=torch.float32):
+		self.device = device
+		self.clip_dtype = clip_dtype
+		self._init_clip()
+		self.models = {}
+		for path in model_paths:
+			name = os.path.splitext(os.path.basename(path))[0]
+			self.models[name] = self._load_model(path)
+		print("CityAesthetics: Pipeline init ok") # debug
+	def __call__(self, raw):
+		emb = self.get_clip_emb(raw)
+		out = {}
+		for name, model in self.models.items():
+			pred = model(emb)
+			out[name] = self.get_model_pred(model, emb)
+		return out
+class CityClassifierPipeline:
+	"""
+	Demo model pipeline for [image=>label] prediction
+		Accepts a single model path and (optionally) a JSON file on initialization.
+		Resulting object can be called directly with a PIL image as the input
+		Returns a single float value with the predicted score [0.0;1.0].
+	"""
+	clip_ver = "openai/clip-vit-large-patch14"
+	def __init__(self, model_path, config_path=None, device="cpu", clip_dtype=torch.float32):
+		self.device = device
+		self.clip_dtype = clip_dtype
+		self._init_clip()
+		self.labels, model_args = self._load_config(config_path)
+		self.model = self._load_model(model_path, model_args)
+		print("CityClassifier: Pipeline init ok") # debug
+	def __call__(self, raw, default=True, tiling=True, tile_strat="mean"):
+		emb = self.get_clip_emb(raw, tiling=tiling)
+		pred = self.get_model_pred(self.model, emb)
+		return self.format_pred(
+			pred,
+			labels = self.labels,
+			drop = [] if default else [0],
+			ts = tile_strat if tiling else "raw",
+		)
+	def format_pred(self, pred, labels, drop=[], ts="mean"):
+		# recombine strategy
+		if   ts == "mean"  : vp = lambda x: float(torch.mean(x))
+		elif ts == "median": vp = lambda x: float(torch.median(x))
+		elif ts == "max"   : vp = lambda x: float(torch.max(x))
+		elif ts == "min"   : vp = lambda x: float(torch.min(x))
+		elif ts == "raw"   : vp = lambda x: float(x)
+		else: raise NotImplementedError(f"CityClassifier: Invalid combine strategy '{ts}'!")
+		# combine pred w/ labels
+		out = {}
+		for k in range(len(pred)):
+			if k in drop: continue
+			key = labels.get(str(k), str(k))
+			out[key] = vp(pred[k])
+		return out
+	def get_model_pred(self, model, emb):
+		with torch.no_grad():
+			pred = model(emb)
+		pred = pred.detach().cpu()
+		return [pred[:, x] for x in range(pred.shape[1])] # split
+	def get_clip_emb(self, raw, tiling=False):
+		if tiling and min(raw.size)>512:
+			if max(raw.size)>1536:
+				raw = TF.functional.resize(raw, 1536)
+			raw = TF.functional.five_crop(raw, 512)
+		img = self.proc(
+			images = raw,
+			return_tensors = "pt"
+		)["pixel_values"].to(self.clip_dtype).to(self.device)
+		with torch.no_grad():
+			emb = self.clip(pixel_values=img)
+		return emb["image_embeds"].detach().to(torch.float32)
+	def _init_clip(self):
+		self.proc = CLIPImageProcessor.from_pretrained(self.clip_ver)
+		self.clip = CLIPVisionModelWithProjection.from_pretrained(
+			self.clip_ver,
+			device_map  = self.device,
+			torch_dtype = self.clip_dtype,
+		)
+	def _load_model(self, path, args=None):
+		sd = load_file(path)
+		assert tuple(sd["up.0.weight"].shape) == (1024, 768) # only allow CLIP ver
+		args = args or { # infer from model
+			"outputs" : int(sd["down.5.bias"].shape[0])
+		}
+		model = PredictorModel(**args)
+		model.eval()
+		model.load_state_dict(sd)
+		model.to(self.device)
+		return model
+	def _load_config(self, path):
+		if not path or not os.path.isfile(path):
+			return ({},None)
+		with open(path) as f:
+			data = json.loads(f.read())
+		return (
+			data.get("labels", {}),
+			data.get("model_params", {}),
+		)
+class CityClassifierMultiModelPipeline(CityClassifierPipeline):
+	"""
+	Demo model pipeline for [image=>label] prediction
+		Accepts a list of model paths on initialization.
+		A matching list of JSON files can also be passed in the same order.
+		Resulting object can be called directly with a PIL image as the input
+		Returns a single float value with the predicted score [0.0;1.0].
+	"""
+	def __init__(self, model_paths, config_paths=[], device="cpu", clip_dtype=torch.float32):
+		self.device = device
+		self.clip_dtype = clip_dtype
+		self._init_clip()
+		self.models = {}
+		self.labels = {}
+		assert len(model_paths) == len(config_paths) or not config_paths, "CityClassifier: Model and config paths must match!"
+		for k in range(len(model_paths)):
+			name = os.path.splitext(os.path.basename(model_paths[k]))[0] # TODO: read from config
+			self.labels[name], model_args = self._load_config(config_paths[k] if config_paths else None)
+			self.models[name] = self._load_model(model_paths[k], model_args)
+		print("CityClassifier: Pipeline init ok") # debug
+	def __call__(self, raw, default=True, tiling=True, tile_strat="mean"):
+		emb = self.get_clip_emb(raw, tiling=tiling)
+		out = {}
+		for name, model in self.models.items():
+			pred = self.get_model_pred(model, emb)
+			out[name] = self.format_pred(
+				pred,
+				labels = self.labels[name],
+				drop = [] if default else [0],
+				ts = tile_strat if tiling else "raw",
+			)
+		if len(out.values()) == 1: return list(out.values())[0] # GRADIO HOTFIX
+		return list(out.values())
+def get_model_path(name, repo, token=True, extension="safetensors", local=False):
+	"""
+	Returns local model path or falls back to HF hub if required.
+	"""
+	fname = f"{name}.{extension}"
+	# local path: [models/AesPred-Anime-v1.8.safetensors]
+	path = os.path.join(os.path.dirname(os.path.realpath(__file__)),"models")
+	if os.path.isfile(os.path.join(path, fname)):
+		print(f"Using local model for '{fname}'")
+		return os.path.join(path, fname)
+	if local: raise OSError(f"Can't find local model '{fname}'!")
+	# huggingface hub fallback
+	print(f"Using HF Hub model for '{fname}'")
+	return str(hf_hub_download(
+		token    = token,
+		repo_id  = repo,
+		filename = fname,
+	))

model.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+class ResBlock(nn.Module):
+	"""Linear block with residuals"""
+	def __init__(self, ch):
+		super().__init__()
+		self.join = nn.ReLU()
+		self.long = nn.Sequential(
+			nn.Linear(ch, ch),
+			nn.LeakyReLU(0.1),
+			nn.Linear(ch, ch),
+			nn.LeakyReLU(0.1),
+			nn.Linear(ch, ch),
+		)
+	def forward(self, x):
+		return self.join(self.long(x) + x)
+class PredictorModel(nn.Module):
+	"""Main predictor class"""
+	def __init__(self, features=768, outputs=1, hidden=1024):
+		super().__init__()
+		self.features = features
+		self.outputs = outputs
+		self.hidden = hidden
+		self.up = nn.Sequential(
+			nn.Linear(self.features, self.hidden),
+			ResBlock(ch=self.hidden),
+		)
+		self.down = nn.Sequential(
+			nn.Linear(self.hidden, 128),
+			nn.Linear(128, 64),
+			nn.Dropout(0.1),
+			nn.LeakyReLU(),
+			nn.Linear(64, 32),
+			nn.Linear(32, self.outputs),
+		)
+		self.out = nn.Softmax(dim=1) if self.outputs > 1 else nn.Tanh()
+	def forward(self, x):
+		y = self.up(x)
+		z = self.down(y)
+		if self.outputs > 1:
+			return self.out(z)
+		else:
+			return (self.out(z)+1.0)/2.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch==2.1.0
+accelerate==0.24.1
+safetensors==0.4.0
+transformers==4.35.0