tolgacangoz
/

anytext

Text-to-Image

Diffusers

Safetensors

Model card Files Files and versions Community

tolgacangoz commited on 7 days ago

Commit

c747cba

verified ·

1 Parent(s): 04fc5b3

Upload anytext.py

Browse files

Files changed (1) hide show

text_embedding_module/anytext.py +25 -15

text_embedding_module/anytext.py CHANGED Viewed

@@ -69,6 +69,7 @@ from diffusers.utils import (
 from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
 from diffusers.configuration_utils import register_to_config, ConfigMixin
 from diffusers.models.modeling_utils import ModelMixin
 checker = BasicTokenizer()
@@ -152,7 +153,7 @@ class EmbeddingManager(nn.Module):
         self.token_dim = token_dim
         self.proj = nn.Linear(40 * 64, token_dim)
-        self.proj.load_state_dict(load_file("proj.safetensors", device=str(embedder.device)))
         if use_fp16:
             self.proj = self.proj.to(dtype=torch.float16)
@@ -269,9 +270,20 @@ def crop_image(src_img, mask):
 def create_predictor(model_dir=None, model_lang="ch", device="cpu", use_fp16=False):
-    model_file_path = model_dir
-    if model_file_path is not None and not os.path.exists(model_file_path):
-        raise ValueError("not find model file path {}".format(model_file_path))
     if model_lang == "ch":
         n_class = 6625
@@ -287,8 +299,8 @@ def create_predictor(model_dir=None, model_lang="ch", device="cpu", use_fp16=Fal
     )
     rec_model = RecModel(rec_config)
-    if model_file_path is not None:
-        rec_model.load_state_dict(torch.load(model_file_path, map_location=device))
     return rec_model
@@ -450,22 +462,20 @@ class TextRecognizer(object):
         return loss
-class TextEmbeddingModule(ModelMixin, ConfigMixin):
-    @register_to_config
     def __init__(self, font_path, use_fp16=False, device="cpu"):
         super().__init__()
-        self.use_fp16 = use_fp16
-        self.device = device
         # TODO: Learn if the recommended font file is free to use
         self.font = ImageFont.truetype(font_path, 60)
-        self.frozen_CLIP_embedder_t3 = FrozenCLIPEmbedderT3(device=self.device, use_fp16=self.use_fp16)
-        self.embedding_manager = EmbeddingManager(self.frozen_CLIP_embedder_t3, use_fp16=self.use_fp16)
-        rec_model_dir = "OCR/ppv3_rec.pth"
-        self.text_predictor = create_predictor(rec_model_dir, device=self.device, use_fp16=self.use_fp16).eval()
         args = {}
         args["rec_image_shape"] = "3, 48, 320"
         args["rec_batch_num"] = 6
-        args["rec_char_dict_path"] = "OCR/ppocr_keys_v1.txt"
         args["use_fp16"] = self.use_fp16
         self.embedding_manager.recog = TextRecognizer(args, self.text_predictor)

 from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
 from diffusers.configuration_utils import register_to_config, ConfigMixin
 from diffusers.models.modeling_utils import ModelMixin
+from huggingface_hub import hf_hub_download
 checker = BasicTokenizer()
         self.token_dim = token_dim
         self.proj = nn.Linear(40 * 64, token_dim)
+        # self.proj.load_state_dict(load_file("proj.safetensors", device=str(embedder.device)))
         if use_fp16:
             self.proj = self.proj.to(dtype=torch.float16)
 def create_predictor(model_dir=None, model_lang="ch", device="cpu", use_fp16=False):
+    if model_dir is None or not os.path.exists(model_dir):
+        try:
+            # Use the repo id from which the pipeline was loaded
+            model_dir = hf_hub_download(
+                repo_id="tolgacangoz/anytext",
+                filename="text_embedding_module/OCR/ppv3_rec.pth",
+                local_dir=".cache/diffusers",
+                local_dir_use_symlinks=True
+            )
+        except Exception as e:
+            raise ValueError(f"Could not download the model file: {e}")
+    if model_dir is not None and not os.path.exists(model_dir):
+        raise ValueError("not find model file path {}".format(model_dir))
     if model_lang == "ch":
         n_class = 6625
     )
     rec_model = RecModel(rec_config)
+    state_dict = torch.load(model_dir, map_location=device)
+    rec_model.load_state_dict(state_dict)
     return rec_model
         return loss
+class TextEmbeddingModule(nn.Module):
+    # @register_to_config
     def __init__(self, font_path, use_fp16=False, device="cpu"):
         super().__init__()
         # TODO: Learn if the recommended font file is free to use
         self.font = ImageFont.truetype(font_path, 60)
+        self.frozen_CLIP_embedder_t3 = FrozenCLIPEmbedderT3(device=device, use_fp16=use_fp16)
+        self.embedding_manager = EmbeddingManager(self.frozen_CLIP_embedder_t3, use_fp16=use_fp16)
+        rec_model_dir = "./text_embedding_module/OCR/ppv3_rec.pth"
+        self.text_predictor = create_predictor(rec_model_dir, device=device, use_fp16=use_fp16).eval()
         args = {}
         args["rec_image_shape"] = "3, 48, 320"
         args["rec_batch_num"] = 6
+        args["rec_char_dict_path"] = "./text_embedding_module/OCR/ppocr_keys_v1.txt"
         args["use_fp16"] = self.use_fp16
         self.embedding_manager.recog = TextRecognizer(args, self.text_predictor)