Spaces:

make789
/

OCRdeepSeekService

Running

App Files Files Community

make789 commited on 24 days ago

Commit

3c00a51

verified ·

1 Parent(s): 14305f4

Upload 2 files

Browse files

Files changed (2) hide show

ocr_service.py +20 -8
requirements.txt +9 -7

ocr_service.py CHANGED Viewed

@@ -455,11 +455,13 @@ async def get_ocr_model():
                 print("  - Tokenizer loaded successfully")
                 # Load model with compatibility settings
-                # Use SDPA attention to avoid LlamaFlashAttention2 import errors
                 load_kwargs = {
                     "trust_remote_code": True,
-                    "use_safetensors": False,  # Avoid safetensors issues
-                    "_attn_implementation": "sdpa",  # Force SDPA (works on HuggingFace Spaces and Apple Silicon)
                 }
                 if IS_APPLE_SILICON:
@@ -485,16 +487,20 @@ async def get_ocr_model():
                         _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
                     else:
                         raise
                 _ocr_model = _ocr_model.eval()
-                # Handle device placement for M4 Mac (Apple Silicon)
                 if USE_MPS and torch.backends.mps.is_available():
                     _ocr_model = _ocr_model.to("mps")
                     print("  - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
                 elif USE_GPU and torch.cuda.is_available():
                     _ocr_model = _ocr_model.cuda().to(torch.bfloat16)
-                    print("  - DeepSeek-OCR loaded on NVIDIA GPU")
                 else:
                     print("  - DeepSeek-OCR loaded on CPU")
     return _ocr_model, _ocr_tokenizer
@@ -516,15 +522,21 @@ async def run_deepseek_ocr(
     try:
         # Maximum quality inference - best OCR quality settings
         result = model.infer(
             tokenizer,
             prompt=prompt,
             image_file=image_path,
             output_path=output_path,
-            base_size=BASE_SIZE,  # 1280 = maximum quality (not light version!)
-            image_size=IMAGE_SIZE,  # 1280 = maximum quality (not light version!)
             crop_mode=CROP_MODE,  # True = best accuracy for complex documents
-            save_results=False,
             test_compress=False,  # False = maximum quality, no compression
         )

                 print("  - Tokenizer loaded successfully")
                 # Load model with compatibility settings
+                # Official DeepSeek-OCR usage: https://huggingface.co/deepseek-ai/DeepSeek-OCR
+                # GPU version uses: _attn_implementation='flash_attention_2', use_safetensors=True
+                # CPU/Spaces version uses: _attn_implementation='sdpa', use_safetensors=True
                 load_kwargs = {
                     "trust_remote_code": True,
+                    "use_safetensors": True,  # Official usage recommends True
+                    "_attn_implementation": "sdpa",  # Use SDPA for CPU/Spaces (GPU would use 'flash_attention_2')
                 }
                 if IS_APPLE_SILICON:
                         _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
                     else:
                         raise
+                # Set to eval mode (as per official usage)
                 _ocr_model = _ocr_model.eval()
+                # Handle device placement (per official DeepSeek-OCR usage)
                 if USE_MPS and torch.backends.mps.is_available():
+                    # Apple Silicon: MPS (Metal Performance Shaders)
                     _ocr_model = _ocr_model.to("mps")
                     print("  - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
                 elif USE_GPU and torch.cuda.is_available():
+                    # NVIDIA GPU: CUDA with bfloat16 (per official usage)
                     _ocr_model = _ocr_model.cuda().to(torch.bfloat16)
+                    print("  - DeepSeek-OCR loaded on NVIDIA GPU (CUDA + bfloat16)")
                 else:
+                    # CPU: No device placement needed
                     print("  - DeepSeek-OCR loaded on CPU")
     return _ocr_model, _ocr_tokenizer
     try:
         # Maximum quality inference - best OCR quality settings
+        # Official DeepSeek-OCR quality presets (from https://huggingface.co/deepseek-ai/DeepSeek-OCR):
+        # - Tiny: base_size=512, image_size=512, crop_mode=False
+        # - Small: base_size=640, image_size=640, crop_mode=False
+        # - Base: base_size=1024, image_size=1024, crop_mode=False
+        # - Large: base_size=1280, image_size=1280, crop_mode=False ← We use this for max quality!
+        # - Gundam: base_size=1024, image_size=640, crop_mode=True
         result = model.infer(
             tokenizer,
             prompt=prompt,
             image_file=image_path,
             output_path=output_path,
+            base_size=BASE_SIZE,  # 1280 = Large quality (maximum quality!)
+            image_size=IMAGE_SIZE,  # 1280 = Large quality (maximum quality!)
             crop_mode=CROP_MODE,  # True = best accuracy for complex documents
+            save_results=False,  # Don't save intermediate files
             test_compress=False,  # False = maximum quality, no compression
         )

requirements.txt CHANGED Viewed

@@ -7,15 +7,17 @@ python-multipart>=0.0.6
 pillow>=10.0.0
 numpy>=1.24.0
-# DeepSeek-OCR dependencies - MAXIMUM QUALITY (not light versions!)
-torch>=2.6.0
-torchvision>=0.19.0
-transformers>=4.46.3,<5.0.0  # Compatible version avoiding LlamaFlashAttention2 issues
-tokenizers>=0.20.3
 einops>=0.7.0
 addict>=2.4.0
 easydict>=1.9
 matplotlib>=3.8.0
-# Note: Using default attention implementation to avoid compatibility issues
-# Flash attention for GPU acceleration (install separately if needed: pip install flash-attn==2.7.3 --no-build-isolation)

 pillow>=10.0.0
 numpy>=1.24.0
+# DeepSeek-OCR dependencies - PINNED VERSIONS (tested working set)
+# IMPORTANT: Pin to exact versions to avoid LlamaFlashAttention2 import errors
+torch==2.6.0
+torchvision==0.21.0  # Match torch version
+transformers==4.46.3  # Critical: Must be 4.46.3 (4.47+ removed LlamaFlashAttention2)
+tokenizers==0.20.3  # Exact version from model card
 einops>=0.7.0
 addict>=2.4.0
 easydict>=1.9
 matplotlib>=3.8.0
+tqdm
+# Note: Flash attention NOT needed for CPU/Spaces deployment
+# GPU only: flash-attn==2.7.3 --no-build-isolation