Spaces:

make789
/

OCRdeepSeekService

Running

App Files Files Community

make789 commited on 25 days ago

Commit

14305f4

verified ·

1 Parent(s): ec87c41

Upload ocr_service.py

Browse files

Files changed (1) hide show

ocr_service.py +97 -75

ocr_service.py CHANGED Viewed

@@ -76,6 +76,11 @@ MAX_JSON_LIST_ITEMS = 100
 # DeepSeek-OCR Model Configuration - Maximum Quality Settings for M4 Mac (Apple Silicon)
 MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
 # Detect Apple Silicon (M1/M2/M3/M4) - use MPS if available, otherwise CPU
 IS_APPLE_SILICON = platform.machine() == "arm64"
 USE_GPU = os.getenv("USE_GPU", "true").lower() == "true" and not IS_APPLE_SILICON  # M4 uses MPS, not CUDA
@@ -143,53 +148,69 @@ def _patch_deepseek_model_for_compatibility():
         Path(os.getenv("TRANSFORMERS_CACHE", "")) / "hub" if os.getenv("TRANSFORMERS_CACHE") else None,
     ]
     model_files = []
     for cache_dir in possible_cache_dirs:
         if cache_dir and cache_dir.exists():
             try:
-                found = list(cache_dir.glob("**/modeling_deepseekv2.py"))
-                model_files.extend(found)
             except Exception:
                 continue
     if not model_files:
-        print("⚠️  Model file not found yet - will patch on first model load")
         return  # Model not downloaded yet, will patch on first load
-    model_file = model_files[0]
-    print(f"🔍 Found model file: {model_file}")
-    # Check if already patched
-    try:
-        with open(model_file, 'r', encoding='utf-8') as f:
-            content = f.read()
-            if "LlamaFlashAttention2 = LlamaAttention" in content:
-                print("✅ Model already patched")
-                return  # Already patched
-            original_content = content  # Save original for comparison
-    except Exception as e:
-        print(f"⚠️  Could not read model file to check patch status: {e}")
-        return
-    # More flexible approach: find and replace any import containing LlamaFlashAttention2
     import re
-    try:
-        # Pattern 1: Multi-line import with parentheses
-        # Matches: from transformers.models.llama.modeling_llama import (\n    ...\n    LlamaFlashAttention2\n    )
-        multiline_pattern = r'from transformers\.models\.llama\.modeling_llama import\s*\([^)]*LlamaFlashAttention2[^)]*\)'
-        # Pattern 2: Single-line import with parentheses
-        singleline_parentheses_pattern = r'from transformers\.models\.llama\.modeling_llama import\s*\([^)]*LlamaFlashAttention2[^)]*\)'
-        # Pattern 3: Direct import without parentheses
-        direct_import_pattern = r'from transformers\.models\.llama\.modeling_llama import[^;]*LlamaFlashAttention2[^;\n]*'
-        patched = False
-        # Try multiline replacement first (most common)
-        if re.search(multiline_pattern, content, re.MULTILINE | re.DOTALL):
-            # Create backup
             backup_file = model_file.with_suffix('.py.backup')
             try:
                 import shutil
@@ -231,8 +252,8 @@ except ImportError:
     LlamaFlashAttention2 = LlamaAttention"""
                 content = re.sub(multiline_pattern, replacement, content, flags=re.MULTILINE | re.DOTALL)
-                patched = True
-                print("🔧 Applied multiline import patch")
         # Try single-line parentheses pattern
         elif re.search(singleline_parentheses_pattern, content):
@@ -266,8 +287,8 @@ except ImportError:
     LlamaFlashAttention2 = LlamaAttention"""
                 content = re.sub(singleline_parentheses_pattern, replacement, content)
-                patched = True
-                print("🔧 Applied single-line parentheses import patch")
         # Try direct import pattern (no parentheses)
         elif re.search(direct_import_pattern, content):
@@ -288,23 +309,22 @@ except ImportError:
     LlamaFlashAttention2 = LlamaAttention"""
             content = re.sub(direct_import_pattern, replacement, content)
-            patched = True
-            print("🔧 Applied direct import patch")
-        # Last resort: find any line containing LlamaFlashAttention2 import and replace
-        if not patched:
             lines = content.split('\n')
             for i, line in enumerate(lines):
                 if 'LlamaFlashAttention2' in line and 'from transformers.models.llama.modeling_llama' in line:
                     # Create backup on first match
-                    if not patched:
-                        backup_file = model_file.with_suffix('.py.backup')
-                        try:
-                            import shutil
-                            shutil.copy2(model_file, backup_file)
-                            print(f"📋 Created backup: {backup_file}")
-                        except Exception as backup_err:
-                            print(f"⚠️  Could not create backup: {backup_err}")
                     # Replace the import line(s)
                     # Handle multiline imports
@@ -329,19 +349,19 @@ except ImportError:
                         # Replace the block
                         lines[i:j+1] = replacement_lines
                         content = '\n'.join(lines)
-                        patched = True
-                        print(f"🔧 Applied line-by-line patch (lines {i}-{j})")
                         break
                     else:
                         # Single line import
                         lines[i] = "# Patch: LlamaFlashAttention2 import with fallback\ntry:\n    from transformers.models.llama.modeling_llama import LlamaFlashAttention2\nexcept ImportError:\n    from transformers.models.llama.modeling_llama import LlamaAttention\n    LlamaFlashAttention2 = LlamaAttention"
                         content = '\n'.join(lines)
-                        patched = True
-                        print(f"🔧 Applied single-line patch (line {i})")
                         break
-        # Last resort: find any line containing LlamaFlashAttention2 import and add fallback
-        if not patched:
             lines_for_fallback = content.split('\n')
             for i, line in enumerate(lines_for_fallback):
                 if 'LlamaFlashAttention2' in line and 'from transformers.models.llama.modeling_llama' in line:
@@ -372,18 +392,18 @@ except ImportError:
                     ])
                     new_lines.extend(lines_for_fallback[i+1:])
                     content = '\n'.join(new_lines)
-                    print(f"✅ Added fallback import block after line {i}")
-                    patched = True
                     break
-        if patched:
             # Write file if content was modified
             # (fallback already writes immediately, regex patterns modify content then write here)
             with open(model_file, 'w', encoding='utf-8') as f:
                 f.write(content)
-            print(f"✅ Successfully patched DeepSeek model file: {model_file}")
         else:
-            print(f"⚠️  Could not find LlamaFlashAttention2 import to patch in {model_file}")
             # Show a snippet around potential import lines for debugging
             lines = content.split('\n')
             for i, line in enumerate(lines):
@@ -414,11 +434,8 @@ async def get_ocr_model():
     if _ocr_model is None or _ocr_tokenizer is None:
         async with _model_lock:
             if _ocr_model is None or _ocr_tokenizer is None:
-                # Patch DeepSeek model code for compatibility BEFORE loading
-                # Works on HuggingFace Spaces (CPU) and M4 Macs (Apple Silicon)
-                _patch_deepseek_model_for_compatibility()
                 # Lazy import dependencies
                 AutoModel, AutoTokenizer = _get_transformers()
                 torch = _get_torch()
@@ -428,14 +445,14 @@ async def get_ocr_model():
                 print(f"  - Crop mode: {CROP_MODE} (best accuracy)")
                 # Load tokenizer first (this triggers model download if needed)
                 _ocr_tokenizer = AutoTokenizer.from_pretrained(
-                    MODEL_NAME, trust_remote_code=True
                 )
-                # Patch AFTER tokenizer loads (model files are now downloaded)
-                # This ensures the model files exist before we try to patch
-                print("  - Patching model code for compatibility...")
-                _patch_deepseek_model_for_compatibility()
                 # Load model with compatibility settings
                 # Use SDPA attention to avoid LlamaFlashAttention2 import errors
@@ -451,15 +468,20 @@ async def get_ocr_model():
                     print("  - Using SDPA attention (HuggingFace Spaces/CPU optimized)")
                 try:
-                    _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
                 except Exception as e:
                     error_msg = str(e)
                     print(f"⚠️  Model load error: {error_msg}")
-                    # If still fails due to LlamaFlashAttention2, patch again and retry
                     if "LlamaFlashAttention2" in error_msg or "flash" in error_msg.lower():
-                        print("  - LlamaFlashAttention2 error detected, patching again...")
-                        _patch_deepseek_model_for_compatibility()  # Patch again in case files changed
-                        print("  - Retrying model load...")
                         _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
                     else:
                         raise

 # DeepSeek-OCR Model Configuration - Maximum Quality Settings for M4 Mac (Apple Silicon)
 MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
+# PIN MODEL REVISION to prevent auto-updates that break compatibility
+# Use a commit hash from https://huggingface.co/deepseek-ai/DeepSeek-OCR/tree/main
+# This prevents "A new version of ... was downloaded" warnings and keeps code stable
+MODEL_REVISION = os.getenv("DEEPSEEK_MODEL_REVISION", "2c968b433af61a059311cbf8997765023806a24d")  # Latest stable commit
 # Detect Apple Silicon (M1/M2/M3/M4) - use MPS if available, otherwise CPU
 IS_APPLE_SILICON = platform.machine() == "arm64"
 USE_GPU = os.getenv("USE_GPU", "true").lower() == "true" and not IS_APPLE_SILICON  # M4 uses MPS, not CUDA
         Path(os.getenv("TRANSFORMERS_CACHE", "")) / "hub" if os.getenv("TRANSFORMERS_CACHE") else None,
     ]
+    # Patch ALL files that might import LlamaFlashAttention2
     model_files = []
     for cache_dir in possible_cache_dirs:
         if cache_dir and cache_dir.exists():
             try:
+                # Find all Python files in the DeepSeek-OCR model directory
+                found = list(cache_dir.glob(f"**/models--deepseek-ai--DeepSeek-OCR/**/*.py"))
+                # Filter for the files that might import LlamaFlashAttention2
+                relevant_files = [
+                    f for f in found
+                    if any(pattern in f.name for pattern in [
+                        'modeling_deepseekv2.py',
+                        'modeling_deepseekocr.py',
+                        'modeling_llama.py'  # In case it's in a nested location
+                    ])
+                ]
+                model_files.extend(relevant_files)
             except Exception:
                 continue
     if not model_files:
+        print("⚠️  Model files not found yet - will patch on first model load")
         return  # Model not downloaded yet, will patch on first load
+    # Patch all found files
+    print(f"🔍 Found {len(model_files)} model file(s) to patch")
+    for model_file in model_files:
+        print(f"   - {model_file.name}")
+    # Patch each file
     import re
+    for model_file in model_files:
+        print(f"\n🔧 Patching: {model_file.name}")
+        patched_this_file = False
+        # Check if already patched
+        try:
+            with open(model_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+                if "LlamaFlashAttention2 = LlamaAttention" in content:
+                    print(f"   ✅ Already patched, skipping")
+                    continue  # Already patched, move to next file
+                original_content = content  # Save original for comparison
+        except Exception as e:
+            print(f"   ⚠️  Could not read model file to check patch status: {e}")
+            continue  # Skip this file, try next
+        # More flexible approach: find and replace any import containing LlamaFlashAttention2
+        try:
+            # Pattern 1: Multi-line import with parentheses
+            # Matches: from transformers.models.llama.modeling_llama import (\n    ...\n    LlamaFlashAttention2\n    )
+            multiline_pattern = r'from transformers\.models\.llama\.modeling_llama import\s*\([^)]*LlamaFlashAttention2[^)]*\)'
+            # Pattern 2: Single-line import with parentheses
+            singleline_parentheses_pattern = r'from transformers\.models\.llama\.modeling_llama import\s*\([^)]*LlamaFlashAttention2[^)]*\)'
+            # Pattern 3: Direct import without parentheses
+            direct_import_pattern = r'from transformers\.models\.llama\.modeling_llama import[^;]*LlamaFlashAttention2[^;\n]*'
+            # Try multiline replacement first (most common)
+            if re.search(multiline_pattern, content, re.MULTILINE | re.DOTALL):
+                # Create backup
             backup_file = model_file.with_suffix('.py.backup')
             try:
                 import shutil
     LlamaFlashAttention2 = LlamaAttention"""
                 content = re.sub(multiline_pattern, replacement, content, flags=re.MULTILINE | re.DOTALL)
+                patched_this_file = True
+                print("   🔧 Applied multiline import patch")
         # Try single-line parentheses pattern
         elif re.search(singleline_parentheses_pattern, content):
     LlamaFlashAttention2 = LlamaAttention"""
                 content = re.sub(singleline_parentheses_pattern, replacement, content)
+                patched_this_file = True
+                print("   🔧 Applied single-line parentheses import patch")
         # Try direct import pattern (no parentheses)
         elif re.search(direct_import_pattern, content):
     LlamaFlashAttention2 = LlamaAttention"""
             content = re.sub(direct_import_pattern, replacement, content)
+            patched_this_file = True
+            print("   🔧 Applied direct import patch")
+            # Last resort: find any line containing LlamaFlashAttention2 import and replace
+            if not patched_this_file:
             lines = content.split('\n')
             for i, line in enumerate(lines):
                 if 'LlamaFlashAttention2' in line and 'from transformers.models.llama.modeling_llama' in line:
                     # Create backup on first match
+                    backup_file = model_file.with_suffix('.py.backup')
+                    try:
+                        import shutil
+                        shutil.copy2(model_file, backup_file)
+                        print(f"   📋 Created backup: {backup_file.name}")
+                    except Exception as backup_err:
+                        print(f"   ⚠️  Could not create backup: {backup_err}")
                     # Replace the import line(s)
                     # Handle multiline imports
                         # Replace the block
                         lines[i:j+1] = replacement_lines
                         content = '\n'.join(lines)
+                        patched_this_file = True
+                        print(f"   🔧 Applied line-by-line patch (lines {i}-{j})")
                         break
                     else:
                         # Single line import
                         lines[i] = "# Patch: LlamaFlashAttention2 import with fallback\ntry:\n    from transformers.models.llama.modeling_llama import LlamaFlashAttention2\nexcept ImportError:\n    from transformers.models.llama.modeling_llama import LlamaAttention\n    LlamaFlashAttention2 = LlamaAttention"
                         content = '\n'.join(lines)
+                        patched_this_file = True
+                        print(f"   🔧 Applied single-line patch (line {i})")
                         break
+            # Last resort: find any line containing LlamaFlashAttention2 import and add fallback
+            if not patched_this_file:
             lines_for_fallback = content.split('\n')
             for i, line in enumerate(lines_for_fallback):
                 if 'LlamaFlashAttention2' in line and 'from transformers.models.llama.modeling_llama' in line:
                     ])
                     new_lines.extend(lines_for_fallback[i+1:])
                     content = '\n'.join(new_lines)
+                    print(f"   ✅ Added fallback import block after line {i}")
+                    patched_this_file = True
                     break
+            if patched_this_file:
             # Write file if content was modified
             # (fallback already writes immediately, regex patterns modify content then write here)
             with open(model_file, 'w', encoding='utf-8') as f:
                 f.write(content)
+            print(f"   ✅ Successfully patched: {model_file.name}")
         else:
+            print(f"   ⚠️  Could not find LlamaFlashAttention2 import to patch in {model_file.name}")
             # Show a snippet around potential import lines for debugging
             lines = content.split('\n')
             for i, line in enumerate(lines):
     if _ocr_model is None or _ocr_tokenizer is None:
         async with _model_lock:
             if _ocr_model is None or _ocr_tokenizer is None:
                 # Lazy import dependencies
+                # Note: Patching no longer needed - we pin transformers==4.46.3 and model revision
                 AutoModel, AutoTokenizer = _get_transformers()
                 torch = _get_torch()
                 print(f"  - Crop mode: {CROP_MODE} (best accuracy)")
                 # Load tokenizer first (this triggers model download if needed)
+                # PIN REVISION to prevent auto-updates that break compatibility
+                print("  - Loading tokenizer (pinned to revision for stability)...")
                 _ocr_tokenizer = AutoTokenizer.from_pretrained(
+                    MODEL_NAME,
+                    trust_remote_code=True,
+                    revision=MODEL_REVISION  # Pin revision to prevent code changes
                 )
+                print("  - Tokenizer loaded successfully")
                 # Load model with compatibility settings
                 # Use SDPA attention to avoid LlamaFlashAttention2 import errors
                     print("  - Using SDPA attention (HuggingFace Spaces/CPU optimized)")
                 try:
+                    # PIN REVISION to prevent auto-updates that break compatibility
+                    _ocr_model = AutoModel.from_pretrained(
+                        MODEL_NAME,
+                        revision=MODEL_REVISION,  # Pin revision to prevent code changes
+                        **load_kwargs
+                    )
                 except Exception as e:
                     error_msg = str(e)
                     print(f"⚠️  Model load error: {error_msg}")
+                    # If still fails, try without revision pin (fallback)
                     if "LlamaFlashAttention2" in error_msg or "flash" in error_msg.lower():
+                        print("  - LlamaFlashAttention2 error detected")
+                        print("  - This should not happen with transformers==4.46.3")
+                        print("  - Retrying without revision pin as fallback...")
                         _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
                     else:
                         raise