make789 commited on
Commit
3c00a51
·
verified ·
1 Parent(s): 14305f4

Upload 2 files

Browse files
Files changed (2) hide show
  1. ocr_service.py +20 -8
  2. requirements.txt +9 -7
ocr_service.py CHANGED
@@ -455,11 +455,13 @@ async def get_ocr_model():
455
  print(" - Tokenizer loaded successfully")
456
 
457
  # Load model with compatibility settings
458
- # Use SDPA attention to avoid LlamaFlashAttention2 import errors
 
 
459
  load_kwargs = {
460
  "trust_remote_code": True,
461
- "use_safetensors": False, # Avoid safetensors issues
462
- "_attn_implementation": "sdpa", # Force SDPA (works on HuggingFace Spaces and Apple Silicon)
463
  }
464
 
465
  if IS_APPLE_SILICON:
@@ -485,16 +487,20 @@ async def get_ocr_model():
485
  _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
486
  else:
487
  raise
 
488
  _ocr_model = _ocr_model.eval()
489
 
490
- # Handle device placement for M4 Mac (Apple Silicon)
491
  if USE_MPS and torch.backends.mps.is_available():
 
492
  _ocr_model = _ocr_model.to("mps")
493
  print(" - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
494
  elif USE_GPU and torch.cuda.is_available():
 
495
  _ocr_model = _ocr_model.cuda().to(torch.bfloat16)
496
- print(" - DeepSeek-OCR loaded on NVIDIA GPU")
497
  else:
 
498
  print(" - DeepSeek-OCR loaded on CPU")
499
  return _ocr_model, _ocr_tokenizer
500
 
@@ -516,15 +522,21 @@ async def run_deepseek_ocr(
516
 
517
  try:
518
  # Maximum quality inference - best OCR quality settings
 
 
 
 
 
 
519
  result = model.infer(
520
  tokenizer,
521
  prompt=prompt,
522
  image_file=image_path,
523
  output_path=output_path,
524
- base_size=BASE_SIZE, # 1280 = maximum quality (not light version!)
525
- image_size=IMAGE_SIZE, # 1280 = maximum quality (not light version!)
526
  crop_mode=CROP_MODE, # True = best accuracy for complex documents
527
- save_results=False,
528
  test_compress=False, # False = maximum quality, no compression
529
  )
530
 
 
455
  print(" - Tokenizer loaded successfully")
456
 
457
  # Load model with compatibility settings
458
+ # Official DeepSeek-OCR usage: https://huggingface.co/deepseek-ai/DeepSeek-OCR
459
+ # GPU version uses: _attn_implementation='flash_attention_2', use_safetensors=True
460
+ # CPU/Spaces version uses: _attn_implementation='sdpa', use_safetensors=True
461
  load_kwargs = {
462
  "trust_remote_code": True,
463
+ "use_safetensors": True, # Official usage recommends True
464
+ "_attn_implementation": "sdpa", # Use SDPA for CPU/Spaces (GPU would use 'flash_attention_2')
465
  }
466
 
467
  if IS_APPLE_SILICON:
 
487
  _ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
488
  else:
489
  raise
490
+ # Set to eval mode (as per official usage)
491
  _ocr_model = _ocr_model.eval()
492
 
493
+ # Handle device placement (per official DeepSeek-OCR usage)
494
  if USE_MPS and torch.backends.mps.is_available():
495
+ # Apple Silicon: MPS (Metal Performance Shaders)
496
  _ocr_model = _ocr_model.to("mps")
497
  print(" - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
498
  elif USE_GPU and torch.cuda.is_available():
499
+ # NVIDIA GPU: CUDA with bfloat16 (per official usage)
500
  _ocr_model = _ocr_model.cuda().to(torch.bfloat16)
501
+ print(" - DeepSeek-OCR loaded on NVIDIA GPU (CUDA + bfloat16)")
502
  else:
503
+ # CPU: No device placement needed
504
  print(" - DeepSeek-OCR loaded on CPU")
505
  return _ocr_model, _ocr_tokenizer
506
 
 
522
 
523
  try:
524
  # Maximum quality inference - best OCR quality settings
525
+ # Official DeepSeek-OCR quality presets (from https://huggingface.co/deepseek-ai/DeepSeek-OCR):
526
+ # - Tiny: base_size=512, image_size=512, crop_mode=False
527
+ # - Small: base_size=640, image_size=640, crop_mode=False
528
+ # - Base: base_size=1024, image_size=1024, crop_mode=False
529
+ # - Large: base_size=1280, image_size=1280, crop_mode=False ← We use this for max quality!
530
+ # - Gundam: base_size=1024, image_size=640, crop_mode=True
531
  result = model.infer(
532
  tokenizer,
533
  prompt=prompt,
534
  image_file=image_path,
535
  output_path=output_path,
536
+ base_size=BASE_SIZE, # 1280 = Large quality (maximum quality!)
537
+ image_size=IMAGE_SIZE, # 1280 = Large quality (maximum quality!)
538
  crop_mode=CROP_MODE, # True = best accuracy for complex documents
539
+ save_results=False, # Don't save intermediate files
540
  test_compress=False, # False = maximum quality, no compression
541
  )
542
 
requirements.txt CHANGED
@@ -7,15 +7,17 @@ python-multipart>=0.0.6
7
  pillow>=10.0.0
8
  numpy>=1.24.0
9
 
10
- # DeepSeek-OCR dependencies - MAXIMUM QUALITY (not light versions!)
11
- torch>=2.6.0
12
- torchvision>=0.19.0
13
- transformers>=4.46.3,<5.0.0 # Compatible version avoiding LlamaFlashAttention2 issues
14
- tokenizers>=0.20.3
 
15
  einops>=0.7.0
16
  addict>=2.4.0
17
  easydict>=1.9
18
  matplotlib>=3.8.0
19
- # Note: Using default attention implementation to avoid compatibility issues
20
- # Flash attention for GPU acceleration (install separately if needed: pip install flash-attn==2.7.3 --no-build-isolation)
 
21
 
 
7
  pillow>=10.0.0
8
  numpy>=1.24.0
9
 
10
+ # DeepSeek-OCR dependencies - PINNED VERSIONS (tested working set)
11
+ # IMPORTANT: Pin to exact versions to avoid LlamaFlashAttention2 import errors
12
+ torch==2.6.0
13
+ torchvision==0.21.0 # Match torch version
14
+ transformers==4.46.3 # Critical: Must be 4.46.3 (4.47+ removed LlamaFlashAttention2)
15
+ tokenizers==0.20.3 # Exact version from model card
16
  einops>=0.7.0
17
  addict>=2.4.0
18
  easydict>=1.9
19
  matplotlib>=3.8.0
20
+ tqdm
21
+ # Note: Flash attention NOT needed for CPU/Spaces deployment
22
+ # GPU only: flash-attn==2.7.3 --no-build-isolation
23