Spaces:
Running
Running
Upload 2 files
Browse files- ocr_service.py +20 -8
- requirements.txt +9 -7
ocr_service.py
CHANGED
|
@@ -455,11 +455,13 @@ async def get_ocr_model():
|
|
| 455 |
print(" - Tokenizer loaded successfully")
|
| 456 |
|
| 457 |
# Load model with compatibility settings
|
| 458 |
-
#
|
|
|
|
|
|
|
| 459 |
load_kwargs = {
|
| 460 |
"trust_remote_code": True,
|
| 461 |
-
"use_safetensors":
|
| 462 |
-
"_attn_implementation": "sdpa", #
|
| 463 |
}
|
| 464 |
|
| 465 |
if IS_APPLE_SILICON:
|
|
@@ -485,16 +487,20 @@ async def get_ocr_model():
|
|
| 485 |
_ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
|
| 486 |
else:
|
| 487 |
raise
|
|
|
|
| 488 |
_ocr_model = _ocr_model.eval()
|
| 489 |
|
| 490 |
-
# Handle device placement
|
| 491 |
if USE_MPS and torch.backends.mps.is_available():
|
|
|
|
| 492 |
_ocr_model = _ocr_model.to("mps")
|
| 493 |
print(" - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
|
| 494 |
elif USE_GPU and torch.cuda.is_available():
|
|
|
|
| 495 |
_ocr_model = _ocr_model.cuda().to(torch.bfloat16)
|
| 496 |
-
print(" - DeepSeek-OCR loaded on NVIDIA GPU")
|
| 497 |
else:
|
|
|
|
| 498 |
print(" - DeepSeek-OCR loaded on CPU")
|
| 499 |
return _ocr_model, _ocr_tokenizer
|
| 500 |
|
|
@@ -516,15 +522,21 @@ async def run_deepseek_ocr(
|
|
| 516 |
|
| 517 |
try:
|
| 518 |
# Maximum quality inference - best OCR quality settings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
result = model.infer(
|
| 520 |
tokenizer,
|
| 521 |
prompt=prompt,
|
| 522 |
image_file=image_path,
|
| 523 |
output_path=output_path,
|
| 524 |
-
base_size=BASE_SIZE, # 1280 =
|
| 525 |
-
image_size=IMAGE_SIZE, # 1280 =
|
| 526 |
crop_mode=CROP_MODE, # True = best accuracy for complex documents
|
| 527 |
-
save_results=False,
|
| 528 |
test_compress=False, # False = maximum quality, no compression
|
| 529 |
)
|
| 530 |
|
|
|
|
| 455 |
print(" - Tokenizer loaded successfully")
|
| 456 |
|
| 457 |
# Load model with compatibility settings
|
| 458 |
+
# Official DeepSeek-OCR usage: https://huggingface.co/deepseek-ai/DeepSeek-OCR
|
| 459 |
+
# GPU version uses: _attn_implementation='flash_attention_2', use_safetensors=True
|
| 460 |
+
# CPU/Spaces version uses: _attn_implementation='sdpa', use_safetensors=True
|
| 461 |
load_kwargs = {
|
| 462 |
"trust_remote_code": True,
|
| 463 |
+
"use_safetensors": True, # Official usage recommends True
|
| 464 |
+
"_attn_implementation": "sdpa", # Use SDPA for CPU/Spaces (GPU would use 'flash_attention_2')
|
| 465 |
}
|
| 466 |
|
| 467 |
if IS_APPLE_SILICON:
|
|
|
|
| 487 |
_ocr_model = AutoModel.from_pretrained(MODEL_NAME, **load_kwargs)
|
| 488 |
else:
|
| 489 |
raise
|
| 490 |
+
# Set to eval mode (as per official usage)
|
| 491 |
_ocr_model = _ocr_model.eval()
|
| 492 |
|
| 493 |
+
# Handle device placement (per official DeepSeek-OCR usage)
|
| 494 |
if USE_MPS and torch.backends.mps.is_available():
|
| 495 |
+
# Apple Silicon: MPS (Metal Performance Shaders)
|
| 496 |
_ocr_model = _ocr_model.to("mps")
|
| 497 |
print(" - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
|
| 498 |
elif USE_GPU and torch.cuda.is_available():
|
| 499 |
+
# NVIDIA GPU: CUDA with bfloat16 (per official usage)
|
| 500 |
_ocr_model = _ocr_model.cuda().to(torch.bfloat16)
|
| 501 |
+
print(" - DeepSeek-OCR loaded on NVIDIA GPU (CUDA + bfloat16)")
|
| 502 |
else:
|
| 503 |
+
# CPU: No device placement needed
|
| 504 |
print(" - DeepSeek-OCR loaded on CPU")
|
| 505 |
return _ocr_model, _ocr_tokenizer
|
| 506 |
|
|
|
|
| 522 |
|
| 523 |
try:
|
| 524 |
# Maximum quality inference - best OCR quality settings
|
| 525 |
+
# Official DeepSeek-OCR quality presets (from https://huggingface.co/deepseek-ai/DeepSeek-OCR):
|
| 526 |
+
# - Tiny: base_size=512, image_size=512, crop_mode=False
|
| 527 |
+
# - Small: base_size=640, image_size=640, crop_mode=False
|
| 528 |
+
# - Base: base_size=1024, image_size=1024, crop_mode=False
|
| 529 |
+
# - Large: base_size=1280, image_size=1280, crop_mode=False ← We use this for max quality!
|
| 530 |
+
# - Gundam: base_size=1024, image_size=640, crop_mode=True
|
| 531 |
result = model.infer(
|
| 532 |
tokenizer,
|
| 533 |
prompt=prompt,
|
| 534 |
image_file=image_path,
|
| 535 |
output_path=output_path,
|
| 536 |
+
base_size=BASE_SIZE, # 1280 = Large quality (maximum quality!)
|
| 537 |
+
image_size=IMAGE_SIZE, # 1280 = Large quality (maximum quality!)
|
| 538 |
crop_mode=CROP_MODE, # True = best accuracy for complex documents
|
| 539 |
+
save_results=False, # Don't save intermediate files
|
| 540 |
test_compress=False, # False = maximum quality, no compression
|
| 541 |
)
|
| 542 |
|
requirements.txt
CHANGED
|
@@ -7,15 +7,17 @@ python-multipart>=0.0.6
|
|
| 7 |
pillow>=10.0.0
|
| 8 |
numpy>=1.24.0
|
| 9 |
|
| 10 |
-
# DeepSeek-OCR dependencies -
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 15 |
einops>=0.7.0
|
| 16 |
addict>=2.4.0
|
| 17 |
easydict>=1.9
|
| 18 |
matplotlib>=3.8.0
|
| 19 |
-
|
| 20 |
-
# Flash attention
|
|
|
|
| 21 |
|
|
|
|
| 7 |
pillow>=10.0.0
|
| 8 |
numpy>=1.24.0
|
| 9 |
|
| 10 |
+
# DeepSeek-OCR dependencies - PINNED VERSIONS (tested working set)
|
| 11 |
+
# IMPORTANT: Pin to exact versions to avoid LlamaFlashAttention2 import errors
|
| 12 |
+
torch==2.6.0
|
| 13 |
+
torchvision==0.21.0 # Match torch version
|
| 14 |
+
transformers==4.46.3 # Critical: Must be 4.46.3 (4.47+ removed LlamaFlashAttention2)
|
| 15 |
+
tokenizers==0.20.3 # Exact version from model card
|
| 16 |
einops>=0.7.0
|
| 17 |
addict>=2.4.0
|
| 18 |
easydict>=1.9
|
| 19 |
matplotlib>=3.8.0
|
| 20 |
+
tqdm
|
| 21 |
+
# Note: Flash attention NOT needed for CPU/Spaces deployment
|
| 22 |
+
# GPU only: flash-attn==2.7.3 --no-build-isolation
|
| 23 |
|