Changed OCR process
Browse files- app/services/ocr_service.py +52 -43
app/services/ocr_service.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import pytesseract
|
| 2 |
import re
|
| 3 |
-
from PIL import Image
|
|
|
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
|
|
@@ -8,37 +9,39 @@ import os
|
|
| 8 |
class OCRService:
|
| 9 |
|
| 10 |
def __init__(self):
|
| 11 |
-
# Auto-detect tesseract path
|
| 12 |
if sys.platform.startswith("win"):
|
| 13 |
pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_PATH", "C:/Program Files/Tesseract-OCR/tesseract.exe")
|
| 14 |
else:
|
| 15 |
-
# Linux / Hugging Face Spaces
|
| 16 |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def extract(self, image: Image.Image) -> dict:
|
| 20 |
w, h = image.size
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
|
| 24 |
-
# --------------------------------
|
| 25 |
-
|
| 26 |
-
# Card name — top left area
|
| 27 |
-
name_region = image.crop((0.15 * w, 0.02 * h, 0.75 * w, 0.10 * h))
|
| 28 |
|
| 29 |
-
# HP — top right
|
| 30 |
-
hp_region = image.crop((0.
|
| 31 |
|
| 32 |
-
# Moves —
|
| 33 |
-
moves_region = image.crop((0.
|
| 34 |
|
| 35 |
# Full image for type detection
|
| 36 |
full_text = pytesseract.image_to_string(image)
|
| 37 |
|
| 38 |
-
# --------------------------------
|
| 39 |
-
# ----- EXTRACT FIELDS -----------
|
| 40 |
-
# --------------------------------
|
| 41 |
-
|
| 42 |
return {
|
| 43 |
"name": self._extract_name(name_region),
|
| 44 |
"hp": self._extract_hp(hp_region),
|
|
@@ -46,29 +49,27 @@ class OCRService:
|
|
| 46 |
"moves": self._extract_moves(moves_region),
|
| 47 |
}
|
| 48 |
|
| 49 |
-
# --------------------------------
|
| 50 |
-
# ----- EXTRACTORS ---------------
|
| 51 |
-
# --------------------------------
|
| 52 |
-
|
| 53 |
def _extract_name(self, region: Image.Image) -> str | None:
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
| 60 |
return text if text else None
|
| 61 |
|
| 62 |
def _extract_hp(self, region: Image.Image) -> str | None:
|
| 63 |
-
region =
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
)
|
| 67 |
-
text = pytesseract.image_to_string(region, config="--psm 7")
|
| 68 |
match = re.search(r'(\d+)\s*HP|HP\s*(\d+)', text, re.IGNORECASE)
|
| 69 |
if match:
|
| 70 |
return match.group(1) or match.group(2)
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def _extract_types(self, text: str) -> list[str] | None:
|
| 74 |
types = [
|
|
@@ -80,25 +81,33 @@ class OCRService:
|
|
| 80 |
return found if found else None
|
| 81 |
|
| 82 |
def _extract_moves(self, region: Image.Image) -> list[dict] | None:
|
| 83 |
-
region =
|
| 84 |
-
|
| 85 |
-
Image.LANCZOS
|
| 86 |
-
)
|
| 87 |
-
text = pytesseract.image_to_string(region)
|
| 88 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 89 |
|
| 90 |
moves = []
|
| 91 |
i = 0
|
| 92 |
while i < len(lines):
|
| 93 |
-
# Match
|
| 94 |
-
match = re.match(r'^([A-Z][a-zA-Z\s]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
if match:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
moves.append({
|
| 97 |
"name": match.group(1).strip(),
|
| 98 |
"damage": match.group(2).strip(),
|
| 99 |
-
"text":
|
| 100 |
})
|
| 101 |
-
i
|
| 102 |
else:
|
| 103 |
i += 1
|
| 104 |
|
|
|
|
| 1 |
import pytesseract
|
| 2 |
import re
|
| 3 |
+
from PIL import Image, ImageFilter, ImageEnhance
|
| 4 |
+
import numpy as np
|
| 5 |
import sys
|
| 6 |
import os
|
| 7 |
|
|
|
|
| 9 |
class OCRService:
|
| 10 |
|
| 11 |
def __init__(self):
|
|
|
|
| 12 |
if sys.platform.startswith("win"):
|
| 13 |
pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_PATH", "C:/Program Files/Tesseract-OCR/tesseract.exe")
|
| 14 |
else:
|
|
|
|
| 15 |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
| 16 |
+
|
| 17 |
+
def _preprocess(self, region: Image.Image, scale: int = 3) -> Image.Image:
|
| 18 |
+
"""Upscale, convert to grayscale, and threshold for better OCR."""
|
| 19 |
+
region = region.resize(
|
| 20 |
+
(region.width * scale, region.height * scale),
|
| 21 |
+
Image.LANCZOS
|
| 22 |
+
)
|
| 23 |
+
region = region.convert("L") # grayscale
|
| 24 |
+
# Increase contrast
|
| 25 |
+
region = ImageEnhance.Contrast(region).enhance(2.0)
|
| 26 |
+
# Threshold to black/white
|
| 27 |
+
region = region.point(lambda x: 0 if x < 140 else 255, "1").convert("L")
|
| 28 |
+
return region
|
| 29 |
|
| 30 |
def extract(self, image: Image.Image) -> dict:
|
| 31 |
w, h = image.size
|
| 32 |
|
| 33 |
+
# Name — skip "Basic Pokemon" line at very top, just grab name row
|
| 34 |
+
name_region = image.crop((0.05 * w, 0.06 * h, 0.72 * w, 0.13 * h))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
# HP — top right, large number + "HP" text
|
| 37 |
+
hp_region = image.crop((0.55 * w, 0.04 * h, 0.97 * w, 0.13 * h))
|
| 38 |
|
| 39 |
+
# Moves — middle to lower section
|
| 40 |
+
moves_region = image.crop((0.02 * w, 0.52 * h, 0.98 * w, 0.88 * h))
|
| 41 |
|
| 42 |
# Full image for type detection
|
| 43 |
full_text = pytesseract.image_to_string(image)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
return {
|
| 46 |
"name": self._extract_name(name_region),
|
| 47 |
"hp": self._extract_hp(hp_region),
|
|
|
|
| 49 |
"moves": self._extract_moves(moves_region),
|
| 50 |
}
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def _extract_name(self, region: Image.Image) -> str | None:
|
| 53 |
+
region = self._preprocess(region, scale=3)
|
| 54 |
+
text = pytesseract.image_to_string(region, config="--psm 7 --oem 3").strip()
|
| 55 |
+
# Clean up noise — keep only lines that look like a name
|
| 56 |
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 57 |
+
for line in lines:
|
| 58 |
+
# Skip lines that are clearly not a name
|
| 59 |
+
if re.search(r'[A-Z][a-z]+', line) and len(line) < 30:
|
| 60 |
+
return line
|
| 61 |
return text if text else None
|
| 62 |
|
| 63 |
def _extract_hp(self, region: Image.Image) -> str | None:
|
| 64 |
+
region = self._preprocess(region, scale=3)
|
| 65 |
+
text = pytesseract.image_to_string(region, config="--psm 6 --oem 3")
|
| 66 |
+
# Look for a number near "HP"
|
|
|
|
|
|
|
| 67 |
match = re.search(r'(\d+)\s*HP|HP\s*(\d+)', text, re.IGNORECASE)
|
| 68 |
if match:
|
| 69 |
return match.group(1) or match.group(2)
|
| 70 |
+
# Fallback: just grab any standalone number (the HP value)
|
| 71 |
+
match = re.search(r'\b(\d{2,3})\b', text)
|
| 72 |
+
return match.group(1) if match else None
|
| 73 |
|
| 74 |
def _extract_types(self, text: str) -> list[str] | None:
|
| 75 |
types = [
|
|
|
|
| 81 |
return found if found else None
|
| 82 |
|
| 83 |
def _extract_moves(self, region: Image.Image) -> list[dict] | None:
|
| 84 |
+
region = self._preprocess(region, scale=2)
|
| 85 |
+
text = pytesseract.image_to_string(region, config="--psm 6 --oem 3")
|
|
|
|
|
|
|
|
|
|
| 86 |
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 87 |
|
| 88 |
moves = []
|
| 89 |
i = 0
|
| 90 |
while i < len(lines):
|
| 91 |
+
# Match: "MoveName 10" or "MoveName 10+" or "MoveName" alone on a line (0 damage moves)
|
| 92 |
+
match = re.match(r'^([A-Z][a-zA-Z\s]{2,25}?)\s{2,}(\d+\+?)$', lines[i])
|
| 93 |
+
if not match:
|
| 94 |
+
# Try looser match for lines like "Psychic 10+"
|
| 95 |
+
match = re.match(r'^([A-Z][a-zA-Z]+)\s+(\d+\+?)$', lines[i])
|
| 96 |
+
|
| 97 |
if match:
|
| 98 |
+
# Collect any following lines as move description until next move or end
|
| 99 |
+
desc_lines = []
|
| 100 |
+
j = i + 1
|
| 101 |
+
while j < len(lines) and not re.match(r'^[A-Z][a-zA-Z\s]+\s+\d+', lines[j]):
|
| 102 |
+
desc_lines.append(lines[j])
|
| 103 |
+
j += 1
|
| 104 |
+
|
| 105 |
moves.append({
|
| 106 |
"name": match.group(1).strip(),
|
| 107 |
"damage": match.group(2).strip(),
|
| 108 |
+
"text": " ".join(desc_lines) if desc_lines else None
|
| 109 |
})
|
| 110 |
+
i = j
|
| 111 |
else:
|
| 112 |
i += 1
|
| 113 |
|