Report-Generator / processing.py
Jaimodiji's picture
Upload folder using huggingface_hub
92a22cd verified
import os
import base64
import io
import re
import json
import requests
import cv2
import numpy as np
from PIL import Image
from flask import current_app
from api_key_manager import get_api_key_manager
# --- NVIDIA NIM Configuration ---
NIM_API_URL = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
def resize_image_if_needed(image_path: str) -> bytes:
"""Resizes an image to a maximum of 500x500 pixels and returns bytes."""
with Image.open(image_path) as image:
MAX_SIZE = 500
width, height = image.size
if width > height:
new_width = min(width, MAX_SIZE)
new_height = int(height * (new_width / width))
else:
new_height = min(height, MAX_SIZE)
new_width = int(width * (new_height / height))
if new_width > MAX_SIZE:
new_width = MAX_SIZE
new_height = int(height * (new_width / width))
if new_height > MAX_SIZE:
new_height = MAX_SIZE
new_width = int(width * (new_height / height))
resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
if resized_image.mode == 'RGBA':
resized_image = resized_image.convert('RGB')
img_byte_arr = io.BytesIO()
resized_image.save(img_byte_arr, format='JPEG', quality=85, optimize=True)
image_bytes = img_byte_arr.getvalue()
base64_size = len(base64.b64encode(image_bytes).decode('utf-8'))
if base64_size > 180000:
quality = max(50, int(85 * (180000 / base64_size)))
img_byte_arr = io.BytesIO()
resized_image.save(img_byte_arr, format='JPEG', quality=quality, optimize=True)
image_bytes = img_byte_arr.getvalue()
return image_bytes
def call_nim_ocr_api(image_bytes: bytes):
"""Calls the NVIDIA NIM API to perform OCR on an image."""
# Get API key from the manager
manager = get_api_key_manager()
api_key, key_index = manager.get_key('nvidia')
if not api_key:
raise Exception("No available NVIDIA API keys. Please set NVIDIA_API_KEY environment variable.")
NIM_HEADERS = {
"Authorization": f"Bearer {api_key}",
"Accept": "application/json",
"Content-Type": "application/json",
}
base64_encoded_data = base64.b64encode(image_bytes)
base64_string = base64_encoded_data.decode('utf-8')
if len(base64_string) > 180000:
raise Exception("Image too large. To upload larger images, use the assets API.")
image_url = f"data:image/png;base64,{base64_string}"
payload = {
"input": [
{
"type": "image_url",
"url": image_url
}
]
}
try:
response = requests.post(NIM_API_URL, headers=NIM_HEADERS, json=payload, timeout=300)
response.raise_for_status()
result = response.json()
manager.mark_success('nvidia', key_index)
return result
except requests.exceptions.RequestException as e:
manager.mark_failure('nvidia', key_index)
error_detail = str(e)
if e.response is not None:
try:
error_detail = e.response.json().get("error", e.response.text)
except json.JSONDecodeError:
error_detail = e.response.text
raise Exception(f"NIM API Error: {error_detail}")
def extract_question_number_from_ocr_result(ocr_result: dict) -> str:
"""Extracts the question number from the OCR result."""
try:
if "data" in ocr_result and len(ocr_result["data"]) > 0:
text_detections = ocr_result["data"][0].get("text_detections", [])
content = " ".join([detection["text_prediction"]["text"] for detection in text_detections])
else:
content = str(ocr_result)
match = re.search(r'^\s*(\d+)', content)
if match:
return match.group(1)
match = re.search(r'(?:^|\s)(?:[Qq][\.:]?\s*|QUESTION\s+)(\d+)', content, re.IGNORECASE)
if match:
return match.group(1)
match = re.search(r'^\s*(\d+)[\.\)]', content)
if match:
return match.group(1)
return ""
except (KeyError, IndexError, TypeError):
return ""
def crop_image_perspective(image_path, points):
if len(points) < 4: return cv2.imread(image_path)
img = cv2.imread(image_path)
if img is None: raise ValueError("Could not read the image file.")
height, width = img.shape[:2]
def clamp(val): return max(0.0, min(1.0, val))
src_points = np.array([[clamp(p.get('x', 0.0)) * width, clamp(p.get('y', 0.0)) * height] for p in points[:4]], dtype=np.float32)
(tl, tr, br, bl) = src_points
width_top, width_bottom = np.linalg.norm(tr - tl), np.linalg.norm(br - bl)
max_width = int(max(width_top, width_bottom))
height_right, height_left = np.linalg.norm(tr - br), np.linalg.norm(tl - bl)
max_height = int(max(height_right, height_left))
if max_width == 0 or max_height == 0: return img
dst_points = np.array([[0, 0], [max_width - 1, 0], [max_width - 1, max_height - 1], [0, max_height - 1]], dtype=np.float32)
matrix = cv2.getPerspectiveTransform(src_points, dst_points)
return cv2.warpPerspective(img, matrix, (max_width, max_height))
def create_pdf_from_full_images(image_paths, output_filename, resolution=300.0):
"""
Creates a PDF from a list of full-page images, preserving image quality
by creating pages of the same size as the images.
"""
if not image_paths:
return False
try:
pdf_pages = []
for image_path in image_paths:
try:
with Image.open(image_path) as img:
# Ensure image is in a format that can be saved to PDF
img = img.convert('RGB')
# Create a new image with a white background of the same size.
# This avoids issues with alpha channels and ensures consistency.
page = Image.new('RGB', img.size, 'white')
page.paste(img, (0, 0))
pdf_pages.append(page)
except Exception as e:
print(f"Error opening or processing image {image_path}: {e}")
if not pdf_pages:
return False
# Save the first page and append the rest
pdf_pages[0].save(
output_filename,
"PDF",
save_all=True,
append_images=pdf_pages[1:],
resolution=resolution
)
return True
except Exception as e:
print(f"Error saving final PDF: {e}")
return False
def remove_color_from_image(image_path, target_colors, threshold, bg_mode, region_box=None):
"""
Removes specific colors from an image using CIELAB Delta E distance.
Uses manual RGB->Lab conversion to strictly match frontend JS logic (Standard CIELAB).
"""
# Read image (OpenCV loads as BGR)
img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
if img is None:
raise ValueError(f"Could not read image: {image_path}")
# Handle Alpha Channel
if img.shape[2] == 3:
img = cv2.cvtColor(img, cv2.COLOR_BGR2BGRA)
# 1. PREPARE IMAGE (BGR -> RGB -> Normalized Float)
# We work on a copy for calculation
img_bgr = img[:, :, :3]
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
# Normalize to 0-1 for formula consistency with typical JS/CSS definitions
# (Frontend JS might be using 0-255 raw, let's verify frontend code provided earlier)
# Frontend code: r = rgb[0] / 255 ...
# Yes, frontend normalizes.
rgb_norm = img_rgb.astype(np.float32) / 255.0
# 2. RGB to XYZ (Vectorized)
# Formula matches JS: r = (r > 0.04045) ? ...
mask_linear = rgb_norm > 0.04045
rgb_linear = np.where(mask_linear, np.power((rgb_norm + 0.055) / 1.055, 2.4), rgb_norm / 12.92)
R, G, B = rgb_linear[:,:,0], rgb_linear[:,:,1], rgb_linear[:,:,2]
X = R * 0.4124 + G * 0.3576 + B * 0.1805
Y = R * 0.2126 + G * 0.7152 + B * 0.0722
Z = R * 0.0193 + G * 0.1192 + B * 0.9505
# Scale XYZ
X /= 0.95047
Y /= 1.00000
Z /= 1.08883
# 3. XYZ to Lab
# Formula: x = (x > 0.008856) ? ...
xyz_stack = np.stack([X, Y, Z], axis=-1)
mask_xyz = xyz_stack > 0.008856
f_xyz = np.where(mask_xyz, np.power(xyz_stack, 1/3), (7.787 * xyz_stack) + 16/116)
fx, fy, fz = f_xyz[:,:,0], f_xyz[:,:,1], f_xyz[:,:,2]
L_chn = (116.0 * fy) - 16.0
a_chn = 500.0 * (fx - fy)
b_chn = 200.0 * (fy - fz)
# 4. CALCULATE DISTANCE
# Threshold mapping matches frontend
max_delta_e = 110.0 - (float(threshold) * 100.0)
max_dist_sq = max_delta_e ** 2
final_keep_mask = np.zeros(L_chn.shape, dtype=bool)
if target_colors:
# Convert Targets (RGB -> Lab) using same math
# Since targets are few, we can do simple loop or small array
for c in target_colors:
# Normalize
r, g, b = c['r']/255.0, c['g']/255.0, c['b']/255.0
# Linearize
r = ((r + 0.055) / 1.055) ** 2.4 if r > 0.04045 else r / 12.92
g = ((g + 0.055) / 1.055) ** 2.4 if g > 0.04045 else g / 12.92
b = ((b + 0.055) / 1.055) ** 2.4 if b > 0.04045 else b / 12.92
# XYZ
x = (r * 0.4124 + g * 0.3576 + b * 0.1805) / 0.95047
y = (r * 0.2126 + g * 0.7152 + b * 0.0722) / 1.00000
z = (r * 0.0193 + g * 0.1192 + b * 0.9505) / 1.08883
# Lab
fx = x ** (1/3) if x > 0.008856 else (7.787 * x) + 16/116
fy = y ** (1/3) if y > 0.008856 else (7.787 * y) + 16/116
fz = z ** (1/3) if z > 0.008856 else (7.787 * z) + 16/116
tL = (116.0 * fy) - 16.0
ta = 500.0 * (fx - fy)
tb = 200.0 * (fy - fz)
# Dist
dist_sq = (L_chn - tL)**2 + (a_chn - ta)**2 + (b_chn - tb)**2
final_keep_mask |= (dist_sq <= max_dist_sq)
# Handle Region Box
if region_box:
h, w = img.shape[:2]
rx = int(region_box['x'] * w)
ry = int(region_box['y'] * h)
rw = int(region_box['w'] * w)
rh = int(region_box['h'] * h)
# Mask is TRUE everywhere EXCEPT the region (Keep outside)
region_protection_mask = np.ones(L_chn.shape, dtype=bool)
# Ensure coords are within bounds
ry = max(0, ry); rx = max(0, rx)
if rw > 0 and rh > 0:
region_protection_mask[ry:ry+rh, rx:rx+rw] = False
final_keep_mask |= region_protection_mask
# Apply Mask to Image
result = img.copy()
if bg_mode == 'black':
bg_color = [0, 0, 0, 255]
elif bg_mode == 'white':
bg_color = [255, 255, 255, 255]
else: # transparent
bg_color = [0, 0, 0, 0]
remove_mask = ~final_keep_mask
result[remove_mask] = bg_color
return result