Spaces:
Running
Running
## PDF processing up to text extraction | |
import os | |
import shutil | |
import fitz | |
from PIL import Image | |
import numpy as np | |
import cv2 | |
import pytesseract | |
from pytesseract import Output | |
import zipfile | |
from pdf2image import convert_from_path | |
import json | |
def convert_to_rgb(image_path): | |
img = Image.open(image_path) | |
rgb_img = img.convert("RGB") | |
return rgb_img | |
def preprocess_image(image): | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21) | |
resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) | |
return resized | |
def extract_vertical_blocks(image): | |
image_np = np.array(image) | |
data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT) | |
blocks = [] | |
current_block = "" | |
current_block_coords = [float('inf'), float('inf'), 0, 0] | |
last_bottom = -1 | |
line_height = 0 | |
for i in range(len(data['text'])): | |
if int(data['conf'][i]) > 0: | |
text = data['text'][i] | |
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i] | |
if line_height == 0: | |
line_height = h * 1.2 | |
if y > last_bottom + line_height: | |
if current_block: | |
blocks.append({ | |
"text": current_block.strip(), | |
"coords": current_block_coords | |
}) | |
current_block = "" | |
current_block_coords = [float('inf'), float('inf'), 0, 0] | |
current_block += text + " " | |
current_block_coords[0] = min(current_block_coords[0], x) | |
current_block_coords[1] = min(current_block_coords[1], y) | |
current_block_coords[2] = max(current_block_coords[2], x + w) | |
current_block_coords[3] = max(current_block_coords[3], y + h) | |
last_bottom = y + h | |
if current_block: | |
blocks.append({ | |
"text": current_block.strip(), | |
"coords": current_block_coords | |
}) | |
return blocks | |
def draw_blocks_on_image(image_path, blocks, output_path): | |
image = cv2.imread(image_path) | |
for block in blocks: | |
coords = block['coords'] | |
cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2) | |
cv2.imwrite(output_path, image) | |
return output_path | |
def process_image(image, output_folder, page_number): | |
image = convert_to_rgb(image) | |
blocks = extract_vertical_blocks(image) | |
base_name = f'page_{page_number + 1}.png' | |
image_path = os.path.join(output_folder, base_name) | |
image.save(image_path) | |
annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}') | |
annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path) | |
return blocks, annotated_image_path | |
def save_extracted_text(blocks, page_number, output_folder): | |
text_file_path = os.path.join(output_folder, 'extracted_text.txt') | |
with open(text_file_path, 'a', encoding='utf-8') as f: | |
f.write(f"[PAGE {page_number}]\n") | |
for block in blocks: | |
f.write(block['text'] + "\n") | |
f.write("[FIN DE PAGE]\n\n") | |
return text_file_path |