Spaces:

pierreguillou
/

arquiteturia

Running

App Files Files Community

pierreguillou commited on 22 days ago

Commit

c1c25ed

•

1 Parent(s): ac16355

Create text_extraction.py

Browse files

Files changed (1) hide show

helpers/text_extraction.py +95 -0

helpers/text_extraction.py ADDED Viewed

	@@ -0,0 +1,95 @@

+## PDF processing up to text extraction
+import os
+import shutil
+import fitz
+from PIL import Image
+import numpy as np
+import cv2
+import pytesseract
+from pytesseract import Output
+import zipfile
+from pdf2image import convert_from_path
+import json
+def convert_to_rgb(image_path):
+    img = Image.open(image_path)
+    rgb_img = img.convert("RGB")
+    return rgb_img
+def preprocess_image(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
+    resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
+    return resized
+def extract_vertical_blocks(image):
+    image_np = np.array(image)
+    data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)
+    blocks = []
+    current_block = ""
+    current_block_coords = [float('inf'), float('inf'), 0, 0]
+    last_bottom = -1
+    line_height = 0
+    for i in range(len(data['text'])):
+        if int(data['conf'][i]) > 0:
+            text = data['text'][i]
+            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
+            if line_height == 0:
+                line_height = h * 1.2
+            if y > last_bottom + line_height:
+                if current_block:
+                    blocks.append({
+                        "text": current_block.strip(),
+                        "coords": current_block_coords
+                    })
+                    current_block = ""
+                    current_block_coords = [float('inf'), float('inf'), 0, 0]
+            current_block += text + " "
+            current_block_coords[0] = min(current_block_coords[0], x)
+            current_block_coords[1] = min(current_block_coords[1], y)
+            current_block_coords[2] = max(current_block_coords[2], x + w)
+            current_block_coords[3] = max(current_block_coords[3], y + h)
+            last_bottom = y + h
+    if current_block:
+        blocks.append({
+            "text": current_block.strip(),
+            "coords": current_block_coords
+        })
+    return blocks
+def draw_blocks_on_image(image_path, blocks, output_path):
+    image = cv2.imread(image_path)
+    for block in blocks:
+        coords = block['coords']
+        cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
+    cv2.imwrite(output_path, image)
+    return output_path
+def process_image(image, output_folder, page_number):
+    image = convert_to_rgb(image)
+    blocks = extract_vertical_blocks(image)
+    base_name = f'page_{page_number + 1}.png'
+    image_path = os.path.join(output_folder, base_name)
+    image.save(image_path)
+    annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
+    annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
+    return blocks, annotated_image_path
+def save_extracted_text(blocks, page_number, output_folder):
+    text_file_path = os.path.join(output_folder, 'extracted_text.txt')
+    with open(text_file_path, 'a', encoding='utf-8') as f:
+        f.write(f"[PAGE {page_number}]\n")
+        for block in blocks:
+            f.write(block['text'] + "\n")
+        f.write("[FIN DE PAGE]\n\n")
+    return text_file_path