Spaces:
Running
Running
pierreguillou
commited on
Commit
•
c1c25ed
1
Parent(s):
ac16355
Create text_extraction.py
Browse files- helpers/text_extraction.py +95 -0
helpers/text_extraction.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## PDF processing up to text extraction
|
2 |
+
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import fitz
|
6 |
+
from PIL import Image
|
7 |
+
import numpy as np
|
8 |
+
import cv2
|
9 |
+
import pytesseract
|
10 |
+
from pytesseract import Output
|
11 |
+
import zipfile
|
12 |
+
from pdf2image import convert_from_path
|
13 |
+
import json
|
14 |
+
|
15 |
+
def convert_to_rgb(image_path):
|
16 |
+
img = Image.open(image_path)
|
17 |
+
rgb_img = img.convert("RGB")
|
18 |
+
return rgb_img
|
19 |
+
|
20 |
+
def preprocess_image(image):
|
21 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
22 |
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
23 |
+
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
|
24 |
+
resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
|
25 |
+
return resized
|
26 |
+
|
27 |
+
def extract_vertical_blocks(image):
|
28 |
+
image_np = np.array(image)
|
29 |
+
data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)
|
30 |
+
|
31 |
+
blocks = []
|
32 |
+
current_block = ""
|
33 |
+
current_block_coords = [float('inf'), float('inf'), 0, 0]
|
34 |
+
last_bottom = -1
|
35 |
+
line_height = 0
|
36 |
+
|
37 |
+
for i in range(len(data['text'])):
|
38 |
+
if int(data['conf'][i]) > 0:
|
39 |
+
text = data['text'][i]
|
40 |
+
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
|
41 |
+
|
42 |
+
if line_height == 0:
|
43 |
+
line_height = h * 1.2
|
44 |
+
|
45 |
+
if y > last_bottom + line_height:
|
46 |
+
if current_block:
|
47 |
+
blocks.append({
|
48 |
+
"text": current_block.strip(),
|
49 |
+
"coords": current_block_coords
|
50 |
+
})
|
51 |
+
current_block = ""
|
52 |
+
current_block_coords = [float('inf'), float('inf'), 0, 0]
|
53 |
+
|
54 |
+
current_block += text + " "
|
55 |
+
current_block_coords[0] = min(current_block_coords[0], x)
|
56 |
+
current_block_coords[1] = min(current_block_coords[1], y)
|
57 |
+
current_block_coords[2] = max(current_block_coords[2], x + w)
|
58 |
+
current_block_coords[3] = max(current_block_coords[3], y + h)
|
59 |
+
|
60 |
+
last_bottom = y + h
|
61 |
+
|
62 |
+
if current_block:
|
63 |
+
blocks.append({
|
64 |
+
"text": current_block.strip(),
|
65 |
+
"coords": current_block_coords
|
66 |
+
})
|
67 |
+
|
68 |
+
return blocks
|
69 |
+
|
70 |
+
def draw_blocks_on_image(image_path, blocks, output_path):
|
71 |
+
image = cv2.imread(image_path)
|
72 |
+
for block in blocks:
|
73 |
+
coords = block['coords']
|
74 |
+
cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
|
75 |
+
cv2.imwrite(output_path, image)
|
76 |
+
return output_path
|
77 |
+
|
78 |
+
def process_image(image, output_folder, page_number):
|
79 |
+
image = convert_to_rgb(image)
|
80 |
+
blocks = extract_vertical_blocks(image)
|
81 |
+
base_name = f'page_{page_number + 1}.png'
|
82 |
+
image_path = os.path.join(output_folder, base_name)
|
83 |
+
image.save(image_path)
|
84 |
+
annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
|
85 |
+
annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
|
86 |
+
return blocks, annotated_image_path
|
87 |
+
|
88 |
+
def save_extracted_text(blocks, page_number, output_folder):
|
89 |
+
text_file_path = os.path.join(output_folder, 'extracted_text.txt')
|
90 |
+
with open(text_file_path, 'a', encoding='utf-8') as f:
|
91 |
+
f.write(f"[PAGE {page_number}]\n")
|
92 |
+
for block in blocks:
|
93 |
+
f.write(block['text'] + "\n")
|
94 |
+
f.write("[FIN DE PAGE]\n\n")
|
95 |
+
return text_file_path
|