pierreguillou commited on
Commit
c1c25ed
1 Parent(s): ac16355

Create text_extraction.py

Browse files
Files changed (1) hide show
  1. helpers/text_extraction.py +95 -0
helpers/text_extraction.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## PDF processing up to text extraction
2
+
3
+ import os
4
+ import shutil
5
+ import fitz
6
+ from PIL import Image
7
+ import numpy as np
8
+ import cv2
9
+ import pytesseract
10
+ from pytesseract import Output
11
+ import zipfile
12
+ from pdf2image import convert_from_path
13
+ import json
14
+
15
+ def convert_to_rgb(image_path):
16
+ img = Image.open(image_path)
17
+ rgb_img = img.convert("RGB")
18
+ return rgb_img
19
+
20
+ def preprocess_image(image):
21
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
22
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
23
+ denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
24
+ resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
25
+ return resized
26
+
27
+ def extract_vertical_blocks(image):
28
+ image_np = np.array(image)
29
+ data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)
30
+
31
+ blocks = []
32
+ current_block = ""
33
+ current_block_coords = [float('inf'), float('inf'), 0, 0]
34
+ last_bottom = -1
35
+ line_height = 0
36
+
37
+ for i in range(len(data['text'])):
38
+ if int(data['conf'][i]) > 0:
39
+ text = data['text'][i]
40
+ x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
41
+
42
+ if line_height == 0:
43
+ line_height = h * 1.2
44
+
45
+ if y > last_bottom + line_height:
46
+ if current_block:
47
+ blocks.append({
48
+ "text": current_block.strip(),
49
+ "coords": current_block_coords
50
+ })
51
+ current_block = ""
52
+ current_block_coords = [float('inf'), float('inf'), 0, 0]
53
+
54
+ current_block += text + " "
55
+ current_block_coords[0] = min(current_block_coords[0], x)
56
+ current_block_coords[1] = min(current_block_coords[1], y)
57
+ current_block_coords[2] = max(current_block_coords[2], x + w)
58
+ current_block_coords[3] = max(current_block_coords[3], y + h)
59
+
60
+ last_bottom = y + h
61
+
62
+ if current_block:
63
+ blocks.append({
64
+ "text": current_block.strip(),
65
+ "coords": current_block_coords
66
+ })
67
+
68
+ return blocks
69
+
70
+ def draw_blocks_on_image(image_path, blocks, output_path):
71
+ image = cv2.imread(image_path)
72
+ for block in blocks:
73
+ coords = block['coords']
74
+ cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
75
+ cv2.imwrite(output_path, image)
76
+ return output_path
77
+
78
+ def process_image(image, output_folder, page_number):
79
+ image = convert_to_rgb(image)
80
+ blocks = extract_vertical_blocks(image)
81
+ base_name = f'page_{page_number + 1}.png'
82
+ image_path = os.path.join(output_folder, base_name)
83
+ image.save(image_path)
84
+ annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
85
+ annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
86
+ return blocks, annotated_image_path
87
+
88
+ def save_extracted_text(blocks, page_number, output_folder):
89
+ text_file_path = os.path.join(output_folder, 'extracted_text.txt')
90
+ with open(text_file_path, 'a', encoding='utf-8') as f:
91
+ f.write(f"[PAGE {page_number}]\n")
92
+ for block in blocks:
93
+ f.write(block['text'] + "\n")
94
+ f.write("[FIN DE PAGE]\n\n")
95
+ return text_file_path