Spaces:
Sleeping
Sleeping
pierreguillou
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -26,167 +26,6 @@ from helpers.text_extraction import *
|
|
26 |
|
27 |
def authenticate(username, password):
|
28 |
return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
|
29 |
-
|
30 |
-
# Helper Functions
|
31 |
-
def convert_to_rgb(image_path):
|
32 |
-
img = Image.open(image_path)
|
33 |
-
rgb_img = img.convert("RGB")
|
34 |
-
return rgb_img
|
35 |
-
|
36 |
-
def preprocess_image(image):
|
37 |
-
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
38 |
-
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
39 |
-
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
|
40 |
-
resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
|
41 |
-
return resized
|
42 |
-
|
43 |
-
def extract_vertical_blocks(image):
|
44 |
-
image_np = np.array(image)
|
45 |
-
data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)
|
46 |
-
|
47 |
-
blocks = []
|
48 |
-
current_block = ""
|
49 |
-
current_block_coords = [float('inf'), float('inf'), 0, 0]
|
50 |
-
last_bottom = -1
|
51 |
-
line_height = 0
|
52 |
-
|
53 |
-
for i in range(len(data['text'])):
|
54 |
-
if int(data['conf'][i]) > 0:
|
55 |
-
text = data['text'][i]
|
56 |
-
x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
|
57 |
-
|
58 |
-
if line_height == 0:
|
59 |
-
line_height = h * 1.2
|
60 |
-
|
61 |
-
if y > last_bottom + line_height:
|
62 |
-
if current_block:
|
63 |
-
blocks.append({
|
64 |
-
"text": current_block.strip(),
|
65 |
-
"coords": current_block_coords
|
66 |
-
})
|
67 |
-
current_block = ""
|
68 |
-
current_block_coords = [float('inf'), float('inf'), 0, 0]
|
69 |
-
|
70 |
-
current_block += text + " "
|
71 |
-
current_block_coords[0] = min(current_block_coords[0], x)
|
72 |
-
current_block_coords[1] = min(current_block_coords[1], y)
|
73 |
-
current_block_coords[2] = max(current_block_coords[2], x + w)
|
74 |
-
current_block_coords[3] = max(current_block_coords[3], y + h)
|
75 |
-
|
76 |
-
last_bottom = y + h
|
77 |
-
|
78 |
-
if current_block:
|
79 |
-
blocks.append({
|
80 |
-
"text": current_block.strip(),
|
81 |
-
"coords": current_block_coords
|
82 |
-
})
|
83 |
-
|
84 |
-
return blocks
|
85 |
-
|
86 |
-
def draw_blocks_on_image(image_path, blocks, output_path):
|
87 |
-
image = cv2.imread(image_path)
|
88 |
-
for block in blocks:
|
89 |
-
coords = block['coords']
|
90 |
-
cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
|
91 |
-
cv2.imwrite(output_path, image)
|
92 |
-
return output_path
|
93 |
-
|
94 |
-
def process_image(image, output_folder, page_number):
|
95 |
-
image = convert_to_rgb(image)
|
96 |
-
blocks = extract_vertical_blocks(image)
|
97 |
-
base_name = f'page_{page_number + 1}.png'
|
98 |
-
image_path = os.path.join(output_folder, base_name)
|
99 |
-
image.save(image_path)
|
100 |
-
annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
|
101 |
-
annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
|
102 |
-
return blocks, annotated_image_path
|
103 |
-
|
104 |
-
def save_extracted_text(blocks, page_number, output_folder):
|
105 |
-
text_file_path = os.path.join(output_folder, 'extracted_text.txt')
|
106 |
-
with open(text_file_path, 'a', encoding='utf-8') as f:
|
107 |
-
f.write(f"[PAGE {page_number}]\n")
|
108 |
-
for block in blocks:
|
109 |
-
f.write(block['text'] + "\n")
|
110 |
-
f.write("[FIN DE PAGE]\n\n")
|
111 |
-
return text_file_path
|
112 |
-
|
113 |
-
# Gemini Functions
|
114 |
-
def initialize_gemini():
|
115 |
-
try:
|
116 |
-
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
117 |
-
generation_config = {
|
118 |
-
"temperature": 1,
|
119 |
-
"top_p": 0.95,
|
120 |
-
"top_k": 40,
|
121 |
-
"max_output_tokens": 8192,
|
122 |
-
"response_mime_type": "text/plain",
|
123 |
-
}
|
124 |
-
model = genai.GenerativeModel(
|
125 |
-
model_name="gemini-1.5-pro",
|
126 |
-
generation_config=generation_config,
|
127 |
-
)
|
128 |
-
return model
|
129 |
-
except Exception as e:
|
130 |
-
raise gr.Error(f"Error initializing Gemini: {str(e)}")
|
131 |
-
|
132 |
-
def create_prompt(extracted_text: str, path_to_data_to_extract: str) -> str:
|
133 |
-
|
134 |
-
# load data to extract
|
135 |
-
with open(path_to_data_to_extract, 'r', encoding='utf-8') as file:
|
136 |
-
data_to_extract = json.load(file)
|
137 |
-
|
138 |
-
prompt = f"""Tu es un assistant juridique expert en analyse de documents judiciaires français.
|
139 |
-
Je vais te fournir le contenu d'un document judiciaire extrait d'un PDF.
|
140 |
-
Ta tâche est d'analyser ce texte et d'en extraire les informations suivantes de manière précise :
|
141 |
-
|
142 |
-
{json.dumps(data_to_extract, indent=2, ensure_ascii=False)}
|
143 |
-
|
144 |
-
Voici quelques règles à suivre :
|
145 |
-
- Si une information n'est pas présente dans le texte, indique "Non spécifié" pour cette catégorie.
|
146 |
-
- Pour les noms des parties (demandeurs et défendeurs, et leurs avocats), liste tous ceux que tu trouves
|
147 |
-
- Assure-toi de différencier correctement les demandeurs des défendeurs.
|
148 |
-
- Si tu n'es pas sûr d'une information, indique-le clairement.
|
149 |
-
|
150 |
-
Présente tes résultats sous forme de JSON, en utilisant les catégories mentionnées ci-dessus.
|
151 |
-
|
152 |
-
Voici le contenu du document :
|
153 |
-
|
154 |
-
{extracted_text.strip()}
|
155 |
-
|
156 |
-
Analyse ce texte et fournis-moi les informations demandées au format JSON uniquement.""".strip()
|
157 |
-
|
158 |
-
return prompt
|
159 |
-
|
160 |
-
def extract_data_with_gemini(text_file_path: str, path_to_data_to_extract: str) -> dict:
|
161 |
-
try:
|
162 |
-
# Initialize Gemini
|
163 |
-
model = initialize_gemini()
|
164 |
-
|
165 |
-
# Read the extracted text
|
166 |
-
with open(text_file_path, 'r', encoding='utf-8') as f:
|
167 |
-
extracted_text = f.read()
|
168 |
-
|
169 |
-
# Create prompt and get response
|
170 |
-
prompt = create_prompt(extracted_text, path_to_data_to_extract)
|
171 |
-
response = model.generate_content(prompt)
|
172 |
-
|
173 |
-
# Parse the JSON response
|
174 |
-
try:
|
175 |
-
# Extract JSON from the response text
|
176 |
-
json_str = response.text
|
177 |
-
if "json" in json_str.lower():
|
178 |
-
json_str = json_str.split("json")[1].split("```")[0]
|
179 |
-
elif "```" in json_str:
|
180 |
-
json_str = json_str.split("```")[1]
|
181 |
-
result = json.loads(json_str)
|
182 |
-
except:
|
183 |
-
result = {"error": "Failed to parse JSON response", "raw_response": response.text}
|
184 |
-
|
185 |
-
return result
|
186 |
-
except Exception as e:
|
187 |
-
raise gr.Error(f"Error in Gemini processing: {str(e)}")
|
188 |
-
|
189 |
-
|
190 |
|
191 |
# Main Processing Function
|
192 |
def process_pdf(pdf_file):
|
|
|
26 |
|
27 |
def authenticate(username, password):
|
28 |
return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Main Processing Function
|
31 |
def process_pdf(pdf_file):
|