pierreguillou commited on
Commit
3b47eef
·
verified ·
1 Parent(s): 96918b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -161
app.py CHANGED
@@ -26,167 +26,6 @@ from helpers.text_extraction import *
26
 
27
  def authenticate(username, password):
28
  return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
29
-
30
- # Helper Functions
31
- def convert_to_rgb(image_path):
32
- img = Image.open(image_path)
33
- rgb_img = img.convert("RGB")
34
- return rgb_img
35
-
36
- def preprocess_image(image):
37
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
38
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
39
- denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
40
- resized = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
41
- return resized
42
-
43
- def extract_vertical_blocks(image):
44
- image_np = np.array(image)
45
- data = pytesseract.image_to_data(image_np, lang='fra', output_type=Output.DICT)
46
-
47
- blocks = []
48
- current_block = ""
49
- current_block_coords = [float('inf'), float('inf'), 0, 0]
50
- last_bottom = -1
51
- line_height = 0
52
-
53
- for i in range(len(data['text'])):
54
- if int(data['conf'][i]) > 0:
55
- text = data['text'][i]
56
- x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
57
-
58
- if line_height == 0:
59
- line_height = h * 1.2
60
-
61
- if y > last_bottom + line_height:
62
- if current_block:
63
- blocks.append({
64
- "text": current_block.strip(),
65
- "coords": current_block_coords
66
- })
67
- current_block = ""
68
- current_block_coords = [float('inf'), float('inf'), 0, 0]
69
-
70
- current_block += text + " "
71
- current_block_coords[0] = min(current_block_coords[0], x)
72
- current_block_coords[1] = min(current_block_coords[1], y)
73
- current_block_coords[2] = max(current_block_coords[2], x + w)
74
- current_block_coords[3] = max(current_block_coords[3], y + h)
75
-
76
- last_bottom = y + h
77
-
78
- if current_block:
79
- blocks.append({
80
- "text": current_block.strip(),
81
- "coords": current_block_coords
82
- })
83
-
84
- return blocks
85
-
86
- def draw_blocks_on_image(image_path, blocks, output_path):
87
- image = cv2.imread(image_path)
88
- for block in blocks:
89
- coords = block['coords']
90
- cv2.rectangle(image, (coords[0], coords[1]), (coords[2], coords[3]), (0, 0, 255), 2)
91
- cv2.imwrite(output_path, image)
92
- return output_path
93
-
94
- def process_image(image, output_folder, page_number):
95
- image = convert_to_rgb(image)
96
- blocks = extract_vertical_blocks(image)
97
- base_name = f'page_{page_number + 1}.png'
98
- image_path = os.path.join(output_folder, base_name)
99
- image.save(image_path)
100
- annotated_image_path = os.path.join(output_folder, f'annotated_{base_name}')
101
- annotated_image_path = draw_blocks_on_image(image_path, blocks, annotated_image_path)
102
- return blocks, annotated_image_path
103
-
104
- def save_extracted_text(blocks, page_number, output_folder):
105
- text_file_path = os.path.join(output_folder, 'extracted_text.txt')
106
- with open(text_file_path, 'a', encoding='utf-8') as f:
107
- f.write(f"[PAGE {page_number}]\n")
108
- for block in blocks:
109
- f.write(block['text'] + "\n")
110
- f.write("[FIN DE PAGE]\n\n")
111
- return text_file_path
112
-
113
- # Gemini Functions
114
- def initialize_gemini():
115
- try:
116
- genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
117
- generation_config = {
118
- "temperature": 1,
119
- "top_p": 0.95,
120
- "top_k": 40,
121
- "max_output_tokens": 8192,
122
- "response_mime_type": "text/plain",
123
- }
124
- model = genai.GenerativeModel(
125
- model_name="gemini-1.5-pro",
126
- generation_config=generation_config,
127
- )
128
- return model
129
- except Exception as e:
130
- raise gr.Error(f"Error initializing Gemini: {str(e)}")
131
-
132
- def create_prompt(extracted_text: str, path_to_data_to_extract: str) -> str:
133
-
134
- # load data to extract
135
- with open(path_to_data_to_extract, 'r', encoding='utf-8') as file:
136
- data_to_extract = json.load(file)
137
-
138
- prompt = f"""Tu es un assistant juridique expert en analyse de documents judiciaires français.
139
- Je vais te fournir le contenu d'un document judiciaire extrait d'un PDF.
140
- Ta tâche est d'analyser ce texte et d'en extraire les informations suivantes de manière précise :
141
-
142
- {json.dumps(data_to_extract, indent=2, ensure_ascii=False)}
143
-
144
- Voici quelques règles à suivre :
145
- - Si une information n'est pas présente dans le texte, indique "Non spécifié" pour cette catégorie.
146
- - Pour les noms des parties (demandeurs et défendeurs, et leurs avocats), liste tous ceux que tu trouves
147
- - Assure-toi de différencier correctement les demandeurs des défendeurs.
148
- - Si tu n'es pas sûr d'une information, indique-le clairement.
149
-
150
- Présente tes résultats sous forme de JSON, en utilisant les catégories mentionnées ci-dessus.
151
-
152
- Voici le contenu du document :
153
-
154
- {extracted_text.strip()}
155
-
156
- Analyse ce texte et fournis-moi les informations demandées au format JSON uniquement.""".strip()
157
-
158
- return prompt
159
-
160
- def extract_data_with_gemini(text_file_path: str, path_to_data_to_extract: str) -> dict:
161
- try:
162
- # Initialize Gemini
163
- model = initialize_gemini()
164
-
165
- # Read the extracted text
166
- with open(text_file_path, 'r', encoding='utf-8') as f:
167
- extracted_text = f.read()
168
-
169
- # Create prompt and get response
170
- prompt = create_prompt(extracted_text, path_to_data_to_extract)
171
- response = model.generate_content(prompt)
172
-
173
- # Parse the JSON response
174
- try:
175
- # Extract JSON from the response text
176
- json_str = response.text
177
- if "json" in json_str.lower():
178
- json_str = json_str.split("json")[1].split("```")[0]
179
- elif "```" in json_str:
180
- json_str = json_str.split("```")[1]
181
- result = json.loads(json_str)
182
- except:
183
- result = {"error": "Failed to parse JSON response", "raw_response": response.text}
184
-
185
- return result
186
- except Exception as e:
187
- raise gr.Error(f"Error in Gemini processing: {str(e)}")
188
-
189
-
190
 
191
  # Main Processing Function
192
  def process_pdf(pdf_file):
 
26
 
27
  def authenticate(username, password):
28
  return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Main Processing Function
31
  def process_pdf(pdf_file):