Ronan commited on
Commit
dd6a24d
1 Parent(s): ec6dd69

feat: add new filters

Browse files
__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
 
country_by_country/utils/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/country_by_country/utils/__pycache__/utils.cpython-310.pyc and b/country_by_country/utils/__pycache__/utils.cpython-310.pyc differ
 
extract_config.yaml CHANGED
@@ -4,7 +4,8 @@ pagefilter:
4
  modelfile: random_forest_model_low_false_positive.joblib
5
 
6
  table_extraction:
 
7
  - type: Unstructured
8
  params:
9
  hi_res_model_name: "yolox"
10
- pdf_image_dpi: 300
 
4
  modelfile: random_forest_model_low_false_positive.joblib
5
 
6
  table_extraction:
7
+ - type: LlamaParse
8
  - type: Unstructured
9
  params:
10
  hi_res_model_name: "yolox"
11
+ pdf_image_dpi: 300
pages/0_Import_File.py CHANGED
@@ -7,7 +7,7 @@ import yaml
7
  import copy
8
  from menu import display_pages_menu, display_config
9
  from pypdf import PdfReader
10
- from utils import get_pdf_iframe, set_state
11
 
12
  from country_by_country.processor import ReportProcessor
13
 
@@ -18,6 +18,48 @@ def set_page_filter(value: dict):
18
  set_state(["config", "pagefilter"], value)
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
22
  st.title("Country by Country Tax Reporting analysis")
23
  st.subheader(
@@ -27,6 +69,23 @@ display_pages_menu()
27
 
28
  mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with st.sidebar:
31
 
32
  st.markdown("# PDF Upload")
@@ -34,13 +93,10 @@ with st.sidebar:
34
  st.markdown("## PDF Report to process")
35
  original_pdf = st.file_uploader(
36
  "Upload a pdf document containing financial table : ",
 
 
37
  )
38
 
39
- if original_pdf is not None:
40
- mytmpfile.write(original_pdf.read())
41
- st.session_state["working_file_pdf"] = mytmpfile
42
- st.session_state["original_pdf_name"] = original_pdf.name
43
-
44
  if "original_pdf_name" in st.session_state:
45
  st.markdown(
46
  "Already loaded file : " + st.session_state["original_pdf_name"],
@@ -50,7 +106,10 @@ with st.sidebar:
50
  # Upload personalized config if required
51
  loaded_config = st.file_uploader(
52
  "Upload a config if the default config doesn't suit you :",
 
 
53
  )
 
54
  if loaded_config is not None:
55
  if not loaded_config.name.endswith(".yaml"):
56
  st.error("Please upload a yaml file")
@@ -69,26 +128,28 @@ with st.sidebar:
69
  loaded_config = None
70
 
71
  # Extract config
72
- with open("extract_config.yaml", "r") as f:
73
- default_config = f.read()
74
-
75
- if not st.session_state.get("config_is_set", False):
76
- st.session_state["initial_config"] = yaml.safe_load(default_config)
77
- st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
78
- st.session_state["config_is_set"] = True
79
 
80
  if bool(loaded_config):
81
  st.session_state["initial_config"] = loaded_config_dict
82
  st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
83
- st.session_state["config_is_set"] = True
84
 
85
  # Set page filter
86
- page_filter_radio_dict = {
87
  pagefilter["type"]: pagefilter
88
  for pagefilter in st.session_state["initial_config"]["pagefilter"]
89
  }
90
- selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys())
91
- set_page_filter(page_filter_radio_dict[selected_page_filter])
 
 
 
 
 
 
 
 
 
 
92
 
93
  display_config()
94
 
@@ -103,32 +164,3 @@ if "working_file_pdf" in st.session_state:
103
  get_pdf_iframe(st.session_state["working_file_pdf"].name),
104
  unsafe_allow_html=True,
105
  )
106
-
107
- if "first_time" not in st.session_state:
108
- st.session_state["first_time"] = False
109
- logging.info("Loading config and pdf")
110
- st.session_state["proc"] = ReportProcessor(st.session_state["config"])
111
-
112
- logging.info("Config and pdf loaded")
113
-
114
- assets = {
115
- "pagefilter": {},
116
- "table_extractors": [],
117
- }
118
-
119
- # Filtering the pages
120
- st.session_state["proc"].page_filter(
121
- st.session_state["working_file_pdf"].name,
122
- assets,
123
- )
124
-
125
- logging.info(f"Assets : {assets}")
126
-
127
- if len(assets["pagefilter"]["selected_pages"]) == 0:
128
- # No page has been automatically selected by the page filter
129
- # Hence, we display the full pdf, letting the user select the pages
130
- pdfreader = PdfReader(st.session_state["working_file_pdf"])
131
- number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
132
- assets["pagefilter"]["selected_pages"] = list(range(number_pages))
133
- st.session_state["assets"] = assets
134
- st.switch_page("pages/1_Selected_Pages.py")
 
7
  import copy
8
  from menu import display_pages_menu, display_config
9
  from pypdf import PdfReader
10
+ from utils import get_pdf_iframe, set_state, generate_assets
11
 
12
  from country_by_country.processor import ReportProcessor
13
 
 
18
  set_state(["config", "pagefilter"], value)
19
 
20
 
21
+ def initiate_configuration() -> None:
22
+ st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
23
+ if isinstance(st.session_state["config"]["pagefilter"], list):
24
+ st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][
25
+ "pagefilter"
26
+ ][0]
27
+ st.session_state["selected_page_filter_name"] = st.session_state["config"][
28
+ "pagefilter"
29
+ ]["type"]
30
+
31
+
32
+ def on_pdf_file_upload() -> None:
33
+ # Change states related to the pdf file upload
34
+ mytmpfile.write(st.session_state.original_pdf.read())
35
+ st.session_state["working_file_pdf"] = mytmpfile
36
+ st.session_state["original_pdf_name"] = st.session_state.original_pdf.name
37
+
38
+ # Generate assets
39
+ generate_assets()
40
+
41
+ st.session_state["page_redirection"] = "pages/1_Selected_Pages.py"
42
+
43
+
44
+ def on_config_file_upload() -> None:
45
+ st.session_state["initial_config"] = st.session_state["initial_uploaded_config"]
46
+ initiate_configuration()
47
+
48
+
49
+ def on_change_page_filter(name_to_filter_dict: dict) -> None:
50
+ st.session_state["selected_page_filter_name"] = st.session_state[
51
+ "radio_button_filter_selection"
52
+ ] # this 'buffer' is needed because selectors wipe their key on reload
53
+ set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]])
54
+
55
+
56
+ # Check if a redirection was requested
57
+ # Workaround because st.switch_page is not allowed in a callback function
58
+ if st.session_state.get("page_redirection", False):
59
+ page_to_redirect_to = st.session_state["page_redirection"]
60
+ st.session_state["page_redirection"] = False
61
+ st.switch_page(page_to_redirect_to)
62
+
63
  st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
64
  st.title("Country by Country Tax Reporting analysis")
65
  st.subheader(
 
69
 
70
  mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
71
 
72
+ # State initialization
73
+ if "first_time" not in st.session_state:
74
+ logging.info("State initialization...")
75
+ st.session_state["first_time"] = False
76
+
77
+ logging.info("... loading default extract config")
78
+ with open("extract_config.yaml", "r") as f:
79
+ st.session_state["initial_config"] = yaml.safe_load(f.read())
80
+ initiate_configuration()
81
+
82
+ logging.info("... initializing processor and assets")
83
+ st.session_state["proc"] = ReportProcessor(st.session_state["config"])
84
+ st.session_state["assets"] = {
85
+ "pagefilter": {},
86
+ "table_extractors": [],
87
+ }
88
+
89
  with st.sidebar:
90
 
91
  st.markdown("# PDF Upload")
 
93
  st.markdown("## PDF Report to process")
94
  original_pdf = st.file_uploader(
95
  "Upload a pdf document containing financial table : ",
96
+ key="original_pdf",
97
+ on_change=on_pdf_file_upload,
98
  )
99
 
 
 
 
 
 
100
  if "original_pdf_name" in st.session_state:
101
  st.markdown(
102
  "Already loaded file : " + st.session_state["original_pdf_name"],
 
106
  # Upload personalized config if required
107
  loaded_config = st.file_uploader(
108
  "Upload a config if the default config doesn't suit you :",
109
+ key="initial_uploaded_config",
110
+ on_change=initiate_configuration,
111
  )
112
+
113
  if loaded_config is not None:
114
  if not loaded_config.name.endswith(".yaml"):
115
  st.error("Please upload a yaml file")
 
128
  loaded_config = None
129
 
130
  # Extract config
 
 
 
 
 
 
 
131
 
132
  if bool(loaded_config):
133
  st.session_state["initial_config"] = loaded_config_dict
134
  st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
 
135
 
136
  # Set page filter
137
+ page_filter_name_to_config_mapping = {
138
  pagefilter["type"]: pagefilter
139
  for pagefilter in st.session_state["initial_config"]["pagefilter"]
140
  }
141
+ page_filter_list = list(page_filter_name_to_config_mapping.keys())
142
+ current_selected_page_filter_index = page_filter_list.index(
143
+ st.session_state["selected_page_filter_name"]
144
+ )
145
+ selected_page_filter_name = st.radio(
146
+ "Page filter",
147
+ page_filter_list,
148
+ index=current_selected_page_filter_index,
149
+ on_change=on_change_page_filter,
150
+ key="radio_button_filter_selection",
151
+ args=(page_filter_name_to_config_mapping,),
152
+ )
153
 
154
  display_config()
155
 
 
164
  get_pdf_iframe(st.session_state["working_file_pdf"].name),
165
  unsafe_allow_html=True,
166
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/1_Selected_Pages.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  from country_by_country.processor import ReportProcessor
3
- from utils import get_pdf_iframe, set_state
4
  from country_by_country.utils.utils import keep_pages
5
  from pypdf import PdfReader
6
  from menu import display_pages_menu, display_config
@@ -29,6 +29,7 @@ def set_extractors() -> None:
29
  ]
30
  set_state(["config", "table_extraction"], selected_extractors_dict)
31
  st.session_state["proc"] = ReportProcessor(st.session_state["config"])
 
32
 
33
 
34
  st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈"
 
1
  import streamlit as st
2
  from country_by_country.processor import ReportProcessor
3
+ from utils import get_pdf_iframe, set_state, generate_assets
4
  from country_by_country.utils.utils import keep_pages
5
  from pypdf import PdfReader
6
  from menu import display_pages_menu, display_config
 
29
  ]
30
  set_state(["config", "table_extraction"], selected_extractors_dict)
31
  st.session_state["proc"] = ReportProcessor(st.session_state["config"])
32
+ generate_assets()
33
 
34
 
35
  st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈"
pages/2_Metadata.py CHANGED
@@ -41,6 +41,7 @@ if "pdf_after_page_validation" in st.session_state:
41
  currency = st.session_state["metadata"]["currency"]
42
  unit = st.session_state["metadata"]["unit"]
43
  headquarter = st.session_state["metadata"]["headquarter"]
 
44
  else:
45
  company_name = None
46
  sector = None
@@ -48,6 +49,15 @@ if "pdf_after_page_validation" in st.session_state:
48
  currency = None
49
  unit = None
50
  headquarter = ""
 
 
 
 
 
 
 
 
 
51
  companies = list(COMPANIES.keys())
52
  company_name = st.selectbox(
53
  "Company name",
@@ -73,7 +83,9 @@ if "pdf_after_page_validation" in st.session_state:
73
  currency = st.selectbox(
74
  "Currency",
75
  currencies,
76
- index=currencies.index(currency) if currency else currencies.index("EUR - Euro"),
 
 
77
  )
78
 
79
  units = [
@@ -98,6 +110,7 @@ if "pdf_after_page_validation" in st.session_state:
98
  )
99
  if submitted:
100
  st.session_state["metadata"] = {
 
101
  "company_name": company_name,
102
  "sector": sector,
103
  "year": year,
 
41
  currency = st.session_state["metadata"]["currency"]
42
  unit = st.session_state["metadata"]["unit"]
43
  headquarter = st.session_state["metadata"]["headquarter"]
44
+ decimal_separator = st.session_state["metadata"]["separator"]
45
  else:
46
  company_name = None
47
  sector = None
 
49
  currency = None
50
  unit = None
51
  headquarter = ""
52
+ decimal_separator = ","
53
+
54
+ separator_list = [",", "."]
55
+ decimal_separator = st.selectbox(
56
+ "Decimal separator",
57
+ separator_list,
58
+ index=separator_list.index(decimal_separator),
59
+ )
60
+
61
  companies = list(COMPANIES.keys())
62
  company_name = st.selectbox(
63
  "Company name",
 
83
  currency = st.selectbox(
84
  "Currency",
85
  currencies,
86
+ index=currencies.index(currency)
87
+ if currency
88
+ else currencies.index("EUR - Euro"),
89
  )
90
 
91
  units = [
 
110
  )
111
  if submitted:
112
  st.session_state["metadata"] = {
113
+ "separator": decimal_separator,
114
  "company_name": company_name,
115
  "sector": sector,
116
  "year": year,
pages/5_Clean_Tables.py CHANGED
@@ -50,12 +50,12 @@ def convert_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame:
50
  for column_name in dataframe.columns:
51
  try:
52
  dataframe[column_name] = dataframe[column_name].astype(float)
53
- except Exception:
54
  pass
55
  return dataframe
56
 
57
 
58
- special_characters = "#&()[]"
59
 
60
 
61
  def style_symbol(v, props=""):
@@ -181,25 +181,52 @@ if (
181
  height=900,
182
  )
183
 
 
 
 
 
184
  col7, col8, col9 = st.columns([1, 1, 1])
185
  with col7:
186
  total = st.checkbox(
187
  "Calculate the Total of each columns, excluding the last row", value=True
188
  )
189
  country = st.checkbox("Activate the country filter", value=True)
 
190
 
191
  with col8:
192
  negativ = st.checkbox(
193
  "Show the negative numbers, for each columns detected as a numerical type"
194
  )
 
 
 
 
 
 
 
 
 
 
 
 
195
  with col9:
196
- symbol = st.checkbox(
197
- "Show the cells that contain a special symbol : " + special_characters,
198
- value=True,
199
- )
200
- remove_symbols = st.checkbox("Remove the special symbols")
 
 
 
 
 
 
 
 
 
201
 
202
  dataframe = st.session_state.tables[st.session_state["algorithm_name"]].copy()
 
203
 
204
  if country:
205
  dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply(
@@ -207,13 +234,74 @@ if (
207
  )
208
 
209
  if remove_symbols:
210
- pattern = "\(.*?\)" + "|[" + re.escape(special_characters) + "]"
211
- for column in dataframe.columns:
212
- dataframe[column] = dataframe[column].apply(
213
- lambda x: re.sub(pattern, "", str(x))
214
- )
 
215
  dataframe = convert_dataframe(dataframe)
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  if total:
218
  dataframe = convert_dataframe(dataframe)
219
  new_row = dataframe.apply(column_sum, axis=0)
 
50
  for column_name in dataframe.columns:
51
  try:
52
  dataframe[column_name] = dataframe[column_name].astype(float)
53
+ except Exception as e:
54
  pass
55
  return dataframe
56
 
57
 
58
+ special_characters = "#&()[]@©€$'R¹³²"
59
 
60
 
61
  def style_symbol(v, props=""):
 
181
  height=900,
182
  )
183
 
184
+ st.subheader(
185
+ "Filters : ",
186
+ )
187
+
188
  col7, col8, col9 = st.columns([1, 1, 1])
189
  with col7:
190
  total = st.checkbox(
191
  "Calculate the Total of each columns, excluding the last row", value=True
192
  )
193
  country = st.checkbox("Activate the country filter", value=True)
194
+ decimal_cleanup = st.checkbox("Apply decimal cleanup")
195
 
196
  with col8:
197
  negativ = st.checkbox(
198
  "Show the negative numbers, for each columns detected as a numerical type"
199
  )
200
+
201
+ with st.container(border=True):
202
+ cleanup_rules = st.checkbox(
203
+ "Apply clean up rules : (number) mean a negative number, o-> 0, homogenization NA, ect ect "
204
+ )
205
+ if cleanup_rules:
206
+ cleanup_excluded = st.multiselect(
207
+ "exclude from filtering",
208
+ st.session_state.tables[st.session_state["algorithm_name"]].columns,
209
+ key="cleanup",
210
+ )
211
+
212
  with col9:
213
+ with st.container(border=True):
214
+ symbol = st.checkbox(
215
+ "Show the cells that contain a special symbol : " + special_characters,
216
+ value=True,
217
+ )
218
+ remove_symbols = st.checkbox(
219
+ "Remove the special symbols on numeric columns"
220
+ )
221
+ if remove_symbols:
222
+ rm_symbol_excluded = st.multiselect(
223
+ "exclude from filtering",
224
+ st.session_state.tables[st.session_state["algorithm_name"]].columns,
225
+ key="rm_symbol",
226
+ )
227
 
228
  dataframe = st.session_state.tables[st.session_state["algorithm_name"]].copy()
229
+ dataframe = convert_dataframe(dataframe)
230
 
231
  if country:
232
  dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply(
 
234
  )
235
 
236
  if remove_symbols:
237
+ pattern = "[" + re.escape(special_characters) + "]"
238
+ for column, dtype in dataframe.dtypes.items():
239
+ if column not in rm_symbol_excluded:
240
+ dataframe[column] = dataframe[column].apply(
241
+ lambda x: re.sub(pattern, "", str(x))
242
+ )
243
  dataframe = convert_dataframe(dataframe)
244
 
245
+ if cleanup_rules:
246
+ for column, dtype in dataframe.dtypes.items():
247
+ if column not in cleanup_excluded:
248
+ # this is a code translated by chatgpt from Kane's R code
249
+ dataframe[column] = dataframe[column].replace(
250
+ {"^-$|^$|^ $|^N/I$|^- -$|^N/A$|^n\\.a\\.$": None}, regex=True
251
+ )
252
+ dataframe[column] = dataframe[column].replace(
253
+ {"^o$|^O$|^\\(o\\)$|^\\(O\\)$|^\\(0\\)$": "0"}, regex=True
254
+ )
255
+
256
+ if dtype == object:
257
+ dataframe[column] = dataframe[column].str.replace(
258
+ "(\\(.*\\))[:alnum:]+", "\\1", regex=True
259
+ )
260
+ dataframe[column] = dataframe[column].str.replace(
261
+ "\\([:alnum:]+$|\\)[:alnum:]+$", "", regex=True
262
+ )
263
+ dataframe[column] = dataframe[column].str.replace(
264
+ "\\([:alpha:]+\\)", "", regex=True
265
+ )
266
+ dataframe[column] = dataframe[column].str.replace(
267
+ "(.+)\\(.+\\)$", "\\1", regex=True
268
+ )
269
+ dataframe[column] = dataframe[column].str.replace(
270
+ "^\\(-(.*)\\)", "-\\1", regex=True
271
+ )
272
+ dataframe[column] = dataframe[column].str.replace(
273
+ "^\\((.*)\\)", "-\\1", regex=True
274
+ )
275
+ dataframe[column] = dataframe[column].str.replace(
276
+ "\\(.*\\)| |\\*|^-$|\\[.*\\]|^-€$", "", regex=True
277
+ )
278
+ dataframe = convert_dataframe(dataframe)
279
+ if decimal_cleanup:
280
+ decimal_separator = (
281
+ st.session_state["metadata"]["separator"]
282
+ if st.session_state["metadata"]["separator"]
283
+ else ","
284
+ )
285
+ for column, dtype in dataframe.dtypes.items():
286
+ if dtype == object:
287
+ if decimal_separator == ",":
288
+ dataframe[column] = dataframe[column].str.replace(
289
+ "\\.", "", regex=False
290
+ )
291
+ dataframe[column] = dataframe[column].str.replace(
292
+ ",", ".", regex=False
293
+ )
294
+ else:
295
+ dataframe[column] = dataframe[column].str.replace(
296
+ ",(.{1,2})$", ".\\1", regex=True
297
+ )
298
+ dataframe[column] = dataframe[column].str.replace(
299
+ "\\.([0-9]{3})", ",\\1", regex=True
300
+ )
301
+ dataframe[column] = dataframe[column].str.replace(
302
+ ",", "", regex=False
303
+ )
304
+
305
  if total:
306
  dataframe = convert_dataframe(dataframe)
307
  new_row = dataframe.apply(column_sum, axis=0)
utils.py CHANGED
@@ -1,9 +1,11 @@
1
  import base64
 
2
  from pathlib import Path
3
  from typing import Any
4
 
5
  import pandas as pd
6
  import streamlit as st
 
7
 
8
 
9
  def get_pdf_iframe(pdf_to_process: str) -> str:
@@ -61,3 +63,25 @@ def set_state(key: Any, value: Any) -> None:
61
  nested_value[key_list[-1]] = value
62
  else:
63
  st.session_state[key] = value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import base64
2
+ import logging
3
  from pathlib import Path
4
  from typing import Any
5
 
6
  import pandas as pd
7
  import streamlit as st
8
+ from pypdf import PdfReader
9
 
10
 
11
  def get_pdf_iframe(pdf_to_process: str) -> str:
 
63
  nested_value[key_list[-1]] = value
64
  else:
65
  st.session_state[key] = value
66
+
67
+
68
+ def generate_assets() -> None:
69
+ assets = {
70
+ "pagefilter": {},
71
+ "table_extractors": [],
72
+ }
73
+
74
+ # Filtering the pages
75
+ st.session_state["proc"].page_filter(
76
+ st.session_state["working_file_pdf"].name,
77
+ assets,
78
+ )
79
+
80
+ logging.info(f"Assets : {assets}")
81
+
82
+ if len(assets["pagefilter"]["selected_pages"]) == 0:
83
+ # No page has been automatically selected by the page filter
84
+ # Hence, we display the full pdf, letting the user select the pages
85
+ number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
86
+ assets["pagefilter"]["selected_pages"] = list(range(number_pages))
87
+ st.session_state["assets"] = assets