|
from uploader import save_logs |
|
import os |
|
import pandas as pd |
|
from rapidfuzz import process, fuzz |
|
from random import randint |
|
from time import sleep |
|
|
|
def fuzzy_match(row, choices, scorer, cutoff): |
|
match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff) |
|
if match: |
|
return match[0] |
|
return "missing receipt" |
|
def reconciliate_and_upload(data_path, |
|
name_of_csv, |
|
folder_to_save_processed_reciepts, |
|
folder_to_save_reconciled_data, |
|
name_of_raw_transcripts = "transcript_raw.txt", |
|
name_of_output = "[AI Generated] Output.xlsx" ): |
|
|
|
with open(f"{data_path}/{name_of_raw_transcripts}") as file: |
|
transcripts = eval(file.read()) |
|
|
|
imgs = [] |
|
path_to_pdfs =data_path |
|
|
|
for root, dirs, files in os.walk(path_to_pdfs): |
|
for file in files: |
|
if file.endswith('.png'): |
|
print(os.path.join(root, file)) |
|
imgs.append({"path": os.path.join(root, file)}) |
|
pass |
|
|
|
list_transcripts_evaled = [] |
|
objects = [] |
|
for i,t in enumerate(transcripts): |
|
content = eval(t["content"].replace('null', '-1')) |
|
try: |
|
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts) |
|
except: |
|
print("sleeping a bit innit") |
|
breakpoint() |
|
sleep(randint(30,40)) |
|
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts) |
|
|
|
objects.append(obk) |
|
print("uploaded image!") |
|
try: |
|
list_transcripts_evaled.append({ |
|
"path": imgs[i]["path"], |
|
"name_of_supplier" :content["name_of_supplier"], |
|
"amount":content["amount"], |
|
"currency":content["currency"], |
|
"date": content["date"]}) |
|
except: |
|
breakpoint() |
|
urls = [] |
|
for ob in objects: |
|
url = "https://drive.google.com/file/d/" + ob["id"] |
|
urls.append(url) |
|
df_app = pd.DataFrame(list_transcripts_evaled) |
|
float_regex = r'[-+]?\d*\.\d+|\d+' |
|
df_app['amount'] = df_app['amount'].astype(str).str.extract(f'({float_regex})', expand=False) |
|
|
|
|
|
df_app['amount'] = ( |
|
df_app['amount'] |
|
.str.replace(" ", "", regex=False) |
|
.str.replace(",", ".", regex=False) |
|
.str.replace("N/A", "-1", regex=False) |
|
.astype(float) |
|
) |
|
|
|
df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce') |
|
df_app["url"] = urls |
|
df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True) |
|
print(data_path) |
|
print("NAME OF CSV: ", name_of_csv) |
|
df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3) |
|
|
|
|
|
df_opp_app['Débit'] = df_opp_app['Débit'].astype(str).str.extract(f'({float_regex})', expand=False) |
|
|
|
|
|
df_opp_app['Débit'] = ( |
|
df_opp_app['Débit'] |
|
.str.replace(" ", "", regex=False) |
|
.str.replace(",", ".", regex=False) |
|
.str.replace("N/A", "-1", regex=False) |
|
.astype(float) |
|
) |
|
|
|
|
|
|
|
df_opp_app['Crédit'] = df_opp_app['Crédit'].astype(str).str.extract(f'({float_regex})', expand=False) |
|
|
|
|
|
df_opp_app['Crédit'] = ( |
|
df_opp_app['Crédit'] |
|
.str.replace(" ", "", regex=False) |
|
.str.replace(",", ".", regex=False) |
|
.str.replace("N/A", "-1", regex=False) |
|
.astype(float) |
|
) |
|
df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce') |
|
|
|
|
|
merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"}) |
|
merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *") |
|
|
|
merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df')) |
|
|
|
|
|
|
|
|
|
raw_choices = df_app['name_of_supplier'].tolist() |
|
choices = [] |
|
for r in raw_choices: |
|
choices.append(r.upper()) |
|
merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1) |
|
merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"]) |
|
|
|
df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])] |
|
|
|
|
|
merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier'] |
|
|
|
merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True) |
|
df_residuals_app.drop(columns=["path"], inplace=True) |
|
merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '') |
|
df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '') |
|
|
|
|
|
with pd.ExcelWriter(name_of_output) as writer: |
|
merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False) |
|
df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False) |
|
|
|
|
|
|
|
id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data) |
|
|
|
return id_output |
|
|