from google.cloud import storage import os with open("./cred.json","w") as fj: fj.write(os.environ["CRED_JSON"]) storage_client = storage.Client() bucket_name = "docs-axio-clara" from langchain_pinecone import PineconeVectorStore from langchain_community.document_loaders import TextLoader from langchain_text_splitters import CharacterTextSplitter from climateqa.engine.embeddings import get_embeddings_function embeddings_function = get_embeddings_function() index_name = "clara-index" namespace = "my-namespace" import os import pdfplumber def get_categories_files(): finale = {} listCat = [] CAT_DIR="config_categorie/" FOLDER_PATH="." bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(CAT_DIR+"categories.csv") lines = blob.download_as_text().split("\n") blob_label = bucket.blob(CAT_DIR+"libelle.csv") lines_label = blob_label.download_as_text().split("\n") labels = {} # récupération des libelles first = True for line in lines_label: # evite la première ligne if first: first = False continue lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","") labels[line.split(";")[0]] = lab print( "label :"+lab ) # premier passage récupération des catégories existantes first = True for line in lines: # evite la première ligne if first: first = False continue categories = line.split(";")[-1].split(" ") for cat in categories: categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","") # si la categorie n'a pas de label on utilise le champ technique try : test = labels[categ] # plante si la clé n'exsite pas except : labels[categ] = categ # on ajoute la catégorie (le label) dans la liste si pas déjà croisée if not labels[categ] in listCat: print(" - ["+categ+"] > "+ labels[categ] ) listCat.append(labels[categ]) # initialisation de la structure finale for cat in listCat: finale[cat] = [] finale["AllCat"] = listCat # deuxième passage association fichier, catégorie first = True for line in lines: # evite la première ligne if first: first = False continue fichier = line.split(";")[0] categories = line.split(";")[-1].split(" ") listCat = [] # on place le fichier dans les catégories associées for cat in categories: categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","") print( fichier +" dans "+ labels[categ] +"("+categ+")") finale[labels[categ]].append(fichier) return finale def get_PDF_Names_from_GCP(): listName = [] # Récupération des fichier depuis GCP storage blobs = storage_client.list_blobs(bucket_name, prefix='sources/') for blob in blobs: listName.append(blob.name) return listName def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"): # Récupération des fichier depuis GCP storage #blobs = storage_client.list_blobs(bucket_name, prefix='sources/') #for blob in blobs: # print( "\n"+blob.name+":") # print( " <- Téléchargement Depuis GCP") # blob.download_to_filename(pdf_folder+"/"+blob.name) # Extraction des textes dpuis les fichiers PDF print(" >>> Extraction PDF") for pdf_file in os.listdir(pdf_folder): if pdf_file.startswith("."): continue print(" > "+pdf_folder+"/"+pdf_file) pdf_total_pages = 0 with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf: pdf_total_pages = len(pdf.pages) # Fuite mémoire pour les gros fichiers # Reouvrir le fichier à chaque N page semble rélgler le problème N_page = 300 page_number = 0 while page_number < pdf_total_pages: print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" ) with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf: npage = 0 while (npage < N_page and page_number < pdf_total_pages) : print(" >>> "+str(page_number+1)) f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w") for char_pdf in pdf.pages[page_number].chars: f.write(char_pdf["text"]) f.close() npage = npage + 1 page_number = page_number + 1 print(" X removing: " + blob.name ) os.remove(pdf_folder+"/"+blob.name) def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"): vectorstore = PineconeVectorStore( index_name=index_name, embedding=embeddings_function, #namespace=namespace ) print(" Vectorisation ...") return vectorstore print("MISSING VECTORS") exit(0)