Clara / climateqa /engine /vectorstore.py
Samiraxio's picture
Upload folder using huggingface_hub
de34389 verified
raw
history blame contribute delete
No virus
5.27 kB
from google.cloud import storage
import os
with open("./cred.json","w") as fj:
fj.write(os.environ["CRED_JSON"])
storage_client = storage.Client()
bucket_name = "docs-axio-clara"
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from climateqa.engine.embeddings import get_embeddings_function
embeddings_function = get_embeddings_function()
index_name = "clara-index"
namespace = "my-namespace"
import os
import pdfplumber
def get_categories_files():
finale = {}
listCat = []
CAT_DIR="config_categorie/"
FOLDER_PATH="."
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(CAT_DIR+"categories.csv")
lines = blob.download_as_text().split("\n")
blob_label = bucket.blob(CAT_DIR+"libelle.csv")
lines_label = blob_label.download_as_text().split("\n")
labels = {}
# récupération des libelles
first = True
for line in lines_label:
# evite la première ligne
if first:
first = False
continue
lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","")
labels[line.split(";")[0]] = lab
print( "label :"+lab )
# premier passage récupération des catégories existantes
first = True
for line in lines:
# evite la première ligne
if first:
first = False
continue
categories = line.split(";")[-1].split(" ")
for cat in categories:
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
# si la categorie n'a pas de label on utilise le champ technique
try :
test = labels[categ] # plante si la clé n'exsite pas
except :
labels[categ] = categ
# on ajoute la catégorie (le label) dans la liste si pas déjà croisée
if not labels[categ] in listCat:
print(" - ["+categ+"] > "+ labels[categ] )
listCat.append(labels[categ])
# initialisation de la structure finale
for cat in listCat:
finale[cat] = []
finale["AllCat"] = listCat
# deuxième passage association fichier, catégorie
first = True
for line in lines:
# evite la première ligne
if first:
first = False
continue
fichier = line.split(";")[0]
categories = line.split(";")[-1].split(" ")
listCat = []
# on place le fichier dans les catégories associées
for cat in categories:
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
print( fichier +" dans "+ labels[categ] +"("+categ+")")
finale[labels[categ]].append(fichier)
return finale
def get_PDF_Names_from_GCP():
listName = []
# Récupération des fichier depuis GCP storage
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
for blob in blobs:
listName.append(blob.name)
return listName
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
# Récupération des fichier depuis GCP storage
#blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
#for blob in blobs:
# print( "\n"+blob.name+":")
# print( " <- Téléchargement Depuis GCP")
# blob.download_to_filename(pdf_folder+"/"+blob.name)
# Extraction des textes dpuis les fichiers PDF
print(" >>> Extraction PDF")
for pdf_file in os.listdir(pdf_folder):
if pdf_file.startswith("."):
continue
print(" > "+pdf_folder+"/"+pdf_file)
pdf_total_pages = 0
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
pdf_total_pages = len(pdf.pages)
# Fuite mémoire pour les gros fichiers
# Reouvrir le fichier à chaque N page semble rélgler le problème
N_page = 300
page_number = 0
while page_number < pdf_total_pages:
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
npage = 0
while (npage < N_page and page_number < pdf_total_pages) :
print(" >>> "+str(page_number+1))
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
for char_pdf in pdf.pages[page_number].chars:
f.write(char_pdf["text"])
f.close()
npage = npage + 1
page_number = page_number + 1
print(" X removing: " + blob.name )
os.remove(pdf_folder+"/"+blob.name)
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
vectorstore = PineconeVectorStore(
index_name=index_name,
embedding=embeddings_function,
#namespace=namespace
)
print(" Vectorisation ...")
return vectorstore
print("MISSING VECTORS")
exit(0)