File size: 5,274 Bytes
03d828b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
from google.cloud import storage
import os
with open("./cred.json","w") as fj:
fj.write(os.environ["CRED_JSON"])
storage_client = storage.Client()
bucket_name = "docs-axio-clara"
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from climateqa.engine.embeddings import get_embeddings_function
embeddings_function = get_embeddings_function()
index_name = "clara-index"
namespace = "my-namespace"
import os
import pdfplumber
def get_categories_files():
finale = {}
listCat = []
CAT_DIR="config_categorie/"
FOLDER_PATH="."
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(CAT_DIR+"categories.csv")
lines = blob.download_as_text().split("\n")
blob_label = bucket.blob(CAT_DIR+"libelle.csv")
lines_label = blob_label.download_as_text().split("\n")
labels = {}
# récupération des libelles
first = True
for line in lines_label:
# evite la première ligne
if first:
first = False
continue
lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","")
labels[line.split(";")[0]] = lab
print( "label :"+lab )
# premier passage récupération des catégories existantes
first = True
for line in lines:
# evite la première ligne
if first:
first = False
continue
categories = line.split(";")[-1].split(" ")
for cat in categories:
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
# si la categorie n'a pas de label on utilise le champ technique
try :
test = labels[categ] # plante si la clé n'exsite pas
except :
labels[categ] = categ
# on ajoute la catégorie (le label) dans la liste si pas déjà croisée
if not labels[categ] in listCat:
print(" - ["+categ+"] > "+ labels[categ] )
listCat.append(labels[categ])
# initialisation de la structure finale
for cat in listCat:
finale[cat] = []
finale["AllCat"] = listCat
# deuxième passage association fichier, catégorie
first = True
for line in lines:
# evite la première ligne
if first:
first = False
continue
fichier = line.split(";")[0]
categories = line.split(";")[-1].split(" ")
listCat = []
# on place le fichier dans les catégories associées
for cat in categories:
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
print( fichier +" dans "+ labels[categ] +"("+categ+")")
finale[labels[categ]].append(fichier)
return finale
def get_PDF_Names_from_GCP():
listName = []
# Récupération des fichier depuis GCP storage
blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
for blob in blobs:
listName.append(blob.name)
return listName
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
# Récupération des fichier depuis GCP storage
#blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
#for blob in blobs:
# print( "\n"+blob.name+":")
# print( " <- Téléchargement Depuis GCP")
# blob.download_to_filename(pdf_folder+"/"+blob.name)
# Extraction des textes dpuis les fichiers PDF
print(" >>> Extraction PDF")
for pdf_file in os.listdir(pdf_folder):
if pdf_file.startswith("."):
continue
print(" > "+pdf_folder+"/"+pdf_file)
pdf_total_pages = 0
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
pdf_total_pages = len(pdf.pages)
# Fuite mémoire pour les gros fichiers
# Reouvrir le fichier à chaque N page semble rélgler le problème
N_page = 300
page_number = 0
while page_number < pdf_total_pages:
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
npage = 0
while (npage < N_page and page_number < pdf_total_pages) :
print(" >>> "+str(page_number+1))
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
for char_pdf in pdf.pages[page_number].chars:
f.write(char_pdf["text"])
f.close()
npage = npage + 1
page_number = page_number + 1
print(" X removing: " + blob.name )
os.remove(pdf_folder+"/"+blob.name)
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
vectorstore = PineconeVectorStore(
index_name=index_name,
embedding=embeddings_function,
#namespace=namespace
)
print(" Vectorisation ...")
return vectorstore
print("MISSING VECTORS")
exit(0) |