|
|
|
from google.cloud import storage |
|
import os |
|
|
|
with open("./cred.json","w") as fj: |
|
fj.write(os.environ["CRED_JSON"]) |
|
|
|
storage_client = storage.Client() |
|
|
|
bucket_name = "docs-axio-clara" |
|
|
|
from langchain_pinecone import PineconeVectorStore |
|
|
|
from langchain_community.document_loaders import TextLoader |
|
from langchain_text_splitters import CharacterTextSplitter |
|
from climateqa.engine.embeddings import get_embeddings_function |
|
embeddings_function = get_embeddings_function() |
|
|
|
|
|
|
|
index_name = "clara-index" |
|
namespace = "my-namespace" |
|
|
|
|
|
import os |
|
import pdfplumber |
|
|
|
|
|
def get_categories_files(): |
|
|
|
finale = {} |
|
listCat = [] |
|
|
|
CAT_DIR="config_categorie/" |
|
FOLDER_PATH="." |
|
|
|
bucket = storage_client.get_bucket(bucket_name) |
|
|
|
blob = bucket.blob(CAT_DIR+"categories.csv") |
|
lines = blob.download_as_text().split("\n") |
|
|
|
blob_label = bucket.blob(CAT_DIR+"libelle.csv") |
|
lines_label = blob_label.download_as_text().split("\n") |
|
|
|
labels = {} |
|
|
|
first = True |
|
for line in lines_label: |
|
|
|
if first: |
|
first = False |
|
continue |
|
lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","") |
|
labels[line.split(";")[0]] = lab |
|
print( "label :"+lab ) |
|
|
|
|
|
first = True |
|
for line in lines: |
|
|
|
if first: |
|
first = False |
|
continue |
|
categories = line.split(";")[-1].split(" ") |
|
|
|
for cat in categories: |
|
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","") |
|
|
|
|
|
try : |
|
test = labels[categ] |
|
except : |
|
labels[categ] = categ |
|
|
|
|
|
if not labels[categ] in listCat: |
|
print(" - ["+categ+"] > "+ labels[categ] ) |
|
listCat.append(labels[categ]) |
|
|
|
|
|
for cat in listCat: |
|
finale[cat] = [] |
|
finale["AllCat"] = listCat |
|
|
|
|
|
first = True |
|
for line in lines: |
|
|
|
if first: |
|
first = False |
|
continue |
|
fichier = line.split(";")[0] |
|
categories = line.split(";")[-1].split(" ") |
|
listCat = [] |
|
|
|
|
|
for cat in categories: |
|
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","") |
|
print( fichier +" dans "+ labels[categ] +"("+categ+")") |
|
finale[labels[categ]].append(fichier) |
|
|
|
return finale |
|
|
|
def get_PDF_Names_from_GCP(): |
|
|
|
listName = [] |
|
|
|
blobs = storage_client.list_blobs(bucket_name, prefix='sources/') |
|
for blob in blobs: |
|
listName.append(blob.name) |
|
return listName |
|
|
|
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(" >>> Extraction PDF") |
|
for pdf_file in os.listdir(pdf_folder): |
|
if pdf_file.startswith("."): |
|
continue |
|
print(" > "+pdf_folder+"/"+pdf_file) |
|
pdf_total_pages = 0 |
|
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf: |
|
pdf_total_pages = len(pdf.pages) |
|
|
|
|
|
|
|
N_page = 300 |
|
page_number = 0 |
|
while page_number < pdf_total_pages: |
|
|
|
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" ) |
|
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf: |
|
|
|
npage = 0 |
|
while (npage < N_page and page_number < pdf_total_pages) : |
|
|
|
print(" >>> "+str(page_number+1)) |
|
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w") |
|
for char_pdf in pdf.pages[page_number].chars: |
|
f.write(char_pdf["text"]) |
|
f.close() |
|
|
|
npage = npage + 1 |
|
page_number = page_number + 1 |
|
|
|
|
|
print(" X removing: " + blob.name ) |
|
os.remove(pdf_folder+"/"+blob.name) |
|
|
|
|
|
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"): |
|
|
|
vectorstore = PineconeVectorStore( |
|
index_name=index_name, |
|
embedding=embeddings_function, |
|
|
|
) |
|
print(" Vectorisation ...") |
|
return vectorstore |
|
|
|
|
|
print("MISSING VECTORS") |
|
exit(0) |