from google.cloud import storage |
import os |
with open("./cred.json","w") as fj: |
fj.write(os.environ["CRED_JSON"]) |
storage_client = storage.Client() |
bucket_name = "docs-axio-clara" |
from langchain_pinecone import PineconeVectorStore |
from langchain_community.document_loaders import TextLoader |
from langchain_text_splitters import CharacterTextSplitter |
from climateqa.engine.embeddings import get_embeddings_function |
embeddings_function = get_embeddings_function() |
index_name = "clara-index" |
namespace = "my-namespace" |
import os |
import pdfplumber |
def get_categories_files(): |
finale = {} |
listCat = [] |
CAT_DIR="config_categorie/" |
bucket = storage_client.get_bucket(bucket_name) |
blob = bucket.blob(CAT_DIR+"categories.csv") |
lines = blob.download_as_text().split("\n") |
blob_label = bucket.blob(CAT_DIR+"libelle.csv") |
lines_label = blob_label.download_as_text().split("\n") |
labels = {} |
first = True |
for line in lines_label: |
if first: |
first = False |
continue |
lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","") |
labels[line.split(";")[0]] = lab |
print( "label :"+lab ) |
first = True |
for line in lines: |
if first: |
first = False |
continue |
categories = line.split(";")[-1].split(" ") |
for cat in categories: |
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","") |
try : |
test = labels[categ] |
except : |
labels[categ] = categ |
if not labels[categ] in listCat: |
print(" - ["+categ+"] > "+ labels[categ] ) |
listCat.append(labels[categ]) |
for cat in listCat: |
finale[cat] = [] |
finale["AllCat"] = listCat |
first = True |
for line in lines: |
if first: |
first = False |
continue |
fichier = line.split(";")[0] |
categories = line.split(";")[-1].split(" ") |
listCat = [] |
for cat in categories: |
categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","") |
print( fichier +" dans "+ labels[categ] +"("+categ+")") |
finale[labels[categ]].append(fichier) |
return finale |
def get_PDF_Names_from_GCP(): |
listName = [] |
blobs = storage_client.list_blobs(bucket_name, prefix='sources/') |
for blob in blobs: |
listName.append(blob.name) |
return listName |
def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"): |
print(" >>> Extraction PDF") |
for pdf_file in os.listdir(pdf_folder): |
if pdf_file.startswith("."): |
continue |
print(" > "+pdf_folder+"/"+pdf_file) |
pdf_total_pages = 0 |
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf: |
pdf_total_pages = len(pdf.pages) |
N_page = 300 |
page_number = 0 |
while page_number < pdf_total_pages: |
print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" ) |
with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf: |
npage = 0 |
while (npage < N_page and page_number < pdf_total_pages) : |
print(" >>> "+str(page_number+1)) |
f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w") |
for char_pdf in pdf.pages[page_number].chars: |
f.write(char_pdf["text"]) |
f.close() |
npage = npage + 1 |
page_number = page_number + 1 |
print(" X removing: " + blob.name ) |
os.remove(pdf_folder+"/"+blob.name) |
def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"): |
vectorstore = PineconeVectorStore( |
index_name=index_name, |
embedding=embeddings_function, |
) |
print(" Vectorisation ...") |
return vectorstore |
exit(0) |