|
import os
|
|
import time
|
|
from urllib.request import Request, urlopen
|
|
from urllib.error import HTTPError, URLError
|
|
from pymongo import MongoClient
|
|
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
|
from config import MONGODB_URI, DB_NAME, SAVE_FOLDER
|
|
|
|
PDF_URLS = [
|
|
|
|
]
|
|
|
|
COLLECTION_NAME = "connaissances"
|
|
|
|
def download_pdf(url, save_path, retries=2, delay=3):
|
|
"""Télécharge un PDF depuis une URL avec gestion des erreurs."""
|
|
for attempt in range(retries):
|
|
try:
|
|
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
with urlopen(req) as response, open(save_path, 'wb') as f:
|
|
f.write(response.read())
|
|
print(f"Téléchargé : {save_path}")
|
|
return
|
|
except (HTTPError, URLError) as e:
|
|
print(f"Erreur ({e}) pour {url}, tentative {attempt+1}/{retries}")
|
|
time.sleep(delay)
|
|
print(f"Échec du téléchargement : {url}")
|
|
'''
|
|
def init_documents():
|
|
"""Initialise les documents dans la base de données avec leurs embeddings."""
|
|
os.makedirs(SAVE_FOLDER, exist_ok=True)
|
|
|
|
|
|
for url in PDF_URLS:
|
|
file_name = url.split("/")[-1]
|
|
file_path = os.path.join(SAVE_FOLDER, file_name)
|
|
if not os.path.exists(file_path):
|
|
download_pdf(url, file_path)
|
|
|
|
print("Chargement des PDFs...")
|
|
loader = PyPDFDirectoryLoader(SAVE_FOLDER)
|
|
docs = loader.load()
|
|
|
|
print("Découpage des documents...")
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
|
chunks = splitter.split_documents(docs)
|
|
print(f"{len(chunks)} morceaux extraits.")
|
|
|
|
print("Initialisation du modèle d'embeddings...")
|
|
embedding_model = HuggingFaceEmbeddings(model_name="shtilev/medical_embedded_v2")
|
|
|
|
print("Connexion à MongoDB...")
|
|
client = MongoClient(MONGODB_URI)
|
|
collection = client[DB_NAME][COLLECTION_NAME]
|
|
|
|
confirm = input("Cette opération supprimera toutes les données existantes. Continuer? (o/n): ")
|
|
if confirm.lower() != 'o':
|
|
print("Opération annulée.")
|
|
return
|
|
|
|
print("Suppression des documents existants...")
|
|
collection.delete_many({})
|
|
|
|
print("Génération des embeddings et insertion dans la base de données...")
|
|
for i, chunk in enumerate(chunks):
|
|
text = chunk.page_content
|
|
print(f"Traitement du morceau {i+1}/{len(chunks)}")
|
|
embedding = embedding_model.embed_query(text)
|
|
collection.insert_one({
|
|
"text": text,
|
|
"embedding": embedding
|
|
})
|
|
|
|
print("Tous les embeddings ont été insérés dans la base MongoDB.")
|
|
|
|
if __name__ == "__main__":
|
|
init_documents()
|
|
''' |