Anonymous1223334444
Update
2721ce7
import os
import json
import traceback
import re
import time
import random
from pathlib import Path
import tiktoken
import numpy as np
from PIL import Image # Pillow for image handling
import io # To handle image bytes
# Gemma imports
import jax.numpy as jnp
# For Gemma models, we need a specific setup to load the model
# For JAX/GPU memory allocation
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"
from gemma import gm
# Sentence-Transformers for text embedding
from sentence_transformers import SentenceTransformer
# --- Configuration ---
# Set the desired Gemma model
GEMMA_MULTIMODAL_MODEL = "gemma-3.4b-it" # You can choose other Gemma variants if available and suitable
# Set the desired Sentence-Transformers model for text embeddings
# This is a good free, multilingual model.
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# The dimension of embeddings for this model
EMBEDDING_DIMENSION = 384 # MiniLM-L12-v2 produces 384-dimensional embeddings
MAX_TOKENS_NORMAL = 500
ENCODING_NAME = "cl100k_base" # Used for token chunking, consistent
# Path configuration
BASE_DIR = Path("/content/") # Default for Colab environment
PDF_DIRECTORY = BASE_DIR / "docs"
OUTPUT_DIR = BASE_DIR / "output"
EMBEDDINGS_FILE_PATH = OUTPUT_DIR / "embeddings_statistiques_multimodal_gemma_st.json"
# Directory to save extracted images and tables HTML (within output)
IMAGE_SAVE_SUBDIR = "extracted_graphs"
TABLE_SAVE_SUBDIR = "extracted_tables"
IMAGE_SAVE_DIR = OUTPUT_DIR / IMAGE_SAVE_SUBDIR
TABLE_SAVE_DIR = OUTPUT_DIR / TABLE_SAVE_SUBDIR
# Global models
gemma_sampler = None
text_embedding_model = None
def initialize_models():
"""Initializes Gemma and Sentence-Transformers models."""
global gemma_sampler, text_embedding_model
print("✓ Initializing Gemma Multimodal Model...")
try:
model = gm.nn.Gemma3_4B() # Initialize Gemma model
# Load Gemma parameters
params = gm.ckpts.load_params(gm.ckpts.CheckpointPath.GEMMA3_4B_IT)
gemma_sampler = gm.text.ChatSampler(model=model, params=params)
print(f"✓ Gemma Multimodal Model '{GEMMA_MULTIMODAL_MODEL}' loaded successfully.")
except Exception as e:
print(f"❌ ERREUR: Échec du chargement du modèle multimodal Gemma : {str(e)}")
print("⚠️ La génération de descriptions multimodales échouera.")
gemma_sampler = None
print(f"✓ Initializing Sentence-Transformers Model '{SENTENCE_TRANSFORMER_MODEL}'...")
try:
text_embedding_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
print(f"✓ Modèle d'embedding textuel Sentence-Transformers '{SENTENCE_TRANSFORMER_MODEL}' chargé avec succès.")
except Exception as e:
print(f"❌ ERREUR: Échec du chargement du modèle d'embedding textuel Sentence-Transformers : {str(e)}")
print("⚠️ La génération d'embeddings textuels échouera.")
text_embedding_model = None
def clean_text(text):
"""Normalize whitespace and clean text while preserving paragraph breaks"""
if not text:
return ""
text = text.replace('\t', ' ')
text = re.sub(r' +', ' ', text)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
# --- PDF Processing Functions (Mostly unchanged from previous version, but updated to use global paths) ---
import fitz # PyMuPDF
import camelot # For table extraction
import pandas as pd
from bs4 import BeautifulSoup
IMAGE_MIN_WIDTH = 100
IMAGE_MIN_HEIGHT = 100
def extract_page_data_pymupdf(pdf_path):
"""Extract text, tables and save images from each page using PyMuPDF and Camelot."""
page_data_list = []
try:
doc = fitz.open(pdf_path)
metadata = doc.metadata or {}
pdf_data = {
'pdf_title': metadata.get('title', pdf_path.name),
'pdf_subject': metadata.get('subject', 'Statistiques'),
'pdf_keywords': metadata.get('keywords', '')
}
for page_num in range(len(doc)):
page = doc.load_page(page_num)
page_index = page_num + 1 # 1-based index
print(f" Extraction des données de la page {page_index}...")
# Extract tables first
table_data = extract_tables_and_images_from_page(pdf_path, page, page_index)
# Track table regions to avoid double-processing text
table_regions = []
for item in table_data:
if 'rect' in item and item['rect'] and len(item['rect']) == 4:
table_regions.append(fitz.Rect(item['rect']))
else:
print(f" Warning: Invalid rect for table on page {page_index}")
# Extract text excluding table regions
page_text = ""
if table_regions:
blocks = page.get_text("blocks")
for block in blocks:
block_rect = fitz.Rect(block[:4])
is_in_table = False
for table_rect in table_regions:
if block_rect.intersects(table_rect):
is_in_table = True
break
if not is_in_table:
page_text += block[4] + "\n"
else:
page_text = page.get_text("text")
page_text = clean_text(page_text)
# Extract and save images (excluding those identified as tables)
image_data = extract_images_from_page(pdf_path, page, page_index, excluded_rects=table_regions)
page_data_list.append({
'pdf_file': pdf_path.name,
'page_number': page_index,
'text': page_text,
'images': image_data,
'tables': [item for item in table_data if item['content_type'] == 'table'],
'pdf_title': pdf_data.get('pdf_title'),
'pdf_subject': pdf_data.get('pdf_subject'),
'pdf_keywords': pdf_data.get('pdf_keywords')
})
doc.close()
except Exception as e:
print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}")
traceback.print_exc()
return page_data_list
def extract_tables_and_images_from_page(pdf_path, page, page_num):
"""Extract tables using Camelot and capture images of table areas."""
table_and_image_data = []
try:
tables = camelot.read_pdf(
str(pdf_path),
pages=str(page_num),
flavor='lattice',
)
if len(tables) == 0:
tables = camelot.read_pdf(
str(pdf_path),
pages=str(page_num),
flavor='stream'
)
for i, table in enumerate(tables):
if table.accuracy < 70:
print(f" Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}")
continue
table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0])
if not table_bbox or len(table_bbox) != 4:
print(f" Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.")
table_rect = None
else:
table_rect = fitz.Rect(table_bbox)
safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html"
table_html_save_path = TABLE_SAVE_DIR / table_html_filename
relative_html_url_path = f"/static/{TABLE_SAVE_SUBDIR}/{table_html_filename}"
table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png"
table_image_save_path = IMAGE_SAVE_DIR / table_image_filename
relative_image_url_path = f"/static/{IMAGE_SAVE_SUBDIR}/{table_image_filename}"
df = table.df
html = f"<caption>Table extrait de {pdf_path.name}, page {page_num}</caption>\n" + df.to_html(index=False)
soup = BeautifulSoup(html, 'html.parser')
table_tag = soup.find('table')
if table_tag:
table_tag['class'] = 'table table-bordered table-striped'
table_tag['style'] = 'width:100%; border-collapse:collapse;'
style_tag = soup.new_tag('style')
style_tag.string = """
.table { border-collapse: collapse; width: 100%; margin-bottom: 1rem;}
.table caption { caption-side: top; padding: 0.5rem; text-align: left; font-weight: bold; }
.table th, .table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
.table th { background-color: #f2f2f2; font-weight: bold; }
.table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0,0,0,.05); }
.table-responsive { overflow-x: auto; margin-bottom: 1rem; }
"""
soup.insert(0, style_tag)
div = soup.new_tag('div')
div['class'] = 'table-responsive'
table_tag.wrap(div)
with open(table_html_save_path, 'w', encoding='utf-8') as f:
f.write(str(soup))
else:
print(f" Warning: Could not find table tag in HTML for table on page {page_num}. Skipping HTML save.")
continue
table_image_bytes = None
if table_rect:
try:
pix = page.get_pixmap(clip=table_rect)
table_image_bytes = pix.tobytes(format='png')
with open(table_image_save_path, "wb") as img_file:
img_file.write(table_image_bytes)
except Exception as img_capture_e:
print(f" Erreur lors de la capture d'image du tableau {i} page {page_num} : {img_capture_e}")
traceback.print_exc()
table_image_bytes = None
table_and_image_data.append({
'content_type': 'table',
'table_html_url': relative_html_url_path,
'table_text_representation': df.to_string(index=False),
'rect': [table_rect.x0, table_rect.y0, table_rect.x1, table_rect.y1] if table_rect else None,
'accuracy': table.accuracy,
'image_bytes': table_image_bytes,
'image_url': relative_image_url_path if table_image_bytes else None
})
return table_and_image_data
except Exception as e:
print(f" Erreur lors de l'extraction des tableaux de la page {page_num} : {str(e)}")
traceback.print_exc()
return []
def extract_images_from_page(pdf_path, page, page_num, excluded_rects=[]):
"""Extract and save images from a page, excluding specified regions (like tables)."""
image_data = []
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
xref = img_info[0]
try:
base_image = page.parent.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
width = base_image["width"]
height = base_image["height"]
if width < IMAGE_MIN_WIDTH or height < IMAGE_MIN_HEIGHT:
continue
img_rect = None
img_rects = page.get_image_rects(xref)
if img_rects:
img_rect = img_rects[0]
if img_rect is None:
print(f" Warning: Could not find rectangle for image {img_index} on page {page_num}. Skipping.")
continue
is_excluded = False
for excluded_rect in excluded_rects:
if img_rect.intersects(excluded_rect):
is_excluded = True
break
if is_excluded:
print(f" Image {img_index} on page {page_num} is within an excluded region (e.g., table). Skipping.")
continue
safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
image_filename = f"{safe_pdf_name}_p{page_num}_img{img_index}.{image_ext}"
image_save_path = IMAGE_SAVE_DIR / image_filename
relative_url_path = f"/static/{IMAGE_SAVE_SUBDIR}/{image_filename}"
with open(image_save_path, "wb") as img_file:
img_file.write(image_bytes)
image_data.append({
'content_type': 'image',
'image_url': relative_url_path,
'rect': [img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1],
'image_bytes': image_bytes
})
except Exception as img_save_e:
print(f" Erreur lors du traitement de l'image {img_index} de la page {page_num} : {img_save_e}")
traceback.print_exc()
return image_data
# --- Embedding and Description Generation Functions (Modified for Gemma and Sentence-Transformers) ---
def token_chunking(text, max_tokens, encoding):
"""Chunk text based on token count with smarter boundaries (sentences, paragraphs)"""
if not text:
return []
tokens = encoding.encode(text)
chunks = []
start_token_idx = 0
while start_token_idx < len(tokens):
end_token_idx = min(start_token_idx + max_tokens, len(tokens))
if end_token_idx < len(tokens):
look_ahead_limit = min(start_token_idx + max_tokens * 2, len(tokens))
text_segment_to_check = encoding.decode(tokens[start_token_idx:look_ahead_limit])
paragraph_break = text_segment_to_check.rfind('\n\n', 0, len(text_segment_to_check) - (look_ahead_limit - (start_token_idx + max_tokens)))
if paragraph_break != -1:
tokens_up_to_break = encoding.encode(text_segment_to_check[:paragraph_break])
end_token_idx = start_token_idx + len(tokens_up_to_break)
else:
sentence_end = re.search(r'[.!?]\s+', text_segment_to_check[:len(text_segment_to_check) - (look_ahead_limit - (start_token_idx + max_tokens))][::-1])
if sentence_end:
char_index_in_segment = len(text_segment_to_check) - 1 - sentence_end.start()
tokens_up_to_end = encoding.encode(text_segment_to_check[:char_index_in_segment + 1])
end_token_idx = start_token_idx + len(tokens_up_to_end)
current_chunk_tokens = tokens[start_token_idx:end_token_idx]
chunk_text = encoding.decode(current_chunk_tokens).strip()
if chunk_text:
chunks.append(chunk_text)
if start_token_idx == end_token_idx:
start_token_idx += 1
else:
start_token_idx = end_token_idx
return chunks
def generate_multimodal_description(image_bytes, prompt_text, max_retries=5, delay=10):
"""
Generate a text description for an image using the Gemma multimodal model.
Returns description text or None if all retries fail or model is not initialized.
"""
global gemma_sampler
if gemma_sampler is None:
print(" Skipping multimodal description generation: Gemma sampler is not initialized.")
return None
# Convert image bytes to PIL Image and then to JAX NumPy array
try:
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
# Gemma expects (H, W, C) numpy array, then converted to JAX numpy
image_np = np.asarray(pil_image)
gemma_image_input = jnp.asarray(image_np)
# Gemma also expects batch dimension, so add it
gemma_image_input = jnp.expand_dims(gemma_image_input, axis=0) # Shape: (1, H, W, C)
except Exception as e:
print(f" Erreur lors de la conversion de l'image pour Gemma : {e}")
return None
for attempt in range(max_retries):
try:
time.sleep(delay + random.uniform(0, 5))
# Gemma chat expects <img_token> special token for image insertion
full_prompt = f"{prompt_text} <img>"
# Use sampler.chat for turn-based interaction
# The images argument accepts a JAX numpy array with shape (batch, num_images, H, W, C)
# If a single image, it's (batch, 1, H, W, C)
# We are currently passing a single image, so gemma_image_input is (1, H, W, C) already.
# To pass it as `images`, it should be `(batch, num_images, H, W, C)`
# So, if gemma_image_input is (1, H, W, C), for sampler.chat(images=...) it should be (1, 1, H, W, C)
# Let's reshape it for the `images` argument.
final_gemma_image_input = jnp.expand_dims(gemma_image_input, axis=1) # Shape: (1, 1, H, W, C)
out = gemma_sampler.chat(
full_prompt,
images=final_gemma_image_input,
max_tokens=500 # Limit response length
)
description = out.strip()
if description:
return description
else:
print(f" Tentative {attempt+1}/{max_retries}: Réponse vide ou inattendue du modèle multimodal Gemma.")
if attempt < max_retries - 1:
retry_delay = delay * (2 ** attempt) + random.uniform(1, 5)
print(f" Réessai dans {retry_delay:.2f}s...")
time.sleep(retry_delay)
continue
except Exception as e:
error_msg = str(e)
print(f" Tentative {attempt+1}/{max_retries} échouée pour la description (Gemma) : {error_msg}")
# Gemma is local, so no API errors like 429. Focus on general errors.
if attempt < max_retries - 1:
retry_delay = delay * (2 ** attempt) + random.uniform(1, 5)
print(f" Réessai dans {retry_delay:.2f}s...")
time.sleep(retry_delay)
continue
else:
print(f" Toutes les {max_retries} tentatives ont échoué pour la description Gemma.")
return None
print(f" Toutes les {max_retries} tentatives ont échoué pour la description (fin de boucle).")
return None
def generate_text_embedding(text_content, max_retries=5, delay=5):
"""
Generate text embedding using the Sentence-Transformers model.
Returns embedding vector (list) or None if all retries fail or model is not initialized.
"""
global text_embedding_model
if text_embedding_model is None:
print(" Skipping text embedding generation: Sentence-Transformers model is not initialized.")
return None
if not text_content or not text_content.strip():
return None # Cannot embed empty text
for attempt in range(max_retries):
try:
time.sleep(delay + random.uniform(0, 0.5)) # Shorter delay for local model
# Sentence-Transformers encode method
embedding = text_embedding_model.encode(text_content, convert_to_numpy=True)
if embedding is not None and len(embedding) == EMBEDDING_DIMENSION:
return embedding.tolist() # Convert numpy array to list for JSON serialization
else:
print(f" Tentative {attempt+1}/{max_retries}: Format d'embedding Sentence-Transformers inattendu. Réponse : {embedding}")
return None
except Exception as e:
error_msg = str(e)
print(f" Tentative {attempt+1}/{max_retries} échouée pour l'embedding (Sentence-Transformers) : {error_msg}")
if attempt < max_retries - 1:
retry_delay = delay * (2 ** attempt) + random.uniform(0.5, 2)
print(f" Réessai dans {retry_delay:.2f}s...")
time.sleep(retry_delay)
continue
else:
print(f" Toutes les {max_retries} tentatives ont échoué pour l'embedding (Sentence-Transformers).")
return None
print(f" Toutes les {max_retries} tentatives ont échoué pour l'embedding (fin de boucle).")
return None
# --- Main Processing Function ---
def process_pdfs_in_directory(directory):
"""Main processing pipeline for all PDFs in a directory."""
all_embeddings_data = []
processed_files = 0
pdf_files = list(directory.glob("*.pdf"))
total_files = len(pdf_files)
if total_files == 0:
print(f"Aucun fichier PDF trouvé dans le répertoire : {directory}")
return []
for pdf_file_path in pdf_files:
processed_files += 1
print(f"\nTraitement de {pdf_file_path.name} ({processed_files}/{total_files})...")
page_data_list = extract_page_data_pymupdf(pdf_file_path)
if not page_data_list:
print(f" Aucune donnée extraite de {pdf_file_path.name}.")
continue
for page_data in page_data_list:
pdf_file = page_data['pdf_file']
page_num = page_data['page_number']
page_text = page_data['text']
images = page_data['images']
tables = page_data['tables']
pdf_title = page_data.get('pdf_title')
pdf_subject = page_data.get('pdf_subject')
pdf_keywords = page_data.get('pdf_keywords')
print(f" Génération des descriptions et embeddings pour la page {page_num}...")
# Process tables: Generate description and then embedding
for table_idx, table in enumerate(tables):
table_image_bytes = table.get('image_bytes')
table_text_repr = table.get('table_text_representation', '')
table_html_url = table.get('table_html_url')
description = None
if table_image_bytes:
prompt = "Décrivez en français le contenu et la structure de ce tableau. Mettez l'accent sur les données principales et les tendances si visibles."
print(f" Page {page_num}: Génération de la description multimodale pour le tableau {table_idx}...")
description = generate_multimodal_description(table_image_bytes, prompt)
elif table_text_repr:
# Fallback for text-only table description, using Gemma's text capabilities
if gemma_sampler:
prompt = f"Décrivez en français le contenu et la structure de ce tableau basé sur sa représentation textuelle:\n{table_text_repr[:1000]}..."
print(f" Page {page_num}: Génération de la description textuelle pour le tableau {table_idx} (fallback via Gemma)...")
try:
# Gemma text-only generation
out = gemma_sampler.chat(prompt, max_tokens=500)
description = out.strip()
except Exception as e:
print(f" Erreur lors de la génération de description textuelle pour le tableau {table_idx} via Gemma: {e}")
description = None
else:
print(" Skipping text description generation for table: Gemma sampler not initialized.")
description = None
if description:
print(f" Page {page_num}: Description générée pour le tableau {table_idx}.")
embedding_vector = generate_text_embedding(description)
if embedding_vector is not None:
chunk_data = {
"pdf_file": pdf_file,
"page_number": page_num,
"chunk_id": f"table_{table_idx}",
"content_type": "table",
"text_content": description,
"embedding": embedding_vector,
"table_html_url": table_html_url,
"image_url": table.get('image_url'),
"pdf_title": pdf_title,
"pdf_subject": pdf_subject,
"pdf_keywords": pdf_keywords
}
all_embeddings_data.append(chunk_data)
print(f" Page {page_num}: Embedding généré pour la description du tableau {table_idx}.")
else:
print(f" Page {page_num}: Échec de la génération de l'embedding pour la description du tableau {table_idx}. Chunk ignoré.")
else:
print(f" Page {page_num}: Aucune description générée pour le tableau {table_idx}. Chunk ignoré.")
# Process images (non-table): Generate description and then embedding
for img_idx, image in enumerate(images):
image_bytes = image.get('image_bytes')
image_url = image.get('image_url')
if image_bytes:
prompt = "Décrivez en français le contenu de cette image. S'il s'agit d'un graphique, décrivez le type de graphique (histogramme, courbe, etc.), les axes, les légendes et les principales informations ou tendances visibles."
print(f" Page {page_num}: Génération de la description multimodale pour l'image {img_idx}...")
description = generate_multimodal_description(image_bytes, prompt)
if description:
print(f" Page {page_num}: Description générée pour l'image {img_idx}.")
embedding_vector = generate_text_embedding(description)
if embedding_vector is not None:
chunk_data = {
"pdf_file": pdf_file,
"page_number": page_num,
"chunk_id": f"image_{img_idx}",
"content_type": "image",
"text_content": description,
"embedding": embedding_vector,
"image_url": image_url,
"pdf_title": pdf_title,
"pdf_subject": pdf_subject,
"pdf_keywords": pdf_keywords
}
all_embeddings_data.append(chunk_data)
print(f" Page {page_num}: Embedding généré pour la description de l'image {img_idx}.")
else:
print(f" Page {page_num}: Échec de la génération de l'embedding pour la description de l'image {img_idx}. Chunk ignoré.")
else:
print(f" Page {page_num}: Aucune description générée pour l'image {img_idx}. Chunk ignoré.")
# Process regular text: Chunk and then generate embeddings
if page_text:
try:
encoding = tiktoken.get_encoding(ENCODING_NAME)
text_chunks = token_chunking(page_text, MAX_TOKENS_NORMAL, encoding)
except Exception as e:
print(f"Erreur lors du chunking du texte de la page {page_num} : {e}. Utilisation du chunking simple.")
text_chunks = [page_text]
for chunk_idx, chunk_content in enumerate(text_chunks):
print(f" Page {page_num}: Génération de l'embedding pour le chunk de texte {chunk_idx}...")
embedding_vector = generate_text_embedding(chunk_content)
if embedding_vector is not None:
chunk_data = {
"pdf_file": pdf_file,
"page_number": page_num,
"chunk_id": f"text_{chunk_idx}",
"content_type": "text",
"text_content": chunk_content,
"embedding": embedding_vector,
"pdf_title": pdf_title,
"pdf_subject": pdf_subject,
"pdf_keywords": pdf_keywords
}
all_embeddings_data.append(chunk_data)
print(f" Page {page_num}: Chunk de texte {chunk_idx} traité avec succès.")
else:
print(f" Page {page_num}: Échec de la génération de l'embedding pour le chunk de texte {chunk_idx}. Chunk ignoré.")
print(f" Page {page_num} terminée. Éléments traités : {len(tables)} tableaux, {len(images)} images, {len(text_chunks)} chunks de texte.")
return all_embeddings_data
# --- Main Execution ---
if __name__ == "__main__":
print("Démarrage du traitement PDF multimodal avec génération de descriptions (Gemma) et embeddings textuels multilingues (Sentence-Transformers)...")
# Validate and create directories
if not PDF_DIRECTORY.is_dir():
print(f"❌ ERREUR: Répertoire PDF non trouvé ou n'est pas un répertoire : {PDF_DIRECTORY}. Veuillez créer un répertoire 'docs' et y placer vos PDFs.")
# Create it if it doesn't exist, for example PDF_DIRECTORY.mkdir(parents=True, exist_ok=True)
# But for Colab, it's often better to instruct user to upload.
exit(1)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
TABLE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
print(f"Répertoire de sortie : {OUTPUT_DIR}")
print(f"Répertoire de sauvegarde des images : {IMAGE_SAVE_DIR}")
print(f"Répertoire de sauvegarde des tableaux (HTML) : {TABLE_SAVE_DIR}")
# Initialize Gemma and Sentence-Transformers models
initialize_models()
# If models failed to initialize, exit
if gemma_sampler is None or text_embedding_model is None:
print("Impossible de continuer car un ou plusieurs modèles n'ont pas pu être initialisés.")
exit(1)
final_embeddings = process_pdfs_in_directory(PDF_DIRECTORY)
if final_embeddings:
print(f"\nTotal d'embeddings générés : {len(final_embeddings)}.")
try:
with EMBEDDINGS_FILE_PATH.open('w', encoding='utf-8') as f:
json.dump(final_embeddings, f, indent=2, ensure_ascii=False)
print(f"Embeddings sauvegardés avec succès dans : {EMBEDDINGS_FILE_PATH}")
except Exception as e:
print(f"\nErreur lors de la sauvegarde du fichier JSON d'embeddings : {e}")
traceback.print_exc()
else:
print("\nAucun embedding n'a été généré.")