Anonymous1223334444

Update

2721ce7 3 months ago

30.8 kB

	import os
	import json
	import traceback
	import re
	import time
	import random
	from pathlib import Path
	import tiktoken
	import numpy as np
	from PIL import Image # Pillow for image handling
	import io # To handle image bytes

	# Gemma imports
	import jax.numpy as jnp
	# For Gemma models, we need a specific setup to load the model
	# For JAX/GPU memory allocation
	os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"
	from gemma import gm

	# Sentence-Transformers for text embedding
	from sentence_transformers import SentenceTransformer


	# --- Configuration ---
	# Set the desired Gemma model
	GEMMA_MULTIMODAL_MODEL = "gemma-3.4b-it" # You can choose other Gemma variants if available and suitable

	# Set the desired Sentence-Transformers model for text embeddings
	# This is a good free, multilingual model.
	SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
	# The dimension of embeddings for this model
	EMBEDDING_DIMENSION = 384 # MiniLM-L12-v2 produces 384-dimensional embeddings


	MAX_TOKENS_NORMAL = 500
	ENCODING_NAME = "cl100k_base" # Used for token chunking, consistent

	# Path configuration
	BASE_DIR = Path("/content/") # Default for Colab environment
	PDF_DIRECTORY = BASE_DIR / "docs"
	OUTPUT_DIR = BASE_DIR / "output"
	EMBEDDINGS_FILE_PATH = OUTPUT_DIR / "embeddings_statistiques_multimodal_gemma_st.json"

	# Directory to save extracted images and tables HTML (within output)
	IMAGE_SAVE_SUBDIR = "extracted_graphs"
	TABLE_SAVE_SUBDIR = "extracted_tables"
	IMAGE_SAVE_DIR = OUTPUT_DIR / IMAGE_SAVE_SUBDIR
	TABLE_SAVE_DIR = OUTPUT_DIR / TABLE_SAVE_SUBDIR


	# Global models
	gemma_sampler = None
	text_embedding_model = None

	def initialize_models():
	"""Initializes Gemma and Sentence-Transformers models."""
	global gemma_sampler, text_embedding_model

	print("✓ Initializing Gemma Multimodal Model...")
	try:
	model = gm.nn.Gemma3_4B() # Initialize Gemma model
	# Load Gemma parameters
	params = gm.ckpts.load_params(gm.ckpts.CheckpointPath.GEMMA3_4B_IT)
	gemma_sampler = gm.text.ChatSampler(model=model, params=params)
	print(f"✓ Gemma Multimodal Model '{GEMMA_MULTIMODAL_MODEL}' loaded successfully.")
	except Exception as e:
	print(f"❌ ERREUR: Échec du chargement du modèle multimodal Gemma : {str(e)}")
	print("⚠️ La génération de descriptions multimodales échouera.")
	gemma_sampler = None

	print(f"✓ Initializing Sentence-Transformers Model '{SENTENCE_TRANSFORMER_MODEL}'...")
	try:
	text_embedding_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
	print(f"✓ Modèle d'embedding textuel Sentence-Transformers '{SENTENCE_TRANSFORMER_MODEL}' chargé avec succès.")
	except Exception as e:
	print(f"❌ ERREUR: Échec du chargement du modèle d'embedding textuel Sentence-Transformers : {str(e)}")
	print("⚠️ La génération d'embeddings textuels échouera.")
	text_embedding_model = None


	def clean_text(text):
	"""Normalize whitespace and clean text while preserving paragraph breaks"""
	if not text:
	return ""
	text = text.replace('\t', ' ')
	text = re.sub(r' +', ' ', text)
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()

	# --- PDF Processing Functions (Mostly unchanged from previous version, but updated to use global paths) ---
	import fitz # PyMuPDF
	import camelot # For table extraction
	import pandas as pd
	from bs4 import BeautifulSoup

	IMAGE_MIN_WIDTH = 100
	IMAGE_MIN_HEIGHT = 100

	def extract_page_data_pymupdf(pdf_path):
	"""Extract text, tables and save images from each page using PyMuPDF and Camelot."""
	page_data_list = []
	try:
	doc = fitz.open(pdf_path)
	metadata = doc.metadata or {}
	pdf_data = {
	'pdf_title': metadata.get('title', pdf_path.name),
	'pdf_subject': metadata.get('subject', 'Statistiques'),
	'pdf_keywords': metadata.get('keywords', '')
	}

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	page_index = page_num + 1 # 1-based index

	print(f" Extraction des données de la page {page_index}...")

	# Extract tables first
	table_data = extract_tables_and_images_from_page(pdf_path, page, page_index)

	# Track table regions to avoid double-processing text
	table_regions = []
	for item in table_data:
	if 'rect' in item and item['rect'] and len(item['rect']) == 4:
	table_regions.append(fitz.Rect(item['rect']))
	else:
	print(f" Warning: Invalid rect for table on page {page_index}")

	# Extract text excluding table regions
	page_text = ""
	if table_regions:
	blocks = page.get_text("blocks")
	for block in blocks:
	block_rect = fitz.Rect(block[:4])
	is_in_table = False
	for table_rect in table_regions:
	if block_rect.intersects(table_rect):
	is_in_table = True
	break
	if not is_in_table:
	page_text += block[4] + "\n"
	else:
	page_text = page.get_text("text")

	page_text = clean_text(page_text)

	# Extract and save images (excluding those identified as tables)
	image_data = extract_images_from_page(pdf_path, page, page_index, excluded_rects=table_regions)

	page_data_list.append({
	'pdf_file': pdf_path.name,
	'page_number': page_index,
	'text': page_text,
	'images': image_data,
	'tables': [item for item in table_data if item['content_type'] == 'table'],
	'pdf_title': pdf_data.get('pdf_title'),
	'pdf_subject': pdf_data.get('pdf_subject'),
	'pdf_keywords': pdf_data.get('pdf_keywords')
	})
	doc.close()
	except Exception as e:
	print(f"Erreur lors du traitement du PDF {pdf_path.name} avec PyMuPDF : {str(e)}")
	traceback.print_exc()
	return page_data_list


	def extract_tables_and_images_from_page(pdf_path, page, page_num):
	"""Extract tables using Camelot and capture images of table areas."""
	table_and_image_data = []
	try:
	tables = camelot.read_pdf(
	str(pdf_path),
	pages=str(page_num),
	flavor='lattice',
	)

	if len(tables) == 0:
	tables = camelot.read_pdf(
	str(pdf_path),
	pages=str(page_num),
	flavor='stream'
	)

	for i, table in enumerate(tables):
	if table.accuracy < 70:
	print(f" Skipping low accuracy table ({table.accuracy:.2f}%) on page {page_num}")
	continue

	table_bbox = table.parsing_report.get('page_bbox', [0, 0, 0, 0])
	if not table_bbox or len(table_bbox) != 4:
	print(f" Warning: Invalid bounding box for table {i} on page {page_num}. Skipping image capture.")
	table_rect = None
	else:
	table_rect = fitz.Rect(table_bbox)

	safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
	table_html_filename = f"{safe_pdf_name}_p{page_num}_table{i}.html"
	table_html_save_path = TABLE_SAVE_DIR / table_html_filename
	relative_html_url_path = f"/static/{TABLE_SAVE_SUBDIR}/{table_html_filename}"

	table_image_filename = f"{safe_pdf_name}_p{page_num}_table{i}.png"
	table_image_save_path = IMAGE_SAVE_DIR / table_image_filename
	relative_image_url_path = f"/static/{IMAGE_SAVE_SUBDIR}/{table_image_filename}"


	df = table.df
	html = f"<caption>Table extrait de {pdf_path.name}, page {page_num}</caption>\n" + df.to_html(index=False)
	soup = BeautifulSoup(html, 'html.parser')
	table_tag = soup.find('table')
	if table_tag:
	table_tag['class'] = 'table table-bordered table-striped'
	table_tag['style'] = 'width:100%; border-collapse:collapse;'

	style_tag = soup.new_tag('style')
	style_tag.string = """
	.table { border-collapse: collapse; width: 100%; margin-bottom: 1rem;}
	.table caption { caption-side: top; padding: 0.5rem; text-align: left; font-weight: bold; }
	.table th, .table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
	.table th { background-color: #f2f2f2; font-weight: bold; }
	.table-striped tbody tr:nth-of-type(odd) { background-color: rgba(0,0,0,.05); }
	.table-responsive { overflow-x: auto; margin-bottom: 1rem; }
	"""
	soup.insert(0, style_tag)

	div = soup.new_tag('div')
	div['class'] = 'table-responsive'
	table_tag.wrap(div)

	with open(table_html_save_path, 'w', encoding='utf-8') as f:
	f.write(str(soup))
	else:
	print(f" Warning: Could not find table tag in HTML for table on page {page_num}. Skipping HTML save.")
	continue

	table_image_bytes = None
	if table_rect:
	try:
	pix = page.get_pixmap(clip=table_rect)
	table_image_bytes = pix.tobytes(format='png')

	with open(table_image_save_path, "wb") as img_file:
	img_file.write(table_image_bytes)

	except Exception as img_capture_e:
	print(f" Erreur lors de la capture d'image du tableau {i} page {page_num} : {img_capture_e}")
	traceback.print_exc()
	table_image_bytes = None

	table_and_image_data.append({
	'content_type': 'table',
	'table_html_url': relative_html_url_path,
	'table_text_representation': df.to_string(index=False),
	'rect': [table_rect.x0, table_rect.y0, table_rect.x1, table_rect.y1] if table_rect else None,
	'accuracy': table.accuracy,
	'image_bytes': table_image_bytes,
	'image_url': relative_image_url_path if table_image_bytes else None
	})

	return table_and_image_data

	except Exception as e:
	print(f" Erreur lors de l'extraction des tableaux de la page {page_num} : {str(e)}")
	traceback.print_exc()
	return []


	def extract_images_from_page(pdf_path, page, page_num, excluded_rects=[]):
	"""Extract and save images from a page, excluding specified regions (like tables)."""
	image_data = []
	image_list = page.get_images(full=True)

	for img_index, img_info in enumerate(image_list):
	xref = img_info[0]
	try:
	base_image = page.parent.extract_image(xref)
	image_bytes = base_image["image"]
	image_ext = base_image["ext"]
	width = base_image["width"]
	height = base_image["height"]

	if width < IMAGE_MIN_WIDTH or height < IMAGE_MIN_HEIGHT:
	continue

	img_rect = None
	img_rects = page.get_image_rects(xref)
	if img_rects:
	img_rect = img_rects[0]

	if img_rect is None:
	print(f" Warning: Could not find rectangle for image {img_index} on page {page_num}. Skipping.")
	continue

	is_excluded = False
	for excluded_rect in excluded_rects:
	if img_rect.intersects(excluded_rect):
	is_excluded = True
	break
	if is_excluded:
	print(f" Image {img_index} on page {page_num} is within an excluded region (e.g., table). Skipping.")
	continue

	safe_pdf_name = "".join(c if c.isalnum() else "_" for c in pdf_path.stem)
	image_filename = f"{safe_pdf_name}_p{page_num}_img{img_index}.{image_ext}"
	image_save_path = IMAGE_SAVE_DIR / image_filename
	relative_url_path = f"/static/{IMAGE_SAVE_SUBDIR}/{image_filename}"

	with open(image_save_path, "wb") as img_file:
	img_file.write(image_bytes)

	image_data.append({
	'content_type': 'image',
	'image_url': relative_url_path,
	'rect': [img_rect.x0, img_rect.y0, img_rect.x1, img_rect.y1],
	'image_bytes': image_bytes
	})

	except Exception as img_save_e:
	print(f" Erreur lors du traitement de l'image {img_index} de la page {page_num} : {img_save_e}")
	traceback.print_exc()

	return image_data

	# --- Embedding and Description Generation Functions (Modified for Gemma and Sentence-Transformers) ---

	def token_chunking(text, max_tokens, encoding):
	"""Chunk text based on token count with smarter boundaries (sentences, paragraphs)"""
	if not text:
	return []

	tokens = encoding.encode(text)
	chunks = []
	start_token_idx = 0

	while start_token_idx < len(tokens):
	end_token_idx = min(start_token_idx + max_tokens, len(tokens))

	if end_token_idx < len(tokens):
	look_ahead_limit = min(start_token_idx + max_tokens * 2, len(tokens))
	text_segment_to_check = encoding.decode(tokens[start_token_idx:look_ahead_limit])

	paragraph_break = text_segment_to_check.rfind('\n\n', 0, len(text_segment_to_check) - (look_ahead_limit - (start_token_idx + max_tokens)))
	if paragraph_break != -1:
	tokens_up_to_break = encoding.encode(text_segment_to_check[:paragraph_break])
	end_token_idx = start_token_idx + len(tokens_up_to_break)
	else:
	sentence_end = re.search(r'[.!?]\s+', text_segment_to_check[:len(text_segment_to_check) - (look_ahead_limit - (start_token_idx + max_tokens))][::-1])
	if sentence_end:
	char_index_in_segment = len(text_segment_to_check) - 1 - sentence_end.start()
	tokens_up_to_end = encoding.encode(text_segment_to_check[:char_index_in_segment + 1])
	end_token_idx = start_token_idx + len(tokens_up_to_end)

	current_chunk_tokens = tokens[start_token_idx:end_token_idx]
	chunk_text = encoding.decode(current_chunk_tokens).strip()

	if chunk_text:
	chunks.append(chunk_text)

	if start_token_idx == end_token_idx:
	start_token_idx += 1
	else:
	start_token_idx = end_token_idx

	return chunks


	def generate_multimodal_description(image_bytes, prompt_text, max_retries=5, delay=10):
	"""
	Generate a text description for an image using the Gemma multimodal model.
	Returns description text or None if all retries fail or model is not initialized.
	"""
	global gemma_sampler

	if gemma_sampler is None:
	print(" Skipping multimodal description generation: Gemma sampler is not initialized.")
	return None

	# Convert image bytes to PIL Image and then to JAX NumPy array
	try:
	pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	# Gemma expects (H, W, C) numpy array, then converted to JAX numpy
	image_np = np.asarray(pil_image)
	gemma_image_input = jnp.asarray(image_np)
	# Gemma also expects batch dimension, so add it
	gemma_image_input = jnp.expand_dims(gemma_image_input, axis=0) # Shape: (1, H, W, C)
	except Exception as e:
	print(f" Erreur lors de la conversion de l'image pour Gemma : {e}")
	return None

	for attempt in range(max_retries):
	try:
	time.sleep(delay + random.uniform(0, 5))

	# Gemma chat expects <img_token> special token for image insertion
	full_prompt = f"{prompt_text} <img>"

	# Use sampler.chat for turn-based interaction
	# The images argument accepts a JAX numpy array with shape (batch, num_images, H, W, C)
	# If a single image, it's (batch, 1, H, W, C)
	# We are currently passing a single image, so gemma_image_input is (1, H, W, C) already.
	# To pass it as `images`, it should be `(batch, num_images, H, W, C)`
	# So, if gemma_image_input is (1, H, W, C), for sampler.chat(images=...) it should be (1, 1, H, W, C)
	# Let's reshape it for the `images` argument.
	final_gemma_image_input = jnp.expand_dims(gemma_image_input, axis=1) # Shape: (1, 1, H, W, C)

	out = gemma_sampler.chat(
	full_prompt,
	images=final_gemma_image_input,
	max_tokens=500 # Limit response length
	)
	description = out.strip()

	if description:
	return description
	else:
	print(f" Tentative {attempt+1}/{max_retries}: Réponse vide ou inattendue du modèle multimodal Gemma.")
	if attempt < max_retries - 1:
	retry_delay = delay * (2 ** attempt) + random.uniform(1, 5)
	print(f" Réessai dans {retry_delay:.2f}s...")
	time.sleep(retry_delay)
	continue

	except Exception as e:
	error_msg = str(e)
	print(f" Tentative {attempt+1}/{max_retries} échouée pour la description (Gemma) : {error_msg}")
	# Gemma is local, so no API errors like 429. Focus on general errors.
	if attempt < max_retries - 1:
	retry_delay = delay * (2 ** attempt) + random.uniform(1, 5)
	print(f" Réessai dans {retry_delay:.2f}s...")
	time.sleep(retry_delay)
	continue
	else:
	print(f" Toutes les {max_retries} tentatives ont échoué pour la description Gemma.")
	return None
	print(f" Toutes les {max_retries} tentatives ont échoué pour la description (fin de boucle).")
	return None


	def generate_text_embedding(text_content, max_retries=5, delay=5):
	"""
	Generate text embedding using the Sentence-Transformers model.
	Returns embedding vector (list) or None if all retries fail or model is not initialized.
	"""
	global text_embedding_model

	if text_embedding_model is None:
	print(" Skipping text embedding generation: Sentence-Transformers model is not initialized.")
	return None

	if not text_content or not text_content.strip():
	return None # Cannot embed empty text

	for attempt in range(max_retries):
	try:
	time.sleep(delay + random.uniform(0, 0.5)) # Shorter delay for local model

	# Sentence-Transformers encode method
	embedding = text_embedding_model.encode(text_content, convert_to_numpy=True)
	if embedding is not None and len(embedding) == EMBEDDING_DIMENSION:
	return embedding.tolist() # Convert numpy array to list for JSON serialization
	else:
	print(f" Tentative {attempt+1}/{max_retries}: Format d'embedding Sentence-Transformers inattendu. Réponse : {embedding}")
	return None

	except Exception as e:
	error_msg = str(e)
	print(f" Tentative {attempt+1}/{max_retries} échouée pour l'embedding (Sentence-Transformers) : {error_msg}")
	if attempt < max_retries - 1:
	retry_delay = delay * (2 ** attempt) + random.uniform(0.5, 2)
	print(f" Réessai dans {retry_delay:.2f}s...")
	time.sleep(retry_delay)
	continue
	else:
	print(f" Toutes les {max_retries} tentatives ont échoué pour l'embedding (Sentence-Transformers).")
	return None
	print(f" Toutes les {max_retries} tentatives ont échoué pour l'embedding (fin de boucle).")
	return None


	# --- Main Processing Function ---

	def process_pdfs_in_directory(directory):
	"""Main processing pipeline for all PDFs in a directory."""
	all_embeddings_data = []
	processed_files = 0
	pdf_files = list(directory.glob("*.pdf"))
	total_files = len(pdf_files)

	if total_files == 0:
	print(f"Aucun fichier PDF trouvé dans le répertoire : {directory}")
	return []

	for pdf_file_path in pdf_files:
	processed_files += 1
	print(f"\nTraitement de {pdf_file_path.name} ({processed_files}/{total_files})...")

	page_data_list = extract_page_data_pymupdf(pdf_file_path)

	if not page_data_list:
	print(f" Aucune donnée extraite de {pdf_file_path.name}.")
	continue

	for page_data in page_data_list:
	pdf_file = page_data['pdf_file']
	page_num = page_data['page_number']
	page_text = page_data['text']
	images = page_data['images']
	tables = page_data['tables']
	pdf_title = page_data.get('pdf_title')
	pdf_subject = page_data.get('pdf_subject')
	pdf_keywords = page_data.get('pdf_keywords')

	print(f" Génération des descriptions et embeddings pour la page {page_num}...")

	# Process tables: Generate description and then embedding
	for table_idx, table in enumerate(tables):
	table_image_bytes = table.get('image_bytes')
	table_text_repr = table.get('table_text_representation', '')
	table_html_url = table.get('table_html_url')

	description = None
	if table_image_bytes:
	prompt = "Décrivez en français le contenu et la structure de ce tableau. Mettez l'accent sur les données principales et les tendances si visibles."
	print(f" Page {page_num}: Génération de la description multimodale pour le tableau {table_idx}...")
	description = generate_multimodal_description(table_image_bytes, prompt)
	elif table_text_repr:
	# Fallback for text-only table description, using Gemma's text capabilities
	if gemma_sampler:
	prompt = f"Décrivez en français le contenu et la structure de ce tableau basé sur sa représentation textuelle:\n{table_text_repr[:1000]}..."
	print(f" Page {page_num}: Génération de la description textuelle pour le tableau {table_idx} (fallback via Gemma)...")
	try:
	# Gemma text-only generation
	out = gemma_sampler.chat(prompt, max_tokens=500)
	description = out.strip()
	except Exception as e:
	print(f" Erreur lors de la génération de description textuelle pour le tableau {table_idx} via Gemma: {e}")
	description = None
	else:
	print(" Skipping text description generation for table: Gemma sampler not initialized.")
	description = None


	if description:
	print(f" Page {page_num}: Description générée pour le tableau {table_idx}.")
	embedding_vector = generate_text_embedding(description)

	if embedding_vector is not None:
	chunk_data = {
	"pdf_file": pdf_file,
	"page_number": page_num,
	"chunk_id": f"table_{table_idx}",
	"content_type": "table",
	"text_content": description,
	"embedding": embedding_vector,
	"table_html_url": table_html_url,
	"image_url": table.get('image_url'),
	"pdf_title": pdf_title,
	"pdf_subject": pdf_subject,
	"pdf_keywords": pdf_keywords
	}
	all_embeddings_data.append(chunk_data)
	print(f" Page {page_num}: Embedding généré pour la description du tableau {table_idx}.")
	else:
	print(f" Page {page_num}: Échec de la génération de l'embedding pour la description du tableau {table_idx}. Chunk ignoré.")
	else:
	print(f" Page {page_num}: Aucune description générée pour le tableau {table_idx}. Chunk ignoré.")


	# Process images (non-table): Generate description and then embedding
	for img_idx, image in enumerate(images):
	image_bytes = image.get('image_bytes')
	image_url = image.get('image_url')

	if image_bytes:
	prompt = "Décrivez en français le contenu de cette image. S'il s'agit d'un graphique, décrivez le type de graphique (histogramme, courbe, etc.), les axes, les légendes et les principales informations ou tendances visibles."
	print(f" Page {page_num}: Génération de la description multimodale pour l'image {img_idx}...")
	description = generate_multimodal_description(image_bytes, prompt)

	if description:
	print(f" Page {page_num}: Description générée pour l'image {img_idx}.")
	embedding_vector = generate_text_embedding(description)

	if embedding_vector is not None:
	chunk_data = {
	"pdf_file": pdf_file,
	"page_number": page_num,
	"chunk_id": f"image_{img_idx}",
	"content_type": "image",
	"text_content": description,
	"embedding": embedding_vector,
	"image_url": image_url,
	"pdf_title": pdf_title,
	"pdf_subject": pdf_subject,
	"pdf_keywords": pdf_keywords
	}
	all_embeddings_data.append(chunk_data)
	print(f" Page {page_num}: Embedding généré pour la description de l'image {img_idx}.")
	else:
	print(f" Page {page_num}: Échec de la génération de l'embedding pour la description de l'image {img_idx}. Chunk ignoré.")
	else:
	print(f" Page {page_num}: Aucune description générée pour l'image {img_idx}. Chunk ignoré.")


	# Process regular text: Chunk and then generate embeddings
	if page_text:
	try:
	encoding = tiktoken.get_encoding(ENCODING_NAME)
	text_chunks = token_chunking(page_text, MAX_TOKENS_NORMAL, encoding)
	except Exception as e:
	print(f"Erreur lors du chunking du texte de la page {page_num} : {e}. Utilisation du chunking simple.")
	text_chunks = [page_text]


	for chunk_idx, chunk_content in enumerate(text_chunks):
	print(f" Page {page_num}: Génération de l'embedding pour le chunk de texte {chunk_idx}...")
	embedding_vector = generate_text_embedding(chunk_content)

	if embedding_vector is not None:
	chunk_data = {
	"pdf_file": pdf_file,
	"page_number": page_num,
	"chunk_id": f"text_{chunk_idx}",
	"content_type": "text",
	"text_content": chunk_content,
	"embedding": embedding_vector,
	"pdf_title": pdf_title,
	"pdf_subject": pdf_subject,
	"pdf_keywords": pdf_keywords
	}
	all_embeddings_data.append(chunk_data)
	print(f" Page {page_num}: Chunk de texte {chunk_idx} traité avec succès.")
	else:
	print(f" Page {page_num}: Échec de la génération de l'embedding pour le chunk de texte {chunk_idx}. Chunk ignoré.")


	print(f" Page {page_num} terminée. Éléments traités : {len(tables)} tableaux, {len(images)} images, {len(text_chunks)} chunks de texte.")


	return all_embeddings_data

	# --- Main Execution ---
	if __name__ == "__main__":
	print("Démarrage du traitement PDF multimodal avec génération de descriptions (Gemma) et embeddings textuels multilingues (Sentence-Transformers)...")

	# Validate and create directories
	if not PDF_DIRECTORY.is_dir():
	print(f"❌ ERREUR: Répertoire PDF non trouvé ou n'est pas un répertoire : {PDF_DIRECTORY}. Veuillez créer un répertoire 'docs' et y placer vos PDFs.")
	# Create it if it doesn't exist, for example PDF_DIRECTORY.mkdir(parents=True, exist_ok=True)
	# But for Colab, it's often better to instruct user to upload.
	exit(1)

	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
	IMAGE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
	TABLE_SAVE_DIR.mkdir(parents=True, exist_ok=True)
	print(f"Répertoire de sortie : {OUTPUT_DIR}")
	print(f"Répertoire de sauvegarde des images : {IMAGE_SAVE_DIR}")
	print(f"Répertoire de sauvegarde des tableaux (HTML) : {TABLE_SAVE_DIR}")

	# Initialize Gemma and Sentence-Transformers models
	initialize_models()

	# If models failed to initialize, exit
	if gemma_sampler is None or text_embedding_model is None:
	print("Impossible de continuer car un ou plusieurs modèles n'ont pas pu être initialisés.")
	exit(1)

	final_embeddings = process_pdfs_in_directory(PDF_DIRECTORY)

	if final_embeddings:
	print(f"\nTotal d'embeddings générés : {len(final_embeddings)}.")
	try:
	with EMBEDDINGS_FILE_PATH.open('w', encoding='utf-8') as f:
	json.dump(final_embeddings, f, indent=2, ensure_ascii=False)
	print(f"Embeddings sauvegardés avec succès dans : {EMBEDDINGS_FILE_PATH}")
	except Exception as e:
	print(f"\nErreur lors de la sauvegarde du fichier JSON d'embeddings : {e}")
	traceback.print_exc()
	else:
	print("\nAucun embedding n'a été généré.")