Spaces:

lsottani
/

Prep_my_data

Sleeping

App Files Files Community

Prep_my_data / app.py

lsottani

Update app.py

4cc0fc9 verified about 2 months ago

raw

history blame contribute delete

5.25 kB

	#!/usr/bin/env python

	import os
	import re
	import tempfile
	from pathlib import Path

	import pdfplumber
	import docx
	import gradio as gr

	def clean_text_for_rag(text: str) -> str:
	"""Normalise et nettoie le texte pour un usage RAG."""
	# Normalisation des caractères typographiques
	text = re.sub(
	r"[’‘“”«»–—\u00A0\u202F…œŒæÆ©®™§°±×÷]",
	lambda m: {
	"’": "'", "‘": "'", "“": '"', "”": '"',
	"«": '"', "»": '"', "–": "-", "—": "-",
	"…": "...", "œ": "oe", "Œ": "OE",
	"æ": "ae", "Æ": "AE", "©": "(c)", "®": "(R)",
	"™": "TM", "§": "§", "°": "°", "±": "+/-",
	"×": "x", "÷": "/"
	}.get(m.group(0), m.group(0)),
	text,
	)
	# Conserver uniquement les caractères suivants
	text = re.sub(r'[^a-zA-ZÀ-ÿæ-œ0-9\s\.\,\:\;\!\?\-\_\'\"\\\(\)]', '', text)
	# Réduire les espaces multiples
	return re.sub(r'\s+', ' ', text).strip()


	def extract_and_clean_pdf(pdf_path: str) -> str:
	"""Ouvre le PDF, récupère le texte et le nettoie."""
	print(f"[+] Extraction du PDF : {pdf_path}")
	all_pages = []
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	txt = page.extract_text()
	if txt:
	all_pages.append(txt)
	return clean_text_for_rag(" ".join(all_pages))


	def extract_and_clean_docx(docx_path: str) -> str:
	"""Lit un fichier DOCX et le nettoie."""
	print(f"[+] Extraction du DOCX : {docx_path}")
	doc = docx.Document(docx_path)
	paragraphs = []
	for para in doc.paragraphs:
	text = para.text.strip()
	if text:
	paragraphs.append(text)
	return clean_text_for_rag(" ".join(paragraphs))

	def extract_and_clean_txt(txt_path: str) -> str:
	"""Lit un fichier texte (txt, md, …) et le nettoie."""
	print(f"[+] Lecture du fichier texte : {txt_path}")
	with open(txt_path, "r", encoding="utf-8") as f:
	lines = f.readlines()
	cleaned = [
	clean_text_for_rag(line.strip())
	for line in lines
	if line.strip()
	]
	return "\n".join(cleaned)

	def process_file(input_file, output_name):

	"""
	- Detecte le type (PDF ou texte)
	- Effectue l'extraction + nettoyage
	- Crée un fichier temporaire avec le nom choisi (output_name)
	- Retourne le chemin du fichier temporaire (Gradio le propose en téléchargement)
	"""

	if input_file is None:
	return None

	if hasattr(input_file, "read"):
	data = input_file.read()
	filename = input_file.name
	elif isinstance(input_file, str):
	filename = input_file
	with open(input_file, "rb") as f:
	data = f.read()
	else:
	filename = input_file[0].name
	data = input_file[0].read()

	# écrire dans /tmp (important sur HF Spaces)
	suffix = os.path.splitext(filename)[1]
	tmp_path = os.path.join(tempfile.gettempdir(), "upload" + suffix)

	with open(tmp_path, "wb") as f:
	f.write(data)

	ext = suffix.lower()

	if ext == ".pdf":
	cleaned_text = extract_and_clean_pdf(tmp_path)

	elif ext == ".docx":
	cleaned_text = extract_and_clean_docx(tmp_path)

	else:
	cleaned_text = extract_and_clean_txt(tmp_path)

	if not output_name.lower().endswith(".md"):
	output_name += ".md"

	out_path = os.path.join(tempfile.gettempdir(), output_name)

	with open(out_path, "w", encoding="utf-8") as f:
	f.write(cleaned_text)

	return out_path

	with gr.Blocks(title="Nettoyage de texte pour RAG") as demo:
	gr.Markdown("# 📄 Nettoyage d'un fichier pour optimisation de vos pipelines RAG")
	gr.Markdown(
	"Déposez simplement votre fichier : nous nous chargeons d’extraire son contenu textuel, de le nettoyer "
	"puis de vous le restituer en format markdown sous le nom que vous choisissez."
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_file = gr.File(
	label="Déposez votre fichier ici",
	file_types=[".pdf", ".txt", ".md", ".docx"],
	file_count="single",
	)
	output_name = gr.Textbox(
	value="output.md",
	label="Nom du fichier de sortie (en .md)",
	placeholder="exemple.md",
	interactive=True,
	)
	submit_btn = gr.Button("Traiter le fichier", variant="primary")
	with gr.Column(scale=1):
	output_file = gr.File(
	label="Fichier nettoyé (.md)",
	file_types=["md"],
	)

	submit_btn.click(
	fn=process_file,
	inputs=[input_file, output_name],
	outputs=output_file,
	)

	gr.Markdown(
	"""
	---
	Prétraitements effectués :
	- Suppression des symboles non imprimables et des caractères parasites
	- Conservation des lettres (y compris accentuées), chiffres, espaces et ponctuation simple
	- Normalisation des espaces pour un texte harmonieux
	- Export automatique au format `.md`

	"""
	)

	if __name__ == "__main__":
	demo.launch()