Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import pandas as pd | |
| from docx import Document | |
| import pdfplumber | |
| from tempfile import NamedTemporaryFile | |
| def extract_text(file): | |
| # Enregistrer temporairement le fichier | |
| suffix = os.path.splitext(file.name)[1].lower() | |
| with NamedTemporaryFile(delete=False, suffix=suffix) as tmp: | |
| tmp.write(file.read()) | |
| tmp_path = tmp.name | |
| # Extraction selon le type | |
| try: | |
| if suffix == ".docx": | |
| doc = Document(tmp_path) | |
| texte = "\n".join([p.text for p in doc.paragraphs]) | |
| filetype = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| elif suffix == ".pdf": | |
| texte = "" | |
| with pdfplumber.open(tmp_path) as pdf: | |
| for page in pdf.pages: | |
| texte += page.extract_text() + "\n" | |
| filetype = "application/pdf" | |
| elif suffix in [".csv", ".xlsx"]: | |
| if suffix == ".csv": | |
| df = pd.read_csv(tmp_path) | |
| else: | |
| df = pd.read_excel(tmp_path) | |
| texte = df.to_string(index=False) | |
| filetype = "spreadsheet" | |
| else: | |
| return "Type de fichier non supporté." | |
| except Exception as e: | |
| return f"Erreur pendant l'extraction : {str(e)}" | |
| finally: | |
| os.remove(tmp_path) | |
| # Structure du retour | |
| return { | |
| "nom_fichier": os.path.basename(file.name), | |
| "type": filetype, | |
| "texte": texte | |
| } | |
| # Interface Gradio | |
| demo = gr.Interface( | |
| fn=extract_text, | |
| inputs=gr.File(label="Uploader un fichier (.docx, .pdf, .csv, .xlsx)"), | |
| outputs="json", | |
| title="🧠 Extracteur de texte", | |
| description="Envoie un fichier et récupère son contenu brut" | |
| ) | |
| demo.launch() | |