Spaces:

kebson
/

table_second_column_extractor

Runtime error

App Files Files Community

kebson commited on Dec 23, 2025

Commit

ce6a96f

verified ·

1 Parent(s): fe12926

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -64

app.py CHANGED Viewed

@@ -3,112 +3,118 @@ import numpy as np
 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
 ocr = PaddleOCR(
     use_textline_orientation=True,
     lang="fr"
 )
-HEADER_EXACT = "DESIGNATIONS"
-def extract_column2_9_lines(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
-    if not result:
-        return "Aucun texte détecté."
     data = result[0]
-    texts = data.get("rec_texts", [])
-    boxes = data.get("dt_polys", [])
     elements = []
     for text, box in zip(texts, boxes):
         text = text.strip()
-        if len(text) < 2:
             continue
-        x = np.mean([p[0] for p in box])
-        y = np.mean([p[1] for p in box])
-        elements.append((x, y, text))
-    if len(elements) < 5:
-        return "Pas assez de données OCR."
-    # --- CLUSTER COLONNES ---
-    X = np.array([[e[0]] for e in elements])
-    kmeans = KMeans(n_clusters=min(7, len(elements)//6 + 2), random_state=42, n_init=10)
-    labels = kmeans.fit_predict(X)
-    columns = {}
-    for (x, y, t), lbl in zip(elements, labels):
-        columns.setdefault(lbl, []).append((x, y, t))
-    # --- COLONNE DESCRIPTION = max texte non numérique ---
-    def score(col):
-        return sum(len(t) for _,_,t in col if not any(c.isdigit() for c in t))
-    desc_col = max(columns.values(), key=score)
-    desc_col.sort(key=lambda e: e[1])  # top -> bottom
-    # --- LOCALISER L’EN-TÊTE ---
-    header_index = None
-    for i, (_, _, t) in enumerate(desc_col):
-        if t.upper() == HEADER_EXACT:
-            header_index = i
-            break
-    if header_index is None:
-        start_index = 0
-    else:
-        start_index = header_index + 1
-    content = desc_col[start_index:]
-    # --- SEUIL ADAPTATIF ---
-    ys = [y for _,y,_ in content]
-    Y_THRESHOLD = max(22, np.median(np.diff(sorted(ys))) * 1.2) if len(ys) > 1 else 30
-    # --- FUSION ---
-    lines = []
     current = ""
     last_y = None
-    for _, y, text in content:
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
-                lines.append(current.strip())
             current = text
         else:
             current += " " + text
         last_y = y
     if current:
-        lines.append(current.strip())
-    # --- NETTOYAGE ---
-    final = []
-    for i, l in enumerate(lines):
-        if i == 0:
-            final.append(l)  # Toujours garder la 1ère vraie ligne
-            continue
-        if len(l) < 5:
-            continue
-        if sum(c.isdigit() for c in l) > len(l)/2:
-            continue
-        final.append(l)
-    final = final[:9]
-    return "\n".join([f"{i+1}. {l}" for i,l in enumerate(final)])
-# --- GRADIO ---
 demo = gr.Interface(
-    fn=extract_column2_9_lines,
     inputs=gr.Image(type="pil", label="Image du tableau"),
-    outputs=gr.Textbox(label="Colonne DESIGNATIONS"),
-    title="Extraction fiable de la colonne DESIGNATIONS",
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)

 from paddleocr import PaddleOCR
 from sklearn.cluster import KMeans
+# -----------------------------
+# OCR
+# -----------------------------
 ocr = PaddleOCR(
     use_textline_orientation=True,
     lang="fr"
 )
+# -----------------------------
+# Extraction de la 2e colonne
+# -----------------------------
+def extract_second_column(image):
     if image is None:
         return "Aucune image fournie."
     img = np.array(image)
     result = ocr.predict(img)
+    if not result or len(result[0]["rec_texts"]) == 0:
+        return "OCR exécuté mais aucun texte détecté."
     data = result[0]
+    texts = data["rec_texts"]
+    boxes = data["dt_polys"]
     elements = []
     for text, box in zip(texts, boxes):
         text = text.strip()
+        if len(text) < 3:
             continue
+        x_center = np.mean([p[0] for p in box])
+        y_center = np.mean([p[1] for p in box])
+        elements.append((x_center, y_center, text))
+    if len(elements) < 5:
+        return "Pas assez de texte détecté."
+    # -----------------------------
+    # 1. Regroupement en colonnes (par X)
+    # -----------------------------
+    X = np.array([[e[0]] for e in elements])
+    # Nombre de colonnes estimé automatiquement
+    n_cols = min(6, max(2, len(elements) // 6))
+    kmeans = KMeans(n_clusters=n_cols, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(X)
+    columns = {}
+    for (x, y, text), label in zip(elements, labels):
+        columns.setdefault(label, []).append((x, y, text))
+    # Trier les colonnes de gauche à droite
+    sorted_columns = sorted(
+        columns.values(),
+        key=lambda col: np.mean([e[0] for e in col])
+    )
+    if len(sorted_columns) < 2:
+        return "Impossible de détecter la 2e colonne."
+    # -----------------------------
+    # 2. Sélection de la 2e colonne
+    # -----------------------------
+    col = sorted_columns[1]
+    col.sort(key=lambda e: e[1])  # top → bottom
+    # -----------------------------
+    # 3. Fusion verticale (cellules)
+    # -----------------------------
+    merged = []
     current = ""
     last_y = None
+    Y_THRESHOLD = 22
+    for _, y, text in col:
         if last_y is None or abs(y - last_y) > Y_THRESHOLD:
             if current:
+                merged.append(current.strip())
             current = text
         else:
             current += " " + text
         last_y = y
     if current:
+        merged.append(current.strip())
+    # -----------------------------
+    # 4. Nettoyage léger
+    # -----------------------------
+    final = [
+        line for line in merged
+        if len(line) > 5
+    ]
+    if not final:
+        return "Colonne détectée mais vide."
+    return "\n".join(f"{i+1}. {l}" for i, l in enumerate(final))
+# -----------------------------
+# Interface Gradio
+# -----------------------------
 demo = gr.Interface(
+    fn=extract_second_column,
     inputs=gr.Image(type="pil", label="Image du tableau"),
+    outputs=gr.Textbox(label="Contenu de la 2e colonne"),
+    title="Extraction de la 2e colonne du tableau",
+    description="La colonne cible est toujours la deuxième (texte uniquement)"
 )
 demo.launch(server_name="0.0.0.0", server_port=7860)