evannh's picture
Update app.py
59ce0f1 verified
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import faiss
from wordcloud import WordCloud
from langdetect import detect
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords as nltk_stopwords
import nltk
import re
import tempfile
import os
import chardet
import csv
nltk.download("stopwords")
fr_stopwords = set(nltk_stopwords.words("french"))
model = SentenceTransformer("all-MiniLM-L12-v2")
def clean_text(text, lang):
words = re.findall(r"\b\w+\b", text.lower())
stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set()
return " ".join([w for w in words if w not in stops and len(w) > 2])
def detect_encoding(file_path):
with open(file_path, "rb") as f:
rawdata = f.read(10000)
return chardet.detect(rawdata)["encoding"]
def detect_separator(file_path, encoding):
with open(file_path, "r", encoding=encoding) as f:
sample = f.read(2048)
sniffer = csv.Sniffer()
try:
dialect = sniffer.sniff(sample)
return dialect.delimiter
except Exception:
return "," # fallback
def semantic_search(file, text_column, query, threshold, top_k):
try:
encoding = detect_encoding(file.name)
sep = detect_separator(file.name, encoding)
df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python")
except Exception as e:
return f"Erreur : {e}", None, None, None, None
if text_column not in df.columns:
return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None
texts = df[text_column].fillna("").astype(str).tolist()
embeddings = model.encode(texts, normalize_embeddings=True)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings.astype("float32"))
query_vec = model.encode([query], normalize_embeddings=True).astype("float32")
scores, indices = index.search(query_vec, len(texts))
sims = scores[0]
matches = sims >= threshold
percent = 100 * np.sum(matches) / len(sims)
top_indices = indices[0][:top_k]
top_scores = sims[:top_k]
top_texts = [texts[i] for i in top_indices]
df_result = pd.DataFrame({
"Similarité": top_scores,
"Texte": top_texts
})
fig, ax = plt.subplots(figsize=(6, 4))
sns.histplot(sims, bins=30, ax=ax, kde=True)
ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}")
ax.set_title("Distribution des similarités")
ax.set_xlabel("Score de similarité")
ax.set_ylabel("Nombre de textes")
ax.legend()
hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
fig.savefig(hist_path, bbox_inches="tight")
plt.close(fig)
try:
lang = detect(" ".join(top_texts[:3]))
except:
lang = "en"
cleaned = [clean_text(t, lang) for t in top_texts]
wc_text = " ".join(cleaned)
wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text)
wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
wc.to_file(wc_path)
csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
df_result.to_csv(csv_path, index=False, encoding="utf-8")
return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path
# Interface Gradio
with gr.Blocks() as demo:
gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV")
with gr.Row():
file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"])
load_columns_btn = gr.Button("🪄 Charger les colonnes")
column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True)
query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes")
threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité")
topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés")
search_btn = gr.Button("⚙️ Lancer la recherche")
result_text = gr.Textbox(label="📊 Résumé", lines=1)
result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True)
result_plot = gr.Image(label="📈 Histogramme des similarités")
result_wc = gr.Image(label="☁️ Nuage de mots")
result_csv = gr.File(label="⬇️ Télécharger résultats CSV")
def load_columns(file):
try:
encoding = detect_encoding(file.name)
sep = detect_separator(file.name, encoding)
df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip")
return gr.update(choices=sorted(df.columns.tolist()))
except Exception as e:
return gr.update(choices=[f"Erreur : {e}"])
load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector])
search_btn.click(
fn=semantic_search,
inputs=[file_input, column_selector, query_input, threshold_input, topk_input],
outputs=[result_text, result_table, result_plot, result_wc, result_csv]
)
if __name__ == "__main__":
demo.launch()