Spaces:

evannh
/

test_recherche_semantique

Sleeping

App Files Files Community

test_recherche_semantique / app.py

evannh

Update app.py

59ce0f1 verified 6 months ago

raw

history blame contribute delete

5.41 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sentence_transformers import SentenceTransformer
	import faiss
	from wordcloud import WordCloud
	from langdetect import detect
	from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
	from nltk.corpus import stopwords as nltk_stopwords
	import nltk
	import re
	import tempfile
	import os
	import chardet
	import csv

	nltk.download("stopwords")
	fr_stopwords = set(nltk_stopwords.words("french"))
	model = SentenceTransformer("all-MiniLM-L12-v2")

	def clean_text(text, lang):
	words = re.findall(r"\b\w+\b", text.lower())
	stops = fr_stopwords if lang == "fr" else ENGLISH_STOP_WORDS if lang == "en" else set()
	return " ".join([w for w in words if w not in stops and len(w) > 2])

	def detect_encoding(file_path):
	with open(file_path, "rb") as f:
	rawdata = f.read(10000)
	return chardet.detect(rawdata)["encoding"]

	def detect_separator(file_path, encoding):
	with open(file_path, "r", encoding=encoding) as f:
	sample = f.read(2048)
	sniffer = csv.Sniffer()
	try:
	dialect = sniffer.sniff(sample)
	return dialect.delimiter
	except Exception:
	return "," # fallback

	def semantic_search(file, text_column, query, threshold, top_k):
	try:
	encoding = detect_encoding(file.name)
	sep = detect_separator(file.name, encoding)
	df = pd.read_csv(file.name, sep=sep, encoding=encoding, quotechar='"', on_bad_lines="skip", engine="python")
	except Exception as e:
	return f"Erreur : {e}", None, None, None, None

	if text_column not in df.columns:
	return f"Colonne '{text_column}' introuvable. Colonnes disponibles : {list(df.columns)}", None, None, None, None

	texts = df[text_column].fillna("").astype(str).tolist()
	embeddings = model.encode(texts, normalize_embeddings=True)
	index = faiss.IndexFlatIP(embeddings.shape[1])
	index.add(embeddings.astype("float32"))

	query_vec = model.encode([query], normalize_embeddings=True).astype("float32")
	scores, indices = index.search(query_vec, len(texts))
	sims = scores[0]
	matches = sims >= threshold
	percent = 100 * np.sum(matches) / len(sims)

	top_indices = indices[0][:top_k]
	top_scores = sims[:top_k]
	top_texts = [texts[i] for i in top_indices]

	df_result = pd.DataFrame({
	"Similarité": top_scores,
	"Texte": top_texts
	})

	fig, ax = plt.subplots(figsize=(6, 4))
	sns.histplot(sims, bins=30, ax=ax, kde=True)
	ax.axvline(threshold, color="red", linestyle="--", label=f"Seuil = {threshold}")
	ax.set_title("Distribution des similarités")
	ax.set_xlabel("Score de similarité")
	ax.set_ylabel("Nombre de textes")
	ax.legend()
	hist_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
	fig.savefig(hist_path, bbox_inches="tight")
	plt.close(fig)

	try:
	lang = detect(" ".join(top_texts[:3]))
	except:
	lang = "en"
	cleaned = [clean_text(t, lang) for t in top_texts]
	wc_text = " ".join(cleaned)
	wc = WordCloud(width=800, height=400, background_color="white").generate(wc_text)
	wc_path = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
	wc.to_file(wc_path)

	csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
	df_result.to_csv(csv_path, index=False, encoding="utf-8")

	return f"{percent:.2f}% des textes sont jugés pertinents (sim ≥ {threshold})", df_result, hist_path, wc_path, csv_path

	# Interface Gradio
	with gr.Blocks() as demo:
	gr.Markdown("# 🔍 Recherche Sémantique avec Visualisation et Export CSV")

	with gr.Row():
	file_input = gr.File(label="📁 Fichier CSV", file_types=[".csv"])
	load_columns_btn = gr.Button("🪄 Charger les colonnes")

	column_selector = gr.Dropdown(label="🧾 Sélectionne la colonne de texte", choices=[], interactive=True)

	query_input = gr.Textbox(label="🔎 Requête (ex : propos racistes)", value="propos racistes")
	threshold_input = gr.Slider(0.0, 1.0, value=0.35, label="Seuil de similarité")
	topk_input = gr.Slider(1, 100, value=20, label="Nombre de résultats affichés")

	search_btn = gr.Button("⚙️ Lancer la recherche")

	result_text = gr.Textbox(label="📊 Résumé", lines=1)
	result_table = gr.Dataframe(label="📋 Textes les plus proches", wrap=True)
	result_plot = gr.Image(label="📈 Histogramme des similarités")
	result_wc = gr.Image(label="☁️ Nuage de mots")
	result_csv = gr.File(label="⬇️ Télécharger résultats CSV")

	def load_columns(file):
	try:
	encoding = detect_encoding(file.name)
	sep = detect_separator(file.name, encoding)
	df = pd.read_csv(file.name, encoding=encoding, sep=sep, engine="python", on_bad_lines="skip")
	return gr.update(choices=sorted(df.columns.tolist()))
	except Exception as e:
	return gr.update(choices=[f"Erreur : {e}"])

	load_columns_btn.click(fn=load_columns, inputs=[file_input], outputs=[column_selector])

	search_btn.click(
	fn=semantic_search,
	inputs=[file_input, column_selector, query_input, threshold_input, topk_input],
	outputs=[result_text, result_table, result_plot, result_wc, result_csv]
	)

	if __name__ == "__main__":
	demo.launch()