User111-ops commited on
Commit
44809e2
·
verified ·
1 Parent(s): e51aca9

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +10 -6
  2. app.py +91 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,12 +1,16 @@
1
  ---
2
- title: Search App
3
- emoji: 🔥
4
- colorFrom: green
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
+ title: Haystack Search App
3
+ emoji: 🔍
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: "3.50.2"
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # 🔍 Haystack Search App
13
+
14
+ Cette application vous permet de rechercher intelligemment des passages pertinents à partir de fichiers PDF, DOCX et TXT.
15
+
16
+ Déposez vos fichiers, entrez un mot-clé ou une question, et l’IA (Haystack + RoBERTa) retrouve les extraits utiles.
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+ import uuid
6
+
7
+ import gradio as gr
8
+ from haystack.document_stores import InMemoryDocumentStore
9
+ from haystack.nodes import FARMReader, PreProcessor, TextConverter, PDFToTextConverter, DocxToTextConverter
10
+ from haystack.nodes import BM25Retriever
11
+ from haystack.pipelines import ExtractiveQAPipeline
12
+
13
+ UPLOAD_DIR = os.path.join(tempfile.gettempdir(), "hf_uploads")
14
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
15
+
16
+ document_store = InMemoryDocumentStore()
17
+ retriever = BM25Retriever(document_store=document_store)
18
+ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
19
+ pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
20
+
21
+ preprocessor = PreProcessor(
22
+ clean_empty_lines=True,
23
+ clean_whitespace=True,
24
+ clean_header_footer=True,
25
+ split_by="word",
26
+ split_length=200,
27
+ split_respect_sentence_boundary=True,
28
+ )
29
+
30
+ converters = {
31
+ ".pdf": PDFToTextConverter(remove_numeric_tables=True),
32
+ ".txt": TextConverter(),
33
+ ".docx": DocxToTextConverter(),
34
+ }
35
+
36
+ def clear_files():
37
+ shutil.rmtree(UPLOAD_DIR)
38
+ os.makedirs(UPLOAD_DIR)
39
+ return "📁 Tous les fichiers ont été supprimés."
40
+
41
+ def add_files(files):
42
+ for file in files:
43
+ ext = os.path.splitext(file.name)[-1].lower()
44
+ if ext in converters:
45
+ file_path = os.path.join(UPLOAD_DIR, file.name)
46
+ with open(file_path, "wb") as f:
47
+ f.write(file.read())
48
+ return f"{len(files)} fichier(s) ajouté(s)."
49
+
50
+ def search_keyword(query):
51
+ if not query:
52
+ return "⚠️ Entrez un mot-clé."
53
+
54
+ docs = []
55
+ for root, _, files in os.walk(UPLOAD_DIR):
56
+ for file in files:
57
+ path = os.path.join(root, file)
58
+ ext = os.path.splitext(file)[-1].lower()
59
+ converter = converters.get(ext)
60
+ if converter:
61
+ doc = converter.convert(file_path=path, meta={"name": file, "path": path})
62
+ docs.extend(preprocessor.process([doc]))
63
+
64
+ document_store.delete_documents()
65
+ document_store.write_documents(docs)
66
+
67
+ prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
68
+ answers = prediction["answers"]
69
+
70
+ results = []
71
+ for ans in answers:
72
+ results.append(f"**Extrait :** {ans.context.strip()}\n\n**Fichier :** {ans.meta.get('name')}\n**Chemin :** {ans.meta.get('path')}\n\n---")
73
+
74
+ return "\n".join(results) if results else "Aucun passage trouvé."
75
+
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown("""# 🔍 Recherche intelligente dans vos documents (.pdf, .txt, .docx)""")
78
+ file_input = gr.File(file_types=[".pdf", ".txt", ".docx"], file_count="multiple", label="Ajoutez vos fichiers ici")
79
+ upload_btn = gr.Button("📁 Ajouter les fichiers")
80
+ clear_btn = gr.Button("🗑️ Vider les fichiers")
81
+
82
+ query = gr.Textbox(label="Mot-clé ou question", placeholder="Tapez un mot-clé ici...")
83
+ search_btn = gr.Button("🔎 Rechercher")
84
+ output = gr.Markdown()
85
+
86
+ upload_btn.click(fn=add_files, inputs=file_input, outputs=output)
87
+ clear_btn.click(fn=clear_files, outputs=output)
88
+ search_btn.click(fn=search_keyword, inputs=query, outputs=output)
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ farm-haystack==1.18.0
2
+ pandas<2.0.0
3
+ pydantic==1.10.13
4
+ gradio==3.50.2