kkg_suche / app.py
alexkueck's picture
Update app.py
c204eeb verified
raw
history blame
31.1 kB
import gradio as gr
import fitz # PyMuPDF
import os
import requests
from huggingface_hub import HfApi
import base64
from io import BytesIO
import urllib.parse
# Zugriff auf das Secret als Umgebungsvariable
HF_TOKEN = os.getenv("HF_WRITE")
HF_READ = os.getenv("HF_READ")
# Überprüfen, ob das Secret geladen wurde
if HF_TOKEN is None:
raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
# Repository-Name und Typ
repo_id = "alexkueck/kkg_suche"
repo_type = "space"
save_dir = "kkg_dokumente"
# HfApi-Instanz erstellen
api = HfApi()
def upload_pdf(file):
if file is None:
return None, "Keine Datei hochgeladen."
# Extrahieren des Dateinamens aus dem vollen Pfad
filename = os.path.basename(file.name)
# Datei zum Hugging Face Space hochladen
upload_path = f"kkg_dokumente/{filename}"
api.upload_file(
path_or_fileobj=file.name,
path_in_repo=upload_path,
repo_id=repo_id,
repo_type=repo_type,
token=HF_TOKEN
)
return f"PDF '{filename}' erfolgreich hochgeladen."
def list_pdfs():
if not os.path.exists(save_dir):
return []
return [f for f in os.listdir(save_dir) if f.endswith('.pdf')]
def display_pdf(selected_pdf, hf_token):
save_dir = "kkg_dokumente"
pdf_path = os.path.join(save_dir, selected_pdf)
# PDF-URL im Hugging Face Space
repo_id = "alexkueck/kkg_suche" # Ersetzen Sie dies durch Ihre Repo-ID
encoded_pdf_name = urllib.parse.quote(selected_pdf)
pdf_url = f"https://huggingface.co/spaces/{repo_id}/resolve/main/kkg_dokumente/{encoded_pdf_name}"
# PDF von der URL herunterladen
headers = {"Authorization": f"Bearer {HF_READ}"}
response = requests.get(pdf_url, headers=headers)
if response.status_code == 200:
with open(pdf_path, 'wb') as f:
f.write(response.content)
else:
return [], f"Fehler beim Herunterladen der PDF-Datei von {pdf_url}"
# PDF in Bilder umwandeln
document = fitz.open(pdf_path)
images = []
for page_number in range(len(document)):
page = document.load_page(page_number)
pix = page.get_pixmap()
img_data = pix.tobytes("png")
# Bild in Base64-String umwandeln
buffered = BytesIO()
buffered.write(img_data)
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
img_str = f"data:image/png;base64,{img_str}"
images.append(img_str)
status = f"PDF '{selected_pdf}' erfolgreich geladen und verarbeitet."
return images, status
with gr.Blocks() as demo:
with gr.Tab("Upload PDF"):
upload_pdf_file = gr.File(label="PDF-Datei hochladen")
upload_status = gr.Textbox(label="Status")
upload_button = gr.Button("Upload")
upload_button.click(upload_pdf, inputs=upload_pdf_file, outputs=upload_status)
with gr.Tab("PDF Auswahl und Anzeige"):
pdf_dropdown = gr.Dropdown(label="Wählen Sie eine PDF-Datei", choices=list_pdfs())
refresh_button = gr.Button("Liste aktualisieren")
pdf_images = gr.Gallery(label="PDF-Seiten als Bilder")
display_status = gr.Textbox(label="Status")
display_button = gr.Button("Anzeigen")
refresh_button.click(lambda: pdf_dropdown.update(choices=list_pdfs()))
display_button.click(display_pdf, inputs=pdf_dropdown, outputs=[pdf_images, display_status])
demo.launch(share=True)
"""
import gradio as gr
import os
from huggingface_hub import HfApi
import time
# Zugriff auf das Secret als Umgebungsvariable
HF_TOKEN = os.getenv("HF_WRITE")
# Überprüfen, ob das Secret geladen wurde
if HF_TOKEN is None:
raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
# Repository-Name und Typ
repo_id = "alexkueck/kkg_suche"
repo_type = "space"
# HfApi-Instanz erstellen
api = HfApi()
def upload_and_display_pdf(file):
if file is None:
return None, "Keine Datei hochgeladen."
# Extrahieren des Dateinamens aus dem vollen Pfad
filename = os.path.basename(file.name)
# Datei zum Hugging Face Space hochladen
upload_path = f"kkg_dokumente/{filename}"
api.upload_file(
path_or_fileobj=file.name,
path_in_repo=upload_path,
repo_id=repo_id,
repo_type=repo_type,
token=HF_TOKEN
)
# Kurze Verzögerung, um sicherzustellen, dass die Datei verfügbar ist
time.sleep(2)
# URL zur hochgeladenen PDF-Datei erstellen
pdf_url = f"https://huggingface.co/spaces/{repo_id}/resolve/main/{upload_path}"
# HTML mit eingebettetem PDF erstellen
html_content = f
<div style="width:100%; height:600px;">
<object data="{pdf_url}" type="application/pdf" width="100%" height="100%">
<p>Es sieht so aus, als ob Ihr Browser keine eingebetteten PDFs unterstützt.
Sie können stattdessen <a href="{pdf_url}">hier klicken, um die PDF-Datei herunterzuladen</a>.</p>
</object>
</div>
return html_content, f"Datei '{filename}' erfolgreich hochgeladen und im Space gespeichert."
# Gradio Interface erstellen
iface = gr.Interface(
fn=upload_and_display_pdf,
inputs=gr.File(label="PDF-Datei hochladen"),
outputs=[
gr.HTML(label="PDF-Anzeige"),
gr.Textbox(label="Status")
],
title="PDF Upload und Anzeige",
description="Laden Sie eine PDF-Datei hoch. Sie wird im 'kkg_dokumente' Ordner des Spaces gespeichert und hier angezeigt."
)
# App starten
iface.launch()
"""
#funktionierenden upload
"""
import gradio as gr
import os
import fitz # PyMuPDF
import tempfile
from huggingface_hub import HfApi
import shutil
# Zugriff auf das Secret als Umgebungsvariable
HF_TOKEN = os.getenv("HF_WRITE")
# Überprüfen, ob das Secret geladen wurde
if HF_TOKEN is None:
raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
# Repository-Name
repo_id = "alexkueck/kkg_suche"
repo_type = "space"
# HfApi-Instanz erstellen
api = HfApi()
def upload_and_display_pdf(file):
if file is None:
return None, "Keine Datei hochgeladen."
# Extrahieren des Dateinamens aus dem vollen Pfad
filename = os.path.basename(file.name)
# Datei zum Hugging Face Space hochladen
upload_path = f"kkg_dokumente/{filename}"
api.upload_file(
path_or_fileobj=file.name,
path_in_repo=upload_path,
repo_id=repo_id,
repo_type=repo_type,
token=HF_TOKEN
)
# PDF in HTML umwandeln
doc = fitz.open(file.name)
html_content = ""
for page in doc:
html_content += page.get_text("html")
doc.close()
# Temporäre HTML-Datei erstellen
with tempfile.NamedTemporaryFile(delete=False, suffix=".html", mode="w", encoding="utf-8") as temp_file:
temp_file.write(html_content)
temp_html_path = temp_file.name
return temp_html_path, f"Datei '{filename}' erfolgreich hochgeladen und im Repository gespeichert."
# Gradio Interface erstellen
iface = gr.Interface(
fn=upload_and_display_pdf,
inputs=gr.File(label="PDF-Datei hochladen"),
outputs=[
gr.HTML(label="PDF-Inhalt"),
gr.Textbox(label="Status")
],
title="PDF Upload und Anzeige",
description="Laden Sie eine PDF-Datei hoch. Sie wird im 'kkg_dokumente' Ordner des Repositories gespeichert und hier angezeigt."
)
# App starten
iface.launch()
"""
"""
# Zugriff auf das Secret als Umgebungsvariable
HF_TOKEN = os.getenv("HF_WRITE")
# Überprüfen, ob das Secret geladen wurde
if HF_TOKEN is None:
raise ValueError("HF_TOKEN environment variable not set. Please set the secret in your Hugging Face Space.")
# Repository-Name
repo_id = "alexkueck/kkg_suche"
# Absoluter Pfad zum Verzeichnis mit den Dokumenten
DOCS_DIR = "kkg_dokumente"
# Funktion zum Extrahieren des Textes aus einer PDF-Datei
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = []
for page in doc:
text.append(page.get_text())
return text
# Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
documents = []
for file_name in os.listdir(DOCS_DIR):
if file_name.endswith(".pdf"):
pdf_path = os.path.join(DOCS_DIR, file_name)
pages_text = extract_text_from_pdf(pdf_path)
documents.append({"file": file_name, "pages": pages_text})
# TF-IDF Vectorizer vorbereiten
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
def display_document(doc_name):
if isinstance(doc_name, list):
doc_name = doc_name[0] # Nehmen Sie das erste Element, falls eine Liste übergeben wurde
file_path = os.path.join(DOCS_DIR, doc_name)
if not os.path.exists(file_path):
return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
# Generieren Sie die URL für das PDF
file_url = f"file://{file_path}"
return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
def search_documents(query):
if not query:
return [doc['file'] for doc in documents], "", []
query_vector = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
results = []
relevant_text = ""
relevant_pdfs = []
num_pages_per_doc = [len(doc['pages']) for doc in documents]
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
doc = documents[doc_index]
results.append(doc['file'])
page_content = doc['pages'][page_index]
index = page_content.lower().find(query.lower())
if index != -1:
start = max(0, index - 100)
end = min(len(page_content), index + 100)
relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
relevant_pdfs.append((doc['file'], page_index))
return results, relevant_text, relevant_pdfs
def update_display(doc_name):
return display_document(doc_name)
def search_and_update(query):
results, rel_text, relevant_pdfs = search_documents(query)
pdf_html = ""
for pdf, page in relevant_pdfs:
pdf_path = os.path.join(DOCS_DIR, pdf)
if not os.path.exists(pdf_path):
pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
else:
file_url = f"file://{pdf_path}"
pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
return gr.update(choices=results, value=results[0] if results else None), rel_text, pdf_html
def upload_file(file):
local_file_path = file.name
target_path_in_space = f"kkg_dokumente/{file.orig_name}"
api = HfApi()
api.upload_file(
path_or_fileobj=local_file_path,
path_in_repo=target_path_in_space,
repo_id=repo_id,
token=HF_TOKEN,
repo_type="space"
)
return file.name
# Initialisieren der Gradio-Oberfläche
with gr.Blocks() as demo:
gr.Markdown("# Dokumentensuche und -anzeige")
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
file_input = gr.File(label="Dokument hochladen", file_types=[".pdf"], type="file")
with gr.Row():
with gr.Column(scale=2):
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente", allow_custom_value=True)
doc_display = gr.HTML(label="Dokumentvorschau")
with gr.Column(scale=1):
relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
pdf_display = gr.HTML()
query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
file_input.upload(upload_file, inputs=file_input, outputs=[doc_dropdown])
demo.launch()
"""
"""
import gradio as gr
import os
import fitz # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Absoluter Pfad zum Verzeichnis mit den Dokumenten
DOCS_DIR = os.path.abspath("kkg_dokumente")
# Funktion zum Extrahieren des Textes aus einer PDF-Datei
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = []
for page in doc:
text.append(page.get_text())
return text
# Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
documents = []
for file_name in os.listdir(DOCS_DIR):
if file_name.endswith(".pdf"):
pdf_path = os.path.join(DOCS_DIR, file_name)
pages_text = extract_text_from_pdf(pdf_path)
documents.append({"file": file_name, "pages": pages_text})
# TF-IDF Vectorizer vorbereiten
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
def display_document(doc_name):
if isinstance(doc_name, list):
doc_name = doc_name[0] # Nehmen Sie das erste Element, falls eine Liste übergeben wurde
file_path = os.path.join(DOCS_DIR, doc_name)
if not os.path.exists(file_path):
return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
# Generieren Sie die URL für das PDF
file_url = f"file://{file_path}"
return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
def search_documents(query):
if not query:
return [doc['file'] for doc in documents], "", []
query_vector = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
results = []
relevant_text = ""
relevant_pdfs = []
num_pages_per_doc = [len(doc['pages']) for doc in documents]
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
doc = documents[doc_index]
results.append(doc['file'])
page_content = doc['pages'][page_index]
index = page_content.lower().find(query.lower())
if index != -1:
start = max(0, index - 100)
end = min(len(page_content), index + 100)
relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
relevant_pdfs.append((doc['file'], page_index))
return results, relevant_text, relevant_pdfs
def update_display(doc_name):
return display_document(doc_name)
def search_and_update(query):
results, rel_text, relevant_pdfs = search_documents(query)
pdf_html = ""
for pdf, page in relevant_pdfs:
pdf_path = os.path.join(DOCS_DIR, pdf)
if not os.path.exists(pdf_path):
pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
else:
file_url = f"file://{pdf_path}"
pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
return gr.update(choices=results, value=results[0] if results else None), rel_text, pdf_html
def upload_file(file):
file_name = "uploaded_file.pdf"
file_path = os.path.join(DOCS_DIR, file_name)
# Debugging-Ausgabe: Überprüfen Sie, ob das Verzeichnis existiert
if not os.path.exists(DOCS_DIR):
print(f"Verzeichnis {DOCS_DIR} existiert nicht. Erstelle Verzeichnis.")
os.makedirs(DOCS_DIR)
# Debugging-Ausgabe: Dateiname und Pfad
print(f"Speichere Datei nach {file_path}")
with open(file_path, "wb") as f:
f.write(file)
# Überprüfen, ob die Datei korrekt gespeichert wurde
if os.path.exists(file_path):
print(f"Datei erfolgreich gespeichert: {file_path}")
else:
print(f"Fehler beim Speichern der Datei: {file_path}")
# Aktualisieren Sie die Dokumentenliste und die TF-IDF-Matrix
pages_text = extract_text_from_pdf(file_path)
documents.append({"file": file_name, "pages": pages_text})
global tfidf_matrix
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
return gr.update(choices=[doc['file'] for doc in documents], value=file_name)
# Initialisieren der Gradio-Oberfläche
with gr.Blocks() as demo:
gr.Markdown("# Dokumentensuche und -anzeige")
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
file_input = gr.File(label="Dokument hochladen", file_types=[".pdf"], type="binary")
with gr.Row():
with gr.Column(scale=2):
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente", allow_custom_value=True)
doc_display = gr.HTML(label="Dokumentvorschau")
with gr.Column(scale=1):
relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
pdf_display = gr.HTML()
query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
file_input.upload(upload_file, inputs=file_input, outputs=[doc_dropdown])
demo.launch()
"""
"""
import gradio as gr
import os
import fitz # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Absoluter Pfad zum Verzeichnis mit den Dokumenten
DOCS_DIR = os.path.abspath("kkg_dokumente")
# Funktion zum Extrahieren des Textes aus einer PDF-Datei
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = []
for page in doc:
text.append(page.get_text())
return text
# Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
documents = []
for file_name in os.listdir(DOCS_DIR):
if file_name.endswith(".pdf"):
pdf_path = os.path.join(DOCS_DIR, file_name)
pages_text = extract_text_from_pdf(pdf_path)
documents.append({"file": file_name, "pages": pages_text})
# TF-IDF Vectorizer vorbereiten
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
def display_document(doc_name):
if isinstance(doc_name, list):
doc_name = doc_name[0] # Nehmen Sie das erste Element, falls eine Liste übergeben wurde
file_path = os.path.join(DOCS_DIR, doc_name)
if not os.path.exists(file_path):
return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
# Generieren Sie die URL für das PDF
file_url = f"file://{file_path}"
return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
def search_documents(query):
if not query:
return [doc['file'] for doc in documents], "", []
query_vector = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
results = []
relevant_text = ""
relevant_pdfs = []
num_pages_per_doc = [len(doc['pages']) for doc in documents]
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
doc = documents[doc_index]
results.append(doc['file'])
page_content = doc['pages'][page_index]
index = page_content.lower().find(query.lower())
if index != -1:
start = max(0, index - 100)
end = min(len(page_content), index + 100)
relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
relevant_pdfs.append((doc['file'], page_index))
return results, relevant_text, relevant_pdfs
def update_display(doc_name):
return display_document(doc_name)
def search_and_update(query):
results, rel_text, relevant_pdfs = search_documents(query)
pdf_html = ""
for pdf, page in relevant_pdfs:
pdf_path = os.path.join(DOCS_DIR, pdf)
if not os.path.exists(pdf_path):
pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
else:
file_url = f"file://{pdf_path}"
pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
return gr.update(choices=results, value=results[0] if results else None), rel_text, pdf_html
def upload_file(file):
file_path = os.path.join(DOCS_DIR, file.name)
with open(file_path, "wb") as f:
f.write(file.read())
# Aktualisieren Sie die Dokumentenliste und die TF-IDF-Matrix
pages_text = extract_text_from_pdf(file_path)
documents.append({"file": file.name, "pages": pages_text})
global tfidf_matrix
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
return gr.update(choices=[doc['file'] for doc in documents], value=file.name)
# Initialisieren der Gradio-Oberfläche
with gr.Blocks() as demo:
gr.Markdown("# Dokumentensuche und -anzeige")
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
file_input = gr.File(label="Dokument hochladen", file_types=[".pdf"], type="binary")
with gr.Row():
with gr.Column(scale=2):
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente", allow_custom_value=True)
doc_display = gr.HTML(label="Dokumentvorschau")
with gr.Column(scale=1):
relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
pdf_display = gr.HTML()
query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
file_input.upload(upload_file, inputs=file_input, outputs=[doc_dropdown])
demo.launch()
"""
###funktioniert......................................
"""
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Beispiel-Daten mit hartcodierten Texten
documents = [
{"file": "document1.pdf", "pages": ["Seite 1 Inhalt von Dokument 1", "Seite 2 Inhalt von Dokument 1"]},
{"file": "document2.pdf", "pages": ["Seite 1 Inhalt von Dokument 2", "Seite 2 Inhalt von Dokument 2"]}
]
# TF-IDF Vectorizer vorbereiten
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
def display_document(doc_name):
# Hartcodierter HTML-Inhalt zur Anzeige des Dokuments
hardcoded_html = f
<h1>{doc_name}</h1>
<p>Dies ist ein Beispieltext für die Anzeige des Dokuments {doc_name}.</p>
<iframe src="https://www.example.com" width="100%" height="600px"></iframe>
return hardcoded_html
def search_documents(query):
if not query:
return [doc['file'] for doc in documents], "", []
query_vector = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
results = []
relevant_text = ""
relevant_pdfs = []
num_pages_per_doc = [len(doc['pages']) for doc in documents]
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
doc = documents[doc_index]
results.append(doc['file'])
page_content = doc['pages'][page_index]
index = page_content.lower().find(query.lower())
if index != -1:
start = max(0, index - 100)
end = min(len(page_content), index + 100)
relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
relevant_pdfs.append((doc['file'], page_index))
return results, relevant_text, relevant_pdfs
def update_display(doc_name):
return display_document(doc_name)
def search_and_update(query):
results, rel_text, relevant_pdfs = search_documents(query)
pdf_html = ""
for pdf, page in relevant_pdfs:
# Hartcodierter HTML-Inhalt zur Anzeige der Suchergebnisse
pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
pdf_html += f'<iframe src="https://www.example.com" width="100%" height="600px"></iframe>'
return results, rel_text, pdf_html
# Initialisieren der Gradio-Oberfläche
with gr.Blocks() as demo:
gr.Markdown("# Dokumentensuche und -anzeige")
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
with gr.Row():
with gr.Column(scale=2):
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente")
doc_display = gr.HTML(label="Dokumentvorschau")
with gr.Column(scale=1):
relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
pdf_display = gr.HTML()
query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
demo.launch()
"""
"""
import gradio as gr
import os
import fitz # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Verwenden Sie den korrekten Pfad für die hochgeladenen Dateien in Ihrem Hugging Face Space
DOCS_DIR = os.path.abspath("kkg_dokumente")
# Funktion zum Extrahieren des Textes aus einer PDF-Datei
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = []
for page in doc:
text.append(page.get_text())
return text
# Dynamische Erstellung der Dokumentenliste und Extraktion der Texte
documents = []
for file_name in os.listdir(DOCS_DIR):
if file_name.endswith(".pdf"):
pdf_path = os.path.join(DOCS_DIR, file_name)
pages_text = extract_text_from_pdf(pdf_path)
documents.append({"file": file_name, "pages": pages_text})
# TF-IDF Vectorizer vorbereiten
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([page for doc in documents for page in doc['pages']])
def display_document(doc_name):
file_path = os.path.join(DOCS_DIR, doc_name)
if not os.path.exists(file_path):
return f"<p>Fehler: Datei nicht gefunden - {file_path}</p>"
# Generieren Sie die URL für das PDF
file_url = f"{DOCS_DIR}/{doc_name}"
return f'<iframe src="{file_url}" width="100%" height="600px"></iframe>'
def search_documents(query):
if not query:
return [doc['file'] for doc in documents], "", []
query_vector = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
results = []
relevant_text = ""
relevant_pdfs = []
num_pages_per_doc = [len(doc['pages']) for doc in documents]
cumulative_pages = [sum(num_pages_per_doc[:i+1]) for i in range(len(num_pages_per_doc))]
for i in related_docs_indices:
if cosine_similarities[i] > 0:
doc_index = next(idx for idx, cumulative in enumerate(cumulative_pages) if i < cumulative)
page_index = i if doc_index == 0 else i - cumulative_pages[doc_index-1]
doc = documents[doc_index]
results.append(doc['file'])
page_content = doc['pages'][page_index]
index = page_content.lower().find(query.lower())
if index != -1:
start = max(0, index - 100)
end = min(len(page_content), index + 100)
relevant_text += f"Aus {doc['file']} (Seite {page_index + 1}):\n...{page_content[start:end]}...\n\n"
relevant_pdfs.append((doc['file'], page_index))
return results, relevant_text, relevant_pdfs
def update_display(doc_name):
return display_document(doc_name)
def search_and_update(query):
results, rel_text, relevant_pdfs = search_documents(query)
pdf_html = ""
for pdf, page in relevant_pdfs:
pdf_path = os.path.join(DOCS_DIR, pdf)
if not os.path.exists(pdf_path):
pdf_html += f"<p>Fehler: Datei nicht gefunden - {pdf_path}</p>"
else:
file_url = f"{DOCS_DIR}/{pdf}"
pdf_html += f"<h3>{pdf} - Seite {page+1}</h3>"
pdf_html += f'<iframe src="{file_url}#page={page+1}" width="100%" height="600px"></iframe>'
return gr.Dropdown.update(choices=results), rel_text, pdf_html
# Initialisieren der Gradio-Oberfläche
with gr.Blocks() as demo:
gr.Markdown("# Dokumentensuche und -anzeige")
query_input = gr.Textbox(label="Suchbegriff (leer lassen für alle Dokumente)")
with gr.Row():
with gr.Column(scale=2):
doc_dropdown = gr.Dropdown(choices=[doc['file'] for doc in documents], label="Dokumente")
doc_display = gr.HTML(label="Dokumentvorschau")
with gr.Column(scale=1):
relevant_text = gr.Textbox(label="Relevanter Text", lines=10)
pdf_display = gr.HTML()
query_input.submit(search_and_update, inputs=[query_input], outputs=[doc_dropdown, relevant_text, pdf_display])
doc_dropdown.change(update_display, inputs=[doc_dropdown], outputs=[doc_display])
demo.launch()
"""