SucheRAG

Sleeping

App Files Files Community

alexkueck commited on Jul 9, 2024

Commit

b48a2c8

verified ·

1 Parent(s): 1b849d5

Update utils.py

Browse files

Files changed (1) hide show

utils.py +28 -7

utils.py CHANGED Viewed

@@ -60,7 +60,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from chromadb.errors import InvalidDimensionException
 import fitz  # PyMuPDF
 import docx
-from huggingface_hub import hf_hub_download
 #import io
 #from PIL import Image, ImageDraw, ImageOps, ImageFont
 #import base64
@@ -314,12 +314,23 @@ def create_directory_loader(file_type, directory_path):
     def load(self):
         documents = []
-        # Annahme: directory_path ist jetzt ein Pfad innerhalb des Hugging Face Spaces
-        files = self.list_files_in_hf_space(self.directory_path)
-        for file in files:
-            if file.endswith(self.file_type):
-                file_path = self.access_pdf(file)
-                documents.extend(self.loader_func(file_path))
         return documents
     return CustomLoader(directory_path, file_type, loaders[file_type])
@@ -401,6 +412,7 @@ def document_loading_splitting():
     #os.makedirs(download_dir, exist_ok=True)
     # Dateien im Hugging Face Space auflisten
     files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
     print("hier.....................................")
     # Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
@@ -413,6 +425,15 @@ def document_loading_splitting():
             download_file_from_hf(file_name, local_file_path)
         print("file_name..................."+str(file_name))
         print("local_file_path..................."+str(local_file_path))
     # Erstellen von DirectoryLoader für jeden Dateityp
     pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
     word_loader = create_directory_loader('.word', CHROMA_WORD)

 from chromadb.errors import InvalidDimensionException
 import fitz  # PyMuPDF
 import docx
+from huggingface_hub import hf_hub_download, list_repo_files
 #import io
 #from PIL import Image, ImageDraw, ImageOps, ImageFont
 #import base64
     def load(self):
         documents = []
+            for file_path in self.file_list:
+                with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
+                    temp_path = temp_file.name
+                # Datei aus dem Hugging Face Space herunterladen
+                hf_hub_download(
+                    repo_id=STORAGE_REPO_ID,
+                    filename=file_path,
+                    repo_type="space",
+                    local_dir=os.path.dirname(temp_path),
+                    local_dir_use_symlinks=False
+                )
+                documents.extend(self.loader_func(temp_path))
+                # Temporäre Datei löschen
+                os.unlink(temp_path)
         return documents
     return CustomLoader(directory_path, file_type, loaders[file_type])
     #os.makedirs(download_dir, exist_ok=True)
     # Dateien im Hugging Face Space auflisten
+    """
     files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
     print("hier.....................................")
     # Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
             download_file_from_hf(file_name, local_file_path)
         print("file_name..................."+str(file_name))
         print("local_file_path..................."+str(local_file_path))
+    """
+    # Dateien im Hugging Face Space auflisten
+    files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space")
+    pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
+    word_files = [f for f in files_in_repo if f.endswith('.docx') and f.startswith("chroma/kkg/word/")]
     # Erstellen von DirectoryLoader für jeden Dateityp
     pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
     word_loader = create_directory_loader('.word', CHROMA_WORD)