Spaces:

CoExperiences
/

aie4-final

Paused

App Files Files Community

danicafisher commited on Oct 18, 2024

Commit

72762da

•

1 Parent(s): b8f7893

Adds url function

Browse files

Files changed (1) hide show

helper_functions.py +43 -12

helper_functions.py CHANGED Viewed

@@ -1,22 +1,17 @@
-from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, WebBaseLoader
 from langchain_community.vectorstores import Qdrant
 import os
-def process_file(file_or_url):
-    if isinstance(file_or_url, str) and file_or_url.startswith(('http://', 'https://')):
-        # Handle URL
-        loader = WebBaseLoader(file_or_url)
-        docs = loader.load()
-        documents.extend(docs)
     # save the file temporarily
-    temp_file = "./"+file_or_url.path
     with open(temp_file, "wb") as file:
-       file.write(file_or_url.content)
-       file_name = file_or_url.name
     documents = []
-    if file_or_url.path.endswith(".pdf"):
-        loader = PyMuPDFLoader(temp_file)
         docs = loader.load()
         documents.extend(docs)
     else:
@@ -25,6 +20,42 @@ def process_file(file_or_url):
         documents.extend(docs)
     return documents
 def add_to_qdrant(documents, embeddings, qdrant_client, collection_name):
     Qdrant.from_documents(

+from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredURLLoader
 from langchain_community.vectorstores import Qdrant
 import os
+import requests
+def process_file(file):
     # save the file temporarily
+    temp_file = "./"+file.path
     with open(temp_file, "wb") as file:
+       file.write(file.content)
     documents = []
+    if file.path.endswith(".pdf"):
+        loader = PyPDFLoader(temp_file)
         docs = loader.load()
         documents.extend(docs)
     else:
         documents.extend(docs)
     return documents
+def load_documents_from_url(url):
+    try:
+        # Check if it's a PDF
+        if url.endswith(".pdf"):
+            try:
+                loader = PyPDFLoader(url)
+                return loader.load()
+            except Exception as e:
+                print(f"Error loading PDF from {url}: {e}")
+                return None
+        # Fetch the content and check for video pages
+        try:
+            response = requests.head(url, timeout=10)  # Timeout for fetching headers
+            content_type = response.headers.get('Content-Type', '')
+        except Exception as e:
+            print(f"Error fetching headers from {url}: {e}")
+            return None
+        # Ignore video content (flagged for now)
+        if 'video' in content_type:
+            return None
+        if 'youtube' in url:
+            return None
+        # Otherwise, treat it as an HTML page
+        try:
+            loader = UnstructuredURLLoader([url])
+            return loader.load()
+        except Exception as e:
+            print(f"Error loading HTML from {url}: {e}")
+            return None
+    except Exception as e:
+        print(f"General error loading from {url}: {e}")
+        return None
 def add_to_qdrant(documents, embeddings, qdrant_client, collection_name):
     Qdrant.from_documents(