danicafisher commited on
Commit
72762da
1 Parent(s): b8f7893

Adds url function

Browse files
Files changed (1) hide show
  1. helper_functions.py +43 -12
helper_functions.py CHANGED
@@ -1,22 +1,17 @@
1
- from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, WebBaseLoader
2
  from langchain_community.vectorstores import Qdrant
3
  import os
 
4
 
5
- def process_file(file_or_url):
6
- if isinstance(file_or_url, str) and file_or_url.startswith(('http://', 'https://')):
7
- # Handle URL
8
- loader = WebBaseLoader(file_or_url)
9
- docs = loader.load()
10
- documents.extend(docs)
11
  # save the file temporarily
12
- temp_file = "./"+file_or_url.path
13
  with open(temp_file, "wb") as file:
14
- file.write(file_or_url.content)
15
- file_name = file_or_url.name
16
 
17
  documents = []
18
- if file_or_url.path.endswith(".pdf"):
19
- loader = PyMuPDFLoader(temp_file)
20
  docs = loader.load()
21
  documents.extend(docs)
22
  else:
@@ -25,6 +20,42 @@ def process_file(file_or_url):
25
  documents.extend(docs)
26
  return documents
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def add_to_qdrant(documents, embeddings, qdrant_client, collection_name):
30
  Qdrant.from_documents(
 
1
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredURLLoader
2
  from langchain_community.vectorstores import Qdrant
3
  import os
4
+ import requests
5
 
6
+ def process_file(file):
 
 
 
 
 
7
  # save the file temporarily
8
+ temp_file = "./"+file.path
9
  with open(temp_file, "wb") as file:
10
+ file.write(file.content)
 
11
 
12
  documents = []
13
+ if file.path.endswith(".pdf"):
14
+ loader = PyPDFLoader(temp_file)
15
  docs = loader.load()
16
  documents.extend(docs)
17
  else:
 
20
  documents.extend(docs)
21
  return documents
22
 
23
+ def load_documents_from_url(url):
24
+ try:
25
+ # Check if it's a PDF
26
+ if url.endswith(".pdf"):
27
+ try:
28
+ loader = PyPDFLoader(url)
29
+ return loader.load()
30
+ except Exception as e:
31
+ print(f"Error loading PDF from {url}: {e}")
32
+ return None
33
+
34
+ # Fetch the content and check for video pages
35
+ try:
36
+ response = requests.head(url, timeout=10) # Timeout for fetching headers
37
+ content_type = response.headers.get('Content-Type', '')
38
+ except Exception as e:
39
+ print(f"Error fetching headers from {url}: {e}")
40
+ return None
41
+
42
+ # Ignore video content (flagged for now)
43
+ if 'video' in content_type:
44
+ return None
45
+ if 'youtube' in url:
46
+ return None
47
+
48
+ # Otherwise, treat it as an HTML page
49
+ try:
50
+ loader = UnstructuredURLLoader([url])
51
+ return loader.load()
52
+ except Exception as e:
53
+ print(f"Error loading HTML from {url}: {e}")
54
+ return None
55
+ except Exception as e:
56
+ print(f"General error loading from {url}: {e}")
57
+ return None
58
+
59
 
60
  def add_to_qdrant(documents, embeddings, qdrant_client, collection_name):
61
  Qdrant.from_documents(