Rauhan commited on
Commit
c4a2d1f
1 Parent(s): e1ca8b7

UPDATE: web crawler

Browse files
Files changed (3) hide show
  1. app.py +4 -8
  2. functions.py +34 -1
  3. requirements.txt +3 -1
app.py CHANGED
@@ -1,11 +1,9 @@
1
  import io
2
- import re
3
  from functions import *
4
  from PyPDF2 import PdfReader
5
- from bs4 import BeautifulSoup
6
  from fastapi import FastAPI, File, UploadFile
7
  from fastapi.middleware.cors import CORSMiddleware
8
- from langchain_community.document_loaders import RecursiveUrlLoader
9
 
10
 
11
  app = FastAPI(title = "ConversAI", root_path = "/api/v1")
@@ -52,12 +50,10 @@ async def addText(vectorstore: str, text: str):
52
 
53
  @app.post("/addWebsite")
54
  async def addWebsite(vectorstore: str, websiteUrl: str):
55
- def bs4_extractor(html: str) -> str:
56
- soup = BeautifulSoup(html, "lxml")
57
- return re.sub(r"\n\n+", "\n\n", soup.text).strip()
58
- loader = RecursiveUrlLoader(websiteUrl, max_depth=2, timeout = 60, extractor=bs4_extractor)
59
  docs = loader.load()
60
- text = "\n\n".join([docs[doc].page_content for doc in range(len(docs))])
61
  return addDocuments(text = text, vectorstore = vectorstore)
62
 
63
 
 
1
  import io
 
2
  from functions import *
3
  from PyPDF2 import PdfReader
 
4
  from fastapi import FastAPI, File, UploadFile
5
  from fastapi.middleware.cors import CORSMiddleware
6
+ from langchain_community.document_loaders import UnstructuredURLLoader
7
 
8
 
9
  app = FastAPI(title = "ConversAI", root_path = "/api/v1")
 
50
 
51
  @app.post("/addWebsite")
52
  async def addWebsite(vectorstore: str, websiteUrl: str):
53
+ urls = getLinks("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
54
+ loader = UnstructuredURLLoader(urls=urls)
 
 
55
  docs = loader.load()
56
+ text = "\n\n\n\n".join([f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
57
  return addDocuments(text = text, vectorstore = vectorstore)
58
 
59
 
functions.py CHANGED
@@ -18,9 +18,14 @@ from langchain.retrievers.document_compressors import FlashrankRerank
18
  from supabase.client import create_client
19
  from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
 
 
21
  from supabase import create_client
22
  from dotenv import load_dotenv
23
  import os
 
 
 
24
 
25
  load_dotenv("secrets.env")
26
  client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
@@ -243,4 +248,32 @@ def listTables(username: str):
243
  except Exception as e:
244
  return {
245
  "error": e
246
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from supabase.client import create_client
19
  from qdrant_client import QdrantClient
20
  from langchain_groq import ChatGroq
21
+ from bs4 import BeautifulSoup
22
+ from urllib.parse import urlparse
23
  from supabase import create_client
24
  from dotenv import load_dotenv
25
  import os
26
+ import time
27
+ import requests
28
+
29
 
30
  load_dotenv("secrets.env")
31
  client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
 
248
  except Exception as e:
249
  return {
250
  "error": e
251
+ }
252
+
253
+
254
+ def getLinks(url: str, timeout = 30):
255
+ start = time.time()
256
+ def getLinksFromPage(url: str):
257
+ response = requests.get(url)
258
+ htmlContent = response.content
259
+ soup = BeautifulSoup(htmlContent, "lxml")
260
+ anchorTags = soup.find_all("a")
261
+ allLinks = []
262
+ for tag in anchorTags:
263
+ if "href" in tag.attrs:
264
+ if urlparse(tag.attrs["href"]).netloc == urlparse(url).netloc:
265
+ allLinks.append(tag.attrs["href"])
266
+ else:
267
+ continue
268
+ else:
269
+ continue
270
+ return allLinks
271
+ links = getLinksFromPage(url)
272
+ uniqueLinks = set()
273
+ for link in links:
274
+ now = time.time()
275
+ if now - start > timeout:
276
+ break
277
+ else:
278
+ uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
279
+ return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
requirements.txt CHANGED
@@ -14,4 +14,6 @@ lxml
14
  PyPDF2
15
  python-dotenv
16
  sentence-transformers
17
- supabase
 
 
 
14
  PyPDF2
15
  python-dotenv
16
  sentence-transformers
17
+ supabase
18
+ unstructured
19
+ urllib3