File size: 1,643 Bytes
75309ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.messages import AIMessage
from fake_useragent import UserAgent

ua = UserAgent()
os.environ["USER_AGENT"] = ua.random

def scraper(url: str, doc_type: str) -> dict:
    if doc_type == "html":
        try:
            loader = AsyncChromiumLoader([url])
            html = loader.load() 
            # Transform
            bs_transformer = BeautifulSoupTransformer()
            docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"])
            print({"source":url, "content": AIMessage(docs_transformed[0].page_content)})
            return {"source":url, "content": AIMessage(docs_transformed[0].page_content)}
        except Exception as e:
            return {"source": url, "content": AIMessage(f"Error scraping website: {str(e)}")}
    elif doc_type == "pdf":
        try:
            loader = PyPDFLoader(url)
            pages = loader.load_and_split()
            # print({"source":url, "content":AIMessage(pages)})
            return {"source":url, "content":AIMessage(pages)}
        except Exception as e:
            return {"source": url, "content": AIMessage(f"Error scraping PDF: {str(e)}")}
    else:
        return {"source": url, "content": AIMessage("Unsupported document type, supported types are 'html' and 'pdf'.")}


if __name__ == "__main__":
    scraper("https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/", "html")