Spaces:
Sleeping
Sleeping
File size: 1,643 Bytes
75309ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import os
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.messages import AIMessage
from fake_useragent import UserAgent
ua = UserAgent()
os.environ["USER_AGENT"] = ua.random
def scraper(url: str, doc_type: str) -> dict:
if doc_type == "html":
try:
loader = AsyncChromiumLoader([url])
html = loader.load()
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"])
print({"source":url, "content": AIMessage(docs_transformed[0].page_content)})
return {"source":url, "content": AIMessage(docs_transformed[0].page_content)}
except Exception as e:
return {"source": url, "content": AIMessage(f"Error scraping website: {str(e)}")}
elif doc_type == "pdf":
try:
loader = PyPDFLoader(url)
pages = loader.load_and_split()
# print({"source":url, "content":AIMessage(pages)})
return {"source":url, "content":AIMessage(pages)}
except Exception as e:
return {"source": url, "content": AIMessage(f"Error scraping PDF: {str(e)}")}
else:
return {"source": url, "content": AIMessage("Unsupported document type, supported types are 'html' and 'pdf'.")}
if __name__ == "__main__":
scraper("https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/", "html") |