Spaces:
Sleeping
Sleeping
############################################################################################################################################################### | |
# _____ _ ___ _ ___ | |
# |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _ | |
# | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_| | |
# |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_| | |
# |___/ |_| | |
# | |
############################################################################################################################################################## | |
# _ ______ _ _ _______ _ _ | |
# _ | | (_____ \ | | (_) (_______) (_) (_) | |
# _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _ | |
# (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | | | |
# / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || | | |
# \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_| | |
# (_____| | |
############################################################################################################################################################### | |
# | |
# Last updated in: 8/20/2024 | |
# | |
############################################################################################################################################################### | |
# ------------------------------------------------------------------------------ | |
# IMPORTS | |
# ------------------------------------------------------------------------------ | |
import gradio as gr | |
from bs4 import BeautifulSoup as Soup | |
from langchain_community.document_loaders import (AsyncHtmlLoader, | |
NewsURLLoader, PubMedLoader, | |
PlaywrightURLLoader, | |
RecursiveUrlLoader, | |
SeleniumURLLoader, | |
UnstructuredURLLoader, | |
WebBaseLoader) | |
from selenium import webdriver | |
from selenium.common.exceptions import WebDriverException | |
from PIL import Image | |
from io import BytesIO | |
# ------------------------------------------------------------------------------ | |
# THE BIG SCRAPER METHOD | |
# ------------------------------------------------------------------------------ | |
def extractDataFromUrls(urls: str, loader_type: str): | |
"""Extracts data from provided URLs using specified loader type. | |
Args: | |
urls (str): Comma-separated URLs to extract data from. | |
loader_type (str): Type of loader to use for data extraction. | |
Returns: | |
tuple: A tuple containing the extracted data in JSON format and as a list of Document objects. | |
Returns error messages if an exception occurs. | |
""" | |
try: | |
urls = urls.split(',') | |
data = [] | |
# Instantiate the selected loader based on loader_type | |
if loader_type == 'AsyncHtmlLoader': | |
loader = AsyncHtmlLoader(urls) | |
elif loader_type == 'UnstructuredURL': | |
loader = UnstructuredURLLoader(urls=urls) | |
elif loader_type == 'RecursiveURL': | |
loader = RecursiveUrlLoader( | |
url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text | |
) | |
elif loader_type == 'SeleniumURL': | |
loader = SeleniumURLLoader(urls=urls) | |
elif loader_type == 'SeleniumURLH': | |
loader = SeleniumURLLoader(urls=urls, headless=False) | |
elif loader_type == 'PlaywrightURL': | |
loader = PlaywrightURLLoader(urls=urls) | |
elif loader_type == 'PubMed': | |
loader = PubMedLoader(urls[0]) | |
elif loader_type == 'NewsURL': | |
loader = NewsURLLoader(urls) | |
elif loader_type == 'WebBaseLoader': | |
loader = WebBaseLoader(urls) | |
else: | |
return "Not Implemented. Development in Progress", "Work In Progress" | |
# Load data using the selected loader | |
data = loader.load() | |
# Convert data to JSON format | |
jsonData = [] | |
for item in data: | |
jsonData.append(item.to_json()) | |
return jsonData, data, urls[0] | |
except Exception as err: | |
return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom" | |
# ------------------------------------------------------------------------------ | |
# WEB DATA AND SCREENSHOT | |
# ------------------------------------------------------------------------------ | |
def take_webdata(url): | |
options = webdriver.ChromeOptions() | |
options.add_argument('--headless') | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
try: | |
wd = webdriver.Chrome(options=options) | |
wd.set_window_size(1080, 720) | |
wd.get(url) | |
wd.implicitly_wait(5) | |
page_title = wd.title | |
screenshot = wd.get_screenshot_as_png() | |
except WebDriverException as e: | |
return Image.new('RGB', (1, 1)), page_title | |
finally: | |
if wd: | |
wd.quit() | |
return Image.open(BytesIO(screenshot)) , page_title | |
# ------------------------------------------------------------------------------ | |
# GRADIO | |
# ------------------------------------------------------------------------------ | |
# Define choices for the dropdown menu | |
choices = [ | |
'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed', | |
'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup', | |
'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL', | |
] | |
# Create the Gradio interface | |
with gr.Blocks(theme="sudeepshouche/minimalist") as demo: | |
extracted_url = gr.State() # Use gr.State() to store the URL | |
screenshot_output = gr.State() | |
title_output = gr.State() | |
gr.Markdown("# The Big Scraper") | |
with gr.Tab("Scraped"): | |
with gr.Row(): | |
with gr.Column(): | |
url_input = gr.Textbox(label="Enter your comma separated URLs here") | |
loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here") | |
btn = gr.Button("Extract Data") | |
with gr.Column(): | |
screenshot_output = gr.Image(label="Screenshot") | |
title_output = gr.Textbox(label="Page Title") | |
json_output = gr.JSON(label="Extracted Data (JSON)") | |
text_output = gr.Textbox(label="Extracted Data (Text)") | |
btn.click(extractDataFromUrls, inputs=[url_input, loader_dropdown], outputs=[json_output, text_output, extracted_url]) \ | |
.then(take_webdata, inputs=extracted_url, outputs=[screenshot_output, title_output], queue=True) | |
# Launch the Gradio interface | |
demo.launch(share=True) |