Spaces:
Sleeping
Sleeping
# scrap a given url recursively | |
import subprocess | |
import os | |
from urllib.parse import urlparse | |
from langchain_community.document_loaders import DirectoryLoader | |
def runcmd(cmd, verbose = False, *args, **kwargs): | |
process = subprocess.Popen( | |
cmd, | |
stdout = subprocess.PIPE, | |
stderr = subprocess.PIPE, | |
text = True, | |
shell = True | |
) | |
std_out, std_err = process.communicate() | |
if verbose: | |
print(std_out.strip(), std_err) | |
pass | |
return process.returncode | |
def scrap_website(target_url:str, depth:int=5): | |
target_domain = urlparse(target_url).netloc | |
target_directory='./downloads/' | |
# To download the files locally for processing, here's the command line | |
command_this=f'wget -e robots=off --recursive -l {depth} --no-clobber --page-requisites --html-extension \ | |
--convert-links --restrict-file-names=windows --force-directories --directory-prefix={target_directory}\ | |
--domains target_domain --no-parent {target_url}' | |
cmd_status = runcmd(command_this, verbose=True) | |
if cmd_status==0: | |
documents_path = os.path.dirname(os.path.realpath(f'{target_directory}/{target_domain}')) | |
loader = DirectoryLoader(documents_path, silent_errors=True, show_progress=True) | |
docs = loader.load() | |
return docs |