Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import bs4 | |
def link_find(url): | |
out = [] | |
source = requests.get(url) | |
if source.status_code ==200: | |
print("YES") | |
#soup = bs4.BeautifulSoup(source.content,'lxml') | |
soup = bs4.BeautifulSoup(source.content,'html.parser') | |
rawp=(f'RAW TEXT RETURNED: {soup.text}') | |
cnt=0 | |
cnt+=len(rawp) | |
rawt=soup.text | |
#out.append(rawp) | |
#out.append("HTML fragments: ") | |
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} | |
node2 = {"URL":url,"LINKS":[],"TREE":[]} | |
q=("a","p","span","content","article") | |
for p in soup.find_all("a"): | |
node1['LINKS'].append(p.get('href')) | |
node1['TREE'].append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) | |
node2['TREE'].append({"URL":p.get('href'),"LINKS":[],"TREE":[]}) | |
node2['LINKS'].append(p.get('href')) | |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]}) | |
else: | |
print("NO") | |
pass | |
return node1,node2 | |
#https://huggingface.co/spaces/Omnibus/crawl | |
def sitemap(url,level): | |
uri="" | |
if url != "" and url != None: | |
link1,link2=link_find(url) | |
if level >=2: | |
for i,ea in enumerate(link1['TREE']): | |
print(ea) | |
try: | |
if not ea['URL'].startswith("http"): | |
uri1=url.split("//")[0] | |
uri2=url.split("//")[1] | |
uri3=uri2.split("/")[0] | |
uri=f'{uri1}//{uri3}' | |
print(uri) | |
out_list1,out_list2=link_find(f"{uri}{ea['URL']}") | |
link1['TREE'][i]=out_list1 | |
link2['TREE'][i]=out_list2 | |
#link1['TREE'].append(out_list) | |
if level>=3: | |
for n,na in enumerate(link1['TREE'][i]['TREE']): | |
print(na) | |
try: | |
if not na['URL'].startswith("http"): | |
uri11=url.split("//")[0] | |
uri22=url.split("//")[1] | |
uri33=uri22.split("/")[0] | |
uri0=f'{uri11}//{uri33}' | |
print(uri0) | |
out_list1,out_list2=link_find(f"{uri0}{na['URL']}") | |
link1['TREE'][i]['TREE'][n]=out_list1 | |
link2['TREE'][i]['TREE'][n]=out_list2 | |
#link1['TREE'][i]['TREE'].append(out_list1) | |
except Exception as e: | |
print (e) | |
except Exception as e: | |
print (e) | |
return link1,link2 | |
def sitemap_OG(url,level): | |
uri="" | |
if url != "" and url != None: | |
link1=link_find(url) | |
if level >=2: | |
for i,ea in enumerate(link1): | |
print(ea) | |
try: | |
if not ea['URL'].startswith("http"): | |
uri1=url.split("//")[0] | |
uri2=url.split("//")[1] | |
uri3=uri2.split("/")[0] | |
uri=f'{uri1}//{uri3}' | |
print(uri) | |
out_list=link_find(f"{uri}{ea['URL']}") | |
link1[i]['TREE']=out_list | |
if level>=3: | |
for n,na in enumerate(link1[i]['TREE']): | |
print(na) | |
try: | |
if not na['URL'].startswith("http"): | |
uri11=url.split("//")[0] | |
uri22=url.split("//")[1] | |
uri33=uri22.split("/")[0] | |
uri0=f'{uri11}//{uri33}' | |
print(uri0) | |
out_list1=link_find(f"{uri0}{na['URL']}") | |
link1[i]['TREE'][n]['TREE']=out_list1 | |
except Exception as e: | |
print (e) | |
except Exception as e: | |
print (e) | |
return link1 | |
with gr.Blocks() as app: | |
with gr.Row(): | |
with gr.Column(scale=3): | |
with gr.Row(): | |
inp=gr.Textbox(label="URL") | |
level=gr.Slider(minimum=1,maximum=2,step=1,value=1) | |
btn=gr.Button() | |
outp=gr.JSON() | |
with gr.Column(scale=1): | |
outmap=gr.JSON() | |
btn.click(sitemap,[inp,level],[outp,outmap]) | |
app.launch() |