JSON-Crawl / app.py
Omnibus's picture
Update app.py
192d263 verified
raw
history blame
5.48 kB
import gradio as gr
import requests
import bs4
def link_find(url):
out = []
source = requests.get(url)
if source.status_code ==200:
print("YES")
#soup = bs4.BeautifulSoup(source.content,'lxml')
soup = bs4.BeautifulSoup(source.content,'html.parser')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
rawt=soup.text
#out.append(rawp)
#out.append("HTML fragments: ")
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
node2 = {"URL":url,"LINKS":[],"TREE":[]}
q=("a","p","span","content","article")
for p in soup.find_all("a"):
url0=p.get('href')
if url0.startswith("//"):
uri1=url.split("//")[0]
#uri2=url.split("//")[1]
#uri3=uri2.split("/")[0]
#uri=f'{uri1}//{uri3}'
uri=f'{uri}{url0}'
print(uri)
if url0.startswith("/"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
uri=f'{uri}{url0}'
print(uri)
else:
uri=url0
node1['LINKS'].append(uri)
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
node2['LINKS'].append(uri)
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
else:
print("NO")
pass
return node1,node2
#https://huggingface.co/spaces/Omnibus/crawl
def sitemap(url,level):
uri=""
uri0=""
if url != "" and url != None:
link1,link2=link_find(url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
#if not ea['URL'].startswith("http"):
# uri1=url.split("//")[0]
# uri2=url.split("//")[1]
# uri3=uri2.split("/")[0]
# uri=f'{uri1}//{uri3}'
# print(uri)
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
link1['TREE'][i]=out_list1
link2['TREE'][i]=out_list2
#link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
#if not na['URL'].startswith("http"):
# uri11=url.split("//")[0]
# uri22=url.split("//")[1]
# uri33=uri22.split("/")[0]
# uri0=f'{uri11}//{uri33}'
# print(uri0)
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
link1['TREE'][i]['TREE'][n]=out_list1
link2['TREE'][i]['TREE'][n]=out_list2
#link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1,link2
def sitemap_OG(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
link1[i]['TREE']=out_list
if level>=3:
for n,na in enumerate(link1[i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
link1[i]['TREE'][n]['TREE']=out_list1
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
with gr.Blocks() as app:
with gr.Row():
with gr.Column(scale=3):
with gr.Row():
inp=gr.Textbox(label="URL")
level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
btn=gr.Button()
outp=gr.JSON()
with gr.Column(scale=1):
outmap=gr.JSON()
btn.click(sitemap,[inp,level],[outp,outmap])
app.launch()