Spaces:
Runtime error
Runtime error
File size: 4,302 Bytes
ae4e988 437cf54 1b682b7 437cf54 573c6d4 75dab34 1b682b7 827c354 437cf54 827c354 e30ed28 437cf54 1b682b7 62f0b09 3b1a5cc c30642b 437cf54 f0e1870 b962252 6660108 437cf54 f0e1870 3b1a5cc f0e1870 6c531ab f0e1870 3b1a5cc f0e1870 6c531ab f0e1870 e343b9b 3b1a5cc ae4e988 3b44741 ae4e988 f0e1870 ae4e988 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import requests
import bs4
def link_find(url):
out = []
source = requests.get(url)
if source.status_code ==200:
print("YES")
#soup = bs4.BeautifulSoup(source.content,'lxml')
soup = bs4.BeautifulSoup(source.content,'html.parser')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
rawt=soup.text
#out.append(rawp)
#out.append("HTML fragments: ")
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"TREE":[]}
q=("a","p","span","content","article")
for p in soup.find_all("a"):
node1['TREE'].append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
else:
print("NO")
pass
return node1
#https://huggingface.co/spaces/Omnibus/crawl
def sitemap(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
#link1['TREE'][i]=out_list
link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
#link1['TREE'][i]['TREE'][n]=out_list1
link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
def sitemap_OG(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
link1[i]['TREE']=out_list
if level>=3:
for n,na in enumerate(link1[i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
link1[i]['TREE'][n]['TREE']=out_list1
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
with gr.Blocks() as app:
with gr.Row():
inp=gr.Textbox(label="URL")
level=gr.Slider(minimum=1,maximum=3,step=1,value=2)
btn=gr.Button()
outp=gr.JSON()
btn.click(sitemap,[inp,level],outp)
app.launch() |