import gradio as gr import requests import bs4 def link_find(url): out = [] source = requests.get(url) if source.status_code ==200: print("YES") #soup = bs4.BeautifulSoup(source.content,'lxml') soup = bs4.BeautifulSoup(source.content,'html.parser') rawp=(f'RAW TEXT RETURNED: {soup.text}') cnt=0 cnt+=len(rawp) rawt=soup.text #out.append(rawp) #out.append("HTML fragments: ") node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} node2 = {"URL":url,"LINKS":[],"TREE":[]} q=("a","p","span","content","article") for p in soup.find_all("a"): url0=p.get('href') if url0.startswith("//"): uri1=url.split("//")[0] #uri2=url.split("//")[1] #uri3=uri2.split("/")[0] #uri=f'{uri1}//{uri3}' uri=f'{uri}{url0}' print(uri) if url0.startswith("/"): uri1=url.split("//")[0] uri2=url.split("//")[1] uri3=uri2.split("/")[0] uri=f'{uri1}//{uri3}' uri=f'{uri}{url0}' print(uri) else: uri=url0 node1['LINKS'].append(uri) node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]}) node2['LINKS'].append(uri) #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]}) else: print("NO") pass return node1,node2 #https://huggingface.co/spaces/Omnibus/crawl def sitemap(url,level): uri="" uri0="" if url != "" and url != None: link1,link2=link_find(url) if level >=2: for i,ea in enumerate(link1['TREE']): print(ea) try: #if not ea['URL'].startswith("http"): # uri1=url.split("//")[0] # uri2=url.split("//")[1] # uri3=uri2.split("/")[0] # uri=f'{uri1}//{uri3}' # print(uri) out_list1,out_list2=link_find(f"{uri}{ea['URL']}") link1['TREE'][i]=out_list1 link2['TREE'][i]=out_list2 #link1['TREE'].append(out_list) if level>=3: for n,na in enumerate(link1['TREE'][i]['TREE']): print(na) try: #if not na['URL'].startswith("http"): # uri11=url.split("//")[0] # uri22=url.split("//")[1] # uri33=uri22.split("/")[0] # uri0=f'{uri11}//{uri33}' # print(uri0) out_list1,out_list2=link_find(f"{uri0}{na['URL']}") link1['TREE'][i]['TREE'][n]=out_list1 link2['TREE'][i]['TREE'][n]=out_list2 #link1['TREE'][i]['TREE'].append(out_list1) except Exception as e: print (e) except Exception as e: print (e) return link1,link2 def sitemap_OG(url,level): uri="" if url != "" and url != None: link1=link_find(url) if level >=2: for i,ea in enumerate(link1): print(ea) try: if not ea['URL'].startswith("http"): uri1=url.split("//")[0] uri2=url.split("//")[1] uri3=uri2.split("/")[0] uri=f'{uri1}//{uri3}' print(uri) out_list=link_find(f"{uri}{ea['URL']}") link1[i]['TREE']=out_list if level>=3: for n,na in enumerate(link1[i]['TREE']): print(na) try: if not na['URL'].startswith("http"): uri11=url.split("//")[0] uri22=url.split("//")[1] uri33=uri22.split("/")[0] uri0=f'{uri11}//{uri33}' print(uri0) out_list1=link_find(f"{uri0}{na['URL']}") link1[i]['TREE'][n]['TREE']=out_list1 except Exception as e: print (e) except Exception as e: print (e) return link1 with gr.Blocks() as app: with gr.Row(): with gr.Column(scale=3): with gr.Row(): inp=gr.Textbox(label="URL") level=gr.Slider(minimum=1,maximum=2,step=1,value=1) btn=gr.Button() outp=gr.JSON() with gr.Column(scale=1): outmap=gr.JSON() btn.click(sitemap,[inp,level],[outp,outmap]) app.launch()