Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import bs4 | |
def sort_doc(text,steps_in=0,control=None): | |
text=str(text) | |
######################################## | |
sen_list=get_sen_list(text) | |
key_cnt=len(sen_list) | |
sen_obj_box=[] | |
for ii,ee in enumerate(sen_list): | |
sen_obj=proc_sen(sen_list,ii) | |
sen_obj_box.append(sen_obj) | |
sen_list=sen_obj_box | |
###################################### | |
key_cnt=len(sen_obj_box) | |
print(key_cnt) | |
#noun_cnt=len(noun_box) | |
#print(noun_cnt) | |
if not steps_in: | |
control_char=list(control_json['control']) | |
char_len=len(control_char) | |
n_cnt=0 | |
nx=key_cnt | |
while True: | |
if nx >= 1: | |
n_cnt+=1 | |
nx = nx/char_len | |
else: | |
print("#######") | |
print(n_cnt) | |
print(nx) | |
print("#######") | |
steps=n_cnt | |
break | |
if steps_in: | |
steps=steps_in | |
if control: | |
control_len=control_json['leng']-steps | |
control_char_val=list(control_json['control'][:control_len]) | |
control_val=list(control_json['control'][control_len:]) | |
val_len=len(control_val) | |
json_out={} | |
noun_list={} | |
step_list=[] | |
big_cnt=0 | |
cnt=0 | |
go=True | |
step_cont_box=[] | |
for ii in range(steps): | |
print(ii) | |
step_cont_box.append(0) | |
#print (step_cont_box) | |
mod=0 | |
pos=len(step_cont_box)-1 | |
if go: | |
for i, ea in enumerate(sen_list): | |
if go: | |
if cnt > char_len-1: | |
#print(step_cont_box) | |
go1=True | |
for ii,ev in enumerate(step_cont_box): | |
if go: | |
if ev >= char_len-1: | |
step_cont_box[ii]=0 | |
if go1==True: | |
step_cont_box[ii-1]=step_cont_box[ii-1]+1 | |
go1=False | |
cnt=1 | |
else: | |
step_cont_box[pos]=cnt | |
cnt+=1 | |
print(step_cont_box) | |
out_js="" | |
for iii,j in enumerate(step_cont_box): | |
print(j) | |
out_js = out_js+control_char[j] | |
sen_obj=sen_obj_box[i] | |
#sen_obj=proc_sen(sen_list,i) | |
#json_out[out_js]={'nouns':ea} | |
json_out[out_js]=sen_obj | |
print ("#################") | |
print (out_js) | |
print (sen_obj) | |
print ("#################") | |
big_cnt+=1 | |
if big_cnt==key_cnt: | |
print("DONE") | |
go=False | |
noun_list=proc_nouns(json_out) | |
return json_out, noun_list | |
def link_find(url): | |
out = [] | |
source = requests.get(url) | |
if source.status_code ==200: | |
print("YES") | |
#soup = bs4.BeautifulSoup(source.content,'lxml') | |
soup = bs4.BeautifulSoup(source.content,'html.parser') | |
rawp=(f'RAW TEXT RETURNED: {soup.text}') | |
cnt=0 | |
cnt+=len(rawp) | |
rawt=soup.text | |
#out.append(rawp) | |
#out.append("HTML fragments: ") | |
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} | |
node2 = {"URL":url,"LINKS":[],"TREE":[]} | |
q=("a","p","span","content","article") | |
for p in soup.find_all("a"): | |
url0=p.get('href') | |
if url0.startswith("//"): | |
print(url0) | |
uri1=url.split("//")[0] | |
#uri2=url.split("//")[1] | |
#uri3=uri2.split("/")[0] | |
#uri=f'{uri1}//{uri3}' | |
uri=f'{uri1}{url0}' | |
print(uri) | |
elif url0.startswith("/") and not url0.startswith("//"): | |
uri1=url.split("//")[0] | |
uri2=url.split("//")[1] | |
uri3=uri2.split("/")[0] | |
uri=f'{uri1}//{uri3}' | |
uri=f'{uri}{url0}' | |
print(uri) | |
else: | |
uri=url0 | |
node1['LINKS'].append(uri) | |
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) | |
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]}) | |
node2['LINKS'].append(uri) | |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]}) | |
else: | |
print("NO") | |
pass | |
return node1,node2 | |
#https://huggingface.co/spaces/Omnibus/crawl | |
def sitemap(url,level): | |
uri="" | |
uri0="" | |
if url != "" and url != None: | |
link1,link2=link_find(url) | |
if level >=2: | |
for i,ea in enumerate(link1['TREE']): | |
print(ea) | |
try: | |
#if not ea['URL'].startswith("http"): | |
# uri1=url.split("//")[0] | |
# uri2=url.split("//")[1] | |
# uri3=uri2.split("/")[0] | |
# uri=f'{uri1}//{uri3}' | |
# print(uri) | |
out_list1,out_list2=link_find(f"{uri}{ea['URL']}") | |
link1['TREE'][i]=out_list1 | |
link2['TREE'][i]=out_list2 | |
#link1['TREE'].append(out_list) | |
if level>=3: | |
for n,na in enumerate(link1['TREE'][i]['TREE']): | |
print(na) | |
try: | |
#if not na['URL'].startswith("http"): | |
# uri11=url.split("//")[0] | |
# uri22=url.split("//")[1] | |
# uri33=uri22.split("/")[0] | |
# uri0=f'{uri11}//{uri33}' | |
# print(uri0) | |
out_list1,out_list2=link_find(f"{uri0}{na['URL']}") | |
link1['TREE'][i]['TREE'][n]=out_list1 | |
link2['TREE'][i]['TREE'][n]=out_list2 | |
#link1['TREE'][i]['TREE'].append(out_list1) | |
except Exception as e: | |
print (e) | |
except Exception as e: | |
print (e) | |
return link1,link2 | |
def sitemap_OG(url,level): | |
uri="" | |
if url != "" and url != None: | |
link1=link_find(url) | |
if level >=2: | |
for i,ea in enumerate(link1): | |
print(ea) | |
try: | |
if not ea['URL'].startswith("http"): | |
uri1=url.split("//")[0] | |
uri2=url.split("//")[1] | |
uri3=uri2.split("/")[0] | |
uri=f'{uri1}//{uri3}' | |
print(uri) | |
out_list=link_find(f"{uri}{ea['URL']}") | |
link1[i]['TREE']=out_list | |
if level>=3: | |
for n,na in enumerate(link1[i]['TREE']): | |
print(na) | |
try: | |
if not na['URL'].startswith("http"): | |
uri11=url.split("//")[0] | |
uri22=url.split("//")[1] | |
uri33=uri22.split("/")[0] | |
uri0=f'{uri11}//{uri33}' | |
print(uri0) | |
out_list1=link_find(f"{uri0}{na['URL']}") | |
link1[i]['TREE'][n]['TREE']=out_list1 | |
except Exception as e: | |
print (e) | |
except Exception as e: | |
print (e) | |
return link1 | |
with gr.Blocks() as app: | |
with gr.Row(): | |
with gr.Column(scale=3): | |
with gr.Row(): | |
inp=gr.Textbox(label="URL") | |
level=gr.Slider(minimum=1,maximum=2,step=1,value=1) | |
btn=gr.Button() | |
outp=gr.JSON() | |
with gr.Column(scale=1): | |
outmap=gr.JSON() | |
btn.click(sitemap,[inp,level],[outp,outmap]) | |
app.launch() |