crawl / app.py
Omnibus's picture
Update app.py
7217652 verified
raw
history blame contribute delete
No virus
16.9 kB
import gradio as gr
import requests
import bs4
######## Load Database ########
import os
from huggingface_hub import HfApi, upload_file
import json
import uuid
token=os.environ.get("HF_TOKEN")
username="omnibus"
dataset_name="tmp"
save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/'
api=HfApi(token="")
filename="urls"
filename2="pages"
def init(filename=filename,save_data=save_data):
#if filename==None:
# filename=filename
r = requests.get(f'{save_data}crawl/{filename}.json')
print(f'status code main:: {r.status_code}')
if r.status_code==200:
lod = json.loads(r.text)
else:
lod={}
return lod
def sort_doc(in_list: list, steps_in: int, control: int=0, prev_list: str=None):
prev_list=init()
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
key_cnt=len(in_list)
print(key_cnt)
#if isinstance(in_list, str):
# in_list=eval(in_list)
control_char=list(control_json['control'])
char_len=len(control_char)
if not steps_in:
n_cnt=0
nx=key_cnt
while True:
if nx >= 1:
n_cnt+=1
nx = nx/char_len
else:
steps=n_cnt
break
if steps_in:
steps=steps_in
if control:
control_len=control_json['leng']-control
control_char=list(control_json['control'][:control_len])
control_val=list(control_json['control'][control_len:])
val_len=len(control_val)
control_val_box=[]
for ea in control_val:
control_val_box.append(ea)
print(f'CONTROL_VAL_BOX:: {control_val_box}')
#prev_list=control_val_box
json_out={}
big_cnt=0
cnt=0
go=True
step_cont_box=[]
if prev_list:
print("LOD")
last_key=list(prev_list.keys())[-1]
print(last_key)
for ea_dig in last_key:
ea_dig=control_json['control'].index(ea_dig)
ea_dig=int(ea_dig)
print(f'{ea_dig} :: {list(control_json["control"][ea_dig])[0]}')
#step_cont_box.append(int(list(control_json["control"][ea_dig])[0]))
step_cont_box.append(ea_dig)
print(step_cont_box)
cnt=int(step_cont_box[-1])+1
if not prev_list:
print("NOT LOD")
for ii in range(steps):
print(ii)
step_cont_box.append(0)
pos=len(step_cont_box)-1
if go:
for i, ea in enumerate(in_list):
if go:
if cnt > char_len-1:
#print(step_cont_box)
go1=True
for ii,ev in enumerate(step_cont_box):
if go:
if ev >= char_len-1:
step_cont_box[ii]=0
if go1==True:
step_cont_box[ii-1]=step_cont_box[ii-1]+1
go1=False
cnt=1
else:
step_cont_box[pos]=cnt
cnt+=1
#print(step_cont_box)
out_js=""
for iii,j in enumerate(step_cont_box):
print(j)
out_js = out_js+control_char[j]
json_out[out_js]=in_list[i]
big_cnt+=1
if big_cnt==key_cnt:
print("DONE")
go=False
return json_out
#############################
def sort_doc_OG(in_list,steps_in=0,control=None):
r = requests.get(f'{save_data}crawl/{filename}.json')
print(f'status code main:: {r.status_code}')
if r.status_code==200:
lod = json.loads(r.text)
#print(f'lod:: {lod}')
#lod[0]['comment']=lod[0]['comment']+1
#lod[0]['comment_list'].append({'user':persona[persona2]['name'],'datetime':'','comment':output,'reply_list':[]})
else:
lod={}
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
text=str(in_list)
key_cnt=len(in_list)
print(key_cnt)
control_char=list(control_json['control'])
char_len=len(control_char)
if not steps_in:
n_cnt=0
nx=key_cnt
while True:
if nx >= 1:
n_cnt+=1
nx = nx/char_len
else:
print("#######")
print(n_cnt)
print(nx)
print("#######")
steps=n_cnt
break
if steps_in:
steps=steps_in
if control:
control_len=control_json['leng']-steps
control_char_val=list(control_json['control'][:control_len])
control_val=list(control_json['control'][control_len:])
val_len=len(control_val)
json_out=lod
noun_list={}
step_list=[]
big_cnt=0
cnt=0
go=True
step_cont_box=[]
if lod:
print("LOD")
last_key=list(lod.keys())[-1]
print(last_key)
for ea_dig in last_key:
ea_dig=control_json['control'].index(ea_dig)
ea_dig=int(ea_dig)
print(f'{ea_dig} :: {list(control_json["control"][ea_dig])[0]}')
#step_cont_box.append(int(list(control_json["control"][ea_dig])[0]))
step_cont_box.append(ea_dig)
print(step_cont_box)
cnt=int(step_cont_box[-1])+1
if not lod:
print("NOT LOD")
for ii in range(steps):
print(ii)
step_cont_box.append(0)
#print (step_cont_box)
mod=0
pos=len(step_cont_box)-1
if go:
for i, ea in enumerate(in_list):
if go and ea not in list(lod.values()):
if cnt > char_len-1:
#print(step_cont_box)
go1=True
for ii,ev in enumerate(step_cont_box):
if go:
if ev >= char_len-1:
step_cont_box[ii]=0
if go1==True:
step_cont_box[ii-1]=step_cont_box[ii-1]+1
go1=False
cnt=1
else:
step_cont_box[pos]=cnt
cnt+=1
#print(step_cont_box)
out_js=""
for iii,j in enumerate(step_cont_box):
print(j)
out_js = out_js+control_char[j]
sen_obj=in_list[i]
json_out[out_js]=sen_obj
#print ("#################")
#print (out_js)
#print (sen_obj)
#print ("#################")
big_cnt+=1
if big_cnt==key_cnt:
print("DONE")
go=False
#noun_list=proc_nouns(json_out)
return json_out
link_box = []
def link_find(url):
node1={}
node2={}
out = []
print(f'Try URL:: {url}')
source = requests.get(url)
if source.status_code ==200:
print("YES")
#soup = bs4.BeautifulSoup(source.content,'lxml')
soup = bs4.BeautifulSoup(source.content,'html.parser')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
rawt=soup.text
#out.append(rawp)
#out.append("HTML fragments: ")
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]}
q=("a","p","span","content","article")
for p in soup.find_all("a"):
url0=p.get('href')
try:
if url0.startswith("//"):
print(url0)
uri1=url.split("//")[0]
#uri2=url.split("//")[1]
#uri3=uri2.split("/")[0]
#uri=f'{uri1}//{uri3}'
uri=f'{uri1}{url0}'
#print(uri)
elif url0.startswith("/") and not url0.startswith("//"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
uri=f'{uri}{url0}'
#print(uri)
else:
uri=url0
node1['LINKS'].append(uri)
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
node2['LINKS'].append(uri)
#node2['LINK_KEY'].append(uri_key)
link_box.append(uri)
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
except Exception as e:
print (e)
else:
print("NO")
pass
return node1,node2
#https://huggingface.co/spaces/Omnibus/crawl
def sitemap_test(url,file_state,level):
url_page=[]
url_front=[]
url_json=[]
for each_url in url:
uri=""
uri0=""
if url != "" and url != None:
link1,link2=link_find(each_url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
link1['TREE'][i]=out_list1
link2['TREE'][i]=out_list2
#link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
link1['TREE'][i]['TREE'][n]=out_list1
link2['TREE'][i]['TREE'][n]=out_list2
#link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
try:
for ea_link in link2['LINKS']:
print(ea_link)
try:
url_list=ea_link.split("/")
url_front.append(url_list[:3])
#url_front.append(f'{url_list[0]}//{url_list[2]}')
except Exception as e:
print(e)
except Exception as e:
print(e)
uri_key=sort_doc(url_front,file_state,8)
######## Save Database ########
uid=uuid.uuid4()
with open(f'{uid}.json', 'w') as f:
json_hist=json.dumps(uri_key, indent=4)
f.write(json_hist)
f.close()
upload_file(
path_or_fileobj =f"{uid}.json",
path_in_repo = f"crawl/{filename}.json",
repo_id =f"{username}/{dataset_name}",
repo_type = "dataset",
token=token,
)
#################################
return link1,link2,uri_key
def sitemap(url,file_state,level):
uri=""
uri0=""
if url != "" and url != None:
link1,link2=link_find(url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
#if not ea['URL'].startswith("http"):
# uri1=url.split("//")[0]
# uri2=url.split("//")[1]
# uri3=uri2.split("/")[0]
# uri=f'{uri1}//{uri3}'
# print(uri)
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
link1['TREE'][i]=out_list1
link2['TREE'][i]=out_list2
#link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
#if not na['URL'].startswith("http"):
# uri11=url.split("//")[0]
# uri22=url.split("//")[1]
# uri33=uri22.split("/")[0]
# uri0=f'{uri11}//{uri33}'
# print(uri0)
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
link1['TREE'][i]['TREE'][n]=out_list1
link2['TREE'][i]['TREE'][n]=out_list2
#link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
#url_page=[]
#url_front=[]
#url_json=[]
#for ea_link in link2['TREE']:
# url_list=ea_link['URL'].split("/")
# url_front.append(f'{url_list[1]}//{url_list[3]}')
#url_front.append("".join(x for x in url_list[:3]))
#url_page.append("/".join(z for z in url_list[3:]))
#print(f'URL_FRONT:: {url_front}')
#url_key=sort
#for each_link in uri_key.keys():
# out_file=init(f'{each_link}.json')
uri_key=sort_doc(link2['LINKS'],file_state,8)
#uri_key=sort_doc(url_front,file_state,8)
######## Save Database ########
uid=uuid.uuid4()
#for ea in list(uri_key.keys()):
# #if not uri_key[ea] in list(lod.values()):
# lod[ea]=uri_key[ea]
with open(f'{uid}.json', 'w') as f:
json_hist=json.dumps(uri_key, indent=4)
f.write(json_hist)
f.close()
upload_file(
path_or_fileobj =f"{uid}.json",
path_in_repo = f"crawl/{filename}.json",
repo_id =f"{username}/{dataset_name}",
repo_type = "dataset",
token=token,
)
#################################
return link1,link2,uri_key
def sitemap_OG(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
link1[i]['TREE']=out_list
if level>=3:
for n,na in enumerate(link1[i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
link1[i]['TREE'][n]['TREE']=out_list1
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
def test():
seed_box=[]
with open("./seed.txt") as f:
this = f.readlines()
f.close()
for ea in this:
ea=ea.strip().strip("\n")
seed_box.append(ea)
#print(ea)
try:
a,b,c = sitemap_test(seed_box,None,1)
except Exception as e:
print (e)
with gr.Blocks() as app:
file_state=gr.State()
with gr.Row():
with gr.Column(scale=3):
with gr.Row():
inp=gr.Textbox(label="URL")
level=gr.Slider(minimum=1,maximum=1,step=1,value=1)
btn=gr.Button()
test_btn=gr.Button("Test")
key_json=gr.JSON()
outp=gr.JSON()
with gr.Column(scale=1):
outmap=gr.JSON()
#test_btn.click(test,None,None)
btn.click(sitemap,[inp,level],[outp,outmap,key_json])
app.launch()