import gradio as gr import requests import bs4 ######## Load Database ######## import os from huggingface_hub import HfApi, upload_file import json import uuid token=os.environ.get("HF_TOKEN") username="omnibus" dataset_name="tmp" save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/' api=HfApi(token="") filename="urls" filename2="pages" def init(filename=filename,save_data=save_data): #if filename==None: # filename=filename r = requests.get(f'{save_data}crawl/{filename}.json') print(f'status code main:: {r.status_code}') if r.status_code==200: lod = json.loads(r.text) else: lod={} return lod def sort_doc(in_list: list, steps_in: int, control: int=0, prev_list: str=None): prev_list=init() control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62} key_cnt=len(in_list) print(key_cnt) #if isinstance(in_list, str): # in_list=eval(in_list) control_char=list(control_json['control']) char_len=len(control_char) if not steps_in: n_cnt=0 nx=key_cnt while True: if nx >= 1: n_cnt+=1 nx = nx/char_len else: steps=n_cnt break if steps_in: steps=steps_in if control: control_len=control_json['leng']-control control_char=list(control_json['control'][:control_len]) control_val=list(control_json['control'][control_len:]) val_len=len(control_val) control_val_box=[] for ea in control_val: control_val_box.append(ea) print(f'CONTROL_VAL_BOX:: {control_val_box}') #prev_list=control_val_box json_out={} big_cnt=0 cnt=0 go=True step_cont_box=[] if prev_list: print("LOD") last_key=list(prev_list.keys())[-1] print(last_key) for ea_dig in last_key: ea_dig=control_json['control'].index(ea_dig) ea_dig=int(ea_dig) print(f'{ea_dig} :: {list(control_json["control"][ea_dig])[0]}') #step_cont_box.append(int(list(control_json["control"][ea_dig])[0])) step_cont_box.append(ea_dig) print(step_cont_box) cnt=int(step_cont_box[-1])+1 if not prev_list: print("NOT LOD") for ii in range(steps): print(ii) step_cont_box.append(0) pos=len(step_cont_box)-1 if go: for i, ea in enumerate(in_list): if go: if cnt > char_len-1: #print(step_cont_box) go1=True for ii,ev in enumerate(step_cont_box): if go: if ev >= char_len-1: step_cont_box[ii]=0 if go1==True: step_cont_box[ii-1]=step_cont_box[ii-1]+1 go1=False cnt=1 else: step_cont_box[pos]=cnt cnt+=1 #print(step_cont_box) out_js="" for iii,j in enumerate(step_cont_box): print(j) out_js = out_js+control_char[j] json_out[out_js]=in_list[i] big_cnt+=1 if big_cnt==key_cnt: print("DONE") go=False return json_out ############################# def sort_doc_OG(in_list,steps_in=0,control=None): r = requests.get(f'{save_data}crawl/{filename}.json') print(f'status code main:: {r.status_code}') if r.status_code==200: lod = json.loads(r.text) #print(f'lod:: {lod}') #lod[0]['comment']=lod[0]['comment']+1 #lod[0]['comment_list'].append({'user':persona[persona2]['name'],'datetime':'','comment':output,'reply_list':[]}) else: lod={} control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62} text=str(in_list) key_cnt=len(in_list) print(key_cnt) control_char=list(control_json['control']) char_len=len(control_char) if not steps_in: n_cnt=0 nx=key_cnt while True: if nx >= 1: n_cnt+=1 nx = nx/char_len else: print("#######") print(n_cnt) print(nx) print("#######") steps=n_cnt break if steps_in: steps=steps_in if control: control_len=control_json['leng']-steps control_char_val=list(control_json['control'][:control_len]) control_val=list(control_json['control'][control_len:]) val_len=len(control_val) json_out=lod noun_list={} step_list=[] big_cnt=0 cnt=0 go=True step_cont_box=[] if lod: print("LOD") last_key=list(lod.keys())[-1] print(last_key) for ea_dig in last_key: ea_dig=control_json['control'].index(ea_dig) ea_dig=int(ea_dig) print(f'{ea_dig} :: {list(control_json["control"][ea_dig])[0]}') #step_cont_box.append(int(list(control_json["control"][ea_dig])[0])) step_cont_box.append(ea_dig) print(step_cont_box) cnt=int(step_cont_box[-1])+1 if not lod: print("NOT LOD") for ii in range(steps): print(ii) step_cont_box.append(0) #print (step_cont_box) mod=0 pos=len(step_cont_box)-1 if go: for i, ea in enumerate(in_list): if go and ea not in list(lod.values()): if cnt > char_len-1: #print(step_cont_box) go1=True for ii,ev in enumerate(step_cont_box): if go: if ev >= char_len-1: step_cont_box[ii]=0 if go1==True: step_cont_box[ii-1]=step_cont_box[ii-1]+1 go1=False cnt=1 else: step_cont_box[pos]=cnt cnt+=1 #print(step_cont_box) out_js="" for iii,j in enumerate(step_cont_box): print(j) out_js = out_js+control_char[j] sen_obj=in_list[i] json_out[out_js]=sen_obj #print ("#################") #print (out_js) #print (sen_obj) #print ("#################") big_cnt+=1 if big_cnt==key_cnt: print("DONE") go=False #noun_list=proc_nouns(json_out) return json_out link_box = [] def link_find(url): node1={} node2={} out = [] print(f'Try URL:: {url}') source = requests.get(url) if source.status_code ==200: print("YES") #soup = bs4.BeautifulSoup(source.content,'lxml') soup = bs4.BeautifulSoup(source.content,'html.parser') rawp=(f'RAW TEXT RETURNED: {soup.text}') cnt=0 cnt+=len(rawp) rawt=soup.text #out.append(rawp) #out.append("HTML fragments: ") node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]} q=("a","p","span","content","article") for p in soup.find_all("a"): url0=p.get('href') try: if url0.startswith("//"): print(url0) uri1=url.split("//")[0] #uri2=url.split("//")[1] #uri3=uri2.split("/")[0] #uri=f'{uri1}//{uri3}' uri=f'{uri1}{url0}' #print(uri) elif url0.startswith("/") and not url0.startswith("//"): uri1=url.split("//")[0] uri2=url.split("//")[1] uri3=uri2.split("/")[0] uri=f'{uri1}//{uri3}' uri=f'{uri}{url0}' #print(uri) else: uri=url0 node1['LINKS'].append(uri) node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]}) node2['LINKS'].append(uri) #node2['LINK_KEY'].append(uri_key) link_box.append(uri) #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]}) except Exception as e: print (e) else: print("NO") pass return node1,node2 #https://huggingface.co/spaces/Omnibus/crawl def sitemap_test(url,file_state,level): url_page=[] url_front=[] url_json=[] for each_url in url: uri="" uri0="" if url != "" and url != None: link1,link2=link_find(each_url) if level >=2: for i,ea in enumerate(link1['TREE']): print(ea) try: out_list1,out_list2=link_find(f"{uri}{ea['URL']}") link1['TREE'][i]=out_list1 link2['TREE'][i]=out_list2 #link1['TREE'].append(out_list) if level>=3: for n,na in enumerate(link1['TREE'][i]['TREE']): print(na) try: out_list1,out_list2=link_find(f"{uri0}{na['URL']}") link1['TREE'][i]['TREE'][n]=out_list1 link2['TREE'][i]['TREE'][n]=out_list2 #link1['TREE'][i]['TREE'].append(out_list1) except Exception as e: print (e) except Exception as e: print (e) try: for ea_link in link2['LINKS']: print(ea_link) try: url_list=ea_link.split("/") url_front.append(url_list[:3]) #url_front.append(f'{url_list[0]}//{url_list[2]}') except Exception as e: print(e) except Exception as e: print(e) uri_key=sort_doc(url_front,file_state,8) ######## Save Database ######## uid=uuid.uuid4() with open(f'{uid}.json', 'w') as f: json_hist=json.dumps(uri_key, indent=4) f.write(json_hist) f.close() upload_file( path_or_fileobj =f"{uid}.json", path_in_repo = f"crawl/{filename}.json", repo_id =f"{username}/{dataset_name}", repo_type = "dataset", token=token, ) ################################# return link1,link2,uri_key def sitemap(url,file_state,level): uri="" uri0="" if url != "" and url != None: link1,link2=link_find(url) if level >=2: for i,ea in enumerate(link1['TREE']): print(ea) try: #if not ea['URL'].startswith("http"): # uri1=url.split("//")[0] # uri2=url.split("//")[1] # uri3=uri2.split("/")[0] # uri=f'{uri1}//{uri3}' # print(uri) out_list1,out_list2=link_find(f"{uri}{ea['URL']}") link1['TREE'][i]=out_list1 link2['TREE'][i]=out_list2 #link1['TREE'].append(out_list) if level>=3: for n,na in enumerate(link1['TREE'][i]['TREE']): print(na) try: #if not na['URL'].startswith("http"): # uri11=url.split("//")[0] # uri22=url.split("//")[1] # uri33=uri22.split("/")[0] # uri0=f'{uri11}//{uri33}' # print(uri0) out_list1,out_list2=link_find(f"{uri0}{na['URL']}") link1['TREE'][i]['TREE'][n]=out_list1 link2['TREE'][i]['TREE'][n]=out_list2 #link1['TREE'][i]['TREE'].append(out_list1) except Exception as e: print (e) except Exception as e: print (e) #url_page=[] #url_front=[] #url_json=[] #for ea_link in link2['TREE']: # url_list=ea_link['URL'].split("/") # url_front.append(f'{url_list[1]}//{url_list[3]}') #url_front.append("".join(x for x in url_list[:3])) #url_page.append("/".join(z for z in url_list[3:])) #print(f'URL_FRONT:: {url_front}') #url_key=sort #for each_link in uri_key.keys(): # out_file=init(f'{each_link}.json') uri_key=sort_doc(link2['LINKS'],file_state,8) #uri_key=sort_doc(url_front,file_state,8) ######## Save Database ######## uid=uuid.uuid4() #for ea in list(uri_key.keys()): # #if not uri_key[ea] in list(lod.values()): # lod[ea]=uri_key[ea] with open(f'{uid}.json', 'w') as f: json_hist=json.dumps(uri_key, indent=4) f.write(json_hist) f.close() upload_file( path_or_fileobj =f"{uid}.json", path_in_repo = f"crawl/{filename}.json", repo_id =f"{username}/{dataset_name}", repo_type = "dataset", token=token, ) ################################# return link1,link2,uri_key def sitemap_OG(url,level): uri="" if url != "" and url != None: link1=link_find(url) if level >=2: for i,ea in enumerate(link1): print(ea) try: if not ea['URL'].startswith("http"): uri1=url.split("//")[0] uri2=url.split("//")[1] uri3=uri2.split("/")[0] uri=f'{uri1}//{uri3}' print(uri) out_list=link_find(f"{uri}{ea['URL']}") link1[i]['TREE']=out_list if level>=3: for n,na in enumerate(link1[i]['TREE']): print(na) try: if not na['URL'].startswith("http"): uri11=url.split("//")[0] uri22=url.split("//")[1] uri33=uri22.split("/")[0] uri0=f'{uri11}//{uri33}' print(uri0) out_list1=link_find(f"{uri0}{na['URL']}") link1[i]['TREE'][n]['TREE']=out_list1 except Exception as e: print (e) except Exception as e: print (e) return link1 def test(): seed_box=[] with open("./seed.txt") as f: this = f.readlines() f.close() for ea in this: ea=ea.strip().strip("\n") seed_box.append(ea) #print(ea) try: a,b,c = sitemap_test(seed_box,None,1) except Exception as e: print (e) with gr.Blocks() as app: file_state=gr.State() with gr.Row(): with gr.Column(scale=3): with gr.Row(): inp=gr.Textbox(label="URL") level=gr.Slider(minimum=1,maximum=1,step=1,value=1) btn=gr.Button() test_btn=gr.Button("Test") key_json=gr.JSON() outp=gr.JSON() with gr.Column(scale=1): outmap=gr.JSON() #test_btn.click(test,None,None) btn.click(sitemap,[inp,level],[outp,outmap,key_json]) app.launch()