File size: 8,448 Bytes
ae4e988 0d9e44b 0e81052 0d9e44b 707168a 0d9e44b 707168a 0e81052 707168a 8786019 707168a 0e81052 707168a 0e81052 707168a 0d9e44b 707168a 0d9e44b 707168a 437cf54 1b682b7 437cf54 573c6d4 75dab34 366c803 0d9e44b 827c354 437cf54 8849abb 192d263 250787d 192d263 e89aaf6 192d263 0bd2623 efc018b 8849abb 51d8734 192d263 448ec0d 8849abb 0d9e44b e30ed28 437cf54 1b682b7 62f0b09 0d9e44b a57fdc7 c30642b 437cf54 f0e1870 b962252 8849abb 6660108 a57fdc7 f0e1870 3b1a5cc f0e1870 8849abb a57fdc7 6c531ab f0e1870 3b1a5cc f0e1870 8849abb a57fdc7 f0e1870 0d9e44b 3b1a5cc ae4e988 3b44741 a57fdc7 5e66ae2 a57fdc7 0d9e44b a57fdc7 0d9e44b ae4e988 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
import gradio as gr
import requests
import bs4
def sort_doc(in_list,steps_in=8,control=None):
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
text=str(in_list)
########################################
sen_list=in_list
######################################
key_cnt=len(in_list)
print(key_cnt)
control_char=list(control_json['control'])
char_len=len(control_char)
if not steps_in:
n_cnt=0
nx=key_cnt
while True:
if nx >= 1:
n_cnt+=1
nx = nx/char_len
else:
print("#######")
print(n_cnt)
print(nx)
print("#######")
steps=n_cnt
break
if steps_in:
steps=steps_in
if control:
control_len=control_json['leng']-steps
control_char_val=list(control_json['control'][:control_len])
control_val=list(control_json['control'][control_len:])
val_len=len(control_val)
json_out={}
noun_list={}
step_list=[]
big_cnt=0
cnt=0
go=True
step_cont_box=[]
for ii in range(steps):
print(ii)
step_cont_box.append(0)
#print (step_cont_box)
mod=0
pos=len(step_cont_box)-1
if go:
for i, ea in enumerate(in_list):
if go:
if cnt > char_len-1:
#print(step_cont_box)
go1=True
for ii,ev in enumerate(step_cont_box):
if go:
if ev >= char_len-1:
step_cont_box[ii]=0
if go1==True:
step_cont_box[ii-1]=step_cont_box[ii-1]+1
go1=False
cnt=1
else:
step_cont_box[pos]=cnt
cnt+=1
print(step_cont_box)
out_js=""
for iii,j in enumerate(step_cont_box):
print(j)
out_js = out_js+control_char[j]
sen_obj=in_list[i]
#sen_obj=proc_sen(sen_list,i)
#json_out[out_js]={'nouns':ea}
json_out[out_js]=sen_obj
print ("#################")
print (out_js)
print (sen_obj)
print ("#################")
big_cnt+=1
if big_cnt==key_cnt:
print("DONE")
go=False
#noun_list=proc_nouns(json_out)
return json_out
link_box = []
def link_find(url):
out = []
source = requests.get(url)
if source.status_code ==200:
print("YES")
#soup = bs4.BeautifulSoup(source.content,'lxml')
soup = bs4.BeautifulSoup(source.content,'html.parser')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
rawt=soup.text
#out.append(rawp)
#out.append("HTML fragments: ")
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]}
q=("a","p","span","content","article")
for p in soup.find_all("a"):
url0=p.get('href')
if url0.startswith("//"):
print(url0)
uri1=url.split("//")[0]
#uri2=url.split("//")[1]
#uri3=uri2.split("/")[0]
#uri=f'{uri1}//{uri3}'
uri=f'{uri1}{url0}'
print(uri)
elif url0.startswith("/") and not url0.startswith("//"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
uri=f'{uri}{url0}'
print(uri)
else:
uri=url0
node1['LINKS'].append(uri)
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
node2['LINKS'].append(uri)
#node2['LINK_KEY'].append(uri_key)
link_box.append(uri)
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
else:
print("NO")
pass
return node1,node2
#https://huggingface.co/spaces/Omnibus/crawl
def sitemap(url,level):
uri=""
uri0=""
if url != "" and url != None:
link1,link2=link_find(url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
#if not ea['URL'].startswith("http"):
# uri1=url.split("//")[0]
# uri2=url.split("//")[1]
# uri3=uri2.split("/")[0]
# uri=f'{uri1}//{uri3}'
# print(uri)
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
link1['TREE'][i]=out_list1
link2['TREE'][i]=out_list2
#link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
#if not na['URL'].startswith("http"):
# uri11=url.split("//")[0]
# uri22=url.split("//")[1]
# uri33=uri22.split("/")[0]
# uri0=f'{uri11}//{uri33}'
# print(uri0)
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
link1['TREE'][i]['TREE'][n]=out_list1
link2['TREE'][i]['TREE'][n]=out_list2
#link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
uri_key=sort_doc(uri)
return link1,link2,uri_key
def sitemap_OG(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
link1[i]['TREE']=out_list
if level>=3:
for n,na in enumerate(link1[i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
link1[i]['TREE'][n]['TREE']=out_list1
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
with gr.Blocks() as app:
with gr.Row():
with gr.Column(scale=3):
with gr.Row():
inp=gr.Textbox(label="URL")
level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
btn=gr.Button()
key_json=gr.JSON()
outp=gr.JSON()
with gr.Column(scale=1):
outmap=gr.JSON()
btn.click(sitemap,[inp,level],[outp,outmap,key_json])
app.launch() |