Update app.py
Browse files
app.py
CHANGED
@@ -2,29 +2,16 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
import bs4
|
4 |
|
5 |
-
def sort_doc(
|
6 |
-
text=str(
|
7 |
|
8 |
########################################
|
9 |
-
sen_list=
|
10 |
-
key_cnt=len(sen_list)
|
11 |
-
|
12 |
-
sen_obj_box=[]
|
13 |
-
for ii,ee in enumerate(sen_list):
|
14 |
-
sen_obj=proc_sen(sen_list,ii)
|
15 |
-
sen_obj_box.append(sen_obj)
|
16 |
-
|
17 |
-
sen_list=sen_obj_box
|
18 |
######################################
|
19 |
-
key_cnt=len(
|
20 |
print(key_cnt)
|
21 |
-
#noun_cnt=len(noun_box)
|
22 |
-
#print(noun_cnt)
|
23 |
-
|
24 |
-
|
25 |
|
26 |
if not steps_in:
|
27 |
-
|
28 |
control_char=list(control_json['control'])
|
29 |
char_len=len(control_char)
|
30 |
n_cnt=0
|
@@ -49,8 +36,6 @@ def sort_doc(text,steps_in=0,control=None):
|
|
49 |
control_val=list(control_json['control'][control_len:])
|
50 |
val_len=len(control_val)
|
51 |
|
52 |
-
|
53 |
-
|
54 |
json_out={}
|
55 |
noun_list={}
|
56 |
step_list=[]
|
@@ -92,7 +77,7 @@ def sort_doc(text,steps_in=0,control=None):
|
|
92 |
for iii,j in enumerate(step_cont_box):
|
93 |
print(j)
|
94 |
out_js = out_js+control_char[j]
|
95 |
-
sen_obj=
|
96 |
#sen_obj=proc_sen(sen_list,i)
|
97 |
|
98 |
#json_out[out_js]={'nouns':ea}
|
@@ -106,10 +91,11 @@ def sort_doc(text,steps_in=0,control=None):
|
|
106 |
if big_cnt==key_cnt:
|
107 |
print("DONE")
|
108 |
go=False
|
109 |
-
noun_list=proc_nouns(json_out)
|
110 |
-
return json_out
|
111 |
|
112 |
|
|
|
113 |
|
114 |
def link_find(url):
|
115 |
out = []
|
@@ -126,7 +112,7 @@ def link_find(url):
|
|
126 |
#out.append(rawp)
|
127 |
#out.append("HTML fragments: ")
|
128 |
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
|
129 |
-
node2 = {"URL":url,"LINKS":[],"TREE":[]}
|
130 |
|
131 |
q=("a","p","span","content","article")
|
132 |
for p in soup.find_all("a"):
|
@@ -152,12 +138,14 @@ def link_find(url):
|
|
152 |
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
|
153 |
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
|
154 |
node2['LINKS'].append(uri)
|
155 |
-
|
|
|
156 |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
|
157 |
|
158 |
else:
|
159 |
print("NO")
|
160 |
pass
|
|
|
161 |
return node1,node2
|
162 |
#https://huggingface.co/spaces/Omnibus/crawl
|
163 |
|
@@ -199,7 +187,9 @@ def sitemap(url,level):
|
|
199 |
print (e)
|
200 |
except Exception as e:
|
201 |
print (e)
|
202 |
-
|
|
|
|
|
203 |
|
204 |
|
205 |
|
@@ -243,8 +233,9 @@ with gr.Blocks() as app:
|
|
243 |
inp=gr.Textbox(label="URL")
|
244 |
level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
|
245 |
btn=gr.Button()
|
|
|
246 |
outp=gr.JSON()
|
247 |
with gr.Column(scale=1):
|
248 |
outmap=gr.JSON()
|
249 |
-
btn.click(sitemap,[inp,level],[outp,outmap])
|
250 |
app.launch()
|
|
|
2 |
import requests
|
3 |
import bs4
|
4 |
|
5 |
+
def sort_doc(in_list,steps_in=8,control=None):
|
6 |
+
text=str(in_list)
|
7 |
|
8 |
########################################
|
9 |
+
sen_list=in_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
######################################
|
11 |
+
key_cnt=len(sen_list)
|
12 |
print(key_cnt)
|
|
|
|
|
|
|
|
|
13 |
|
14 |
if not steps_in:
|
|
|
15 |
control_char=list(control_json['control'])
|
16 |
char_len=len(control_char)
|
17 |
n_cnt=0
|
|
|
36 |
control_val=list(control_json['control'][control_len:])
|
37 |
val_len=len(control_val)
|
38 |
|
|
|
|
|
39 |
json_out={}
|
40 |
noun_list={}
|
41 |
step_list=[]
|
|
|
77 |
for iii,j in enumerate(step_cont_box):
|
78 |
print(j)
|
79 |
out_js = out_js+control_char[j]
|
80 |
+
sen_obj=sen_list[i]
|
81 |
#sen_obj=proc_sen(sen_list,i)
|
82 |
|
83 |
#json_out[out_js]={'nouns':ea}
|
|
|
91 |
if big_cnt==key_cnt:
|
92 |
print("DONE")
|
93 |
go=False
|
94 |
+
#noun_list=proc_nouns(json_out)
|
95 |
+
return json_out
|
96 |
|
97 |
|
98 |
+
link_box = []
|
99 |
|
100 |
def link_find(url):
|
101 |
out = []
|
|
|
112 |
#out.append(rawp)
|
113 |
#out.append("HTML fragments: ")
|
114 |
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
|
115 |
+
node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]}
|
116 |
|
117 |
q=("a","p","span","content","article")
|
118 |
for p in soup.find_all("a"):
|
|
|
138 |
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
|
139 |
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
|
140 |
node2['LINKS'].append(uri)
|
141 |
+
#node2['LINK_KEY'].append(uri_key)
|
142 |
+
link_box.append(uri)
|
143 |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
|
144 |
|
145 |
else:
|
146 |
print("NO")
|
147 |
pass
|
148 |
+
|
149 |
return node1,node2
|
150 |
#https://huggingface.co/spaces/Omnibus/crawl
|
151 |
|
|
|
187 |
print (e)
|
188 |
except Exception as e:
|
189 |
print (e)
|
190 |
+
uri_key=sort_doc(uri)
|
191 |
+
|
192 |
+
return link1,link2,uri_key
|
193 |
|
194 |
|
195 |
|
|
|
233 |
inp=gr.Textbox(label="URL")
|
234 |
level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
|
235 |
btn=gr.Button()
|
236 |
+
key_json=gr.JSON()
|
237 |
outp=gr.JSON()
|
238 |
with gr.Column(scale=1):
|
239 |
outmap=gr.JSON()
|
240 |
+
btn.click(sitemap,[inp,level],[outp,outmap,key_json])
|
241 |
app.launch()
|