Omnibus commited on
Commit
0d9e44b
1 Parent(s): 707168a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -26
app.py CHANGED
@@ -2,29 +2,16 @@ import gradio as gr
2
  import requests
3
  import bs4
4
 
5
- def sort_doc(text,steps_in=0,control=None):
6
- text=str(text)
7
 
8
  ########################################
9
- sen_list=get_sen_list(text)
10
- key_cnt=len(sen_list)
11
-
12
- sen_obj_box=[]
13
- for ii,ee in enumerate(sen_list):
14
- sen_obj=proc_sen(sen_list,ii)
15
- sen_obj_box.append(sen_obj)
16
-
17
- sen_list=sen_obj_box
18
  ######################################
19
- key_cnt=len(sen_obj_box)
20
  print(key_cnt)
21
- #noun_cnt=len(noun_box)
22
- #print(noun_cnt)
23
-
24
-
25
 
26
  if not steps_in:
27
-
28
  control_char=list(control_json['control'])
29
  char_len=len(control_char)
30
  n_cnt=0
@@ -49,8 +36,6 @@ def sort_doc(text,steps_in=0,control=None):
49
  control_val=list(control_json['control'][control_len:])
50
  val_len=len(control_val)
51
 
52
-
53
-
54
  json_out={}
55
  noun_list={}
56
  step_list=[]
@@ -92,7 +77,7 @@ def sort_doc(text,steps_in=0,control=None):
92
  for iii,j in enumerate(step_cont_box):
93
  print(j)
94
  out_js = out_js+control_char[j]
95
- sen_obj=sen_obj_box[i]
96
  #sen_obj=proc_sen(sen_list,i)
97
 
98
  #json_out[out_js]={'nouns':ea}
@@ -106,10 +91,11 @@ def sort_doc(text,steps_in=0,control=None):
106
  if big_cnt==key_cnt:
107
  print("DONE")
108
  go=False
109
- noun_list=proc_nouns(json_out)
110
- return json_out, noun_list
111
 
112
 
 
113
 
114
  def link_find(url):
115
  out = []
@@ -126,7 +112,7 @@ def link_find(url):
126
  #out.append(rawp)
127
  #out.append("HTML fragments: ")
128
  node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
129
- node2 = {"URL":url,"LINKS":[],"TREE":[]}
130
 
131
  q=("a","p","span","content","article")
132
  for p in soup.find_all("a"):
@@ -152,12 +138,14 @@ def link_find(url):
152
  node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
153
  node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
154
  node2['LINKS'].append(uri)
155
-
 
156
  #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
157
 
158
  else:
159
  print("NO")
160
  pass
 
161
  return node1,node2
162
  #https://huggingface.co/spaces/Omnibus/crawl
163
 
@@ -199,7 +187,9 @@ def sitemap(url,level):
199
  print (e)
200
  except Exception as e:
201
  print (e)
202
- return link1,link2
 
 
203
 
204
 
205
 
@@ -243,8 +233,9 @@ with gr.Blocks() as app:
243
  inp=gr.Textbox(label="URL")
244
  level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
245
  btn=gr.Button()
 
246
  outp=gr.JSON()
247
  with gr.Column(scale=1):
248
  outmap=gr.JSON()
249
- btn.click(sitemap,[inp,level],[outp,outmap])
250
  app.launch()
 
2
  import requests
3
  import bs4
4
 
5
+ def sort_doc(in_list,steps_in=8,control=None):
6
+ text=str(in_list)
7
 
8
  ########################################
9
+ sen_list=in_list
 
 
 
 
 
 
 
 
10
  ######################################
11
+ key_cnt=len(sen_list)
12
  print(key_cnt)
 
 
 
 
13
 
14
  if not steps_in:
 
15
  control_char=list(control_json['control'])
16
  char_len=len(control_char)
17
  n_cnt=0
 
36
  control_val=list(control_json['control'][control_len:])
37
  val_len=len(control_val)
38
 
 
 
39
  json_out={}
40
  noun_list={}
41
  step_list=[]
 
77
  for iii,j in enumerate(step_cont_box):
78
  print(j)
79
  out_js = out_js+control_char[j]
80
+ sen_obj=sen_list[i]
81
  #sen_obj=proc_sen(sen_list,i)
82
 
83
  #json_out[out_js]={'nouns':ea}
 
91
  if big_cnt==key_cnt:
92
  print("DONE")
93
  go=False
94
+ #noun_list=proc_nouns(json_out)
95
+ return json_out
96
 
97
 
98
+ link_box = []
99
 
100
  def link_find(url):
101
  out = []
 
112
  #out.append(rawp)
113
  #out.append("HTML fragments: ")
114
  node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
115
+ node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]}
116
 
117
  q=("a","p","span","content","article")
118
  for p in soup.find_all("a"):
 
138
  node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
139
  node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
140
  node2['LINKS'].append(uri)
141
+ #node2['LINK_KEY'].append(uri_key)
142
+ link_box.append(uri)
143
  #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
144
 
145
  else:
146
  print("NO")
147
  pass
148
+
149
  return node1,node2
150
  #https://huggingface.co/spaces/Omnibus/crawl
151
 
 
187
  print (e)
188
  except Exception as e:
189
  print (e)
190
+ uri_key=sort_doc(uri)
191
+
192
+ return link1,link2,uri_key
193
 
194
 
195
 
 
233
  inp=gr.Textbox(label="URL")
234
  level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
235
  btn=gr.Button()
236
+ key_json=gr.JSON()
237
  outp=gr.JSON()
238
  with gr.Column(scale=1):
239
  outmap=gr.JSON()
240
+ btn.click(sitemap,[inp,level],[outp,outmap,key_json])
241
  app.launch()