Omnibus commited on
Commit
1b682b7
1 Parent(s): 3b1a5cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -6,6 +6,7 @@ def link_find(url):
6
  out = []
7
  source = requests.get(url)
8
  if source.status_code ==200:
 
9
  #soup = bs4.BeautifulSoup(source.content,'lxml')
10
  soup = bs4.BeautifulSoup(source.content,'html.parser')
11
 
@@ -15,7 +16,7 @@ def link_find(url):
15
  rawt=soup.text
16
  #out.append(rawp)
17
  #out.append("HTML fragments: ")
18
- node1 = ({"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"TREE":[]})
19
 
20
  q=("a","p","span","content","article")
21
  for p in soup.find_all("a"):
@@ -23,6 +24,7 @@ def link_find(url):
23
  #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
24
 
25
  else:
 
26
  pass
27
  return node1
28
  #https://huggingface.co/spaces/Omnibus/crawl
@@ -42,7 +44,7 @@ def sitemap(url,level):
42
  uri=f'{uri1}//{uri3}'
43
  print(uri)
44
  out_list=link_find(f"{uri}{ea['URL']}")
45
- #link1[i]['TREE']=out_list
46
  if level>=3:
47
  for n,na in enumerate(link1['TREE'][i]['TREE']):
48
  print(na)
@@ -54,7 +56,7 @@ def sitemap(url,level):
54
  uri0=f'{uri11}//{uri33}'
55
  print(uri0)
56
  out_list1=link_find(f"{uri0}{na['URL']}")
57
- #link1[i]['TREE'][n]['TREE']=out_list1
58
  except Exception as e:
59
  print (e)
60
  except Exception as e:
 
6
  out = []
7
  source = requests.get(url)
8
  if source.status_code ==200:
9
+ print("YES")
10
  #soup = bs4.BeautifulSoup(source.content,'lxml')
11
  soup = bs4.BeautifulSoup(source.content,'html.parser')
12
 
 
16
  rawt=soup.text
17
  #out.append(rawp)
18
  #out.append("HTML fragments: ")
19
+ node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"TREE":[]}
20
 
21
  q=("a","p","span","content","article")
22
  for p in soup.find_all("a"):
 
24
  #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
25
 
26
  else:
27
+ print("NO")
28
  pass
29
  return node1
30
  #https://huggingface.co/spaces/Omnibus/crawl
 
44
  uri=f'{uri1}//{uri3}'
45
  print(uri)
46
  out_list=link_find(f"{uri}{ea['URL']}")
47
+ link1['TREE'][i]['TREE']=out_list
48
  if level>=3:
49
  for n,na in enumerate(link1['TREE'][i]['TREE']):
50
  print(na)
 
56
  uri0=f'{uri11}//{uri33}'
57
  print(uri0)
58
  out_list1=link_find(f"{uri0}{na['URL']}")
59
+ link1['TREE'][i]['TREE'][n]['TREE']=out_list1
60
  except Exception as e:
61
  print (e)
62
  except Exception as e: