File size: 2,120 Bytes
d6afb45
 
7ee1b98
d6afb45
 
 
f586a70
56e3a34
d6afb45
b878468
d6afb45
 
 
 
 
 
 
 
 
 
 
c0e818a
43954cf
a4abaeb
0766cce
43954cf
 
8eb0cc4
43954cf
 
d6afb45
 
6005136
d6afb45
 
 
f586a70
 
 
 
 
 
 
 
43954cf
798a988
8eb0cc4
e832cec
f586a70
 
 
 
 
6005136
d6afb45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56e3a34
 
f586a70
 
 
56e3a34
26f9624
792d4ad
f586a70
 
 
 
176890c
d6afb45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import urllib.request
import requests
import bs4
import lxml

def find_all(url,q=None,num=None):
    rawp = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    # title of the page
    print(soup.title)
    # get attributes:
    print(soup.title.name)
    # get values:
    print(soup.title.string)
    # beginning navigation:
    print(soup.title.parent.name)
    # getting specific values:
    print(soup.p)
    print(soup.find_all('p'))
    for tag in soup.find_all():
        try:
            n = tag.get(tag.string)
            rawp.append([tag.name:tag.string,"parent":tag.parent.name,"children":tag.children.name])
        except Exception as e:
            print (e)
            rawp.append({tag.name:tag.string})
    
        #rawp.append(tag.string)
    for url in soup.find_all('a'):
        print(url.get('href'))
        
    print(soup.get_text())

    
    return rawp


def find_it(url,q=None,num=None):
    out = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    for p in soup.find_all(f'{q}'):
        #out.append(p)
        out.append([{q:p.string},{"parent":p.parent.name}])
        #out.append(p.parent.name)

    for url in soup.find_all('a'):
        print(url.get('href'))
        
    print(soup.get_text())
    return out    
    
def find_it2(url):
    response = requests.get(url,a1=None,q2=None,q3=None)
    try:
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')])
        return out
    except Exception as e:
        print (e)
        return e


with gr.Blocks() as app:
    with gr.Row():
        inp = gr.Textbox()
        q = gr.Textbox(value="p")
        num = gr.Number(value=1)
    with gr.Row():
        all_btn = gr.Button("Load")
        find_btn = gr.Button("Find")
    with gr.Row():
        rawp = gr.JSON()
        outp = gr.JSON()
    
    all_btn.click(find_all,[inp,q,num],[rawp])
    find_btn.click(find_it,[inp,q,num],[outp])
    
app.launch()