Spaces:

mediaparty2023
/

spoof-detect

Runtime error

App Files Files Community

Niv Sardi commited on Aug 8, 2022

Commit

485f76b

•

1 Parent(s): 05802f8

implement basic python crawler

Browse files

Signed-off-by: Niv Sardi <xaiki@evilgiggle.com>

Files changed (7) hide show

README.org +15 -0
crawler/common/selectors.py +7 -0
crawler/entity.py +46 -0
crawler/main.py +50 -0
crawler/screenshot.py +28 -0
crawler/vendor.py +70 -0
detect.js +7 -2

README.org ADDED Viewed

	@@ -0,0 +1,15 @@

+#+TITLE: Spoof Detect
+* yolo
+https://github.com/ModelDepot/tfjs-yolo-tiny
+https://github.com/Hyuto/yolov5-tfjs
+** augmentation
+https://github.com/srp-31/Data-Augmentation-for-Object-Detection-YOLO-
+* proveedores
+http://www.bcra.gov.ar/SistemasFinancierosYdePagos/Proveedores-servicios-de-pago-ofrecen-cuentas-de-pago.asp
+http://www.bcra.gov.ar/SistemasFinancierosYdePagos/Proveedores-servicios-de-billeteras-digitales-Interoperables.asp
+http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp

crawler/common/selectors.py ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env python3
+logo = "img[src*=logo]"
+logosbancos = "img[src*=logosbancos]"
+entity_http = "p.post-pagina-interior a[target=_blank][href*=http]"
+entity_mailto = "p.post-pagina-interior a[target=_blank][href*=mailto]"

crawler/entity.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+class Entity():
+    def __init__(self, name, bco, url=None, logo=None):
+        self.name = name
+        self.bco = bco
+        self.url = url
+        self.logo = logo
+    def __repr__(self):
+        return f"""
+Entity:
+        name: {self.name}
+        bco:  {self.bco}
+        url:  {self.url}
+        logo: {self.logo}
+        """
+    @classmethod
+    def from_list(cls, l):
+        self = apply(cls, l)
+        return self
+    @classmethod
+    def from_dict(cls, d):
+        self = cls(None, None)
+        for f in d.keys():
+            setattr(self, f, d[f])
+        return self
+    @classmethod
+    def row_names(cls):
+        return ['name', 'bco', 'url', 'logo']
+    @property
+    def DATA_PATH(self):
+        return f"./data/{self.bco}"
+    def to_row(self):
+        return [self.name, self.bco, self.url, self.logo]
+if __name__ == '__main__':
+    e = Entity.from_dict({'url': 'blah'})
+    assert(e.url == 'blah')
+    print(e)

crawler/main.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import csv
+import requests
+from bs4 import BeautifulSoup
+from progress.bar import ChargingBar
+from entity import Entity
+from common import selectors
+URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
+page = requests.get(URL)
+soup = BeautifulSoup(page.content, "html.parser")
+options = soup.find(class_="form-control").find_all('option')
+with open('entidades.csv', 'w', newline='') as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerow(Entity.row_names())
+    bar = ChargingBar('Processing', max=len(options))
+    for o in options[1:]:
+        e = Entity(
+            name = o.text,
+            bco = o.attrs['value']
+        )
+        page = requests.post(URL, data={'bco': e.bco})
+        soup = BeautifulSoup(page.content, "html.parser")
+        try:
+            img = soup.select_one(selectors.logosbancos).attrs['src']
+            img = img.replace("../", "https://www.bcra.gob.ar/")
+        except AttributeError as err:
+            print('img', e.name, err)
+            img = None
+        e.logo = img
+        a = soup.select_one(selectors.entity_http)
+        try:
+            a = a.attrs['href']
+        except AttributeError:
+            a = soup.select_one(selectors.entity_mailto)
+            try:
+                a = 'http://' + a.attrs['href'].split('@')[1]
+            except TypeError:
+                print('ERROR', a)
+        e.url = a
+        writer.writerow(e.to_row())
+        bar.next()
+    bar.finish()

crawler/screenshot.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python3
+#
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from common import selectors
+from entity import Entity
+options = webdriver.FirefoxOptions()
+options.add_argument("--headless")
+options.add_argument("--window-size=1920x8000")
+driver = webdriver.Firefox(options=options)
+def sc_entity(e: Entity):
+    print(e)
+    driver.get(e.url)
+    driver.save_screenshot(f"{e.DATA_PATH}/screenshot.png")
+    driver.save_full_page_screenshot(f"{e.DATA_PATH}/screenshot.full.png")
+    logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
+    with open(f"{e.DATA_PATH}/logo.pos", 'w') as f:
+        for i in logos:
+            f.write(repr(i.rect))
+            print(i.get_attribute('src'), i.rect)
+if __name__ == '__main__':
+    sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))

crawler/vendor.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+import pathlib
+import ssl
+import shutil
+import csv
+import concurrent.futures
+import requests
+from bs4 import BeautifulSoup
+from progress.bar import ChargingBar
+from entity import Entity
+from common import selectors
+import screenshot
+def query_vendor_site(e: Entity):
+    pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
+    ssl_url = e.url.split("/")[2]
+    try:
+        page = requests.get(e.url)
+    except Exception:
+        page = requests.get(e.url.replace('http', 'https'))
+    soup = BeautifulSoup(page.content, "html.parser")
+    logos = soup.select(selectors.logo)
+    cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
+    fn = f"{e.DATA_PATH}/cert"
+    with open(fn, 'w') as f:
+        f.write(cert)
+    i = 0
+    lfn = []
+    for l in logos:
+        src = l.attrs['src']
+        ext = src.split('.')[-1].split('/')[-1]
+        try:
+            res = requests.get(src, stream=True)
+        except Exception:
+            res = requests.get(f"{e.url}/{src}")
+        fn = f"{e.DATA_PATH}/{i}.{ext}"
+        with open(fn, "wb") as f:
+            shutil.copyfileobj(res.raw, f)
+        lfn.append(fn)
+        i+=1
+    screenshot.sc_entity(e)
+    return (fn, lfn)
+def from_csv(fn):
+    with open(fn, newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor:
+            futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]}
+            bar = ChargingBar('Processing', max=len(futures))
+            for f in concurrent.futures.as_completed(futures):
+                url = futures[f]
+                try:
+                    (cert, logos) = f.result()
+                except Exception as exc:
+                    print('%r generated an exception: %s' % (url, exc))
+                else:
+                    print(cert, logos)
+                bar.next()
+            bar.finish()
+#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
+#exit()
+if __name__ == '__main__':
+    from_csv('entidades.csv')

detect.js CHANGED Viewed

@@ -1,3 +1,8 @@
 // Set up a mutation observer to listen for title changes
 // Will fire if framework AJAX stuff switches page title
 let createObserver = function() {
@@ -8,7 +13,7 @@ let createObserver = function() {
         console.log('Mutations!', mutations)
         observer.disconnect()
         observer = null
-        cleanup()
         createObserver()
     })
@@ -20,4 +25,4 @@ let createObserver = function() {
 createObserver()
 // Kick off initial page load check
-cleanup()

+let run = () => {
+}
 // Set up a mutation observer to listen for title changes
 // Will fire if framework AJAX stuff switches page title
 let createObserver = function() {
         console.log('Mutations!', mutations)
         observer.disconnect()
         observer = null
+        run()
         createObserver()
     })
 createObserver()
 // Kick off initial page load check
+run()