|
|
|
import time |
|
import ujson |
|
from random import randint |
|
from typing import Dict, List, Any |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from selenium.common.exceptions import NoSuchElementException |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_authors(list1, file_name): |
|
|
|
with open(file_name, 'w', encoding='utf-8') as f: |
|
for i in range(0, len(list1)): |
|
f.write(list1[i] + '\n') |
|
|
|
|
|
def initCrawlerScraper(seed,max_profiles=500): |
|
|
|
webOpt = webdriver.ChromeOptions() |
|
webOpt.add_experimental_option('excludeSwitches', ['enable-logging']) |
|
webOpt.add_argument('--ignore-certificate-errors') |
|
webOpt.add_argument('--incognito') |
|
webOpt.headless = True |
|
driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt) |
|
driver.get(seed) |
|
|
|
Links = [] |
|
pub_data = [] |
|
|
|
nextLink = driver.find_element_by_css_selector(".nextLink").is_enabled() |
|
print("Crawler has begun...") |
|
while (nextLink): |
|
page = driver.page_source |
|
|
|
bs = BeautifulSoup(page, "lxml") |
|
|
|
|
|
for link in bs.findAll('a', class_='link person'): |
|
url = str(link)[str(link).find('https://pureportal.coventry.ac.uk/en/persons/'):].split('"') |
|
Links.append(url[0]) |
|
|
|
|
|
try: |
|
if driver.find_element_by_css_selector(".nextLink"): |
|
element = driver.find_element_by_css_selector(".nextLink") |
|
driver.execute_script("arguments[0].click();", element) |
|
else: |
|
nextLink = False |
|
except NoSuchElementException: |
|
break |
|
|
|
|
|
if len(Links) >= max_profiles: |
|
break |
|
|
|
print("Crawler has found ", len(Links), " pureportal profiles") |
|
write_authors(Links, 'Authors_URL.txt') |
|
|
|
print("Scraping publication data for ", len(Links), " pureportal profiles...") |
|
count = 0 |
|
for link in Links: |
|
|
|
time.sleep(1) |
|
driver.get(link) |
|
try: |
|
if driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large"): |
|
element = driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large") |
|
for a in element: |
|
if "research output".lower() in a.text.lower(): |
|
driver.execute_script("arguments[0].click();", a) |
|
driver.get(driver.current_url) |
|
|
|
name = driver.find_element_by_css_selector("div[class='header person-details']>h1") |
|
r = requests.get(driver.current_url) |
|
|
|
soup = BeautifulSoup(r.content, 'lxml') |
|
|
|
|
|
table = soup.find('ul', attrs={'class': 'list-results'}) |
|
if table != None: |
|
for row in table.findAll('div', attrs={'class': 'result-container'}): |
|
data = {} |
|
data['name'] = row.h3.a.text |
|
data['pub_url'] = row.h3.a['href'] |
|
date = row.find("span", class_="date") |
|
|
|
rowitem = row.find_all(['div']) |
|
span = row.find_all(['span']) |
|
data['cu_author'] = name.text |
|
data['date'] = date.text |
|
print("Publication Name :", row.h3.a.text) |
|
print("Publication URL :", row.h3.a['href']) |
|
print("CU Author :", name.text) |
|
print("Date :", date.text) |
|
print("\n") |
|
pub_data.append(data) |
|
else: |
|
|
|
name = driver.find_element_by_css_selector("div[class='header person-details']>h1") |
|
r = requests.get(link) |
|
|
|
soup = BeautifulSoup(r.content, 'lxml') |
|
|
|
table = soup.find('div', attrs={'class': 'relation-list relation-list-publications'}) |
|
if table != None: |
|
for row in table.findAll('div', attrs={'class': 'result-container'}): |
|
data = {} |
|
data["name"] = row.h3.a.text |
|
data['pub_url'] = row.h3.a['href'] |
|
date = row.find("span", class_="date") |
|
rowitem = row.find_all(['div']) |
|
span = row.find_all(['span']) |
|
data['cu_author'] = name.text |
|
data['date'] = date.text |
|
print("Publication Name :", row.h3.a.text) |
|
print("Publication URL :", row.h3.a['href']) |
|
print("CU Author :", name.text) |
|
print("Date :", date.text) |
|
print("\n") |
|
pub_data.append(data) |
|
except Exception: |
|
continue |
|
|
|
print("Crawler has scrapped data for ", len(pub_data), " pureportal publications") |
|
driver.quit() |
|
|
|
with open('scraper_results.json', 'w') as f: |
|
ujson.dump(pub_data, f) |
|
|
|
|
|
initCrawlerScraper('https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/', max_profiles=500) |
|
|
|
|
|
|
|
|