|
const fs = require('fs'); |
|
const db = require('./db'); |
|
const config = require('./config'); |
|
const { getName } = require('country-list'); |
|
const YAML = require('yaml'); |
|
const js2xml = require('xml-js'); |
|
|
|
describe('Scrapers', function () { |
|
|
|
jasmine.DEFAULT_TIMEOUT_INTERVAL = 600000; |
|
|
|
let proxies = { lastUpdated: new Date(), bySource: {}, byType: { socks5: [], socks4: [], http: [] } } |
|
|
|
it('Proxynova Scraper', function () { |
|
|
|
let scraperId = 'proxynova'; |
|
let pages = ['https://www.proxynova.com/proxy-server-list/country-bd/ ', 'https://www.proxynova.com/proxy-server-list/country-br/ ', 'https://www.proxynova.com/proxy-server-list/country-cl/ ', 'https://www.proxynova.com/proxy-server-list/country-cn/ ', 'https://www.proxynova.com/proxy-server-list/country-co/ ', 'https://www.proxynova.com/proxy-server-list/country-fr/ ', 'https://www.proxynova.com/proxy-server-list/country-de/ ', 'https://www.proxynova.com/proxy-server-list/country-hk/ ', 'https://www.proxynova.com/proxy-server-list/country-in/ ', 'https://www.proxynova.com/proxy-server-list/country-id/ ', 'https://www.proxynova.com/proxy-server-list/country-jp/ ', 'https://www.proxynova.com/proxy-server-list/country-ke/ ', 'https://www.proxynova.com/proxy-server-list/country-nl/ ', 'https://www.proxynova.com/proxy-server-list/country-pl/ ', 'https://www.proxynova.com/proxy-server-list/country-ru/ ', 'https://www.proxynova.com/proxy-server-list/country-rs/ ', 'https://www.proxynova.com/proxy-server-list/country-kr/ ', 'https://www.proxynova.com/proxy-server-list/country-tw/ ', 'https://www.proxynova.com/proxy-server-list/country-th/ ', 'https://www.proxynova.com/proxy-server-list/country-ua/ ', 'https://www.proxynova.com/proxy-server-list/country-gb/ ', 'https://www.proxynova.com/proxy-server-list/country-us/ ', 'https://www.proxynova.com/proxy-server-list/country-ve/ ', 'https://www.proxynova.com/proxy-server-list/country-ir/ ', 'https://www.proxynova.com/proxy-server-list/country-tr/ ', 'https://www.proxynova.com/proxy-server-list/country-na/ ', 'https://www.proxynova.com/proxy-server-list/country-mz/ ', 'https://www.proxynova.com/proxy-server-list/country-it/ ', 'https://www.proxynova.com/proxy-server-list/country-eg/ ', 'https://www.proxynova.com/proxy-server-list/country-bg/']; |
|
let pageIndex = 0; |
|
let proxyFound = 0; |
|
proxies.bySource[scraperId] = []; |
|
|
|
|
|
function loadPage() { |
|
|
|
let page = pages[pageIndex++]; |
|
browser.driver.get(page); |
|
|
|
console.log(scraperId + " Visiting... " + page); |
|
browser.driver.findElement(by.tagName('tbody')).findElements(by.tagName('tr')).then((data) => { |
|
data.forEach(row => { |
|
row.findElements(by.tagName('td')).then((col) => { |
|
if (col[1] != undefined) { |
|
var proxy = new Object(); |
|
col[0].getText().then((text) => { |
|
proxy.ip = text |
|
}).then(() => { |
|
col[1].getText().then((text) => { |
|
proxy.port = text |
|
}) |
|
}).then(() => { |
|
col[5].getText().then((text) => { |
|
proxy.country = text |
|
}) |
|
}).then(() => { |
|
col[6].getText().then((text) => { |
|
proxy.anonymity = text |
|
}) |
|
}).then(() => { |
|
proxy.type = 'HTTP/HTTPS'; |
|
}).then(() => { |
|
proxies.bySource[scraperId].push(proxy); |
|
if (proxies.byType[getTypeMapping(proxy.type)] == undefined) proxies.byType[getTypeMapping(proxy.type)] = []; |
|
proxies.byType[getTypeMapping(proxy.type)].push(proxy); |
|
proxyFound++; |
|
}) |
|
.catch((err) => { |
|
console.log("Exception Occured! in " + scraperId, err.stack); |
|
}) |
|
} |
|
}); |
|
}); |
|
}).then(() => { |
|
if (pageIndex < pages.length) { |
|
loadPage(); |
|
} else { |
|
console.log(`Got ${proxyFound} proxies from ${scraperId}`); |
|
} |
|
}).catch((err) => { |
|
console.log("Exception Occured! in " + scraperId, err.stack); |
|
}); |
|
} |
|
|
|
loadPage(); |
|
|
|
}); |
|
|
|
it('US-proxy.org Scraper', function () { |
|
|
|
let scraperId = 'usproxy'; |
|
var pages = ['https://free-proxy-list.net/ ', 'https://www.socks-proxy.net/', 'https://www.us-proxy.org/', 'https://free-proxy-list.net/uk-proxy.html', 'https://www.sslproxies.org/', 'https://free-proxy-list.net/anonymous-proxy.html']; |
|
let pageIndex = 0; |
|
let proxyFound = 0; |
|
proxies.bySource[scraperId] = []; |
|
|
|
|
|
function loadPage() { |
|
|
|
let page = pages[pageIndex++]; |
|
browser.driver.get(page); |
|
|
|
console.log(scraperId + " Visiting... " + page); |
|
browser.driver.findElement(by.className('table-responsive fpl-list')).findElements(by.tagName('tr')).then((rows) => { |
|
rows.forEach(row => { |
|
var proxy = new Object(); |
|
row.findElements(by.tagName('td')).then((cols) => { |
|
if (cols.length > 0) { |
|
cols[0].getText().then((text) => { |
|
proxy.ip = text; |
|
}).then(() => { |
|
cols[1].getText().then((text) => { |
|
proxy.port = text; |
|
}) |
|
}).then(() => { |
|
cols[4].getText().then((text) => { |
|
proxy.country = text; |
|
}) |
|
}).then(() => { |
|
cols[4].getText().then((text) => { |
|
proxy.anonymity = text; |
|
}) |
|
}).then(() => { |
|
cols[4].getText().then((text) => { |
|
proxy.type = 'HTTP/HTTPS'; |
|
}) |
|
}).then(() => { |
|
proxies.bySource[scraperId].push(proxy); |
|
if (proxies.byType[getTypeMapping(proxy.type)] == undefined) proxies.byType[getTypeMapping(proxy.type)] = []; |
|
proxies.byType[getTypeMapping(proxy.type)].push(proxy); |
|
proxyFound++; |
|
}).catch((err) => { |
|
console.log("Exception Occured! in " + scraperId, err); |
|
}) |
|
} |
|
}) |
|
}); |
|
|
|
}).then(() => { |
|
if (pageIndex < pages.length) { |
|
loadPage(); |
|
} else { |
|
console.log(`Got ${proxyFound} proxies from ${scraperId}`); |
|
} |
|
}).catch((err) => { |
|
console.log("Exception Occured! in " + scraperId, err.stack); |
|
}); |
|
} |
|
|
|
loadPage(); |
|
|
|
}); |
|
|
|
it('openproxy.space Scraper', function () { |
|
|
|
try { |
|
let scraperId = 'openproxy'; |
|
let pages = ['https://openproxy.space/list/http']; |
|
let pageIndex = 0; |
|
let proxyFound = 0; |
|
proxies.bySource[scraperId] = []; |
|
|
|
|
|
function loadPage() { |
|
|
|
let page = pages[pageIndex++]; |
|
browser.driver.get(page); |
|
|
|
console.log(scraperId + " Visiting... " + page); |
|
browser.driver.executeScript('return __NUXT__').then(d => { |
|
let proxyList = d.data[0].data; |
|
proxyList.forEach(country => { |
|
country.items.forEach(proxyItem => { |
|
var proxy = new Object(); |
|
proxy.country = getName(country.code); |
|
proxy.ip = proxyItem.split(':')[0]; |
|
proxy.port = proxyItem.split(':')[1]; |
|
proxy.type = 'unknown'; |
|
proxy.anonymity = 'unknown'; |
|
proxyFound++; |
|
proxies.bySource[scraperId].push(proxy); |
|
if (proxies.byType[getTypeMapping(proxy.type)] == undefined) proxies.byType[getTypeMapping(proxy.type)] = []; |
|
proxies.byType[getTypeMapping(proxy.type)].push(proxy); |
|
}); |
|
}) |
|
console.log(`Got ${proxyFound} proxies from ${scraperId}`); |
|
}).catch((error) => { |
|
console.log("Exception Occured! in " + scraperId, error.stack); |
|
}) |
|
} |
|
|
|
loadPage(); |
|
} catch (error) { |
|
console.log("Exception Occured! in " + scraperId, error.stack); |
|
} |
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
it('freeproxy.world Scraper', function () { |
|
|
|
let scraperId = 'freeproxy'; |
|
let pages = []; |
|
for (let i = 0; i <= 50; i++) { |
|
pages.push("https://www.freeproxy.world/?type=&anonymity=&country=&speed=&port=&page=" + i); |
|
} |
|
let pageIndex = 0; |
|
let proxyFound = 0; |
|
proxies.bySource[scraperId] = []; |
|
|
|
|
|
function loadPage() { |
|
|
|
let page = pages[pageIndex++]; |
|
browser.driver.get(page); |
|
|
|
console.log(scraperId + " Visiting... " + page); |
|
browser.driver.findElement(by.tagName('tbody')).findElements(by.tagName('tr')).then((rows) => { |
|
rows.forEach(row => { |
|
try { |
|
var proxy = new Object(); |
|
row.findElements(by.tagName('td')).then((cols) => { |
|
if (cols.length > 3) { |
|
cols[0].getText().then((text) => { |
|
proxy.ip = text; |
|
}).then(() => { |
|
cols[1].getText().then((text) => { |
|
proxy.port = text; |
|
}) |
|
}).then(() => { |
|
cols[2].getText().then((text) => { |
|
proxy.country = text; |
|
}) |
|
}).then(() => { |
|
cols[5].getText().then((text) => { |
|
proxy.type = text; |
|
}) |
|
}).then(() => { |
|
cols[6].getText().then((text) => { |
|
proxy.anonymity = text; |
|
}) |
|
}).then(() => { |
|
proxies.bySource[scraperId].push(proxy); |
|
if (proxies.byType[getTypeMapping(proxy.type)] == undefined) proxies.byType[getTypeMapping(proxy.type)] = []; |
|
proxies.byType[getTypeMapping(proxy.type)].push(proxy); |
|
proxyFound++; |
|
}).catch((err) => { |
|
console.log("Exception Occured! in " + scraperId, err); |
|
}) |
|
} |
|
}) |
|
proxyFound++; |
|
} catch (error) { |
|
console.log("Exception Occured! in " + scraperId, error.stack); |
|
} |
|
}); |
|
|
|
}).then(() => { |
|
if (pageIndex < pages.length) { |
|
loadPage(); |
|
} else { |
|
console.log(`Got ${proxyFound} proxies from ${scraperId}`); |
|
} |
|
}).catch((err) => { |
|
console.log("Exception Occured! in " + scraperId, err.stack); |
|
}); |
|
} |
|
|
|
loadPage(); |
|
|
|
}); |
|
|
|
it('geonode.com Scraper', function () { |
|
|
|
let scraperId = 'geonode'; |
|
let pages = ['https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&filterLastChecked=60']; |
|
let pageIndex = 0; |
|
let proxyFound = 0; |
|
proxies.bySource[scraperId] = []; |
|
|
|
|
|
function loadPage() { |
|
|
|
let page = pages[pageIndex++]; |
|
browser.driver.get(page); |
|
|
|
console.log(scraperId + " Visiting... " + page); |
|
browser.driver.executeScript('return document.getElementsByTagName("body")[0].innerText').then(d => { |
|
try { |
|
let proxyList = JSON.parse(d).data; |
|
proxyList.forEach(proxyItem => { |
|
var proxy = new Object(); |
|
proxy.country = getName(proxyItem.country); |
|
proxy.ip = proxyItem.ip; |
|
proxy.port = proxyItem.port; |
|
proxy.type = proxyItem.protocols.join(" | "); |
|
proxy.anonymity = proxyItem.anonymityLevel; |
|
proxyFound++; |
|
proxies.bySource[scraperId].push(proxy); |
|
if (proxies.byType[getTypeMapping(proxy.type)] == undefined) proxies.byType[getTypeMapping(proxy.type)] = []; |
|
proxies.byType[getTypeMapping(proxy.type)].push(proxy); |
|
}); |
|
} catch (error) { |
|
console.log('Error occured in ' + scraperId, error); |
|
} |
|
}) |
|
} |
|
|
|
loadPage(); |
|
|
|
}); |
|
|
|
it('Save', function () { |
|
|
|
|
|
fs.writeFile('generated/raw.json', JSON.stringify(proxies, null, 2), function (err, data) { |
|
console.log("Successfully saved raw.json"); |
|
}); |
|
|
|
|
|
let jsonArray = []; |
|
Object.keys(proxies.bySource).forEach(element => { |
|
if (Array.isArray(proxies.bySource[element])) { |
|
proxies.bySource[element].forEach(item => { |
|
jsonArray.push(item); |
|
saveIntoDb(item); |
|
}); |
|
} |
|
}); |
|
|
|
|
|
fs.writeFile('proxies.csv', getCSV(jsonArray).join('\n'), function (err, data) { |
|
console.log("Successfully saved proxies.csv"); |
|
}); |
|
fs.writeFile('proxies.json', JSON.stringify(jsonArray, null, 2), function (err, data) { |
|
console.log("Successfully saved proxies.json"); |
|
}); |
|
fs.writeFile('proxies.txt', getTXT(jsonArray).join('\n'), function (err, data) { |
|
console.log("Successfully saved proxies.txt"); |
|
}); |
|
fs.writeFile('proxies.yaml', YAML.stringify(jsonArray), function (err, data) { |
|
console.log("Successfully saved proxies.yaml"); |
|
}); |
|
fs.writeFile('proxies.xml', js2xml.js2xml(jsonArray, { compact: true, ignoreComment: true, spaces: 4 }), function (err, data) { |
|
console.log("Successfully saved proxies.xml"); |
|
}); |
|
|
|
|
|
|
|
|
|
fs.writeFile('generated/socks5_proxies.csv', getCSV(proxies.byType.socks5).join('\n'), function (err, data) { |
|
console.log("Successfully saved socks5_proxies.csv"); |
|
}); |
|
fs.writeFile('generated/socks5_proxies.json', JSON.stringify(proxies.byType.socks5, null, 2), function (err, data) { |
|
console.log("Successfully saved socks5_proxies.json"); |
|
}); |
|
fs.writeFile('generated/socks5_proxies.txt', getTXT(proxies.byType.socks5).join('\n'), function (err, data) { |
|
console.log("Successfully saved socks5_proxies.txt"); |
|
}); |
|
fs.writeFile('generated/socks5_proxies.yaml', YAML.stringify(proxies.byType.socks5), function (err, data) { |
|
console.log("Successfully saved socks5_proxies.yaml"); |
|
}); |
|
fs.writeFile('generated/socks5_proxies.xml', js2xml.js2xml(proxies.byType.socks5, { compact: true, ignoreComment: true, spaces: 4 }), function (err, data) { |
|
console.log("Successfully saved socks5_proxies.xml"); |
|
}); |
|
|
|
fs.writeFile('generated/socks4_proxies.csv', getCSV(proxies.byType.socks4).join('\n'), function (err, data) { |
|
console.log("Successfully saved socks4_proxies.csv"); |
|
}); |
|
fs.writeFile('generated/socks4_proxies.json', JSON.stringify(proxies.byType.socks4, null, 2), function (err, data) { |
|
console.log("Successfully saved socks4_proxies.json"); |
|
}); |
|
fs.writeFile('generated/socks4_proxies.txt', getTXT(proxies.byType.socks4).join('\n'), function (err, data) { |
|
console.log("Successfully saved socks4_proxies.txt"); |
|
}); |
|
fs.writeFile('generated/socks4_proxies.yaml', YAML.stringify(proxies.byType.socks4), function (err, data) { |
|
console.log("Successfully saved socks4_proxies.yaml"); |
|
}); |
|
fs.writeFile('generated/socks4_proxies.xml', js2xml.js2xml(proxies.byType.socks4, { compact: true, ignoreComment: true, spaces: 4 }), function (err, data) { |
|
console.log("Successfully saved socks4_proxies.xml"); |
|
}); |
|
|
|
fs.writeFile('generated/http_proxies.csv', getCSV(proxies.byType.http).join('\n'), function (err, data) { |
|
console.log("Successfully saved http_proxies.csv"); |
|
}); |
|
fs.writeFile('generated/http_proxies.json', JSON.stringify(proxies.byType.http, null, 2), function (err, data) { |
|
console.log("Successfully saved http_proxies.json"); |
|
}); |
|
fs.writeFile('generated/http_proxies.txt', getTXT(proxies.byType.http).join('\n'), function (err, data) { |
|
console.log("Successfully saved http_proxies.txt"); |
|
}); |
|
fs.writeFile('generated/http_proxies.yaml', YAML.stringify(proxies.byType.http), function (err, data) { |
|
console.log("Successfully saved http_proxies.yaml"); |
|
}); |
|
fs.writeFile('generated/http_proxies.xml', js2xml.js2xml(proxies.byType.http, { compact: true, ignoreComment: true, spaces: 4 }), function (err, data) { |
|
console.log("Successfully saved http_proxies.xml"); |
|
}); |
|
|
|
|
|
if(jsonArray.length > 100) { |
|
let README = fs.readFileSync('README.md', 'utf-8'); |
|
let dynamicLine = README.substring(README.indexOf('<!-- dynamic-count-start -->') + 29, README.indexOf('<!-- dynamic-count-end -->') - 1); |
|
README = README.replace(dynamicLine, '## Current Proxy Count: ' + (Math.floor(jsonArray.length/100)*100) + '+ π'); |
|
fs.writeFileSync('README.md', README); |
|
} |
|
|
|
}) |
|
|
|
function getTypeMapping(proxyType) { |
|
if (['HTTP/HTTPS', 'unknown', 'HTTPS', 'HTTP'].includes(proxyType)) { |
|
return 'http'; |
|
} else if(['SOCKS5,SOCKS4'].includes(proxyType)) { |
|
return ' socks5' |
|
} else if(['SOCKS4'].includes(proxyType)) { |
|
return ' socks4' |
|
} else if(['SOCKS5'].includes(proxyType)) { |
|
return ' socks5' |
|
} else { |
|
return proxyType; |
|
} |
|
} |
|
|
|
function getCSV(jsonArray) { |
|
let csvContent = []; |
|
jsonArray.forEach(item => { |
|
let values = [item.ip, item.port, item.country, item.anonymity, item.type]; |
|
csvContent.push(values.join(',')); |
|
}) |
|
return csvContent; |
|
} |
|
|
|
function getTXT(jsonArray) { |
|
let txtContent = []; |
|
jsonArray.forEach(item => { |
|
let values = [item.ip, item.port]; |
|
txtContent.push(values.join(':')); |
|
}) |
|
return txtContent; |
|
} |
|
|
|
function saveIntoDb(proxy) { |
|
if (!config.SAVE_TO_DB) return; |
|
var sql = "INSERT into proxies_tb (`proxy`,`port`,`country`,`type`,`anonymity`) VALUES('" + proxy.ip + "'," + proxy.port + ",'" + proxy.country + "','" + proxy.type + "','" + proxy.anonymity + "')"; |
|
db.query(sql, (err, result) => { |
|
if (err) console.log(err) |
|
}) |
|
} |
|
}); |