Spaces:
Sleeping
Sleeping
Create wuzzuf_scraper.py
Browse files- wuzzuf_scraper.py +118 -0
wuzzuf_scraper.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
# wuzzuf function
|
7 |
+
def Wuzzuf_scrapping(job_type , job_num):
|
8 |
+
job1 = job_type.split(" ")[0]
|
9 |
+
job2 = job_type.split(" ")[1]
|
10 |
+
link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
|
11 |
+
title = []
|
12 |
+
location = []
|
13 |
+
country = []
|
14 |
+
job_description = []
|
15 |
+
Job_Requirements =[]
|
16 |
+
company_name = []
|
17 |
+
links = []
|
18 |
+
Jop_type = []
|
19 |
+
Career_Level = []
|
20 |
+
company_logo = []
|
21 |
+
Job_Categories = []
|
22 |
+
Skills_And_Tools = []
|
23 |
+
Experience_Needed =[]
|
24 |
+
post_time = []
|
25 |
+
Title = []
|
26 |
+
pages_num = np.ceil(job_num/15)
|
27 |
+
|
28 |
+
|
29 |
+
for i in range(int(pages_num) ):
|
30 |
+
link_new = link1 +'&start='+str(i)
|
31 |
+
data = requests.get(link_new)
|
32 |
+
soup = BeautifulSoup(data.content)
|
33 |
+
Title = soup.find_all('h2' , {'class': 'css-m604qf'})
|
34 |
+
|
35 |
+
# to get the info about jobs
|
36 |
+
|
37 |
+
for x in range(0,len(Title)):
|
38 |
+
t = re.split('\(|\-',Title[x].find('a').text)
|
39 |
+
title.append(t[0].strip())
|
40 |
+
loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
|
41 |
+
r = ""
|
42 |
+
for i in range(len(loc[:-1])):
|
43 |
+
r= r+ ', ' +loc[:-1][i].strip()
|
44 |
+
location.append(r.replace(',', '', 1).strip())
|
45 |
+
country.append(loc[-1].strip())
|
46 |
+
links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
|
47 |
+
m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
|
48 |
+
company_name.append(m)
|
49 |
+
c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
|
50 |
+
if len(c) ==1:
|
51 |
+
Jop_type.append(c[0].text)
|
52 |
+
else:
|
53 |
+
n =[]
|
54 |
+
for i in range(len(c)):
|
55 |
+
n.append(c[i].text)
|
56 |
+
Jop_type.append(n)
|
57 |
+
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
|
58 |
+
Career_Level.append(n[0].text)
|
59 |
+
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
|
60 |
+
|
61 |
+
yy = n[1].text.replace('·',' ').strip()
|
62 |
+
yy = re.findall('[0-9-+]*',yy)
|
63 |
+
y1 =""
|
64 |
+
for i in range(len(yy)):
|
65 |
+
|
66 |
+
if any(yy[i]):
|
67 |
+
y1 = y1+yy[i]
|
68 |
+
if y1 != "":
|
69 |
+
Experience_Needed.append(y1)
|
70 |
+
else:
|
71 |
+
Experience_Needed.append("Not Specified")
|
72 |
+
time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
|
73 |
+
post_time.append(time.text)
|
74 |
+
|
75 |
+
# to get the logo of the company
|
76 |
+
|
77 |
+
data1 = requests.get(links[x])
|
78 |
+
soup1 = BeautifulSoup(data1.content)
|
79 |
+
company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
|
80 |
+
#time.sleep(4)
|
81 |
+
|
82 |
+
|
83 |
+
# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
|
84 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
85 |
+
#driver.implicitly_wait(10)
|
86 |
+
driver.get(links[x])
|
87 |
+
Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
|
88 |
+
Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
|
89 |
+
job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
|
90 |
+
all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
|
91 |
+
dict_other = {}
|
92 |
+
|
93 |
+
new = all[0].text.split("\n\n")
|
94 |
+
|
95 |
+
if len(new)!=1 :
|
96 |
+
for i in range(len(new)):
|
97 |
+
result =[]
|
98 |
+
for k in (new[i].split('\n')[1:]):
|
99 |
+
result.append(k.replace("\u202f"," "))
|
100 |
+
dict_other[new[i].split('\n')[0]] = result
|
101 |
+
|
102 |
+
#result = re.sub('[\W_]+', '', ini_string)
|
103 |
+
|
104 |
+
Job_Requirements.append(dict_other)
|
105 |
+
|
106 |
+
else:
|
107 |
+
nn = new[0].replace("\u202f"," ")
|
108 |
+
Job_Requirements.append(nn.split('\n'))
|
109 |
+
|
110 |
+
|
111 |
+
# create data frame to combine all together
|
112 |
+
|
113 |
+
df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
|
114 |
+
|
115 |
+
df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
|
116 |
+
return df[:job_num]
|
117 |
+
|
118 |
+
|