Adr740 commited on
Commit
6caef3b
·
verified ·
1 Parent(s): 44a0ce2

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +32 -0
  2. config.py +21 -0
  3. full_scraping_script_v3.py +305 -0
  4. logs.py +49 -0
  5. requirements.txt +17 -0
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from functools import partial
4
+ from full_scraping_script_v3 import run_scrapping
5
+ import gdown
6
+ from config import json_url_id
7
+ download_url = f'https://drive.google.com/uc?id={json_url_id}'
8
+ output = 'secret_google_service_account.json'
9
+ gdown.download(download_url, output, quiet=False)
10
+
11
+ title = ""
12
+ with gr.Blocks(title=title,theme='nota-ai/theme',css="footer {visibility: hidden}") as demo:
13
+ gr.Markdown(f"## {title}")
14
+
15
+ with gr.Row():
16
+ with gr.Column(scale=6):
17
+ with gr.Row():
18
+ with gr.Column(scale=8):
19
+ password = gr.Textbox( lines=1, label="Tatooine password")
20
+ cookie = gr.Textbox( lines=1, label="Cookie")
21
+ with gr.Column(scale=1):
22
+ chat_submit_button = gr.Button(value="Submit ▶")
23
+ with gr.Row():
24
+ listing = gr.Markdown("Waiting for password...")
25
+
26
+
27
+ fn_chat = run_scrapping
28
+
29
+ chat_submit_button.click(fn=fn_chat, inputs=[password, cookie], outputs=[listing])
30
+
31
+
32
+ demo.launch(max_threads=40)
config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ openai_api_key = os.environ.get("openai_api_key")
4
+ proxycurl_api_key = os.environ.get("proxycurl_api_key")
5
+
6
+ ftp_host = os.environ.get("ftp_host")
7
+ ftp_user = os.environ.get("ftp_user")
8
+ ftp_pass = os.environ.get("ftp_pass")
9
+ csv_export_name = os.environ.get("csv_export_name")
10
+ password = os.environ.get("password")
11
+
12
+ folder_id = os.environ.get("folder_id")
13
+ json_url_id = os.environ.get("json_url_id")
14
+
15
+ places_morocco = [["Casablanca, Morocco"], ["Tanger, Morocco"],["Rabat, Morocco"],["Marrakech, Morocco"],["Agadir, Morocco"],["Kenitra, Morocco"]]
16
+ job_title_morocco = "software engineer"
17
+ amount_morocco = 70
18
+
19
+ places_world = [["United Kingdom"], ["France"],["Canada"],["Belgium"],["Germany"],["Saudi Arabia"], ["United arab emirates"]]
20
+ job_title_world = "software engineer remote"
21
+ amount_world = 70
full_scraping_script_v3.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+ import pandas as pd
5
+ import ftplib
6
+ import requests
7
+ import requests
8
+ import logging
9
+
10
+ from openai import OpenAI
11
+ from linkedin_jobs_scraper import LinkedinScraper
12
+ from linkedin_jobs_scraper.events import Events, EventData, EventMetrics
13
+ from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters
14
+ from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, \
15
+ OnSiteOrRemoteFilters, SalaryBaseFilters
16
+
17
+ from config import *
18
+ from logs import save_logs
19
+
20
+ client = OpenAI(api_key=openai_api_key)
21
+
22
+ # Replace 'your_li_at_cookie_value_here' with the actual value you copied
23
+
24
+ # Change root logger level (default is WARN)
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+
28
+
29
+
30
+ def scrape(location = ["Morocco"], job_title = "Développeur java", page_offset = 2, amount = 30, remote = False, horizon = "month"):
31
+
32
+ if remote:
33
+ onsite = [OnSiteOrRemoteFilters.REMOTE]
34
+ else:
35
+ onsite = [OnSiteOrRemoteFilters.REMOTE, OnSiteOrRemoteFilters.HYBRID, OnSiteOrRemoteFilters.ON_SITE]
36
+
37
+ if horizon in "month":
38
+ timespan = TimeFilters.MONTH
39
+ elif horizon in "day":
40
+ timespan = TimeFilters.DAY
41
+ else:
42
+ timespan = TimeFilters.MONTH
43
+ result = []
44
+ # Fired once for each successfully processed job
45
+ def on_data(data: EventData):
46
+ print('[ON_DATA]', data.title, data.company, data.company_link, data.date, data.link, data.insights,
47
+ len(data.description))
48
+ result.append(data)
49
+
50
+ # Fired once for each page (25 jobs)
51
+ def on_metrics(metrics: EventMetrics):
52
+ print('[ON_METRICS]', str(metrics))
53
+
54
+
55
+ def on_error(error):
56
+ print('[ON_ERROR]', error)
57
+
58
+
59
+ def on_end():
60
+ print('[ON_END]')
61
+
62
+
63
+ scraper = LinkedinScraper(
64
+ chrome_executable_path=None, # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
65
+ chrome_binary_location=None, # Custom path to Chrome/Chromium binary (e.g. /foo/bar/chrome-mac/Chromium.app/Contents/MacOS/Chromium)
66
+ chrome_options=None, # Custom Chrome options here
67
+ headless=True, # Overrides headless mode only if chrome_options is None
68
+ max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
69
+ slow_mo=0.5, # Slow down the scraper to avoid 'Too many requests 429' errors (in seconds)
70
+ page_load_timeout=40 # Page load timeout (in seconds)
71
+ )
72
+
73
+ # Add event listeners
74
+ scraper.on(Events.DATA, on_data)
75
+ scraper.on(Events.ERROR, on_error)
76
+ scraper.on(Events.END, on_end)
77
+
78
+ queries = [
79
+ Query(
80
+ query=job_title,
81
+ options=QueryOptions(
82
+ locations=location,
83
+ apply_link=False, # Try to extract apply link (easy applies are skipped). If set to True, scraping is slower because an additional page must be navigated. Default to False.
84
+ skip_promoted_jobs=True, # Skip promoted jobs. Default to False.
85
+ page_offset=page_offset, # How many pages to skip
86
+ limit=amount,
87
+ filters=QueryFilters(
88
+ # company_jobs_url='https://www.linkedin.com/jobs/search/?f_C=1441%2C17876832%2C791962%2C2374003%2C18950635%2C16140%2C10440912&geoId=92000000', # Filter by companies.
89
+ relevance=RelevanceFilters.RECENT,
90
+ time=TimeFilters.MONTH,
91
+ # type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
92
+ on_site_or_remote=onsite,
93
+ )
94
+ )
95
+ ),
96
+ ]
97
+
98
+ scraper.run(queries)
99
+ return result
100
+
101
+ def upload_pic(image_url, image_filename):
102
+ image_filename = str(image_filename) +".png"
103
+ # URL of the image to download
104
+
105
+ # Download the image
106
+ response = requests.get(image_url)
107
+ image_data = response.content
108
+
109
+ # Save the image locally
110
+ with open("img.png", "wb") as file:
111
+ file.write(image_data)
112
+
113
+ # Upload the image to the FTP server
114
+ with ftplib.FTP(ftp_host) as ftp:
115
+ ftp.login(ftp_user, ftp_pass)
116
+ with open("img.png", "rb") as file:
117
+ ftp.storbinary(f"STOR {image_filename}", file)
118
+
119
+ print(f"{image_filename} uploaded successfully to {ftp_host}")
120
+
121
+ def upload_ftp(filename):
122
+ # Upload the image to the FTP server
123
+ with ftplib.FTP(ftp_host) as ftp:
124
+ ftp.login(ftp_user, ftp_pass)
125
+ with open(filename, "rb") as file:
126
+ ftp.storbinary(f"STOR {filename}", file)
127
+
128
+ print(f"{filename} uploaded successfully to {ftp_host}")
129
+
130
+ def run_scrapping(input_password, cookie = ""):
131
+ if len(cookie) > 10:
132
+ os.environ['LI_AT_COOKIE'] = cookie
133
+
134
+ if input_password != password:
135
+ return "Wrong password"
136
+
137
+ jobs = []
138
+ for place in places_morocco:
139
+ job_unit = scrape(place,job_title_morocco, 0, amount_morocco, horizon = "day")
140
+ jobs += job_unit
141
+ # break
142
+
143
+ jobs_remote = []
144
+ for place in places_world:
145
+ job_unit = scrape(place,job_title, 0, amount_world,remote = True, horizon = "day")
146
+ jobs_remote += job_unit
147
+
148
+ df = pd.DataFrame(jobs + jobs_remote)
149
+ df_tatooine = pd.DataFrame(columns=["titre", "slug", "description", "job_type", "type_contrat", "experience", "pays", "ville", "salaire", "url", "entreprise_info", "entreprise_logo", "seo_description"])
150
+ df_tatooine["titre"] = df["title"]
151
+ df_tatooine["description"] = df["description"]
152
+ # df_tatooine["pays"] = df["place"]
153
+ # df_tatooine["ville"] = df["place"].str.split(",")[:]
154
+ df_tatooine[['ville', 'pays']] = df['location'].str.rsplit(', ', n=1, expand=True)
155
+ df_tatooine["url"] = df["link"]
156
+ df_tatooine = df_tatooine.fillna('')
157
+ df_tatooine["job_id"] = df["job_id"]
158
+ raw_content = []
159
+ titles = df_tatooine["titre"].to_list()
160
+ descs = df_tatooine["description"].to_list()
161
+ for i in range(len(df_tatooine["titre"].to_list())):
162
+ raw_content.append(titles[i] + descs[i])
163
+
164
+ seo_descs = []
165
+ job_types = []
166
+ type_contrats = []
167
+ response_rythmes = []
168
+ for i, content in enumerate(raw_content):
169
+ response = client.chat.completions.create(
170
+ model="gpt-4o-mini",
171
+ messages=[
172
+ {
173
+ "role": "system",
174
+ "content": [
175
+ {
176
+ "type": "text",
177
+ "text": "Your task will be to convert a job offer into a short description for SEO as it will be posted on a job board called tatooinejobs. You don't answer anything else than the SEO description, make sure it is highly converting and convincing. Make sure it's not too heavy and easy to read. \nDig into any human psychological biases you can find to make sure people will click on it. Make sure it's mainly oriented to promote the job board TatooineJobs."
178
+ }
179
+ ]
180
+ },
181
+ {
182
+ "role": "user",
183
+ "content": [
184
+ {
185
+ "type": "text",
186
+ "text": content
187
+ }
188
+ ]
189
+ }
190
+ ],
191
+ temperature=1,
192
+ max_tokens=3656,
193
+ top_p=1,
194
+ frequency_penalty=0,
195
+ presence_penalty=0,
196
+ response_format={
197
+ "type": "text"
198
+ }
199
+ ).choices[0].message.content
200
+ response_types = client.chat.completions.create(
201
+ model="gpt-4o-mini",
202
+ messages=[
203
+ {
204
+ "role": "system",
205
+ "content": [
206
+ {
207
+ "type": "text",
208
+ "text": "Your task is to determine from a job offer its type of contract and work. You need to output a json with the following fields:\n\n{\n\"job_type\": [ ...] # to be chosen from [CDI, CDD, Freelance, Stage],\n\"rythme_travail\" : [...] # to be chosen from [Full-time, Part-time] ,\n\"type_contrat\" : [...] # to be chosen from [Hybrid, Full-remote, On-site] \n }\n\nFill in the most likely to be"
209
+ }
210
+ ]
211
+ },
212
+ {
213
+ "role": "user",
214
+ "content": [
215
+ {
216
+ "type": "text",
217
+ "text": content
218
+ }
219
+ ]
220
+ }
221
+ ],
222
+ temperature=1,
223
+ max_tokens=1567,
224
+ top_p=1,
225
+ frequency_penalty=0,
226
+ presence_penalty=0,
227
+ response_format={
228
+ "type": "json_object"
229
+ }
230
+ ).choices[0].message.content
231
+
232
+ response_job_type = eval(response_types)["job_type"]
233
+ response_type_contrat = eval(response_types)["type_contrat"]
234
+ response_rythme = eval(response_types)["rythme_travail"]
235
+ seo_descs.append(response)
236
+ job_types.append(str(response_job_type))
237
+ type_contrats.append(str(response_type_contrat))
238
+ response_rythmes.append(str(response_rythme))
239
+
240
+ if i % 10 == 0:
241
+ print(f"Processed {i+1} job offer out of {len(raw_content)}")
242
+
243
+ df_tatooine["seo_description"] = seo_descs
244
+ df_tatooine['job_type'] = type_contrats
245
+ df_tatooine['type_contrat'] = job_types
246
+ df_tatooine['rythme_travail'] = response_rythmes
247
+ df_tatooine["entreprise_info"] = df["company"]
248
+
249
+ profile_pic = []
250
+ headers = {'Authorization': 'Bearer ' + proxycurl_api_key}
251
+
252
+ api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/resolve'
253
+
254
+ for i, company in enumerate(df_tatooine["entreprise_info"].to_list()):
255
+ params = {
256
+ 'company_name': company,
257
+ 'enrich_profile': 'enrich',
258
+ }
259
+
260
+ response = requests.get(api_endpoint,
261
+ params=params,
262
+ headers=headers)
263
+ content = response.json()
264
+ if i == 0:
265
+ print(content)
266
+
267
+ if i % 5 == 0:
268
+ print(f"Processed {i+1} link out of {len(df_tatooine)}")
269
+ profile_pic.append(content["profile"]["profile_pic_url"])
270
+ # profile_pic.append("")
271
+
272
+
273
+ df_tatooine["entreprise_logo"] = profile_pic
274
+ for rec in df_tatooine.to_dict(orient="records"):
275
+ job_id = rec["job_id"]
276
+ url = rec["entreprise_logo"]
277
+ try:
278
+ upload_pic(url, job_id)
279
+ except:
280
+ pass
281
+
282
+
283
+ df_tatooine["job_type"] = df_tatooine.job_type.str.replace("['", "").str.replace("']", "").str.replace("'", "")
284
+ df_tatooine["type_contrat"] = df_tatooine.type_contrat.str.replace("['", "").str.replace("']", "").str.replace("'", "")
285
+ df_tatooine["rythme_travail"] = df_tatooine.rythme_travail.str.replace("['", "").str.replace("']", "").str.replace("'", "")
286
+
287
+ df_not_morocco = df_tatooine[~df_tatooine['pays'].isin(["Morocco"])]
288
+ df_morocco = df_tatooine[df_tatooine['pays'].isin(["Morocco"])]
289
+ df_not_morocco['pays'] = df_not_morocco['ville']
290
+ df_not_morocco['ville'] = None
291
+ new_df = pd.concat([df_morocco, df_not_morocco])
292
+ mapper = {
293
+ "Morocco" : "Maroc",
294
+ "United Kingdom" : "Royaume-Uni",
295
+ "Belgium" : "Belgique",
296
+ "Germany" : "Allemagne",
297
+ "Saudi Arabia" : "Arabie saoudite",
298
+ "United arab emirates" : "Émirats arabes unis",
299
+ }
300
+ new_df['pays'] = new_df.pays.replace(mapper)
301
+
302
+ # new_df.to_csv(csv_export_name, sep=";")
303
+ upload_ftp(csv_export_name)
304
+ save_logs(csv_export_name)
305
+ return "Scrapping done!"
logs.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from google.oauth2 import service_account
3
+ from googleapiclient.discovery import build
4
+ from googleapiclient.http import MediaFileUpload
5
+ from datetime import datetime
6
+
7
+ def save_logs(filename, folder_id = ""):
8
+
9
+ # Get the current date and time
10
+ now = datetime.now()
11
+ # filename = str(now).replace(":","").replace(" ","").replace("-","").replace(".","")+".txt"
12
+ # with open(filename, 'w') as file:
13
+ # file.write(to_save)
14
+ # Path to the service account key file
15
+ SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
16
+
17
+ # Define the required scopes
18
+ SCOPES = ['https://www.googleapis.com/auth/drive.file']
19
+
20
+ # Authenticate using the service account key file
21
+ credentials = service_account.Credentials.from_service_account_file(
22
+ SERVICE_ACCOUNT_FILE, scopes=SCOPES)
23
+
24
+ # Build the Google Drive API client
25
+ service = build('drive', 'v3', credentials=credentials)
26
+
27
+ # Specify the folder ID where you want to upload the file
28
+
29
+ # Metadata of the file to be uploaded
30
+ file_metadata = {
31
+ 'name': filename, # Name of the file to be uploaded
32
+ 'parents': [folder_id] # Folder ID
33
+ }
34
+
35
+ # Path to the file you want to upload
36
+ file_path = filename
37
+
38
+ # Create a MediaFileUpload object to upload the file
39
+ media = MediaFileUpload(file_path)
40
+
41
+ # Use the Drive API to upload the file
42
+ file = service.files().create(
43
+ body=file_metadata,
44
+ media_body=media,
45
+ fields='id'
46
+ ).execute()
47
+
48
+ # Print the file ID of the uploaded file
49
+ print('Saved in Google Drive - File ID: %s' % file.get('id'))
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ linkedin-jobs-scraper
2
+ openai
3
+ gradio
4
+ requests
5
+ bs4
6
+ openai
7
+ pandas
8
+ numpy
9
+ bs4
10
+ nltk
11
+ tiktoken
12
+ pdf2image
13
+ gdown
14
+ google-auth
15
+ google-auth-oauthlib
16
+ google-auth-httplib2
17
+ google-api-python-client