daswer123 commited on
Commit
f1c202b
1 Parent(s): 334f8e3

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +131 -0
  2. funcs.py +88 -0
  3. module.py +143 -0
  4. requirements.txt +43 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from module import generate_resume_urls
3
+
4
+ import os
5
+ import tempfile
6
+ import shutil
7
+
8
+ def write_result_to_file(result, filename="result.txt"):
9
+ # Create temp folder
10
+ result = "\n\n".join(result)
11
+ temp_dir = tempfile.mkdtemp()
12
+
13
+ # Write result to file
14
+ with open(os.path.join(temp_dir, filename), "w") as f:
15
+ f.write(result)
16
+ f.close()
17
+
18
+ return os.path.join(temp_dir, filename)
19
+
20
+ def get_links(job_search_status_enable,gender_enable,salary_enable,education_level_enable,experience_enable,age_enable,
21
+ links,use_filters,use_inc_values,job_search_status,gender,age_from,age_to,salary_from,salary_to,education_level,experience):
22
+ result_links = []
23
+
24
+ cout_slogan = "Кол-во ссылок: "
25
+ links_list = links.split("\n")
26
+ # Remove empy elemts from links_list
27
+ links_list = [link for link in links_list if link]
28
+
29
+ if not use_filters:
30
+ for link in links_list:
31
+ result = generate_resume_urls(link)
32
+ result_links += result
33
+
34
+ cout_slogan = cout_slogan + str(len(result_links))
35
+ return write_result_to_file(result_links),cout_slogan
36
+
37
+ # When use_filters is True
38
+ else:
39
+ selected_filters = []
40
+ filter_values = {}
41
+
42
+ for link in links_list:
43
+ if age_enable:
44
+ selected_filters.append("age")
45
+ # print(age_from,age_to)
46
+ filter_values["age"] = list(range(age_from,age_to))
47
+ diffrence = age_to - age_from
48
+
49
+ if diffrence <= 0:
50
+ filter_values["age"].append(age_from)
51
+ else:
52
+ filter_values["age"].append(age_to)
53
+
54
+ if job_search_status_enable:
55
+ selected_filters.append("job_search_status")
56
+ print(job_search_status)
57
+ filter_values["job_search_status"] = job_search_status
58
+
59
+ if gender_enable:
60
+ selected_filters.append("gender")
61
+ filter_values["gender"] = gender
62
+
63
+ if salary_enable:
64
+ selected_filters.append("salary")
65
+ filter_values["salary_from"] = salary_from
66
+ filter_values["salary_to"] = salary_to
67
+
68
+ if education_level_enable:
69
+ selected_filters.append("education_level")
70
+ filter_values["education_level"] = education_level
71
+
72
+ if experience_enable:
73
+ selected_filters.append("experience")
74
+ filter_values["experience"] = experience
75
+
76
+ for link in links_list:
77
+ result = generate_resume_urls(link, selected_filters, filter_values, use_inc_values)
78
+ result_links += result
79
+
80
+ cout_slogan = cout_slogan + str(len(result_links))
81
+ return write_result_to_file(result_links),cout_slogan
82
+
83
+ with gr.Blocks() as demo:
84
+ gr.Markdown("# Модуль для увеличения кол-ва ссылок за счет добавления фильтра")
85
+
86
+ with gr.Row():
87
+ links = gr.TextArea(label="Введите ссылки",placeholder="https://hh.ru/search/resume?text=Региональный+визуальный+мерчендайзер")
88
+
89
+ use_filters = gr.Checkbox(label="Использовать Фильтры ( если выключенно, создается максимально кол-во ссылок )")
90
+ with gr.Accordion("Фильтры",open=True):
91
+ # Набор фильтров
92
+ use_inc_values = gr.Checkbox(label="Добавить к списку резюме, список без указанного возраста и зарплаты")
93
+ with gr.Row():
94
+ with gr.Row():
95
+ job_search_status_enable = gr.Checkbox(label="Статус поиска работы",value=True)
96
+ gender_enable = gr.Checkbox(label="Пол",value=True)
97
+ with gr.Row():
98
+ age_enable = gr.Checkbox(label="Возраст",value=True)
99
+ with gr.Row():
100
+ salary_enable = gr.Checkbox(label="Зарплата",value=True)
101
+
102
+ education_level_enable = gr.Checkbox(label="Образование",value=True)
103
+ experience_enable = gr.Checkbox(label="Опыт работы",value=True)
104
+
105
+ with gr.Accordion("Настройка фильтров",open=False):
106
+ with gr.Row():
107
+ job_search_status = gr.CheckboxGroup(["unknown", "not_looking_for_job", "looking_for_offers", "active_search", "has_job_offer", "accepted_job_offer"], label="Статус поиска работы")
108
+ gender = gr.Radio(["male", "female","both"], label="Пол")
109
+ with gr.Row():
110
+ age_from = gr.Slider(18, 70,value=18, step=1, label="Возраст от")
111
+ age_to = gr.Slider(18, 70,value=71,step=1, label="Возраст до")
112
+ with gr.Row():
113
+ salary_from = gr.Slider(0, 1000000000,step=1,value=0, label="Зарплата от")
114
+ salary_to = gr.Slider(0, 1000000000,step=1,value=1000000000,label="Зарплата до")
115
+
116
+ education_level = gr.CheckboxGroup(["secondary", "special_secondary", "unfinished_higher", "bachelor", "master", "doctor", "candidate", "higher"], label="Уровень образования")
117
+ experience = gr.CheckboxGroup(["noExperience", "between1And3", "between3And6", "moreThan6"], label="Опыт работы")
118
+
119
+ with gr.Row():
120
+ get_result_btn = gr.Button("Получить")
121
+ with gr.Row():
122
+ result_field = gr.File(label="Результат", interactive=False)
123
+ with gr.Row():
124
+ result_count_field = gr.Label(value="Кол-во ссылок")
125
+
126
+ get_result_btn.click(fn=get_links,inputs=
127
+ [ job_search_status_enable,gender_enable,salary_enable,education_level_enable,experience_enable,age_enable,
128
+ links,use_filters,use_inc_values,job_search_status,gender,age_from,age_to,salary_from,salary_to,education_level,experience],
129
+ outputs=[result_field,result_count_field])
130
+
131
+ demo.launch()
funcs.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.parse
2
+ import scrapy
3
+ from scrapy.crawler import CrawlerProcess
4
+ from bs4 import BeautifulSoup
5
+ import json
6
+
7
+ from module_1_pre.funcs import generate_resume_urls, generate_specific_urls
8
+
9
+
10
+ class ResumeSpider(scrapy.Spider):
11
+ name = "resume_spider"
12
+
13
+ custom_settings = {
14
+ 'RETRY_HTTP_CODES': [400],
15
+ 'RETRY_TIMES': 5,
16
+ 'LOG_ENABLED': False
17
+ }
18
+
19
+ def __init__(self, base_url=None,*args, **kwargs):
20
+ super(ResumeSpider, self).__init__(*args, **kwargs)
21
+ self.base_url = base_url
22
+
23
+ def start_requests(self):
24
+ urls = generate_resume_urls(self.base_url)
25
+ for url in urls:
26
+ yield scrapy.Request(url=url, callback=self.parse_general)
27
+
28
+ def parse_general(self, response):
29
+ soup = BeautifulSoup(response.text, 'html.parser')
30
+ main_element = soup.find('main', class_='resume-serp-content')
31
+
32
+ if main_element and main_element.find('div', attrs={'data-resume-id': True}):
33
+ page_count = self.get_page_count(soup)
34
+
35
+ if page_count < 250:
36
+ result = {
37
+ 'url': response.url,
38
+ 'pages': page_count
39
+ }
40
+ with open('resume_urls.json', 'a') as f:
41
+ json.dump(result, f, ensure_ascii=False, indent=4)
42
+ f.write('\n')
43
+ else:
44
+ self.base_url = response.url
45
+ specific_urls = generate_specific_urls(self.base_url)
46
+ for url in specific_urls:
47
+ yield scrapy.Request(url=url, callback=self.parse_specific)
48
+ else:
49
+ print(f'No resumes found on page: {response.url}')
50
+
51
+ def parse_specific(self, response):
52
+ soup = BeautifulSoup(response.text, 'html.parser')
53
+ page_count = self.get_page_count(soup)
54
+
55
+ result = {
56
+ 'url': response.url,
57
+ 'pages': page_count
58
+ }
59
+ with open('resume_urls.json', 'a', encoding='utf-8') as f:
60
+ json.dump(result, f, ensure_ascii=False, indent=4, separators=(',', ': '))
61
+ f.write(',\n') # Add comma and newline for valid JSON array
62
+
63
+ def get_page_count(self, soup):
64
+ pager = soup.find('div', class_='pager')
65
+ if pager:
66
+ last_page_link = pager.find_all('a', attrs={'data-qa': 'pager-page'})[-1]
67
+ last_page = int(last_page_link.text.strip())
68
+ return last_page
69
+ return 1
70
+
71
+
72
+ def stage1(base_url):
73
+ process = CrawlerProcess()
74
+ process.crawl(ResumeSpider,base_url)
75
+ process.start()
76
+
77
+
78
+
79
+
80
+ # for url in urls:
81
+ # result = generate_resume_urls(url)
82
+ # print(result)
83
+ # print("\n")
84
+
85
+ # lens = len(['https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=unknown&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=not_looking_for_job&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=looking_for_offers&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=active_search&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=has_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&job_search_status=accepted_job_offer&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=moreThan6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between3And6&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=between1And3&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&experience=noExperience&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=male&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&gender=female&label=only_with_gender&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=unfinished_higher&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=master&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=bachelor&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80', 'https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&logic=normal&pos=full_text&exp_period=all_time&education_level=special_secondary&text=%D0%9A%D0%B0%D1%81%D1%81%D0%B8%D1%80'])
86
+ # print(lens)
87
+
88
+
module.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.parse
2
+
3
+ import urllib.parse
4
+
5
+ def generate_resume_urls(base_url, selected_filters=None, filter_values=None, use_incredible_values=True):
6
+ parsed_url = urllib.parse.urlparse(base_url)
7
+ query_params = urllib.parse.parse_qs(parsed_url.query)
8
+
9
+ if "/resumes/" in parsed_url.path:
10
+ search_text = parsed_url.path.split("/")[-1].replace("_", " ")
11
+ else:
12
+ search_text = query_params.get("text", [""])[0]
13
+
14
+ encoded_text = urllib.parse.quote(search_text)
15
+
16
+ filter_options = {
17
+ "job_search_status": ["unknown", "not_looking_for_job", "looking_for_offers", "active_search", "has_job_offer", "accepted_job_offer"],
18
+ "gender": ["male", "female"],
19
+ "age": list(range(18, 71)),
20
+ "salary" : False,
21
+ "salary_from": 0,
22
+ "salary_to": 1000000000,
23
+ "education_level": ["secondary", "special_secondary", "unfinished_higher", "bachelor", "master", "doctor", "candidate", "higher"],
24
+ "experience": ["noExperience", "between1And3", "between3And6", "moreThan6"]
25
+ }
26
+
27
+ if filter_values:
28
+ for key in filter_values.keys():
29
+ # print(key)
30
+ if key == "salary":
31
+ filter_options[key] = filter_values[key]
32
+ continue
33
+ if key == "salary_from":
34
+ filter_options[key] = filter_values[key]
35
+ continue
36
+ if key == "salary_to":
37
+ filter_options[key] = filter_values[key]
38
+ continue
39
+ if filter_values[key] == None:
40
+ continue
41
+
42
+ filter_options[key] = filter_values[key] if len(filter_values[key]) > 0 else filter_options[key]
43
+
44
+ # print(selected_filters)
45
+
46
+ if filter_values is None:
47
+ filter_values = {}
48
+
49
+ if selected_filters is None:
50
+ selected_filters = filter_options.keys()
51
+
52
+ generated_urls = []
53
+
54
+ # print(filter_values)
55
+
56
+ for status in filter_options["job_search_status"] if "job_search_status" in selected_filters else [None]:
57
+ for gender in filter_options["gender"] if "gender" in selected_filters else [None]:
58
+ for age in filter_options["age"] if "age" in selected_filters else [None]:
59
+ for experience in filter_options["experience"] if "experience" in selected_filters else [None]:
60
+ for education in filter_options["education_level"] if "education_level" in selected_filters else [None]:
61
+ url = f"https://hh.ru/search/resume?search_period=0&order_by=relevance&filter_exp_period=all_time&relocation=living_or_relocation&job_search_status_changed_by_user=true&logic=normal&pos=full_text&exp_period=all_time&text={encoded_text}"
62
+
63
+ if status:
64
+ url += f"&job_search_status={status}"
65
+ if gender:
66
+ url += f"&gender={gender}"
67
+ if age:
68
+ url += f"&label=only_with_age"
69
+ url += f"&age_from={age}&age_to={age}"
70
+ if experience:
71
+ url += f"&experience={experience}"
72
+ if education:
73
+ url += f"&education_level={education}"
74
+
75
+ if "salary" in selected_filters:
76
+ url +=f"&label=only_with_salary"
77
+ url += f"&salary_from={filter_options['salary_from']}&salary_to={filter_options['salary_to']}"
78
+
79
+ if "area" in query_params:
80
+ url += f"&area={query_params['area'][0]}"
81
+ if "university" in query_params:
82
+ url += f"&university={query_params['university'][0]}"
83
+
84
+ generated_urls.append(url)
85
+
86
+ if use_incredible_values:
87
+ for status in filter_options["job_search_status"] if "job_search_status" in selected_filters else [None]:
88
+ for gender in filter_options["gender"] if "gender" in selected_filters else [None]:
89
+ for experience in filter_options["experience"] if "experience" in selected_filters else [None]:
90
+ for education in filter_options["education_level"] if "education_level" in selected_filters else [None]:
91
+ url = f"https://hh.ru/search/resume?search_period=0&order_by=relevance&salary_from=10000000&salary_to=1000000000000&filter_exp_period=all_time&relocation=living_or_relocation&job_search_status_changed_by_user=true&logic=normal&pos=full_text&exp_period=all_time&age_from=100000000&age_to=1000000&text={encoded_text}"
92
+
93
+ if status:
94
+ url += f"&job_search_status={status}"
95
+ if gender:
96
+ url += f"&gender={gender}"
97
+ if experience:
98
+ url += f"&experience={experience}"
99
+ if education:
100
+ url += f"&education_level={education}"
101
+
102
+ if "area" in query_params:
103
+ url += f"&area={query_params['area'][0]}"
104
+ if "university" in query_params:
105
+ url += f"&university={query_params['university'][0]}"
106
+
107
+ generated_urls.append(url)
108
+
109
+ for key, value in query_params.items():
110
+ if key not in ["text", "job_search_status", "experience", "gender", "age_from", "age_to", "education_level", "area", "university"]:
111
+ for url in generated_urls:
112
+ url += f"&{key}={value[0]}"
113
+
114
+ return generated_urls
115
+
116
+
117
+ def generate_specific_urls(base_url):
118
+ parsed_url = urllib.parse.urlparse(base_url)
119
+ query_params = urllib.parse.parse_qs(parsed_url.query)
120
+
121
+ experience_options = ["noExperience", "between1And3", "between3And6", "moreThan6"]
122
+ employment_options = ["full", "part", "project", "volunteer", "probation"]
123
+
124
+ specific_urls = []
125
+
126
+ for emp in employment_options:
127
+ query_params["employment"] = [emp]
128
+
129
+ # Add area and university parameters if present
130
+ if "area" in query_params:
131
+ query_params["area"] = query_params["area"]
132
+ if "university" in query_params:
133
+ query_params["university"] = query_params["university"]
134
+
135
+ new_query_string = urllib.parse.urlencode(query_params, doseq=True)
136
+ new_url = parsed_url._replace(query=new_query_string).geturl()
137
+ specific_urls.append(new_url)
138
+
139
+ return specific_urls
140
+
141
+
142
+ # urls = generate_resume_urls("https://hh.ru/resumes/generalnyj-direktor-stroitelnoj-kompanii",use_incredible_values=True)
143
+ # print(len(urls))
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ attrs==23.2.0
2
+ Automat==22.10.0
3
+ beautifulsoup4==4.12.3
4
+ certifi==2024.2.2
5
+ cffi==1.16.0
6
+ charset-normalizer==3.3.2
7
+ colorama==0.4.6
8
+ constantly==23.10.4
9
+ crochet==2.1.1
10
+ cryptography==42.0.5
11
+ cssselect==1.2.0
12
+ filelock==3.13.3
13
+ hyperlink==21.0.0
14
+ idna==3.6
15
+ incremental==22.10.0
16
+ itemadapter==0.8.0
17
+ itemloaders==1.1.0
18
+ jmespath==1.0.1
19
+ lxml==5.1.0
20
+ packaging==24.0
21
+ parsel==1.9.0
22
+ Protego==0.3.0
23
+ pyasn1==0.6.0
24
+ pyasn1_modules==0.4.0
25
+ pycparser==2.21
26
+ PyDispatcher==2.0.7
27
+ pyOpenSSL==24.1.0
28
+ queuelib==1.6.2
29
+ requests==2.31.0
30
+ requests-file==2.0.0
31
+ Scrapy==2.11.1
32
+ service-identity==24.1.0
33
+ six==1.16.0
34
+ soupsieve==2.5
35
+ tldextract==5.1.2
36
+ tqdm==4.66.2
37
+ Twisted==24.3.0
38
+ twisted-iocpsupport==1.0.4
39
+ typing_extensions==4.10.0
40
+ urllib3==2.2.1
41
+ w3lib==2.1.2
42
+ wrapt==1.16.0
43
+ zope.interface==6.2