gperdrizet commited on
Commit
df6f062
·
verified ·
1 Parent(s): 540a3f7

Cleaned up

Browse files
functions/__init__.py CHANGED
@@ -5,6 +5,6 @@ This package contains modules for data acquisition, processing, and analysis
5
  of LinkedIn profiles, GitHub profiles, and job postings.
6
  """
7
 
8
- from .data_acquisition import get_linkedin_profile_html
9
 
10
  __all__ = ['get_linkedin_profile_html']
 
5
  of LinkedIn profiles, GitHub profiles, and job postings.
6
  """
7
 
8
+ from .context_acquisition import get_linkedin_profile_html
9
 
10
  __all__ = ['get_linkedin_profile_html']
functions/context_acquisition.py CHANGED
@@ -5,14 +5,15 @@ Functions for acquiring context from various sources including LinkedIn profiles
5
  GitHub profiles, and job postings using browser automation.
6
  """
7
 
 
 
 
8
  from selenium import webdriver
9
  from selenium.webdriver.chrome.options import Options
10
  from selenium.webdriver.common.by import By
11
  from selenium.webdriver.support.ui import WebDriverWait
12
  from selenium.webdriver.support import expected_conditions as EC
13
  from selenium.common.exceptions import TimeoutException, WebDriverException
14
- import time
15
- import logging
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO)
@@ -35,14 +36,14 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
35
  WebDriverException: If there's an issue with the browser automation
36
  TimeoutException: If the page takes too long to load
37
  """
38
-
39
  # Validate LinkedIn URL
40
  if not profile_url or not isinstance(profile_url, str):
41
  raise ValueError("Profile URL must be a non-empty string")
42
-
43
  if "linkedin.com/in/" not in profile_url:
44
  raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
45
-
46
  # Configure Chrome options for headless browsing
47
  chrome_options = Options()
48
  chrome_options.add_argument("--headless") # Run in background
@@ -50,53 +51,66 @@ def get_linkedin_profile_html(profile_url: str, wait_time: int = 10) -> str:
50
  chrome_options.add_argument("--disable-dev-shm-usage")
51
  chrome_options.add_argument("--disable-gpu")
52
  chrome_options.add_argument("--window-size=1920,1080")
53
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
54
-
 
55
  driver = None
56
  try:
57
  # Initialize the Chrome driver
58
  logger.info("Initializing browser for URL: %s", profile_url)
59
  driver = webdriver.Chrome(options=chrome_options)
60
  driver.set_page_load_timeout(30)
61
-
62
  # Navigate to the LinkedIn profile
63
  logger.info("Navigating to LinkedIn profile...")
64
  driver.get(profile_url)
65
-
66
  # Wait for the page to load
67
  # Look for common LinkedIn profile elements
68
  wait = WebDriverWait(driver, wait_time)
69
-
70
  try:
71
  # Wait for either the main content or login prompt
72
  wait.until(
73
  EC.any_of(
74
- EC.presence_of_element_located((By.CSS_SELECTOR, ".pv-top-card")), # Profile header
75
- EC.presence_of_element_located((By.CSS_SELECTOR, ".profile-section")), # Profile section
76
- EC.presence_of_element_located((By.CSS_SELECTOR, ".authwall")), # Auth wall
77
- EC.presence_of_element_located((By.CSS_SELECTOR, ".public-profile")), # Public profile
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
79
  )
80
  except TimeoutException:
81
  logger.warning("Standard LinkedIn elements not found, proceeding with current page state")
82
-
83
  # Additional wait to ensure dynamic content loads
84
  time.sleep(2)
85
-
86
  # Get the page HTML
87
  html_content = driver.page_source
88
-
89
  logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
90
  return html_content
91
-
92
  except WebDriverException as e:
93
  logger.error("WebDriver error occurred: %s", str(e))
94
  raise WebDriverException(f"Browser automation failed: {str(e)}") from e
95
-
96
  except Exception as e:
97
  logger.error("Unexpected error occurred: %s", str(e))
98
  raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
99
-
100
  finally:
101
  # Always clean up the driver
102
  if driver:
@@ -123,5 +137,5 @@ def setup_chrome_driver_options() -> Options:
123
  chrome_options.add_argument("--disable-blink-features=AutomationControlled")
124
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
125
  chrome_options.add_experimental_option('useAutomationExtension', False)
126
-
127
  return chrome_options
 
5
  GitHub profiles, and job postings using browser automation.
6
  """
7
 
8
+ import time
9
+ import logging
10
+
11
  from selenium import webdriver
12
  from selenium.webdriver.chrome.options import Options
13
  from selenium.webdriver.common.by import By
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
  from selenium.common.exceptions import TimeoutException, WebDriverException
 
 
17
 
18
  # Set up logging
19
  logging.basicConfig(level=logging.INFO)
 
36
  WebDriverException: If there's an issue with the browser automation
37
  TimeoutException: If the page takes too long to load
38
  """
39
+
40
  # Validate LinkedIn URL
41
  if not profile_url or not isinstance(profile_url, str):
42
  raise ValueError("Profile URL must be a non-empty string")
43
+
44
  if "linkedin.com/in/" not in profile_url:
45
  raise ValueError("URL must be a valid LinkedIn profile URL (containing 'linkedin.com/in/')")
46
+
47
  # Configure Chrome options for headless browsing
48
  chrome_options = Options()
49
  chrome_options.add_argument("--headless") # Run in background
 
51
  chrome_options.add_argument("--disable-dev-shm-usage")
52
  chrome_options.add_argument("--disable-gpu")
53
  chrome_options.add_argument("--window-size=1920,1080")
54
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
55
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
56
+
57
  driver = None
58
  try:
59
  # Initialize the Chrome driver
60
  logger.info("Initializing browser for URL: %s", profile_url)
61
  driver = webdriver.Chrome(options=chrome_options)
62
  driver.set_page_load_timeout(30)
63
+
64
  # Navigate to the LinkedIn profile
65
  logger.info("Navigating to LinkedIn profile...")
66
  driver.get(profile_url)
67
+
68
  # Wait for the page to load
69
  # Look for common LinkedIn profile elements
70
  wait = WebDriverWait(driver, wait_time)
71
+
72
  try:
73
  # Wait for either the main content or login prompt
74
  wait.until(
75
  EC.any_of(
76
+ EC.presence_of_element_located(( # Profile header
77
+ By.CSS_SELECTOR,
78
+ ".pv-top-card"
79
+ )),
80
+ EC.presence_of_element_located(( # Profile section
81
+ By.CSS_SELECTOR,
82
+ ".profile-section"
83
+ )),
84
+ EC.presence_of_element_located(( # Auth wall
85
+ By.CSS_SELECTOR,
86
+ ".authwall"
87
+ )),
88
+ EC.presence_of_element_located(( # Public profile
89
+ By.CSS_SELECTOR,
90
+ ".public-profile"
91
+ )),
92
  )
93
  )
94
  except TimeoutException:
95
  logger.warning("Standard LinkedIn elements not found, proceeding with current page state")
96
+
97
  # Additional wait to ensure dynamic content loads
98
  time.sleep(2)
99
+
100
  # Get the page HTML
101
  html_content = driver.page_source
102
+
103
  logger.info("Successfully retrieved HTML content (%d characters)", len(html_content))
104
  return html_content
105
+
106
  except WebDriverException as e:
107
  logger.error("WebDriver error occurred: %s", str(e))
108
  raise WebDriverException(f"Browser automation failed: {str(e)}") from e
109
+
110
  except Exception as e:
111
  logger.error("Unexpected error occurred: %s", str(e))
112
  raise RuntimeError(f"Failed to retrieve LinkedIn profile: {str(e)}") from e
113
+
114
  finally:
115
  # Always clean up the driver
116
  if driver:
 
137
  chrome_options.add_argument("--disable-blink-features=AutomationControlled")
138
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
139
  chrome_options.add_experimental_option('useAutomationExtension', False)
140
+
141
  return chrome_options
functions/data_acquisition.py ADDED
File without changes
resumate.py CHANGED
@@ -37,7 +37,7 @@ def process_inputs(linkedin_url, github_url, job_post_url):
37
  try:
38
  result += "Attempting to retrieve LinkedIn profile...\n"
39
  html_content = get_linkedin_profile_html(linkedin_url)
40
- result += f"✅ Successfully retrieved LinkedIn profile HTML ({len(html_content)} characters)\n"
41
  except Exception as e: # pylint: disable=broad-exception-caught
42
  result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
43
 
 
37
  try:
38
  result += "Attempting to retrieve LinkedIn profile...\n"
39
  html_content = get_linkedin_profile_html(linkedin_url)
40
+ result += f"LinkedIn profile HTML ({len(html_content)} characters)\n"
41
  except Exception as e: # pylint: disable=broad-exception-caught
42
  result += f"❌ Failed to retrieve LinkedIn profile: {str(e)}\n"
43