Spaces:
Runtime error
Runtime error
import time | |
from typing import Dict, List | |
from bs4 import BeautifulSoup | |
from bs4.element import Tag | |
from loguru import logger | |
from selenium.webdriver.common.by import By | |
from llm_engineering.domain.documents import PostDocument | |
from llm_engineering.domain.exceptions import ImproperlyConfigured | |
from llm_engineering.settings import settings | |
from .base import BaseSeleniumCrawler | |
class LinkedInCrawler(BaseSeleniumCrawler): | |
model = PostDocument | |
def __init__(self, scroll_limit: int = 5, is_deprecated: bool = True) -> None: | |
super().__init__(scroll_limit) | |
self._is_deprecated = is_deprecated | |
def set_extra_driver_options(self, options) -> None: | |
options.add_experimental_option("detach", True) | |
def login(self) -> None: | |
if self._is_deprecated: | |
raise DeprecationWarning( | |
"As LinkedIn has updated its security measures, the login() method is no longer supported." | |
) | |
self.driver.get("https://www.linkedin.com/login") | |
if not settings.LINKEDIN_USERNAME or not settings.LINKEDIN_PASSWORD: | |
raise ImproperlyConfigured( | |
"LinkedIn scraper requires the {LINKEDIN_USERNAME} and {LINKEDIN_PASSWORD} settings." | |
) | |
self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME) | |
self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD) | |
self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click() | |
def extract(self, link: str, **kwargs) -> None: | |
if self._is_deprecated: | |
raise DeprecationWarning( | |
"As LinkedIn has updated its feed structure, the extract() method is no longer supported." | |
) | |
if self.model.link is not None: | |
old_model = self.model.find(link=link) | |
if old_model is not None: | |
logger.info(f"Post already exists in the database: {link}") | |
return | |
logger.info(f"Starting scrapping data for profile: {link}") | |
self.login() | |
soup = self._get_page_content(link) | |
data = { # noqa | |
"Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"), | |
"About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"), | |
"Main Page": self._scrape_section(soup, "div", {"id": "main-content"}), | |
"Experience": self._scrape_experience(link), | |
"Education": self._scrape_education(link), | |
} | |
self.driver.get(link) | |
time.sleep(5) | |
button = self.driver.find_element( | |
By.CSS_SELECTOR, ".app-aware-link.profile-creator-shared-content-view__footer-action" | |
) | |
button.click() | |
# Scrolling and scraping posts | |
self.scroll_page() | |
soup = BeautifulSoup(self.driver.page_source, "html.parser") | |
post_elements = soup.find_all( | |
"div", | |
class_="update-components-text relative update-components-update-v2__commentary", | |
) | |
buttons = soup.find_all("button", class_="update-components-image__image-link") | |
post_images = self._extract_image_urls(buttons) | |
posts = self._extract_posts(post_elements, post_images) | |
logger.info(f"Found {len(posts)} posts for profile: {link}") | |
self.driver.close() | |
user = kwargs["user"] | |
self.model.bulk_insert( | |
[ | |
PostDocument(platform="linkedin", content=post, author_id=user.id, author_full_name=user.full_name) | |
for post in posts | |
] | |
) | |
logger.info(f"Finished scrapping data for profile: {link}") | |
def _scrape_section(self, soup: BeautifulSoup, *args, **kwargs) -> str: | |
"""Scrape a specific section of the LinkedIn profile.""" | |
# Example: Scrape the 'About' section | |
parent_div = soup.find(*args, **kwargs) | |
return parent_div.get_text(strip=True) if parent_div else "" | |
def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]: | |
""" | |
Extracts image URLs from button elements. | |
Args: | |
buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons. | |
Returns: | |
Dict[str, str]: A dictionary mapping post indexes to image URLs. | |
""" | |
post_images = {} | |
for i, button in enumerate(buttons): | |
img_tag = button.find("img") | |
if img_tag and "src" in img_tag.attrs: | |
post_images[f"Post_{i}"] = img_tag["src"] | |
else: | |
logger.warning("No image found in this button") | |
return post_images | |
def _get_page_content(self, url: str) -> BeautifulSoup: | |
"""Retrieve the page content of a given URL.""" | |
self.driver.get(url) | |
time.sleep(5) | |
return BeautifulSoup(self.driver.page_source, "html.parser") | |
def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]: | |
""" | |
Extracts post texts and combines them with their respective images. | |
Args: | |
post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements. | |
post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index. | |
Returns: | |
Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL. | |
""" | |
posts_data = {} | |
for i, post_element in enumerate(post_elements): | |
post_text = post_element.get_text(strip=True, separator="\n") | |
post_data = {"text": post_text} | |
if f"Post_{i}" in post_images: | |
post_data["image"] = post_images[f"Post_{i}"] | |
posts_data[f"Post_{i}"] = post_data | |
return posts_data | |
def _scrape_experience(self, profile_url: str) -> str: | |
"""Scrapes the Experience section of the LinkedIn profile.""" | |
self.driver.get(profile_url + "/details/experience/") | |
time.sleep(5) | |
soup = BeautifulSoup(self.driver.page_source, "html.parser") | |
experience_content = soup.find("section", {"id": "experience-section"}) | |
return experience_content.get_text(strip=True) if experience_content else "" | |
def _scrape_education(self, profile_url: str) -> str: | |
self.driver.get(profile_url + "/details/education/") | |
time.sleep(5) | |
soup = BeautifulSoup(self.driver.page_source, "html.parser") | |
education_content = soup.find("section", {"id": "education-section"}) | |
return education_content.get_text(strip=True) if education_content else "" | |