import requests import re from bs4 import BeautifulSoup from typing import List, Dict import hashlib import json import os class HTMLScraper: def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) self.program_urls = { 'ai': 'https://abit.itmo.ru/program/master/ai', 'ai_product': 'https://abit.itmo.ru/program/master/ai_product' } def scrape_programs(self) -> Dict: programs = {} for program_id, url in self.program_urls.items(): try: print(f'Скрапинг программы {program_id}...') program_data = self._scrape_program_page(url, program_id) programs[program_id] = program_data except Exception as e: print(f'Ошибка при скрапинге {program_id}: {e}') return programs def _scrape_program_page(self, url: str, program_id: str) -> Dict: response = self.session.get(url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') title = self._extract_title(soup) description = self._extract_description(soup) pdf_links = self._extract_pdf_links(soup, url) program_data = { 'id': program_id, 'title': title, 'description': description, 'url': url, 'pdf_links': pdf_links, 'hash': self._calculate_hash(response.content) } return program_data def _extract_title(self, soup: BeautifulSoup) -> str: title_elem = soup.find('h1') or soup.find('title') if title_elem: return title_elem.get_text().strip() return '' def _extract_description(self, soup: BeautifulSoup) -> str: desc_selectors = [ '.program-description', '.description', '.program-info', 'p', '.content' ] for selector in desc_selectors: elem = soup.select_one(selector) if elem: text = elem.get_text().strip() if len(text) > 50: return text[:500] return '' def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: pdf_links = [] for link in soup.find_all('a', href=True): href = link.get('href', '') text = link.get_text().strip().lower() if self._is_pdf_link(href, text): full_url = self._make_absolute_url(href, base_url) pdf_links.append({ 'url': full_url, 'text': text, 'filename': self._extract_filename(href) }) return pdf_links def _is_pdf_link(self, href: str, text: str) -> bool: pdf_indicators = [ 'учебный план', 'учебный план', 'curriculum', 'plan', 'pdf', '.pdf', 'программа', 'program' ] href_lower = href.lower() return any(indicator in href_lower or indicator in text for indicator in pdf_indicators) def _make_absolute_url(self, href: str, base_url: str) -> str: if href.startswith('http'): return href elif href.startswith('/'): base = '/'.join(base_url.split('/')[:3]) return base + href else: return base_url.rstrip('/') + '/' + href.lstrip('/') def _extract_filename(self, href: str) -> str: filename = href.split('/')[-1] if not filename.endswith('.pdf'): filename += '.pdf' return filename def _calculate_hash(self, content: bytes) -> str: return hashlib.sha256(content).hexdigest() def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'): os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(programs, f, ensure_ascii=False, indent=2) print(f'Программы сохранены в {output_path}') def main(): scraper = HTMLScraper() programs = scraper.scrape_programs() scraper.save_programs(programs) for program_id, program in programs.items(): print(f'\n{program["title"]}:') print(f'PDF ссылок найдено: {len(program["pdf_links"])}') for link in program['pdf_links']: print(f' - {link["filename"]}: {link["url"]}') if __name__ == '__main__': main()