| | import requests |
| | import pdfplumber |
| | import os |
| | import re |
| | from typing import List, Dict |
| | import tempfile |
| | from urllib.parse import urlparse |
| |
|
| | class PDFParser: |
| | def __init__(self): |
| | self.session = requests.Session() |
| | self.session.headers.update({ |
| | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| | }) |
| | |
| | def download_pdf(self, url: str, filename: str) -> str: |
| | """Скачивает PDF файл и сохраняет локально""" |
| | try: |
| | print(f'Скачивание PDF: {filename}') |
| | response = self.session.get(url, stream=True, timeout=60) |
| | response.raise_for_status() |
| | |
| | |
| | os.makedirs('data/raw', exist_ok=True) |
| | |
| | |
| | filepath = os.path.join('data/raw', filename) |
| | with open(filepath, 'wb') as f: |
| | for chunk in response.iter_content(chunk_size=8192): |
| | f.write(chunk) |
| | |
| | print(f'PDF сохранен: {filepath}') |
| | return filepath |
| | |
| | except Exception as e: |
| | print(f'Ошибка скачивания PDF {url}: {e}') |
| | return None |
| | |
| | def parse_pdf(self, filepath: str, program_id: str) -> List[Dict]: |
| | """Парсит PDF и извлекает информацию о курсах""" |
| | courses = [] |
| | |
| | try: |
| | print(f'Парсинг PDF: {filepath}') |
| | |
| | with pdfplumber.open(filepath) as pdf: |
| | |
| | table_courses = self._extract_from_tables(pdf, program_id) |
| | if table_courses: |
| | courses.extend(table_courses) |
| | print(f'Извлечено из таблиц: {len(table_courses)} курсов') |
| | |
| | |
| | if len(courses) < 5: |
| | text_courses = self._extract_from_text(pdf, program_id) |
| | courses.extend(text_courses) |
| | print(f'Извлечено из текста: {len(text_courses)} курсов') |
| | |
| | |
| | courses = self._deduplicate_courses(courses) |
| | |
| | print(f'Всего извлечено курсов: {len(courses)}') |
| | return courses |
| | |
| | except Exception as e: |
| | print(f'Ошибка парсинга PDF {filepath}: {e}') |
| | return [] |
| | |
| | def _extract_from_tables(self, pdf, program_id: str) -> List[Dict]: |
| | """Извлекает курсы из таблиц PDF""" |
| | courses = [] |
| | current_semester = 1 |
| | |
| | for page_num, page in enumerate(pdf.pages): |
| | try: |
| | |
| | tables = page.extract_tables() |
| | |
| | for table in tables: |
| | if not table or len(table) < 2: |
| | continue |
| | |
| | |
| | semester = self._detect_semester_from_table(table, current_semester) |
| | if semester: |
| | current_semester = semester |
| | |
| | |
| | for row in table[1:]: |
| | if not row or len(row) < 2: |
| | continue |
| | |
| | course = self._parse_table_row(row, program_id, current_semester, page_num + 1) |
| | if course: |
| | courses.append(course) |
| | |
| | except Exception as e: |
| | print(f'Ошибка обработки страницы {page_num + 1}: {e}') |
| | continue |
| | |
| | return courses |
| | |
| | def _extract_from_text(self, pdf, program_id: str) -> List[Dict]: |
| | """Извлекает курсы из текста PDF""" |
| | courses = [] |
| | current_semester = 1 |
| | |
| | for page_num, page in enumerate(pdf.pages): |
| | try: |
| | text = page.extract_text() |
| | if not text: |
| | continue |
| | |
| | |
| | semester = self._detect_semester_from_text(text, current_semester) |
| | if semester: |
| | current_semester = semester |
| | |
| | |
| | page_courses = self._parse_text_for_courses(text, program_id, current_semester, page_num + 1) |
| | courses.extend(page_courses) |
| | |
| | except Exception as e: |
| | print(f'Ошибка обработки текста страницы {page_num + 1}: {e}') |
| | continue |
| | |
| | return courses |
| | |
| | def _detect_semester_from_table(self, table: List[List], current_semester: int) -> int: |
| | """Определяет семестр по заголовкам таблицы""" |
| | if not table or not table[0]: |
| | return current_semester |
| | |
| | header_text = ' '.join([str(cell) for cell in table[0] if cell]).lower() |
| | |
| | |
| | for i in range(1, 5): |
| | if f'{i} семестр' in header_text or f'{i} семестре' in header_text: |
| | return i |
| | |
| | return current_semester |
| | |
| | def _detect_semester_from_text(self, text: str, current_semester: int) -> int: |
| | """Определяет семестр по тексту""" |
| | text_lower = text.lower() |
| | |
| | |
| | for i in range(1, 5): |
| | if f'{i} семестр' in text_lower or f'{i} семестре' in text_lower: |
| | return i |
| | |
| | return current_semester |
| | |
| | def _parse_table_row(self, row: List, program_id: str, semester: int, page: int) -> Dict: |
| | """Парсит строку таблицы и извлекает информацию о курсе""" |
| | if not row or len(row) < 2: |
| | return None |
| | |
| | |
| | clean_row = [str(cell).strip() if cell else '' for cell in row] |
| | |
| | |
| | course_name = '' |
| | credits = 0 |
| | hours = 0 |
| | course_type = 'required' |
| | |
| | for i, cell in enumerate(clean_row): |
| | if not cell or cell.lower() in ['название', 'дисциплина', 'курс', 'предмет']: |
| | continue |
| | |
| | |
| | if len(cell) > 10 and not cell.isdigit(): |
| | course_name = cell |
| | break |
| | |
| | |
| | for cell in clean_row: |
| | if cell.isdigit(): |
| | num = int(cell) |
| | if 1 <= num <= 12: |
| | credits = num |
| | elif 18 <= num <= 216: |
| | hours = num |
| | |
| | |
| | row_text = ' '.join(clean_row).lower() |
| | if any(word in row_text for word in ['по выбору', 'электив', 'факультатив']): |
| | course_type = 'elective' |
| | |
| | if not course_name or len(course_name) < 5: |
| | return None |
| | |
| | return { |
| | 'id': f'{program_id}_{semester}_{len(course_name)}', |
| | 'program_id': program_id, |
| | 'semester': semester, |
| | 'name': course_name, |
| | 'credits': credits, |
| | 'hours': hours, |
| | 'type': course_type, |
| | 'source_pdf': os.path.basename(filepath) if 'filepath' in locals() else '', |
| | 'source_page': page |
| | } |
| | |
| | def _parse_text_for_courses(self, text: str, program_id: str, semester: int, page: int) -> List[Dict]: |
| | """Парсит текст и ищет курсы""" |
| | courses = [] |
| | |
| | |
| | lines = text.split('\n') |
| | |
| | for line in lines: |
| | line = line.strip() |
| | if not line or len(line) < 10: |
| | continue |
| | |
| | |
| | course = self._extract_course_from_line(line, program_id, semester, page) |
| | if course: |
| | courses.append(course) |
| | |
| | return courses |
| | |
| | def _extract_course_from_line(self, line: str, program_id: str, semester: int, page: int) -> Dict: |
| | """Извлекает информацию о курсе из строки текста""" |
| | |
| | patterns = [ |
| | r'([А-Я][А-Яа-я\s\-\(\)]+?)\s+(\d+)\s+(\d+)', |
| | r'([А-Я][А-Яа-я\s\-\(\)]+?)\s+(\d+)\s*кр', |
| | r'([А-Я][А-Яа-я\s\-\(\)]+?)\s+(\d+)\s*ч', |
| | ] |
| | |
| | for pattern in patterns: |
| | match = re.search(pattern, line) |
| | if match: |
| | course_name = match.group(1).strip() |
| | if len(course_name) < 5: |
| | continue |
| | |
| | |
| | numbers = [int(match.group(i)) for i in range(2, len(match.groups()) + 1)] |
| | |
| | credits = 0 |
| | hours = 0 |
| | |
| | if len(numbers) >= 2: |
| | credits, hours = numbers[0], numbers[1] |
| | elif len(numbers) == 1: |
| | if numbers[0] <= 12: |
| | credits = numbers[0] |
| | else: |
| | hours = numbers[0] |
| | |
| | |
| | course_type = 'required' |
| | if any(word in line.lower() for word in ['по выбору', 'электив', 'факультатив']): |
| | course_type = 'elective' |
| | |
| | return { |
| | 'id': f'{program_id}_{semester}_{len(course_name)}', |
| | 'program_id': program_id, |
| | 'semester': semester, |
| | 'name': course_name, |
| | 'credits': credits, |
| | 'hours': hours, |
| | 'type': course_type, |
| | 'source_page': page |
| | } |
| | |
| | return None |
| | |
| | def _deduplicate_courses(self, courses: List[Dict]) -> List[Dict]: |
| | """Удаляет дубликаты курсов""" |
| | seen = set() |
| | unique_courses = [] |
| | |
| | for course in courses: |
| | |
| | key = f"{course['name']}_{course['semester']}_{course['program_id']}" |
| | |
| | if key not in seen: |
| | seen.add(key) |
| | unique_courses.append(course) |
| | |
| | return unique_courses |
| |
|
| | def main(): |
| | parser = PDFParser() |
| | |
| | |
| | test_url = "https://example.com/test.pdf" |
| | filename = "test_curriculum.pdf" |
| | |
| | |
| | filepath = parser.download_pdf(test_url, filename) |
| | if filepath: |
| | courses = parser.parse_pdf(filepath, 'test_program') |
| | print(f'Извлечено курсов: {len(courses)}') |
| | for course in courses[:5]: |
| | print(f"- {course['name']} ({course['semester']} семестр, {course['credits']} кредитов)") |
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|