Spaces:

balancet1
/

task-parser

Running

App Files Files Community

Artem Nikolaev commited on 14 days ago

Commit

ca8ebf7

1 Parent(s): 3f3962f

PDF Task Parser with FastAPI and Google integration

Browse files

Files changed (26) hide show

Dockerfile +14 -0
backend.py +257 -0
requirements.txt +17 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-314.pyc +0 -0
src/__pycache__/excel_exporter.cpython-314.pyc +0 -0
src/__pycache__/google_calendar.cpython-314.pyc +0 -0
src/__pycache__/google_sheets.cpython-314.pyc +0 -0
src/__pycache__/main_with_calendar.cpython-314.pyc +0 -0
src/__pycache__/parser.cpython-314.pyc +0 -0
src/__pycache__/summarizer.cpython-314.pyc +0 -0
src/excel_exporter.py +105 -0
src/google_calendar.py +190 -0
src/google_sheets.py +127 -0
src/parser.py +653 -0
src/summarizer.py +122 -0
web/.DS_Store +0 -0
web/icons/.DS_Store +0 -0
web/icons/calendarLogo.svg +16 -0
web/icons/favicon.svg +7 -0
web/icons/sheetsLogo-svgrepo-com.svg +7 -0
web/icons/titleIcon.svg +37 -0
web/icons/upload.svg +27 -0
web/index.html +92 -0
web/script.js +230 -0
web/style.css +253 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN mkdir -p temp_uploads output
+EXPOSE 7860
+CMD ["uvicorn", "backend:app", "--host", "0.0.0.0", "--port", "7860"]

backend.py ADDED Viewed

	@@ -0,0 +1,257 @@

+from fastapi import FastAPI, UploadFile, File, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, HTMLResponse
+import tempfile
+import os
+import sys
+import base64
+import uvicorn
+import pandas as pd
+from io import BytesIO
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from src.parser import TaskParser
+from src.excel_exporter import ExcelExporter
+from src.summarizer import TaskSummarizer
+from src.google_sheets import GoogleSheetsExporter
+from src.google_calendar import GoogleCalendarExporter
+import json
+GOOGLE_CREDS = os.environ.get("GOOGLE_CREDENTIALS")
+if GOOGLE_CREDS:
+    os.makedirs("credentials", exist_ok=True)
+    with open("credentials/google-credentials.json", "w") as f:
+        f.write(GOOGLE_CREDS)
+    print("✅ Google-ключ загружен из секретов")
+app = FastAPI(title="PDF Task Parser API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+TEMP_DIR = "temp_uploads"
+os.makedirs(TEMP_DIR, exist_ok=True)
+summarizer = None
+def get_summarizer():
+    global summarizer
+    if summarizer is None:
+        try:
+            print("🔄 Загрузка суммаризатора...")
+            summarizer = TaskSummarizer()
+            print("✅ Суммаризатор загружен")
+        except Exception as e:
+            print(f"⚠️ Суммаризатор не загружен: {e}")
+            summarizer = False
+    return summarizer if summarizer is not False else None
+@app.get("/")
+async def root():
+    return {"message": "PDF Task Parser API", "status": "running"}
+@app.get("/app")
+async def get_app():
+    html_path = os.path.join("web", "index.html")
+    if os.path.exists(html_path):
+        with open(html_path, "r", encoding="utf-8") as f:
+            return HTMLResponse(content=f.read())
+    return HTMLResponse(content="<h1>index.html not found</h1>", status_code=404)
+@app.get("/style.css")
+async def get_css():
+    with open("web/style.css", "r", encoding="utf-8") as f:
+        return HTMLResponse(content=f.read(), media_type="text/css")
+@app.get("/script.js")
+async def get_js():
+    with open("web/script.js", "r", encoding="utf-8") as f:
+        return HTMLResponse(content=f.read(), media_type="application/javascript")
+@app.get("/web/icons/{icon_name}")
+async def get_icon(icon_name: str):
+    icon_path = os.path.join("web", "icons", icon_name)
+    if os.path.exists(icon_path):
+        with open(icon_path, "rb") as f:
+            return HTMLResponse(content=f.read(), media_type="image/svg+xml")
+    return HTMLResponse(status_code=404)
+@app.post("/parse-batch")
+async def parse_batch(request: Request):
+    print("\n" + "="*60)
+    print("🔍 ПОЛУЧЕН ЗАПРОС НА ПАРСИНГ")
+    print("="*60)
+    form = await request.form()
+    files = form.getlist("files")
+    export_to_sheets = form.get("export_to_sheets", "false").lower() == "true"
+    export_to_calendar = form.get("export_to_calendar", "false").lower() == "true"
+    sheets_url = form.get("sheets_url", "")
+    calendar_id = form.get("calendar_id", "")
+    print(f"📄 Файлов: {len(files)}")
+    print(f"📊 Экспорт в Sheets: {export_to_sheets}")
+    print(f"📅 Экспорт в Calendar: {export_to_calendar}")
+    print(f"🔗 URL Sheets: {sheets_url}")
+    print(f"📆 ID Calendar: {calendar_id}")
+    print("="*60 + "\n")
+    all_results = []
+    all_tasks_data = []
+    sheets_export_status = None
+    calendar_export_status = None
+    all_dfs = []
+    for file in files:
+        file_ext = os.path.splitext(file.filename)[1].lower()
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp:
+            content = await file.read()
+            tmp.write(content)
+            tmp_path = tmp.name
+        try:
+            parser = TaskParser(tmp_path)
+            text = parser.extract_text()
+            tasks = parser.parse_tasks(text)
+            if tasks:
+                summarizer = get_summarizer()
+                for task in tasks:
+                    task['source'] = file.filename
+                    if summarizer:
+                        try:
+                            task['summary'] = summarizer.summarize(task['full_description'])
+                        except Exception:
+                            task['summary'] = task['full_description'][:100] + "..."
+                    else:
+                        task['summary'] = task['full_description'][:100] + "..."
+                    all_tasks_data.append(task)
+                df_data = []
+                for task in tasks:
+                    df_data.append({
+                        '№': task['number'],
+                        'Краткое описание': task.get('summary', ''),
+                        'Описание': task['full_description'],
+                        'Ответственный': task.get('responsible', ''),
+                        'Срок': task.get('due_date_str', '')
+                    })
+                df = pd.DataFrame(df_data)
+                all_dfs.append({
+                    "df": df,
+                    "filename": file.filename,
+                    "tasks": tasks
+                })
+                all_results.append({
+                    "filename": file.filename,
+                    "tasks": tasks,
+                    "count": len(tasks)
+                })
+            os.remove(tmp_path)
+        except Exception as e:
+            print(f"Ошибка в {file.filename}: {e}")
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+    if not all_tasks_data:
+        return JSONResponse({
+            "success": False,
+            "error": "Задачи не найдены ни в одном файле"
+        })
+    # ===== ЭКСПОРТ В GOOGLE SHEETS =====
+    if export_to_sheets and sheets_url:
+        try:
+            print("📊 Экспорт в Google Sheets...")
+            sheets_exporter = GoogleSheetsExporter()
+            if sheets_exporter.use_existing_spreadsheet(sheets_url):
+                for item in all_dfs:
+                    sheet_name = os.path.splitext(item['filename'])[0][:30]
+                    sheet_name = sheet_name.replace(' ', '_').replace('/', '_')
+                    sheets_exporter.export_dataframe(item['df'], sheet_name)
+                sheets_export_status = "success"
+                print("✅ Экспорт в Google Sheets выполнен")
+            else:
+                sheets_export_status = "error: таблица не найдена"
+                print("❌ Таблица не найдена")
+        except Exception as e:
+            sheets_export_status = f"error: {str(e)}"
+            print(f"❌ Ошибка Sheets: {e}")
+    # ===== ЭКСПОРТ В GOOGLE CALENDAR =====
+    if export_to_calendar and calendar_id:
+        try:
+            print(f"📅 Экспорт в Google Calendar...")
+            print(f"   ID календаря: {calendar_id}")
+            print(f"   Количество задач: {len(all_tasks_data)}")
+            calendar_exporter = GoogleCalendarExporter(calendar_id=calendar_id)
+            calendar_exporter.create_events_from_tasks(all_tasks_data)
+            calendar_export_status = "success"
+            print("✅ Экспорт в Google Calendar выполнен")
+        except Exception as e:
+            calendar_export_status = f"error: {str(e)}"
+            print(f"❌ Ошибка Calendar: {e}")
+    else:
+        print(f"⚠️ Экспорт в Calendar пропущен: export_to_calendar={export_to_calendar}, calendar_id={calendar_id}")
+    # ===== СОЗДАЁМ EXCEL =====
+    exporter = ExcelExporter()
+    for item in all_dfs:
+        sheet_name = os.path.splitext(item['filename'])[0][:30]
+        sheet_name = sheet_name.replace(' ', '_').replace('/', '_').replace('\\', '_')
+        exporter.add_sheet(item['df'], sheet_name)
+    all_df_data = []
+    for task in all_tasks_data:
+        all_df_data.append({
+            'Источник': task.get('source', ''),
+            '№': task['number'],
+            'Краткое описание': task.get('summary', ''),
+            'Описание': task['full_description'],
+            'Ответственный': task.get('responsible', ''),
+            'Срок': task.get('due_date_str', '')
+        })
+    all_df = pd.DataFrame(all_df_data)
+    exporter.add_sheet(all_df, "Все задачи")
+    excel_buffer = BytesIO()
+    exporter.save_to_buffer(excel_buffer)
+    excel_bytes = excel_buffer.getvalue()
+    excel_base64 = base64.b64encode(excel_bytes).decode('ascii')
+    total_stats = {
+        "total": len(all_tasks_data),
+        "with_responsible": sum(1 for t in all_tasks_data if t.get('responsible')),
+        "with_date": sum(1 for t in all_tasks_data if t.get('due_date_str')),
+        "files_count": len(all_results)
+    }
+    return {
+        "success": True,
+        "tasks": all_tasks_data,
+        "statistics": total_stats,
+        "excel_base64": excel_base64,
+        "files": [{"name": r["filename"], "count": r["count"]} for r in all_results],
+        "sheets_export": sheets_export_status,
+        "calendar_export": calendar_export_status
+    }
+if __name__ == "__main__":
+    print("🚀 Запуск PDF Task Parser API")
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+fastapi==0.135.1
+uvicorn==0.42.0
+python-multipart==0.0.22
+pandas==3.0.1
+openpyxl==3.1.5
+pdfplumber==0.11.9
+python-docx==1.2.0
+google-auth==2.48.0
+google-api-python-client==2.190.0
+gspread==6.2.1
+transformers==5.2.0
+torch==2.10.0
+sentencepiece==0.2.1
+tiktoken==0.12.0
+python-dateutil==2.9.0.post0
+certifi==2026.2.25
+requests==2.32.5

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (143 Bytes). View file

src/__pycache__/excel_exporter.cpython-314.pyc ADDED Viewed

Binary file (6.23 kB). View file

src/__pycache__/google_calendar.cpython-314.pyc ADDED Viewed

Binary file (9.01 kB). View file

src/__pycache__/google_sheets.cpython-314.pyc ADDED Viewed

Binary file (6.5 kB). View file

src/__pycache__/main_with_calendar.cpython-314.pyc ADDED Viewed

Binary file (12.3 kB). View file

src/__pycache__/parser.cpython-314.pyc ADDED Viewed

Binary file (32.6 kB). View file

src/__pycache__/summarizer.cpython-314.pyc ADDED Viewed

Binary file (6.36 kB). View file

src/excel_exporter.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import pandas as pd
+from openpyxl import Workbook
+from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
+from datetime import datetime
+import os
+from io import BytesIO
+class ExcelExporter:
+    def __init__(self, filename=None):
+        self.filename = filename
+        self.base_filename = "output/tasks.xlsx"
+        if filename and os.path.exists(filename):
+            from openpyxl import load_workbook
+            self.wb = load_workbook(filename)
+        else:
+            self.wb = Workbook()
+            if "Sheet" in self.wb.sheetnames:
+                self.wb.remove(self.wb["Sheet"])
+    def add_sheet(self, df, sheet_name: str):
+        if sheet_name in self.wb.sheetnames:
+            self.wb.remove(self.wb[sheet_name])
+        ws = self.wb.create_sheet(title=sheet_name)
+        headers = list(df.columns)
+        for col_idx, header in enumerate(headers, 1):
+            ws.cell(row=1, column=col_idx, value=header)
+        for row_idx, row in df.iterrows():
+            for col_idx, value in enumerate(row, 1):
+                cell = ws.cell(row=row_idx + 2, column=col_idx, value=value)
+                if isinstance(value, (datetime, pd.Timestamp)):
+                    cell.number_format = 'DD.MM.YYYY'
+        self._apply_formatting(ws, len(df.columns), len(df))
+    def _apply_formatting(self, ws, num_columns, num_rows):
+        header_font = Font(name='Arial', size=12, bold=True, color='FFFFFF')
+        header_fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')
+        header_alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
+        cell_alignment = Alignment(horizontal='left', vertical='center', wrap_text=True)
+        date_alignment = Alignment(horizontal='center', vertical='center')
+        border = Border(
+            left=Side(style='thin'),
+            right=Side(style='thin'),
+            top=Side(style='thin'),
+            bottom=Side(style='thin')
+        )
+        for col in range(1, num_columns + 1):
+            cell = ws.cell(row=1, column=col)
+            cell.font = header_font
+            cell.fill = header_fill
+            cell.alignment = header_alignment
+            cell.border = border
+        for row in range(2, num_rows + 2):
+            for col in range(1, num_columns + 1):
+                cell = ws.cell(row=row, column=col)
+                cell.border = border
+                col_letter = ws.cell(row=1, column=col).value
+                if col_letter in ['Срок', 'Дата']:
+                    cell.alignment = date_alignment
+                else:
+                    cell.alignment = cell_alignment
+        for col in ws.columns:
+            max_length = 0
+            col_letter = col[0].column_letter
+            for cell in col:
+                try:
+                    if len(str(cell.value)) > max_length:
+                        max_length = len(str(cell.value))
+                except:
+                    pass
+            adjusted_width = min(max_length + 2, 80)
+            ws.column_dimensions[col_letter].width = adjusted_width
+        ws.freeze_panes = 'A2'
+    def save(self, filename=None):
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"output/tasks_{timestamp}.xlsx"
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        for sheet in self.wb.worksheets:
+            for row in sheet.iter_rows():
+                for cell in row:
+                    if cell.value is not None:
+                        _ = cell.value
+        self.wb.save(filename)
+        print(f"✅ Excel файл сохранен: {filename}")
+        return filename
+    def save_to_buffer(self, buffer):
+        self.wb.save(buffer)
+        return buffer

src/google_calendar.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+import datetime
+import os
+import sys
+class GoogleCalendarExporter:
+    """
+    Класс для экспорта задач в Google Calendar
+    """
+    def __init__(self, credentials_path='credentials/google-credentials.json', calendar_id='primary'):
+        """
+        Инициализация подключения к Google Calendar
+        Args:
+            credentials_path: путь к JSON-файлу с ключами сервисного аккаунта
+            calendar_id: ID календаря ('primary' для основного или конкретный ID)
+        """
+        self.credentials_path = credentials_path
+        self.calendar_id = calendar_id
+        self.service = None
+        # Проверяем наличие файла с ключами
+        if not os.path.exists(credentials_path):
+            print(f"❌ Файл с ключами не найден: {credentials_path}")
+            print("💡 Убедитесь, что файл лежит в папке credentials/")
+            sys.exit(1)
+        self._authenticate()
+    def _authenticate(self):
+        """Аутентификация в Google Calendar API через сервисный аккаунт"""
+        try:
+            # Определяем права доступа (нужны для записи)
+            SCOPES = ['https://www.googleapis.com/auth/calendar']  # Полный доступ к календарю [citation:6]
+            # Загружаем ключи сервисного аккаунта [citation:5]
+            credentials = service_account.Credentials.from_service_account_file(
+                self.credentials_path,
+                scopes=SCOPES
+            )
+            # Создаем сервис для работы с Calendar API [citation:6]
+            self.service = build('calendar', 'v3', credentials=credentials)
+            print("✅ Успешная аутентификация в Google Calendar")
+        except Exception as e:
+            print(f"❌ Ошибка аутентификации: {e}")
+            sys.exit(1)
+    def create_event_from_task(self, task):
+        """
+        Создает событие в календаре из задачи
+        Args:
+            task: словарь с данными задачи (number, summary, full_description, responsible, due_date, due_date_str)
+        Returns:
+            ссылка на созданное событие или None при ошибке
+        """
+        if not self.service:
+            print("❌ Сервис не инициализирован")
+            return None
+        # Проверяем наличие даты
+        if not task.get('due_date'):
+            print(f"⚠️ Задача #{task.get('number', '?')} пропущена: нет даты")
+            return None
+        try:
+            # Формируем событие
+            due_date = task['due_date']
+            # Создаем событие на целый день (если не указано время) [citation:6]
+            # Или можно задать конкретное время, например 10:00
+            event_date = due_date.strftime('%Y-%m-%d')
+            # Берем краткое описание или начало полного
+            summary = task.get('summary', '')
+            if not summary:
+                summary = task.get('full_description', '')[:50] + '...'
+            # Формируем описание события
+            description = f"""
+📋 Задача #{task.get('number', '?')}
+📝 Полное описание:
+{task.get('full_description', '')}
+👤 Ответственный: {task.get('responsible', 'Не указан')}
+📅 Срок: {task.get('due_date_str', '')}
+🔗 Создано автоматически парсером задач
+            """.strip()
+            # Создаем событие [citation:6]
+            event = {
+                'summary': f"Задача #{task['number']}: {summary}",
+                'description': description,
+                'start': {
+                    'date': event_date,  # Целый день
+                },
+                'end': {
+                    'date': event_date,  # Целый день
+                },
+                'reminders': {
+                    'useDefault': True  # Использовать стандартные напоминания
+                }
+            }
+            # Добавляем время, если нужно (например, сделать на 10:00)
+            # event['start']['dateTime'] = f"{event_date}T10:00:00+03:00"
+            # event['end']['dateTime'] = f"{event_date}T11:00:00+03:00"
+            # Создаем событие в календаре [citation:1]
+            created_event = self.service.events().insert(
+                calendarId=self.calendar_id,
+                body=event
+            ).execute()
+            print(f"✅ Событие создано: {created_event.get('htmlLink')}")
+            return created_event.get('htmlLink')
+        except HttpError as e:
+            print(f"❌ Ошибка API при создании события для задачи #{task.get('number', '?')}: {e}")
+            return None
+        except Exception as e:
+            print(f"❌ Неожиданная ошибка для задачи #{task.get('number', '?')}: {e}")
+            return None
+    def create_events_from_tasks(self, tasks):
+        """
+        Создает события для списка задач
+        Args:
+            tasks: список словарей с задачами
+        Returns:
+            список ссылок на созданные события
+        """
+        results = []
+        print(f"\n📅 Создание событий в календаре (ID: {self.calendar_id})...")
+        for task in tasks:
+            event_link = self.create_event_from_task(task)
+            if event_link:
+                results.append({
+                    'task_number': task.get('number'),
+                    'task_summary': task.get('summary', '')[:30] + '...',
+                    'event_link': event_link
+                })
+        print(f"\n✅ Создано {len(results)} событий из {len(tasks)} задач")
+        return results
+    def check_calendar_access(self):
+        """Проверяет доступ к календарю (выводит список ближайших событий)"""
+        try:
+            now = datetime.datetime.utcnow().isoformat() + 'Z'
+            events_result = self.service.events().list(
+                calendarId=self.calendar_id,
+                timeMin=now,
+                maxResults=5,
+                singleEvents=True,
+                orderBy='startTime'
+            ).execute()
+            events = events_result.get('items', [])
+            if not events:
+                print("📭 В календаре нет предстоящих событий")
+            else:
+                print(f"📅 Ближайшие события в календаре:")
+                for event in events:
+                    start = event['start'].get('dateTime', event['start'].get('date'))
+                    print(f"   • {start}: {event.get('summary', 'Без названия')}")
+            return True
+        except HttpError as e:
+            print(f"❌ Ошибка доступа к календарю: {e}")
+            print("💡 Проверьте, что:")
+            print("   1. Calendar API включен в Google Cloud Console")
+            print("   2. Календарь расшарен на email сервисного аккаунта")
+            print("   3. Calendar ID указан правильно")
+            return False

src/google_sheets.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gspread
+from google.oauth2.service_account import Credentials
+from google.auth.exceptions import GoogleAuthError
+import pandas as pd
+from datetime import datetime
+import os
+import sys
+class GoogleSheetsExporter:
+    """
+    Класс для экспорта задач в Google Sheets (исправленная версия)
+    """
+    def __init__(self, credentials_path='credentials/google-credentials.json'):
+        """
+        Инициализация подключения к Google Sheets
+        """
+        self.credentials_path = credentials_path
+        self.client = None
+        self.spreadsheet = None
+        # Проверяем наличие файла с ключами
+        if not os.path.exists(credentials_path):
+            print(f"❌ Файл с ключами не найден: {credentials_path}")
+            print("💡 Убедитесь, что файл лежит в папке credentials/")
+            sys.exit(1)
+        self._authenticate()
+    def _authenticate(self):
+        """Аутентификация в Google Sheets API"""
+        try:
+            # Определяем права доступа
+            scope = [
+                'https://www.googleapis.com/auth/spreadsheets',
+                'https://www.googleapis.com/auth/drive'
+            ]
+            # Загружаем ключи
+            credentials = Credentials.from_service_account_file(
+                self.credentials_path,
+                scopes=scope
+            )
+            # Авторизуемся
+            self.client = gspread.authorize(credentials)
+            print("✅ Успешная аутентификация в Google Sheets")
+        except Exception as e:
+            print(f"❌ Ошибка аутентификации: {e}")
+            sys.exit(1)
+    def use_existing_spreadsheet(self, spreadsheet_identifier):
+        """
+        ИСПРАВЛЕНО: Открывает существующую таблицу (по URL, ID или названию)
+        Args:
+            spreadsheet_identifier: URL, ID или название таблицы
+        """
+        try:
+            # Пробуем открыть по URL
+            if spreadsheet_identifier.startswith('https://'):
+                self.spreadsheet = self.client.open_by_url(spreadsheet_identifier)
+                print(f"✅ Открыта таблица по URL")
+            else:
+                # Пробуем открыть по ID или названию
+                try:
+                    self.spreadsheet = self.client.open_by_key(spreadsheet_identifier)
+                except:
+                    self.spreadsheet = self.client.open(spreadsheet_identifier)
+            print(f"✅ Таблица: {self.spreadsheet.title}")
+            return self.spreadsheet
+        except gspread.SpreadsheetNotFound:
+            print(f"❌ Таблица не найдена. Проверьте:")
+            print(f"   1. Правильно ли вы скопировали ссылку/ID")
+            print(f"   2. Расшарили ли таблицу на email сервисного аккаунта")
+            return None
+        except Exception as e:
+            print(f"❌ Ошибка при открытии таблицы: {e}")
+            return None
+    def export_dataframe(self, df, sheet_name='Tasks', clear_sheet=True):
+        """
+        Экспортирует DataFrame в открытую Google таблицу
+        """
+        if self.spreadsheet is None:
+            print("❌ Сначала откройте таблицу через use_existing_spreadsheet()")
+            return False
+        try:
+            # Проверяем, существует ли лист с таким названием
+            try:
+                worksheet = self.spreadsheet.worksheet(sheet_name)
+                if clear_sheet:
+                    worksheet.clear()
+                    print(f"🧹 Лист '{sheet_name}' очищен")
+            except gspread.WorksheetNotFound:
+                # Создаем новый лист
+                worksheet = self.spreadsheet.add_worksheet(
+                    title=sheet_name,
+                    rows=max(100, len(df) + 10),
+                    cols=len(df.columns) + 5
+                )
+                print(f"📄 Создан новый лист: '{sheet_name}'")
+            # Подготавливаем данные
+            headers = df.columns.tolist()
+            data = df.values.tolist()
+            all_data = [headers] + data
+            # Записываем
+            worksheet.update('A1', all_data)
+            print(f"✅ Записано {len(df)} строк в лист '{sheet_name}'")
+            return worksheet
+        except Exception as e:
+            print(f"❌ Ошибка при экспорте: {e}")
+            return False
+    def get_shareable_link(self):
+        """Возвращает ссылку на таблицу"""
+        if self.spreadsheet:
+            return self.spreadsheet.url
+        return None

src/parser.py ADDED Viewed

	@@ -0,0 +1,653 @@

+import pdfplumber
+import re
+from datetime import datetime
+from typing import List, Dict, Optional
+import os
+import subprocess
+import shutil
+class TaskParser:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.tasks = []
+        self.file_extension = os.path.splitext(file_path)[1].lower()
+        # ========== НАСТРОЙКИ ПОЛЕЙ (МОЖНО МЕНЯТЬ) ==========
+        # Ключевые слова для поиска даты
+        self.date_keywords = ['Срок', 'Дата', 'Дедлайн', 'Due', 'Выполнить до']
+        # Ключевые слова для поиска ответственного
+        self.resp_keywords = ['Отв.', 'Исполнитель', 'Ответственный', 'Исп.']
+        # Текстовые статусы выполнения
+        self.status_keywords = ['выполнено', 'выполнен', 'сделано', 'готово']
+        # Разделители между словом и значением
+        self.separators = r'\s*(?:—|–|-|:)?\s*'
+        # Слова, которые означают конец раздела с задачами
+        self.after_keywords = [
+            'Протокол вёл',
+            'Лист согласования',
+            'Стр.',
+            'Page',
+            'Ознакомлены',
+            'Подписи',
+            'УТВЕРЖДАЮ',
+            'СОГЛАСОВАНО',
+            'От АО «ТАНЕКО»:',
+            'От ООО «НТЦ Татнефть»:',
+            'От ООО «ЭПИК»:'
+        ]
+        # Слова, которые игнорируются до РЕШИЛИ:
+        self.before_keywords = [
+            'СЛУШАЛИ:',
+            'ВЫСТУПИЛИ:',
+            'ДОКЛАДЫВАЛИ:',
+            'ОБСУЖДАЛИ:',
+            'ПОВЕСТКА ДНЯ:',
+            'ПРИСУТСТВОВАЛИ:'
+        ]
+        # ====================================================
+    def extract_text(self) -> str:
+        """Извлекает текст из файла (поддерживает PDF, DOCX и DOC)"""
+        if self.file_extension == '.pdf':
+            return self._extract_from_pdf()
+        elif self.file_extension == '.docx':
+            return self._extract_from_docx()
+        elif self.file_extension == '.doc':
+            return self._extract_from_doc()
+        else:
+            print(f"❌ Неподдерживаемый формат файла: {self.file_extension}")
+            print("   Поддерживаются: .pdf, .docx, .doc")
+            return ""
+    def _extract_from_pdf(self) -> str:
+        full_text = ""
+        try:
+            with pdfplumber.open(self.file_path) as pdf:
+                for page in pdf.pages:
+                    text = page.extract_text()
+                    if text:
+                        full_text += text + "\n"
+            print(f"✅ Извлечено {len(full_text)} символов из PDF")
+            return full_text
+        except Exception as e:
+            print(f"❌ Ошибка при чтении PDF: {e}")
+            return ""
+    def _extract_from_docx(self) -> str:
+        try:
+            from docx import Document
+            doc = Document(self.file_path)
+            full_text = []
+            for para in doc.paragraphs:
+                text = para.text.strip()
+                if text:
+                    try:
+                        import xml.etree.ElementTree as ET
+                        if para._element.xpath('.//w:numPr'):
+                            full_text.append(f"¶ {text}")
+                        else:
+                            full_text.append(text)
+                    except:
+                        full_text.append(text)
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = []
+                    for cell in row.cells:
+                        if cell.text.strip():
+                            row_text.append(cell.text)
+                    if row_text:
+                        full_text.append(' | '.join(row_text))
+            result = '\n'.join(full_text)
+            print(f"✅ Извлечено {len(result)} символов из Word документа (.docx)")
+            return result
+        except ImportError:
+            print("❌ Библиотека python-docx не установлена")
+            print("   Установите: pip install python-docx")
+            return ""
+        except Exception as e:
+            print(f"❌ Ошибка при чтении Word документа: {e}")
+            return ""
+    def _extract_from_doc(self) -> str:
+        if shutil.which('antiword'):
+            try:
+                result = subprocess.run(['antiword', self.file_path],
+                                       capture_output=True, text=True)
+                if result.returncode == 0:
+                    print(f"✅ Извлечено {len(result.stdout)} символов из Word .doc файла")
+                    return result.stdout
+            except Exception as e:
+                print(f"⚠️ Ошибка antiword: {e}")
+        if shutil.which('soffice'):
+            try:
+                import tempfile
+                temp_dir = tempfile.mkdtemp()
+                result = subprocess.run([
+                    'soffice', '--headless', '--convert-to', 'txt',
+                    '--outdir', temp_dir, self.file_path
+                ], capture_output=True, text=True)
+                if result.returncode == 0:
+                    base_name = os.path.basename(self.file_path).replace('.doc', '.txt')
+                    txt_file = os.path.join(temp_dir, base_name)
+                    if os.path.exists(txt_file):
+                        with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
+                            content = f.read()
+                        os.remove(txt_file)
+                        os.rmdir(temp_dir)
+                        print(f"✅ Извлечено {len(content)} символов из Word .doc файла (через LibreOffice)")
+                        return content
+            except Exception as e:
+                print(f"⚠️ Ошибка при конвертации через LibreOffice: {e}")
+        print("❌ Не удалось извлечь текст из .doc файла.")
+        print("   Установите: brew install antiword")
+        return ""
+    def parse_tasks(self, text: str) -> List[Dict]:
+        lines = text.split('\n')
+        has_resheno = False
+        resheno_index = -1
+        for i, line in enumerate(lines[:100]):
+            if 'РЕШИЛИ:' in line:
+                has_resheno = True
+                resheno_index = i
+                print(f"✅ Найден маркер 'РЕШИЛИ:' в строке {i}")
+                break
+        if self.file_extension == '.pdf':
+            print("📄 PDF файл: использую простой парсинг")
+            self.tasks = self._parse_pdf_simple(lines)
+        elif has_resheno:
+            print("📝 Word файл с РЕШИЛИ: использую парсинг протокола")
+            self.tasks = self._parse_word_protocol(lines, resheno_index)
+        else:
+            print("📄 Простой список: использую базовый парсинг")
+            self.tasks = self._parse_simple_list(lines)
+        return self.tasks
+    def _parse_pdf_simple(self, lines: List[str]) -> List[Dict]:
+        tasks = []
+        current_task = None
+        current_description = []
+        решили_index = -1
+        for i, line in enumerate(lines):
+            if 'РЕШИЛИ:' in line:
+                решили_index = i
+                break
+        start_index = решили_index + 1 if решили_index != -1 else 0
+        i = start_index
+        while i < len(lines):
+            line = lines[i].strip()
+            if not line:
+                i += 1
+                continue
+            stop_parsing = False
+            for keyword in self.after_keywords:
+                if keyword in line[:30]:
+                    stop_parsing = True
+                    break
+            if stop_parsing:
+                break
+            task_match = re.match(r'^(\d+)\.\s+(.*)', line)
+            if task_match:
+                if current_task:
+                    full_desc = ' '.join(current_description)
+                    full_desc = re.sub(r'\s+', ' ', full_desc)
+                    # Очищаем описание от метаданных
+                    full_desc = re.sub(r'[;,\s]*Отв\.:\s*[^\.]+?(?:\.|$)', '', full_desc)
+                    full_desc = re.sub(r'[;,\s]*Отв\.:\s*[^С]+?(?:\s+Срок|$)', '', full_desc)
+                    full_desc = re.sub(r'[;,\s]*Отв\.:\s*[^\n]+', '', full_desc)
+                    full_desc = re.sub(r'[;,\s]*Срок\s*[—–-]?\s*\d{2}\.\d{2}\.\d{4}', '', full_desc)
+                    full_desc = re.sub(r'[;,\s]*Срок\s*[—–-]?\s*до\s+конца\s+года', '', full_desc)
+                    full_desc = re.sub(r'[;,\s]*С\b', '', full_desc)
+                    full_desc = re.sub(r'\s+', ' ', full_desc)
+                    full_desc = re.sub(r'\s*[;,]?\s*$', '', full_desc)
+                    full_desc = full_desc.strip()
+                    current_task['full_description'] = full_desc
+                    tasks.append(current_task)
+                task_num = task_match.group(1)
+                task_text = task_match.group(2)
+                current_task = {
+                    'number': int(task_num),
+                    'full_description': '',
+                    'responsible': '',
+                    'due_date': None,
+                    'due_date_str': ''
+                }
+                current_description = [task_text]
+                i += 1
+            elif current_task:
+                current_description.append(line)
+                if 'Отв.:' in line:
+                    resp_match = re.search(r'Отв\.:\s*([^С]+?)(?:\s+Срок|$)', line)
+                    if not resp_match:
+                        resp_match = re.search(r'Отв\.:\s*([^\n]+)', line)
+                    if resp_match:
+                        responsible = resp_match.group(1).strip()
+                        # Обрезаем до ключевых слов
+                        stop_words = self.date_keywords + ['Выполнено', 'Приложение', 'приложение', 'Протокол'] + self.status_keywords
+                        for stop_word in stop_words:
+                            if stop_word in responsible:
+                                responsible = responsible.split(stop_word)[0].strip()
+                                break
+                        # Дополнительная очистка от "Срок" и "Выполнено" в любом регистре
+                        responsible = re.sub(r'\s+Срок.*$', '', responsible, flags=re.IGNORECASE)
+                        responsible = re.sub(r'\s+Выполнено.*$', '', responsible, flags=re.IGNORECASE)
+                        responsible = re.sub(r'\s+до\s+конца\s+года.*$', '', responsible, flags=re.IGNORECASE)
+                        responsible = re.sub(r'\s+', ' ', responsible)
+                        current_task['responsible'] = responsible
+                if 'Срок' in line or any(word in line.lower() for word in self.status_keywords + ['до конца года']):
+                    line_lower = line.lower()
+                    # Проверяем на статусы выполнения
+                    if any(word in line_lower for word in self.status_keywords):
+                        current_task['due_date_str'] = 'Выполнено'
+                    elif 'до конца года' in line_lower:
+                        current_task['due_date_str'] = 'до конца года'
+                    else:
+                        date_match = re.search(r'Срок\s*[—–-]?\s*(\d{2}\.\d{2}\.\d{4})', line)
+                        if date_match:
+                            date_str = date_match.group(1).strip()
+                            current_task['due_date_str'] = date_str
+                            try:
+                                current_task['due_date'] = datetime.strptime(date_str, '%d.%m.%Y').date()
+                            except ValueError:
+                                pass
+                i += 1
+            else:
+                i += 1
+        if current_task:
+            full_desc = ' '.join(current_description)
+            full_desc = re.sub(r'\s+', ' ', full_desc)
+            # Очищаем описание от метаданных
+            full_desc = re.sub(r'[;,\s]*Отв\.:\s*[^\.]+?(?:\.|$)', '', full_desc)
+            full_desc = re.sub(r'[;,\s]*Отв\.:\s*[^С]+?(?:\s+Срок|$)', '', full_desc)
+            full_desc = re.sub(r'[;,\s]*Отв\.:\s*[^\n]+', '', full_desc)
+            full_desc = re.sub(r'[;,\s]*Срок\s*[—–-]?\s*\d{2}\.\d{2}\.\d{4}', '', full_desc)
+            full_desc = re.sub(r'[;,\s]*Срок\s*[—–-]?\s*до\s+конца\s+года', '', full_desc)
+            full_desc = re.sub(r'[;,\s]*С\b', '', full_desc)
+            full_desc = re.sub(r'\s+', ' ', full_desc)
+            full_desc = re.sub(r'\s*[;,]?\s*$', '', full_desc)
+            full_desc = full_desc.strip()
+            current_task['full_description'] = full_desc
+            tasks.append(current_task)
+        return tasks
+    def _parse_word_protocol(self, all_lines: List[str], start_idx: int) -> List[Dict]:
+        tasks = []
+        решили_pos = -1
+        for i, line in enumerate(all_lines):
+            if 'РЕШИЛИ:' in line:
+                решили_pos = i
+                break
+        if решили_pos == -1:
+            return []
+        task_lines = []
+        i = решили_pos + 1
+        while i < len(all_lines) and not all_lines[i].strip():
+            i += 1
+        started = False
+        while i < len(all_lines):
+            line = all_lines[i].strip()
+            stop_found = False
+            for keyword in self.after_keywords:
+                if keyword in line[:30]:
+                    stop_found = True
+                    break
+            if stop_found:
+                break
+            if re.match(r'^\d+$', line):
+                i += 1
+                continue
+            is_service = False
+            for keyword in self.before_keywords:
+                if keyword in line:
+                    is_service = True
+                    break
+            if is_service:
+                i += 1
+                continue
+            if not started and (re.match(r'^\d+[.\t]', line) or line.startswith('¶')):
+                started = True
+            if started and line:
+                task_lines.append(line)
+            i += 1
+        if task_lines:
+            last_line = task_lines[-1]
+            for keyword in self.after_keywords:
+                if keyword in last_line:
+                    task_lines[-1] = last_line.split(keyword)[0].strip()
+                    break
+        if task_lines and not task_lines[-1]:
+            task_lines.pop()
+        i = 0
+        task_counter = 1
+        while i < len(task_lines):
+            line = task_lines[i]
+            is_task_start = False
+            task_number = None
+            description = None
+            match = re.match(r'^(\d+)[.\t]\s*(.*)', line)
+            if match:
+                is_task_start = True
+                task_number = int(match.group(1))
+                description = match.group(2)
+            if not is_task_start and line.startswith('¶'):
+                is_task_start = True
+                task_number = task_counter
+                description = re.sub(r'^¶\s*', '', line)
+            if not is_task_start:
+                has_resp = any(k in line for k in self.resp_keywords)
+                has_date = any(k in line for k in self.date_keywords)
+                is_service = False
+                for keyword in self.before_keywords:
+                    if keyword in line:
+                        is_service = True
+                        break
+                if not has_resp and not has_date and not is_service and len(line) > 20:
+                    is_task_start = True
+                    task_number = task_counter
+                    description = line
+            if is_task_start and description:
+                i += 1
+                responsible = ""
+                due_date_str = ""
+                due_date = None
+                while i < len(task_lines) and not task_lines[i].strip():
+                    i += 1
+                collected_resp = False
+                collected_date = False
+                while i < len(task_lines) and not (collected_resp and collected_date):
+                    current = task_lines[i].strip()
+                    if not current:
+                        i += 1
+                        continue
+                    next_is_task = False
+                    if re.match(r'^\d+[.\t]', current):
+                        next_is_task = True
+                    elif current.startswith('¶'):
+                        next_is_task = True
+                    else:
+                        has_resp_next = any(k in current for k in self.resp_keywords)
+                        has_date_next = any(k in current for k in self.date_keywords)
+                        is_service_next = any(k in current for k in self.before_keywords)
+                        if not has_resp_next and not has_date_next and not is_service_next and len(current) > 20:
+                            next_is_task = True
+                    if next_is_task:
+                        break
+                    if not collected_resp:
+                        for keyword in self.resp_keywords:
+                            if keyword in current:
+                                resp_parts = current.split(keyword)
+                                if len(resp_parts) > 1:
+                                    resp_text = resp_parts[1].strip()
+                                    stop_words = self.date_keywords + ['Выполнено', 'Приложение', 'приложение', 'Протокол'] + self.status_keywords
+                                    for stop_word in stop_words:
+                                        if stop_word in resp_text.lower():
+                                            resp_text = resp_text.split(stop_word)[0].strip()
+                                            break
+                                    # Дополнительная очистка
+                                    resp_text = re.sub(r'\s+Срок.*$', '', resp_text, flags=re.IGNORECASE)
+                                    resp_text = re.sub(r'\s+Выполнено.*$', '', resp_text, flags=re.IGNORECASE)
+                                    resp_text = re.sub(r'\s+до\s+конца\s+года.*$', '', resp_text, flags=re.IGNORECASE)
+                                    responsible = re.sub(r'\s+', ' ', resp_text)
+                                    responsible = re.sub(r'^:\s*', '', responsible)
+                                    collected_resp = True
+                                    for d_keyword in self.date_keywords:
+                                        if d_keyword in current:
+                                            date_parts = current.split(d_keyword)
+                                            if len(date_parts) > 1:
+                                                date_text = date_parts[1].strip()
+                                                date_text = re.sub(r'^\s*[—–-]?\s*', '', date_text)
+                                                due_date_str = re.sub(r'\s+', ' ', date_text)
+                                                date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', date_text)
+                                                if date_match:
+                                                    try:
+                                                        due_date = datetime.strptime(date_match.group(1), '%d.%m.%Y').date()
+                                                    except ValueError:
+                                                        pass
+                                                collected_date = True
+                                            break
+                                i += 1
+                                break
+                    if not collected_date and i < len(task_lines):
+                        current = task_lines[i].strip()
+                        current_lower = current.lower()
+                        # Проверяем на статусы выполнения
+                        if any(word in current_lower for word in self.status_keywords):
+                            due_date_str = 'Выполнено'
+                            collected_date = True
+                            i += 1
+                        elif 'до конца года' in current_lower:
+                            due_date_str = 'до конца года'
+                            collected_date = True
+                            i += 1
+                        else:
+                            for keyword in self.date_keywords:
+                                if keyword in current:
+                                    date_parts = current.split(keyword)
+                                    if len(date_parts) > 1:
+                                        date_text = date_parts[1].strip()
+                                        date_text = re.sub(r'^\s*[—–-]?\s*', '', date_text)
+                                        due_date_str = re.sub(r'\s+', ' ', date_text)
+                                        date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', date_text)
+                                        if date_match:
+                                            try:
+                                                due_date = datetime.strptime(date_match.group(1), '%d.%m.%Y').date()
+                                            except ValueError:
+                                                pass
+                                        collected_date = True
+                                        i += 1
+                                    break
+                    if not (collected_resp or collected_date):
+                        i += 1
+                task = {
+                    'number': task_number,
+                    'full_description': description,
+                    'responsible': responsible,
+                    'due_date': due_date,
+                    'due_date_str': due_date_str
+                }
+                tasks.append(task)
+                task_counter += 1
+            else:
+                i += 1
+        return tasks
+    def _parse_simple_list(self, lines: List[str]) -> List[Dict]:
+        tasks = []
+        current_task = None
+        current_description = []
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            task_match = re.match(r'^(\d+)\.\s+(.*)', line)
+            if task_match:
+                if current_task:
+                    full_desc = ' '.join(current_description)
+                    full_desc = re.sub(r'\s+', ' ', full_desc)
+                    current_task['full_description'] = full_desc
+                    tasks.append(current_task)
+                task_num = task_match.group(1)
+                task_text = task_match.group(2)
+                current_task = {
+                    'number': int(task_num),
+                    'full_description': '',
+                    'responsible': '',
+                    'due_date': None,
+                    'due_date_str': ''
+                }
+                current_description = [task_text]
+            elif current_task:
+                current_description.append(line)
+                if 'Срок' in line or any(word in line.lower() for word in self.status_keywords + ['до конца года']):
+                    line_lower = line.lower()
+                    if any(word in line_lower for word in self.status_keywords):
+                        current_task['due_date_str'] = 'Выполнено'
+                    elif 'до конца года' in line_lower:
+                        current_task['due_date_str'] = 'до конца года'
+                    else:
+                        date_match = re.search(rf'Срок\s*[—–-]?\s*(\d{{2}}\.\d{{2}}\.\d{{4}})', line)
+                        if date_match:
+                            date_str = date_match.group(1).strip()
+                            current_task['due_date_str'] = date_str
+                            try:
+                                current_task['due_date'] = datetime.strptime(date_str, '%d.%m.%Y').date()
+                            except ValueError:
+                                pass
+                for keyword in self.resp_keywords:
+                    if keyword in line:
+                        resp_match = re.search(rf'{re.escape(keyword)}\s*[—–-]?\s*([^С]+?)(?:\s+Срок|$)', line)
+                        if not resp_match:
+                            resp_match = re.search(rf'{re.escape(keyword)}\s*[—–-]?\s*([^\n]+)', line)
+                        if resp_match:
+                            responsible = resp_match.group(1).strip()
+                            stop_words = self.date_keywords + ['Выполнено', 'Приложение', 'приложение'] + self.status_keywords
+                            for stop_word in stop_words:
+                                if stop_word in responsible.lower():
+                                    responsible = responsible.split(stop_word)[0].strip()
+                                    break
+                            # Дополнительная очистка
+                            responsible = re.sub(r'\s+Срок.*$', '', responsible, flags=re.IGNORECASE)
+                            responsible = re.sub(r'\s+Выполнено.*$', '', responsible, flags=re.IGNORECASE)
+                            responsible = re.sub(r'\s+до\s+конца\s+года.*$', '', responsible, flags=re.IGNORECASE)
+                            responsible = re.sub(r'\s+', ' ', responsible)
+                            current_task['responsible'] = responsible
+                            break
+        if current_task:
+            full_desc = ' '.join(current_description)
+            full_desc = re.sub(r'\s+', ' ', full_desc)
+            current_task['full_description'] = full_desc
+            tasks.append(current_task)
+        return tasks
+    def print_tasks(self):
+        if not self.tasks:
+            print("❌ Задачи не найдены")
+            return
+        print(f"\n📋 Найдено задач: {len(self.tasks)}\n")
+        print("=" * 80)
+        for task in self.tasks:
+            print(f"Задача #{task['number']}")
+            print(f"📝 Описание: {task['full_description'][:100]}...")
+            print(f"👤 Ответственный: {task['responsible'] or '❌ НЕТ'}")
+            print(f"📅 Срок: {task['due_date_str'] or '❌ НЕТ'}")
+            print("-" * 40)
+    def to_dataframe(self):
+        import pandas as pd
+        data = []
+        for task in self.tasks:
+            data.append({
+                '№': task['number'],
+                'Описание': task['full_description'],
+                'Ответственный': task.get('responsible', 'Не указан'),
+                'Срок': task.get('due_date_str', 'Не указан'),
+                'Дата (для сортировки)': task.get('due_date')
+            })
+        df = pd.DataFrame(data)
+        return df

src/summarizer.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import re
+from typing import Optional
+class TaskSummarizer:
+    def __init__(self, model_name="cointegrated/rut5-base-absum"):
+        """
+        Инициализация модели для суммаризации
+        """
+        self.model_name = model_name
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = None
+        self.model = None
+        print(f"🔄 Загрузка модели {model_name}...")
+        print(f"📱 Устройство: {self.device}")
+        try:
+            # Загружаем токенизатор и модель
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            self.model.to(self.device)
+            self.model.eval()  # Режим оценки (не обучения)
+            print("✅ Модель успешно загружена!")
+        except Exception as e:
+            print(f"❌ Ошибка при загрузке модели: {e}")
+            print("💡 Попробуйте выполнить: pip install --upgrade transformers torch")
+            raise
+    def summarize(self, text: str, max_length: int = 50, min_length: int = 10) -> str:
+        """
+        Создает краткую суммаризацию текста задачи
+        Args:
+            text: Полный текст задачи
+            max_length: Максимальная длина суммаризации
+            min_length: Минимальная длина суммаризации
+        Returns:
+            Краткое описание задачи
+        """
+        if not text or len(text) < 20:
+            return text
+        try:
+            # Очищаем текст от лишних символов
+            text = self._clean_text(text)
+            # Токенизируем входной текст
+            inputs = self.tokenizer(
+                text,
+                max_length=512,
+                truncation=True,
+                return_tensors="pt"
+            ).to(self.device)
+            # Генерируем суммаризацию
+            with torch.no_grad():  # Отключаем вычисление градиентов для экономии памяти
+                summary_ids = self.model.generate(
+                    inputs.input_ids,
+                    max_length=max_length,
+                    min_length=min_length,
+                    num_beams=4,  # Поиск с лучом для лучшего качества
+                    length_penalty=2.0,  # Штраф за длину
+                    early_stopping=True,
+                    no_repeat_ngram_size=3  # Избегаем повторений
+                )
+            # Декодируем результат
+            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            # Постобработка
+            summary = self._postprocess_summary(summary)
+            return summary
+        except Exception as e:
+            print(f"⚠️ Ошибка при суммаризации: {e}")
+            # Возвращаем первые 100 символов как запасной вариант
+            return text[:100] + "..."
+    def _clean_text(self, text: str) -> str:
+        """Очищает текст от лишних символов"""
+        # Удаляем номер задачи в начале (если есть)
+        text = re.sub(r'^\d+\.\s*', '', text)
+        # Удаляем информацию об ответственном и сроке
+        text = re.sub(r'Отв\.:.*?Срок\s*-\s*\d{2}\.\d{2}\.\d{4}', '', text)
+        text = re.sub(r'Отв\.:.*$', '', text, flags=re.MULTILINE)
+        text = re.sub(r'Срок\s*-\s*\d{2}\.\d{2}\.\d{4}', '', text)
+        # Удаляем лишние пробелы
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+    def _postprocess_summary(self, summary: str) -> str:
+        """Постобработка сгенерированной суммаризации"""
+        # Убираем лишние пробелы
+        summary = re.sub(r'\s+', ' ', summary)
+        # Убираем точку в конце, если её нет
+        if summary and not summary.endswith(('.', '!', '?')):
+            summary += '.'
+        # Делаем первую букву заглавной
+        if summary:
+            summary = summary[0].upper() + summary[1:]
+        return summary
+    def summarize_batch(self, texts, max_length=50, min_length=10):
+        """
+        Суммаризация нескольких текстов (для эффективности)
+        """
+        summaries = []
+        for text in texts:
+            summary = self.summarize(text, max_length, min_length)
+            summaries.append(summary)
+        return summaries

web/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

web/icons/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

web/icons/calendarLogo.svg ADDED Viewed

web/icons/favicon.svg ADDED Viewed

web/icons/sheetsLogo-svgrepo-com.svg ADDED Viewed

web/icons/titleIcon.svg ADDED Viewed

web/icons/upload.svg ADDED Viewed

web/index.html ADDED Viewed

	@@ -0,0 +1,92 @@

+<!DOCTYPE html>
+<html lang="ru">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Парсер задач</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@100..900&display=swap" rel="stylesheet">
+    <link rel="shortcut icon" href="/task-parser/web/icons/favicon.svg" type="image/x-icon">
+    <link rel="stylesheet" href="/task-parser/web/style.css">
+</head>
+<body>
+    <div class="container">
+        <header>
+            <div class="title">
+                <img src="/task-parser/web/icons/titleIcon.svg" alt="">
+                <h1>Твой парсер</h1>
+            </div>
+            <p>Загрузи PDF или Word документ, чтобы извлечь все задачи</p>
+        </header>
+        <main>
+            <div id="dropZone" class="upload-area">
+                <div class="upload-item">
+                    <img src="/task-parser/web/icons/upload.svg" alt="">
+                    <div class="upload-text">Перетащи сюда файлы</div>
+                    <div class="upload-advice">или кликни для выбора</div>
+                </div>
+            </div>
+            <input type="file" id="fileInput" style="display:none" multiple accept=".pdf,.docx,.doc">
+            <div id="selectedFiles" class="selected-files" style="display:none">
+                <strong>Выбрано файлов:</strong> <span id="filesCount">0</span>
+                <div id="filesList" class="files-list"></div>
+            </div>
+            <div class="upload-choice">
+                <div class="option-card">
+                    <div class="option-choice">
+                        <img src="/task-parser/web/icons/sheetsLogo-svgrepo-com.svg" alt="">
+                        <h3>Google Sheets</h3>
+                    </div>
+                    <label>
+                        <input type="checkbox" id="exportSheets"> Экспортировать
+                    </label>
+                    <input type="text" id="sheetsUrl" class="option-input" placeholder="URL таблицы" disabled>
+                </div>
+                <div class="option-card">
+                    <div class="option-choice">
+                        <img src="/task-parser/web/icons/calendarLogo.svg" alt="">
+                        <h3>Google Calendar</h3>
+                    </div>
+                    <label>
+                        <input type="checkbox" id="exportCalendar"> Экспортировать
+                    </label>
+                    <input type="text" id="calendarId" class="option-input" placeholder="ID календаря" disabled>
+                </div>
+            </div>
+            <button id="processBtn" class="btn" disabled>Обработать документы</button>
+        </main>
+        <section>
+            <div id="loader" class="loader"></div>
+            <div id="results" class="results">
+                <h2>Результаты обработки</h2>
+                <div id="stats" class="stats"></div>
+                <div class="table-wrapper">
+                    <table id="tasksTable">
+                        <thead>
+                            <tr>
+                                <th>№</th>
+                                <th>Краткое описание</th>
+                                <th>Описание</th>
+                                <th>Ответственный</th>
+                                <th>Срок</th>
+                            </tr>
+                        </thead>
+                        <tbody id="tasksBody"></tbody>
+                    </table>
+                </div>
+                <button id="downloadBtn" class="download-btn" style="display:none">Скачать Excel</button>
+            </div>
+        </section>
+    </div>
+    <script src="/task-parser/web/script.js"></script>
+</body>
+</html>

web/script.js ADDED Viewed

	@@ -0,0 +1,230 @@

+const dropZone = document.getElementById('dropZone');
+const fileInput = document.getElementById('fileInput');
+const selectedFilesDiv = document.getElementById('selectedFiles');
+const filesCountSpan = document.getElementById('filesCount');
+const filesListDiv = document.getElementById('filesList');
+const processBtn = document.getElementById('processBtn');
+const loader = document.getElementById('loader');
+const resultsDiv = document.getElementById('results');
+const tasksBody = document.getElementById('tasksBody');
+const statsDiv = document.getElementById('stats');
+const downloadBtn = document.getElementById('downloadBtn');
+const exportSheets = document.getElementById('exportSheets');
+const sheetsUrl = document.getElementById('sheetsUrl');
+const exportCalendar = document.getElementById('exportCalendar');
+const calendarId = document.getElementById('calendarId');
+let selectedFiles = [];
+let excelData = null;
+function updateFilesList() {
+    if (selectedFiles.length === 0) {
+        selectedFilesDiv.style.display = 'none';
+        processBtn.disabled = true;
+        return;
+    }
+    selectedFilesDiv.style.display = 'block';
+    filesCountSpan.textContent = selectedFiles.length;
+    let filesHtml = '';
+    for (let i = 0; i < selectedFiles.length; i++) {
+        const file = selectedFiles[i];
+        filesHtml += '<div class="file-item"><span>📄 ' + file.name + '</span><button onclick="removeFile(' + i + ')">✕</button></div>';
+    }
+    filesListDiv.innerHTML = filesHtml;
+    processBtn.disabled = false;
+}
+window.removeFile = function(index) {
+    selectedFiles.splice(index, 1);
+    updateFilesList();
+};
+function addFiles(files) {
+    const allowed = ['.pdf', '.docx', '.doc'];
+    const valid = [];
+    for (let i = 0; i < files.length; i++) {
+        const file = files[i];
+        const ext = '.' + file.name.split('.').pop().toLowerCase();
+        if (allowed.includes(ext)) {
+            valid.push(file);
+        }
+    }
+    if (valid.length === 0) {
+        alert('Поддерживаются только PDF, DOCX, DOC');
+        return;
+    }
+    for (let i = 0; i < valid.length; i++) {
+        const newFile = valid[i];
+        let exists = false;
+        for (let j = 0; j < selectedFiles.length; j++) {
+            if (selectedFiles[j].name === newFile.name && selectedFiles[j].size === newFile.size) {
+                exists = true;
+                break;
+            }
+        }
+        if (!exists) {
+            selectedFiles.push(newFile);
+        }
+    }
+    updateFilesList();
+}
+dropZone.addEventListener('click', function() {
+    fileInput.click();
+});
+dropZone.addEventListener('dragover', function(e) {
+    e.preventDefault();
+});
+dropZone.addEventListener('drop', function(e) {
+    e.preventDefault();
+    if (e.dataTransfer.files.length) {
+        addFiles(e.dataTransfer.files);
+    }
+});
+fileInput.addEventListener('change', function(e) {
+    if (e.target.files.length) {
+        addFiles(e.target.files);
+    }
+    fileInput.value = '';
+});
+exportSheets.addEventListener('change', function() {
+    sheetsUrl.disabled = !exportSheets.checked;
+});
+exportCalendar.addEventListener('change', function() {
+    calendarId.disabled = !exportCalendar.checked;
+});
+function displayResults(tasks, stats, filesInfo) {
+    let statsHtml = '<div class="stat-card"><div class="stat-value">' + stats.total + '</div><div class="stat-label">Всего задач</div></div>';
+    statsHtml += '<div class="stat-card"><div class="stat-value">' + stats.with_responsible + '</div><div class="stat-label">С ответственным</div></div>';
+    statsHtml += '<div class="stat-card"><div class="stat-value">' + stats.with_date + '</div><div class="stat-label">С датой</div></div>';
+    if (filesInfo && filesInfo.length) {
+        statsHtml += '<div class="stat-card"><div class="stat-value">' + filesInfo.length + '</div><div class="stat-label">Обработано файлов</div></div>';
+    }
+    statsDiv.innerHTML = statsHtml;
+    let tasksHtml = '';
+    for (let i = 0; i < tasks.length; i++) {
+        const task = tasks[i];
+        const summary = task.summary || (task.full_description || '').substring(0, 80);
+        const description = (task.full_description || '').substring(0, 100);
+        tasksHtml += '<tr>';
+        tasksHtml += '<td>' + task.number + '</td>';
+        tasksHtml += '<td>' + summary + ((task.full_description || '').length > 80 ? '...' : '') + '</td>';
+        tasksHtml += '<td>' + description + ((task.full_description || '').length > 100 ? '...' : '') + '</td>';
+        tasksHtml += '<td>' + (task.responsible || '-') + '</td>';
+        tasksHtml += '<td>' + (task.due_date_str || '-') + '</td>';
+        tasksHtml += '</tr>';
+    }
+    tasksBody.innerHTML = tasksHtml;
+    resultsDiv.style.display = 'block';
+}
+function setupDownload(data) {
+    if (!data) return;
+    downloadBtn.style.display = 'inline-block';
+    downloadBtn.onclick = function() {
+        try {
+            const binaryString = atob(data);
+            const bytes = new Uint8Array(binaryString.length);
+            for (let i = 0; i < binaryString.length; i++) {
+                bytes[i] = binaryString.charCodeAt(i);
+            }
+            const blob = new Blob([bytes], { type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' });
+            const url = URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.href = url;
+            const now = new Date();
+            const year = now.getFullYear();
+            const month = String(now.getMonth() + 1).padStart(2, '0');
+            const day = String(now.getDate()).padStart(2, '0');
+            const hours = String(now.getHours()).padStart(2, '0');
+            const minutes = String(now.getMinutes()).padStart(2, '0');
+            const seconds = String(now.getSeconds()).padStart(2, '0');
+            const timestamp = year + '-' + month + '-' + day + 'T' + hours + '-' + minutes + '-' + seconds;
+            a.download = 'tasks_' + timestamp + '.xlsx';
+            a.click();
+            URL.revokeObjectURL(url);
+        } catch (err) {
+            console.error('Ошибка скачивания:', err);
+            alert('Ошибка при скачивании файла');
+        }
+    };
+}
+processBtn.addEventListener('click', async function() {
+    if (selectedFiles.length === 0) return;
+    processBtn.disabled = true;
+    loader.style.display = 'block';
+    resultsDiv.style.display = 'none';
+    downloadBtn.style.display = 'none';
+    const formData = new FormData();
+    for (let i = 0; i < selectedFiles.length; i++) {
+        formData.append('files', selectedFiles[i]);
+    }
+    formData.append('export_to_sheets', exportSheets.checked);
+    formData.append('export_to_calendar', exportCalendar.checked);
+    formData.append('sheets_url', sheetsUrl.value);
+    formData.append('calendar_id', calendarId.value);
+    // ===== ПРОВЕРКА: что отправляем =====
+    console.log('Отправляю FormData:');
+    for (let pair of formData.entries()) {
+        console.log(pair[0], '=', pair[1]);
+    }
+    // ===================================
+    console.log('Отправляю:', {
+        files: selectedFiles.length,
+        export_to_calendar: exportCalendar.checked,
+        calendar_id: calendarId.value
+    });
+    try {
+        const response = await fetch('/parse-batch', {
+            method: 'POST',
+            body: formData
+        });
+        console.log('Ответ получен, статус:', response.status);
+        if (!response.ok) {
+            throw new Error('HTTP ошибка: ' + response.status);
+        }
+        const data = await response.json();
+        console.log('Данные:', data);
+        if (data.success) {
+            if (data.excel_base64) {
+                excelData = data.excel_base64;
+                setupDownload(excelData);
+            }
+            displayResults(data.tasks, data.statistics, data.files);
+            if (data.calendar_export === 'success') {
+                alert('Задачи добавлены в Google Calendar');
+            } else if (data.calendar_export) {
+                alert('Ошибка Calendar: ' + data.calendar_export);
+            }
+        } else {
+            alert('Ошибка: ' + (data.error || 'Неизвестная ошибка'));
+        }
+    } catch (err) {
+        console.error('Ошибка:', err);
+        alert('Ошибка соединения: ' + err.message + '\n\nПроверь, что сервер запущен (python3 backend.py)');
+    } finally {
+        processBtn.disabled = false;
+        loader.style.display = 'none';
+    }
+});

web/style.css ADDED Viewed

	@@ -0,0 +1,253 @@

+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body {
+    font-family: 'Roboto', sans-serif;
+    background: linear-gradient(135deg, #6b6bae 0%, #192955 100%);
+    color: white;
+    padding: 20px;
+    min-height: 100vh;
+}
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+    width: 100%;
+}
+.title {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+}
+.title img {
+    height: 45px;
+}
+h1 {
+    text-align: center;
+    margin-bottom: 10px;
+}
+p {
+    text-align: center;
+    margin-bottom: 30px;
+    opacity: 0.9;
+}
+.upload-area {
+    border: 3px dashed #667eea;
+    border-radius: 15px;
+    background: #2a2a3a;
+    text-align: center;
+    padding: 50px;
+    cursor: pointer;
+    transition: all 0.4s ease;
+    margin-bottom: 20px;
+}
+.upload-area:hover {
+    background: #333344;
+    border-color: #764ba2;
+}
+.upload-area img {
+    max-width: 100%;
+    height: 90px;
+}
+.upload-text {
+    font-size: 18px;
+    margin-bottom: 10px;
+}
+.upload-advice {
+    font-size: 13px;
+    opacity: 0.6;
+}
+.selected-files {
+    background: #1e2a1e;
+    padding: 15px;
+    border-radius: 10px;
+    margin-bottom: 20px;
+}
+.files-list {
+    margin-top: 10px;
+    max-height: 120px;
+    overflow-y: auto;
+}
+.file-item {
+    background: #2a3a2a;
+    padding: 8px 12px;
+    border-radius: 8px;
+    margin-bottom: 5px;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.file-item button {
+    background: #c62828;
+    color: white;
+    border: none;
+    border-radius: 50%;
+    width: 24px;
+    height: 24px;
+    cursor: pointer;
+}
+.upload-choice {
+    display: flex;
+    gap: 20px;
+    margin-bottom: 20px;
+    flex-wrap: wrap;
+}
+.option-card {
+    flex: 1;
+    background: #2a2a3a;
+    padding: 20px;
+    border-radius: 10px;
+    min-width: 250px;
+}
+.option-choice {
+    display: flex;
+    gap: 5px;
+}
+.option-choice img {
+    max-width: 100%;
+    height: 30px;
+}
+.option-card h3 {
+    margin-bottom: 15px;
+}
+.option-input {
+    width: 100%;
+    padding: 10px;
+    margin-top: 10px;
+    border-radius: 8px;
+    border: none;
+    background: #1e1e2a;
+    color: white;
+}
+.btn {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border: none;
+    padding: 15px 30px;
+    border-radius: 50px;
+    font-size: 16px;
+    font-weight: 600;
+    cursor: pointer;
+    width: 100%;
+    transition: transform 0.3s;
+}
+.btn:hover:not(:disabled) {
+    transform: translateY(-2px);
+}
+.btn:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+.loader {
+    display: none;
+    text-align: center;
+    margin: 20px 0;
+}
+.loader::before {
+    content: "";
+    display: inline-block;
+    width: 40px;
+    height: 40px;
+    border: 4px solid #f3f3f3;
+    border-top: 4px solid #667eea;
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+}
+@keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
+}
+.results {
+    display: none;
+    margin-top: 30px;
+    background: #2a2a3a;
+    padding: 20px;
+    border-radius: 15px;
+}
+.stats {
+    display: flex;
+    gap: 20px;
+    margin-bottom: 20px;
+    flex-wrap: wrap;
+}
+.stat-card {
+    flex: 1;
+    background: #1e1e2a;
+    padding: 20px;
+    border-radius: 10px;
+    text-align: center;
+}
+.stat-value {
+    font-size: 32px;
+    font-weight: bold;
+    color: #667eea;
+}
+.table-wrapper {
+    overflow-x: auto;
+}
+table {
+    width: 100%;
+    border-collapse: collapse;
+}
+th, td {
+    padding: 12px;
+    text-align: left;
+    border-bottom: 1px solid #3a3a4a;
+}
+th {
+    background: #1e1e2a;
+}
+.download-btn {
+    background: #4caf50;
+    color: white;
+    border: none;
+    padding: 12px 24px;
+    border-radius: 25px;
+    margin-top: 20px;
+    cursor: pointer;
+}
+@media (max-width: 768px) {
+    .upload-choice {
+        flex-direction: column;
+    }
+    .stats {
+        flex-direction: column;
+    }
+}