muryshev's picture
init
57cf043
raw
history blame
6.88 kB
import logging
import os
import shutil
from pathlib import Path
from fastapi import HTTPException, UploadFile
from sqlalchemy.orm import Session
from common.common import get_source_format
from common.configuration import Configuration
from common.constants import PROCESSING_FORMATS
from components.parser.xml.xml_parser import XMLParser
from components.dbo.models.dataset import Dataset
from components.dbo.models.dataset_document import DatasetDocument
from components.dbo.models.document import Document
from schemas.document import Document as DocumentSchema
from schemas.document import DocumentDownload
from components.services.dataset import DatasetService
logger = logging.getLogger(__name__)
class DocumentService:
"""
Сервис для работы с документами.
"""
def __init__(
self,
dataset_service: DatasetService,
config: Configuration,
db: Session
):
logger.info("Initializing DocumentService")
self.db = db
self.dataset_service = dataset_service
self.xml_parser = XMLParser()
self.documents_path = Path(config.db_config.files.documents_path)
def get_document(
self,
document_id: int,
dataset_id: int | None = None,
) -> DocumentDownload:
"""
Скачать документ по его идентификатору.
"""
logger.info(f"Getting document info for ID: {document_id}")
if dataset_id is None:
dataset_id = self.dataset_service.get_current_dataset().dataset_id
self.dataset_service.raise_if_processing()
with self.db() as session:
document_in_dataset = (
session.query(DatasetDocument)
.filter(
DatasetDocument.dataset_id == dataset_id,
DatasetDocument.document_id == document_id,
)
.first()
)
if not document_in_dataset:
logger.warning(f"Document not found: {document_id}")
raise HTTPException(status_code=404, detail="Document not found")
document = (
session.query(Document)
.filter(
Document.id == document_id,
)
.first()
)
result = DocumentDownload(
filename=f'{document.title[:40]}.{document.source_format}',
filepath=self.documents_path
/ f'{document.document_id}.{document.source_format}',
)
logger.debug(f"Retrieved document: {result.filename}")
return result
def add_document(self, dataset_id: int, file: UploadFile) -> DocumentSchema:
"""
Добавить документ в датасет.
"""
self.dataset_service.raise_if_processing()
file_location = Path.cwd() / 'tmp' / file.filename
file_location.parent.mkdir(parents=True, exist_ok=True)
with open(file_location, 'wb') as buffer:
buffer.write(file.file.read())
source_format = get_source_format(file.filename)
logger.info(f"Parsing file: {file_location}")
logger.info(f"Source format: {source_format}")
try:
parsed = self.xml_parser.parse(file_location, include_content=False)
except Exception:
raise HTTPException(
status_code=400, detail="Invalid XML file, service can't parse it"
)
with self.db() as session:
dataset = (
session.query(Dataset).filter(Dataset.id == dataset_id).first()
)
if not dataset:
raise HTTPException(status_code=404, detail='Dataset not found')
if not dataset.is_draft:
raise HTTPException(status_code=403, detail='Dataset is not draft')
document = Document(
title=parsed.name,
owner=parsed.owner,
status=parsed.status,
source_format=source_format,
)
logger.info(f"Document: {document}")
session.add(document)
session.flush()
logger.info(f"Document ID: {document.document_id}")
link = DatasetDocument(
dataset_id=dataset_id,
document_id=document.document_id,
)
session.add(link)
if source_format in PROCESSING_FORMATS:
logger.info(
f"Moving file to: {self.documents_path / f'{document.document_id}.{source_format}'}"
)
shutil.move(
file_location,
self.documents_path / f'{document.document_id}.{source_format}',
)
else:
logger.error(f"Unknown source format: {source_format}")
raise HTTPException(status_code=400, detail='Unknown document format')
if len(os.listdir(file_location.parent)) == 0:
file_location.parent.rmdir()
session.commit()
session.refresh(document)
result = DocumentSchema(
id=document.document_id,
name=document.title,
owner=document.owner,
status=document.status,
)
logger.debug(f"Retrieved document: {result.name}")
return result
def delete_document(self, dataset_id: int, document_id: int) -> None:
"""
Удалить документ из датасета.
"""
self.dataset_service.raise_if_processing()
with self.db() as session:
dataset_document = (
session.query(DatasetDocument)
.filter(
DatasetDocument.dataset_id == dataset_id,
DatasetDocument.document_id == document_id,
)
.first()
)
if not dataset_document:
raise HTTPException(status_code=404, detail='Document not found')
dataset = (
session.query(Dataset).filter(Dataset.id == dataset_id).first()
)
if not dataset.is_draft:
raise HTTPException(status_code=403, detail='Dataset is not draft')
document = (
session.query(Document).filter(Document.id == document_id).first()
)
is_used = (
session.query(DatasetDocument)
.filter(DatasetDocument.document_id == document_id)
.count()
)
if is_used == 0:
os.remove(self.documents_path / f'{document_id}.{document.source_format}')
session.delete(document)
session.delete(dataset_document)
session.commit()