| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from __future__ import absolute_import |
| from __future__ import print_function |
| from __future__ import unicode_literals |
| from io import BytesIO |
| from itertools import takewhile |
| import json |
| import logging |
| import inspect |
|
|
| from .. import recordstream |
| from ..bintype import ERROREVENT |
| from ..bintype import resolve_type_events |
| from ..bintype import resolve_values_from_stream |
| from ..dataio import ParseError |
| from ..dataio import dumpbytes |
| from ..recordstream import nth |
| from ..tagids import tagnames |
| from ..treeop import STARTEVENT |
| from ..treeop import ENDEVENT |
| from ..treeop import prefix_ancestors_from_level |
| from ..utils import JsonObjects |
|
|
| from ._shared import tag_models |
| from ._shared import RecordModel |
| from ._shared import BinStorageId |
| from ._shared import COLORREF |
| from ._shared import Margin |
| from .controlchar import CHID |
| from .controlchar import ControlChar |
| from .tagid16_document_properties import DocumentProperties |
| from .tagid17_id_mappings import IdMappings |
| from .tagid18_bin_data import BinData |
| from .tagid20_border_fill import BorderFill |
| from .tagid19_face_name import FaceName |
| from .tagid21_char_shape import CharShape |
| from .tagid21_char_shape import LanguageStruct |
| from .tagid22_tab_def import TabDef |
| from .tagid23_numbering import Numbering |
| from .tagid24_bullet import Bullet |
| from .tagid25_para_shape import ParaShape |
| from .tagid26_style import Style |
| from .tagid27_doc_data import DocData |
| from .tagid28_distribute_doc_data import DistributeDocData |
| from .tagid30_compatible_document import CompatibleDocument |
| from .tagid31_layout_compatibility import LayoutCompatibility |
| from .tagid32_unknown import TagModel32 |
| from .tagid50_para_header import Paragraph |
| from .tagid51_para_text import ParaText |
| from .tagid51_para_text import ParaTextChunks |
| from .tagid52_para_char_shape import ParaCharShape |
| from .tagid53_para_line_seg import ParaLineSeg |
| from .tagid53_para_line_seg import ParaLineSegList |
| from .tagid53_para_line_seg import LineSeg |
| from .tagid54_para_range_tag import ParaRangeTag |
| from .tagid55_ctrl_header import Control |
| from .controls.bookmark_control import BookmarkControl |
| from .controls.columns_def import ColumnsDef |
| from .controls.common_controls import CommonControl |
| from .controls.dutmal import Dutmal |
| from .controls.field import Field |
| from .controls.field import FieldUnknown |
| from .controls.field import FieldDate |
| from .controls.field import FieldDocDate |
| from .controls.field import FieldPath |
| from .controls.field import FieldBookmark |
| from .controls.field import FieldMailMerge |
| from .controls.field import FieldCrossRef |
| from .controls.field import FieldFormula |
| from .controls.field import FieldClickHere |
| from .controls.field import FieldClickHereData |
| from .controls.field import FieldSummary |
| from .controls.field import FieldUserInfo |
| from .controls.field import FieldHyperLink |
| from .controls.field import FieldMemo |
| from .controls.field import FieldPrivateInfoSecurity |
| from .controls.gshape_object_control import GShapeObjectControl |
| from .controls.header_footer import HeaderFooter |
| from .controls.header_footer import Header |
| from .controls.header_footer import Footer |
| from .controls.hidden_comment import HiddenComment |
| from .controls.index_marker import IndexMarker |
| from .controls.note import Note |
| from .controls.note import FootNote |
| from .controls.note import EndNote |
| from .controls.numbering import AutoNumbering |
| from .controls.numbering import NewNumbering |
| from .controls.page_hide import PageHide |
| from .controls.page_number_position import PageNumberPosition |
| from .controls.page_odd_even import PageOddEven |
| from .controls.section_def import SectionDef |
| from .controls.table_control import TableControl |
| from .controls.tcps_control import TCPSControl |
| from .tagid56_list_header import ListHeader |
| from .tagid56_list_header import TableCaption |
| from .tagid56_list_header import TableCell |
| from .tagid56_list_header import TextboxParagraphList |
| from .tagid56_list_header import HeaderParagraphList |
| from .tagid56_list_header import FooterParagraphList |
| from .tagid57_page_def import PageDef |
| from .tagid58_footnote_shape import FootnoteShape |
| from .tagid59_page_border_fill import PageBorderFill |
| from .tagid60_shape_component import ShapeComponent |
| from .tagid61_table import TableBody |
| from .tagid62_shape_component_line import ShapeLine |
| from .tagid63_shape_component_rectangle import ShapeRectangle |
| from .tagid64_shape_component_ellipse import ShapeEllipse |
| from .tagid65_shape_component_arc import ShapeArc |
| from .tagid66_shape_component_polygon import ShapePolygon |
| from .tagid67_shape_component_curve import ShapeCurve |
| from .tagid68_shape_component_ole import ShapeOLE |
| from .tagid69_shape_component_picture import ShapePicture |
| from .tagid70_shape_component_container import ShapeContainer |
| from .tagid71_ctrl_data import ControlData |
| from .tagid72_ctrl_eqedit import EqEdit |
| from .tagid74_shape_component_textart import ShapeTextArt |
| from .tagid75_form_object import FormObject |
| from .tagid76_memo_shape import MemoShape |
| from .tagid77_memo_list import MemoList |
| from .tagid78_forbidden_char import ForbiddenChar |
| from .tagid79_chart_data import ChartData |
| from .tagid99_shape_component_unknown import ShapeUnknown |
|
|
| |
| RecordModel |
| BinStorageId |
| COLORREF |
| Margin |
| DocumentProperties |
| BinData |
| BorderFill |
| CharShape |
| LanguageStruct |
| TabDef |
| Numbering |
| Bullet |
| ParaShape |
| Style |
| DocData |
| DistributeDocData |
| CompatibleDocument |
| LayoutCompatibility |
| TagModel32 |
| Paragraph |
| ParaText |
| ParaTextChunks |
| ParaCharShape |
| ParaLineSeg |
| ParaLineSegList |
| LineSeg |
| ParaRangeTag |
| Control |
| ListHeader |
| TableCaption |
| TableCell |
| TextboxParagraphList |
| PageDef |
| FootnoteShape |
| PageBorderFill |
| ShapeComponent |
| TableBody |
| ShapeLine |
| ShapeRectangle |
| ShapeEllipse |
| ShapeArc |
| ShapePolygon |
| ShapeCurve |
| ShapeOLE |
| ShapePicture |
| ShapeContainer |
| ControlData |
| EqEdit |
| ShapeTextArt |
| FormObject |
| MemoShape |
| MemoList |
| ForbiddenChar |
| ChartData |
| ShapeUnknown |
| CHID |
| ControlChar |
| BookmarkControl |
| ColumnsDef |
| CommonControl |
| Dutmal |
| Field |
| FieldUnknown |
| FieldDate |
| FieldDocDate |
| FieldPath |
| FieldBookmark |
| FieldMailMerge |
| FieldCrossRef |
| FieldFormula |
| FieldClickHere |
| FieldClickHereData |
| FieldSummary |
| FieldUserInfo |
| FieldHyperLink |
| FieldMemo |
| FieldPrivateInfoSecurity |
| GShapeObjectControl |
| HeaderFooter |
| Header |
| HeaderParagraphList |
| Footer |
| FooterParagraphList |
| HiddenComment |
| IndexMarker |
| Note |
| FootNote |
| EndNote |
| AutoNumbering |
| NewNumbering |
| PageHide |
| PageNumberPosition |
| PageOddEven |
| SectionDef |
| TableControl |
| TCPSControl |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class UnknownTagModel(RecordModel): |
| pass |
|
|
|
|
| class Text(object): |
| pass |
|
|
|
|
| def _check_tag_models(): |
| for tagid, name in tagnames.items(): |
| assert tagid in tag_models, 'RecordModel for %s is missing!' % name |
|
|
|
|
| _check_tag_models() |
|
|
|
|
| def init_record_parsing_context(base, record): |
| ''' Initialize a context to parse the given record |
| |
| the initializations includes followings: |
| - context = dict(base) |
| - context['record'] = record |
| - context['stream'] = record payload stream |
| |
| :param base: the base context to be shallow-copied into the new one |
| :param record: to be parsed |
| :returns: new context |
| ''' |
|
|
| return dict(base, record=record, stream=BytesIO(record['payload'])) |
|
|
|
|
| def parse_models(context, records): |
| for context, model in parse_models_intern(context, records): |
| yield model |
|
|
|
|
| def parse_models_intern(context, records): |
| context_models = ((init_record_parsing_context(context, record), record) |
| for record in records) |
| context_models = parse_models_with_parent(context_models) |
| for context, model in context_models: |
| stream = context['stream'] |
| unparsed = stream.read() |
| if unparsed: |
| model['unparsed'] = unparsed |
| yield context, model |
|
|
|
|
| def parse_models_with_parent(context_models): |
| level_prefixed = ((model['level'], (context, model)) |
| for context, model in context_models) |
| root_item = (dict(), dict()) |
| ancestors_prefixed = prefix_ancestors_from_level(level_prefixed, root_item) |
| for ancestors, (context, model) in ancestors_prefixed: |
| context['parent'] = ancestors[-1] |
| parse_model(context, model) |
| yield context, model |
|
|
|
|
| def parse_model(context, model): |
| ''' HWPTAG로 모델 결정 후 기본 파싱 ''' |
|
|
| stream = context['stream'] |
| context['resolve_values'] = resolve_values_from_stream(stream) |
| events = resolve_model_events(context, model) |
| events = raise_on_errorevent(context, events) |
| model['binevents'] = list(events) |
|
|
| logger.debug('model: %s', model['type'].__name__) |
| logger.debug('%s', model['content']) |
|
|
|
|
| def raise_on_errorevent(context, events): |
| binevents = list() |
| for ev, item in events: |
| yield ev, item |
| binevents.append((ev, item)) |
| if ev is ERROREVENT: |
| e = item['exception'] |
| msg = 'can\'t parse %s' % item['type'] |
| pe = ParseError(msg) |
| pe.cause = e |
| pe.path = context.get('path') |
| pe.treegroup = context.get('treegroup') |
| pe.record = context.get('record') |
| pe.offset = item.get('bin_offset') |
| pe.binevents = binevents |
| raise pe |
|
|
|
|
| def resolve_models(context, records): |
| model_contexts = (dict(context, record=record, model=dict(record)) |
| for record in records) |
|
|
| level_prefixed = ((context['model']['level'], context) |
| for context in model_contexts) |
| root_item = {} |
| ancestors_prefixed = prefix_ancestors_from_level(level_prefixed, root_item) |
| for ancestors, context in ancestors_prefixed: |
| parent = ancestors[-1] |
| context['parent'] = parent, parent.get('model', {}) |
|
|
| record_frame = context['record'] |
| context['type'] = RecordModel |
| context['name'] = record_frame['tagname'] |
| yield STARTEVENT, context |
| for x in resolve_model_events(context, context['model']): |
| yield x |
| event, item = x |
| context['value'] = item |
| yield ENDEVENT, context |
|
|
|
|
| def resolve_model_events(context, model): |
|
|
| resolve_values = context['resolve_values'] |
|
|
| model['type'] = model_type = tag_models.get(model['tagid'], |
| UnknownTagModel) |
|
|
| for ev, item in resolve_type_events(model_type, context, resolve_values): |
| yield ev, item |
|
|
| model['content'] = item['value'] |
|
|
| extension_types = getattr(model['type'], 'extension_types', None) |
| if extension_types: |
| key = model['type'].get_extension_key(context, model) |
| extension = extension_types.get(key) |
| if extension is not None: |
| |
| |
| |
| |
| for cls in get_extension_mro(extension, model['type']): |
| extension_type_events = resolve_type_events(cls, context, |
| resolve_values) |
| for ev, item in extension_type_events: |
| yield ev, item |
| content = item['value'] |
| model['content'].update(content) |
| model['type'] = extension |
|
|
| if 'parent' in context: |
| parent = context['parent'] |
| parent_context, parent_model = parent |
| parent_type = parent_model.get('type') |
| parent_content = parent_model.get('content') |
|
|
| on_child = getattr(parent_type, 'on_child', None) |
| if on_child: |
| on_child(parent_content, parent_context, (context, model)) |
|
|
|
|
| def get_extension_mro(cls, up_to_cls=None): |
| mro = inspect.getmro(cls) |
| mro = takewhile(lambda cls: cls is not up_to_cls, mro) |
| mro = list(cls for cls in mro if 'attributes' in cls.__dict__) |
| mro = reversed(mro) |
| return mro |
|
|
|
|
| class ModelJsonEncoder(json.JSONEncoder): |
|
|
| def default(self, obj): |
| if isinstance(obj, bytes): |
| return obj.decode('latin1') |
| return json.JSONEncoder.default(self, obj) |
|
|
|
|
| def model_to_json(model, *args, **kwargs): |
| ''' convert a model to json ''' |
| kwargs['cls'] = ModelJsonEncoder |
| model = dict(model) |
| model['type'] = model['type'].__name__ |
| record = model |
| record['payload'] = list(dumpbytes(record['payload'])) |
| if 'unparsed' in model: |
| model['unparsed'] = list(dumpbytes(model['unparsed'])) |
| if 'binevents' in model: |
| del model['binevents'] |
| return json.dumps(model, *args, **kwargs) |
|
|
|
|
| def chain_iterables(iterables): |
| for iterable in iterables: |
| for item in iterable: |
| yield item |
|
|
|
|
| class ModelStream(recordstream.RecordStream): |
|
|
| def models(self, **kwargs): |
| |
| kwargs.setdefault('version', self.version) |
| try: |
| kwargs.setdefault('path', self.path) |
| except AttributeError: |
| pass |
| treegroup = kwargs.get('treegroup', None) |
| if treegroup is not None: |
| records = self.records_treegroup(treegroup) |
| models = parse_models(kwargs, records) |
| else: |
| groups = self.models_treegrouped(**kwargs) |
| models = chain_iterables(groups) |
| return models |
|
|
| def models_treegrouped(self, **kwargs): |
| ''' iterable of iterable of the models, grouped by the top-level tree |
| ''' |
| kwargs.setdefault('version', self.version) |
| for group_idx, records in enumerate(self.records_treegrouped()): |
| kwargs['treegroup'] = group_idx |
| yield parse_models(kwargs, records) |
|
|
| def model(self, idx): |
| return nth(self.models(), idx) |
|
|
| def models_json(self, **kwargs): |
| models = self.models(**kwargs) |
| return JsonObjects(models, model_to_json) |
|
|
| def other_formats(self): |
| d = super(ModelStream, self).other_formats() |
| d['.models'] = self.models_json().open |
| return d |
|
|
| def parse_model_events(self): |
| context = dict(version=self.version) |
|
|
| def resolve_values_from_record(record): |
| stream = BytesIO(record['payload']) |
| return resolve_values_from_stream(stream) |
|
|
| for group_idx, records in enumerate(self.records_treegrouped()): |
| context['treegroup'] = group_idx |
| for x in resolve_models(context, records): |
| event, item = x |
| if item['type'] is RecordModel: |
| if event is STARTEVENT: |
| record_frame = item['record'] |
| stream = BytesIO(record_frame['payload']) |
| resolve_values = resolve_values_from_stream(stream) |
| item['stream'] = stream |
| item['resolve_values'] = resolve_values |
| elif event is ENDEVENT: |
| stream = item['stream'] |
| item['leftover'] = { |
| 'offset': stream.tell(), |
| 'bytes': stream.read() |
| } |
| yield x |
|
|
|
|
| class DocInfo(ModelStream): |
|
|
| @property |
| def idmappings(self): |
| for model in self.models(): |
| if model['type'] is IdMappings: |
| return model |
|
|
| @property |
| def facenames_by_lang(self): |
| facenames = list(m for m in self.models() |
| if m['type'] is FaceName) |
| languages = 'ko', 'en', 'cn', 'jp', 'other', 'symbol', 'user' |
| facenames_by_lang = dict() |
| offset = 0 |
| for lang in languages: |
| n_fonts = self.idmappings['content'][lang + '_fonts'] |
| facenames_by_lang[lang] = facenames[offset:offset + n_fonts] |
| offset += n_fonts |
| return facenames_by_lang |
|
|
| @property |
| def charshapes(self): |
| return (m for m in self.models() |
| if m['type'] is CharShape) |
|
|
| def get_charshape(self, charshape_id): |
| return nth(self.charshapes, charshape_id) |
|
|
| def charshape_lang_facename(self, charshape_id, lang): |
| charshape = self.get_charshape(charshape_id) |
| lang_facename_offset = charshape['content']['font_face'][lang] |
| return self.facenames_by_lang[lang][lang_facename_offset] |
|
|
|
|
| class Sections(recordstream.Sections): |
|
|
| section_class = ModelStream |
|
|
|
|
| class Hwp5File(recordstream.Hwp5File): |
|
|
| docinfo_class = DocInfo |
| bodytext_class = Sections |
|
|
|
|
| def create_context(file=None, **context): |
| if file is not None: |
| context['version'] = file.fileheader.version |
| assert 'version' in context |
| return context |
|
|