This is fine-tuned PoC of markuplm-base model for parsing news attributes from web-pages: author, pulication date, content and etc.

Inference example

Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content).


id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}

def eval(url):
    current_dir = os.path.dirname(os.path.abspath(__file__))

    model_folder = os.path.join(current_dir, 'models')  # models folder is in the repository root
    model_name = 'OxMarkupLM.pt'

    processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
    processor.parse_html = False

    model_path = os.path.join(model_folder, model_name)

    model = MarkupLMForTokenClassification.from_pretrained(
        model_path, id2label=labels.id2label, label2id=labels.label2id
    )

    html = utils.clean_html(utils.get_html_content(url))
    data = [utils.extract_nodes_and_feautures(html)]
    example = utils.split_sliding_data(data, 10, 0)

    title, author, date, content = [], [], [], []
    for splited in example:
        nodes, xpaths = splited['nodes'], splited['xpaths']
        encoding = processor(
            nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
            padding="max_length", truncation=True, max_length=512, return_tensors="pt"
        )
        offset_mapping = encoding.pop("offset_mapping")
        with torch.no_grad():
            logits = model(**encoding).logits

        predictions = logits.argmax(-1)
        processed_words = []

        for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
            if word_id is not None and offset[0] == 0:
                if pred_id == 1:
                    title.append(nodes[word_id])
                elif pred_id == 2 and word_id not in processed_words:
                    processed_words.append(word_id)
                    content.append(nodes[word_id])
                elif pred_id == 3:
                    author.append(nodes[word_id])
                elif pred_id == 4:
                    date.append(nodes[word_id])

    title = rank_titles(title, '\n'.join(content))
    return {
        "model_name": model_name,
        "url": url,
        "title": title,
        "author": author,
        "date": date,
        "content": content,
    }
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Model tree for olga-rondareva/OxMarkupLM

Finetuned
(2)
this model