{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "aa1a8952", "metadata": {}, "outputs": [], "source": [ "#import libraries\n", "from transformers import pipeline" ] }, { "cell_type": "code", "execution_count": 1, "id": "5493cee5", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9b94e18c676540d1ad8312149bdbcc7c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/431M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "586b43b1b9e2484c9b8636f9b41ab5e2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/1.19k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d6c722044e074fd58b9cf1c67fa48551", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.txt: 0%| | 0.00/213k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "95002c472f794e3b9b96dcfba697395e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/669k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b71e5e8241de454c9bd5f7066b3f5413", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/125 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#reference appropriate Hugging Face model\n", "model_name = 'koakande/bert-finetuned-ner'\n", "\n", "# Load token classification pipeline modelfrom Hugging Face\n", "model = pipeline(\"token-classification\", model=model_name, aggregation_strategy=\"simple\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "f59488e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'koakande/bert-finetuned-ner'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_name" ] }, { "cell_type": "code", "execution_count": 4, "id": "e6b97a0e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'entity_group': 'PER',\n", " 'score': 0.99741244,\n", " 'word': 'Kabeer',\n", " 'start': 12,\n", " 'end': 18},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9985826,\n", " 'word': 'OVO',\n", " 'start': 61,\n", " 'end': 64},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99884343,\n", " 'word': 'UK',\n", " 'start': 72,\n", " 'end': 74}]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "msg = \"Hello, I am Kabeer. I work as a machine learning engineer at OVO in the UK\"\n", "model(msg)" ] }, { "cell_type": "code", "execution_count": 5, "id": "7c54c0ca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"Hello, I am Kabeer. I work as a machine learning engineer at OVO in the UK\"" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# write a prediction method for the model\n", "def predict_entities(text):\n", " # Use the loaded model to identify entities in the text\n", " entities = model(text)\n", " # Highlight identified entities in the input text\n", " highlighted_text = text\n", " for entity in entities:\n", " entity_text = text[entity['start']:entity['end']]\n", " replacement = f\"{entity_text}\"\n", " highlighted_text = highlighted_text.replace(entity_text, replacement)\n", " return highlighted_text\n", "\n", "predict_entities(msg)" ] }, { "cell_type": "code", "execution_count": 6, "id": "4d784554", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "