{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "spiritual-swift", "metadata": {}, "outputs": [], "source": [ "%config Completer.use_jedi = False\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 1, "id": "stopped-single", "metadata": {}, "outputs": [], "source": [ "import tensorflow\n", "import regex" ] }, { "cell_type": "code", "execution_count": 2, "id": "numeric-handle", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline" ] }, { "cell_type": "code", "execution_count": 3, "id": "numerous-overall", "metadata": {}, "outputs": [], "source": [ "from email_parser import nlp" ] }, { "cell_type": "code", "execution_count": 4, "id": "studied-oracle", "metadata": {}, "outputs": [], "source": [ "text = \"\"\"tel: 512 222 5555\"\"\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "pacific-walter", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'en'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lang = nlp.f_detect_language(text)\n", "lang" ] }, { "cell_type": "code", "execution_count": 6, "id": "every-gardening", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
entityvaluestartendscore
0TEL512 222 55555171
\n", "
" ], "text/plain": [ " entity value start end score\n", "0 TEL 512 222 5555 5 17 1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_result = nlp.f_ner(text, lang=lang)\n", "df_result" ] }, { "cell_type": "code", "execution_count": null, "id": "operating-recorder", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "id": "delayed-overhead", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
entityvaluestartendscore
0SIGNATUREJB1191220.955208
\n", "
" ], "text/plain": [ " entity value start end score\n", "0 SIGNATURE JB 119 122 0.955208" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.f_detect_email_signature(text, lang=\"fr\")" ] }, { "cell_type": "code", "execution_count": 33, "id": "frozen-jones", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('je', None), (\"m'appelle\", None), ('Jean-Baptiste', 'PER')]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iter_match = regex.finditer(\"\\s|$\", text)\n", "list_values = []\n", "start_pos = 0\n", "for match in iter_match:\n", " word = match.string[start_pos:match.start()]\n", " \n", " df_entity = df_result.query(f\"start>={start_pos} & end<={match.start()}\").head(1)\n", " if len(df_entity)==1:\n", " entity = df_entity[\"entity\"].values[0]\n", " else:\n", " entity = None\n", "# list_values\n", " list_values.append((word, entity))\n", " start_pos = match.end()\n", "list_values\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "solid-speaker", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }