{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "sys.path.append(\"..\")\n", "from src.preprocessing import PreprocessingPipeline\n", "import pandas as pd\n", "import vaex" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "### Test vaex" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"../data/test_en.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
# label text
0 0 "I think it's time John Rambo move on with his l...
1 1 "I've just watch 2 films of Pang brothers, The E...
2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...
3 0 'This so called remake is terrible. I went to se...
4 1 'When Northfork debuted at the Cannes Film Festi...
... ... ...
4,9950 'The title tells it all -- Ed Gein, the butcher ...
4,9960 "This film makes about as much sense as an 'Ozzi...
4,9970 '"Sex and the City" has some great things going ...
4,9980 'Please...if anybody gets the chance to read thi...
4,9990 '...a film comes along that manages to be absolu...
" ], "text/plain": [ "# label text\n", "0 0 \"I think it's time John Rambo move on with his l...\n", "1 1 \"I've just watch 2 films of Pang brothers, The E...\n", "2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n", "3 0 'This so called remake is terrible. I went to se...\n", "4 1 'When Northfork debuted at the Cannes Film Festi...\n", "... ... ...\n", "4,995 0 'The title tells it all -- Ed Gein, the butcher ...\n", "4,996 0 \"This film makes about as much sense as an 'Ozzi...\n", "4,997 0 '\"Sex and the City\" has some great things going ...\n", "4,998 0 'Please...if anybody gets the chance to read thi...\n", "4,999 0 '...a film comes along that manages to be absolu..." ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vaex.from_pandas(df)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df_small = df.iloc[:1000]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
# label text
0 0 "I think it's time John Rambo move on with his l...
1 1 "I've just watch 2 films of Pang brothers, The E...
2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...
3 0 'This so called remake is terrible. I went to se...
4 1 'When Northfork debuted at the Cannes Film Festi...
... ... ...
9951 "It's a funny business, reviewing movies. These ...
9961 'Right from the start you see that "Anchors Awei...
9970 'I saw this movie in NEW York city. I was waitin...
9980 'Firstly, this is NOT an adaptation of a Stephen...
9991 "Barbra Streisand's debut television special is ...
" ], "text/plain": [ "# label text\n", "0 0 \"I think it's time John Rambo move on with his l...\n", "1 1 \"I've just watch 2 films of Pang brothers, The E...\n", "2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n", "3 0 'This so called remake is terrible. I went to se...\n", "4 1 'When Northfork debuted at the Cannes Film Festi...\n", "... ... ...\n", "995 1 \"It's a funny business, reviewing movies. These ...\n", "996 1 'Right from the start you see that \"Anchors Awei...\n", "997 0 'I saw this movie in NEW York city. I was waitin...\n", "998 0 'Firstly, this is NOT an adaptation of a Stephen...\n", "999 1 \"Barbra Streisand's debut television special is ..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vaex.from_pandas(df_small)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# df = pd.read_csv(\"../data/test_en.csv\")\n", "df = pd.read_excel(\"../data/test_chinese.xlsx\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "pre_steps = [\n", " \"normalize_unicode\",\n", " \"normalize_acronyms\",\n", " \"normalize_bullet_points\",\n", " \"normalize_hyphenated_words\",\n", " \"normalize_quotation_marks\",\n", " \"normalize_whitespaces\",\n", " \"normalize_repeating_words\",\n", " \"normalize_repeating_chars\",\n", " \"normalize_useless_spaces\",\n", " # \"replace_currency_symbols\",\n", " # \"replace_emails\",\n", " # \"replace_emojis\",\n", " # \"replace_hashtags\",\n", " # \"replace_numbers\",\n", " # \"replace_phone_numbers\",\n", " # \"replace_urls\",\n", " # \"replace_user_handles\",\n", " # \"remove_accents\",\n", " # \"remove_brackets\",\n", " # \"remove_html_tags\",\n", " # \"remove_non_words\",\n", " # \"remove_punctuation\",\n", " # \"lowercase\",\n", " \"strip\",\n", "]\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "post_steps = [\n", " \"lowercase\",\n", " # \"replace_currency_symbols\",\n", " # \"replace_urls\",\n", " # \"replace_emails\",\n", " # \"replace_user_handles\",\n", " # \"replace_hashtags\",\n", " # \"replace_emojis\",\n", " # \"replace_phone_numbers\",\n", " # \"replace_numbers\",\n", " # \"remove_html_tags\",\n", " # \"remove_accents\",\n", " # \"remove_brackets\",\n", " \"remove_non_words\",\n", " # \"remove_numbers\",\n", " # \"remove_punctuation\",\n", " \"normalize_repeating_words\",\n", " \"normalize_repeating_chars\",\n", " \"normalize_useless_spaces\",\n", " \"strip\",\n", "]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "pipe = PreprocessingPipeline(\n", " language=\"Chinese\",\n", " lemmatization_step=\"Spacy lemmatizer (keep stopwords)\", # \"Disable lemmatizer\",\n", " pre_steps=pre_steps,\n", " post_steps=post_steps,\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.text[0]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.pre(df.text[0])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 mp 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Compose(, , , , , , )" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.post" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "odf = pipe.vaex_process(df, \"text\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "odf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "steps = list(PreprocessingPipeline.pipeline_components().keys())\n", "default_pre_steps_idx = [steps.index(i) for i in pre_steps]\n", "default_post_steps_idx = [steps.index(i) for i in post_steps]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "default_pre_steps_idx" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "default_post_steps_idx" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sorted(list(PreprocessingPipeline.pipeline_components().keys()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list(PreprocessingPipeline.lemmatization_component().keys())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "_re_non_words = re.compile(\"[^A-Za-z]+\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "_re_non_words.sub(\" \", \"Mimmo23\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "aa7efd0b3ada76bb0689aa8ed0b61d7de788847e3d11d2d142fc5800c765982f" }, "kernelspec": { "display_name": "Python 3.7.11 64-bit ('wordify': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }