{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "32744c39-dae0-4f8c-beea-af8cf934f977", "metadata": {}, "outputs": [], "source": [ "from paraphrase_metrics import metrics as pm\n", "import pandas as pd\n", "import spacy\n", "from tqdm import tqdm\n", "nlp = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "c219268a-d25b-4b27-b0eb-0bb578ec3450", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
og_s1og_s2new_s1new_s2og_labelnew_labelremarks
0Amrozi accused his brother, whom he called \"th...Referring to him as only \"the witness\", Amrozi...Amrozi accused his brother, whom he called \"th...Referring to him as only \"the witness\", Amrozi...11no need to correct
1Yucaipa owned Dominick's before selling the ch...Yucaipa bought Dominick's in 1995 for $693 mil...Yucaipa owned Dominick's before selling the ch...Yucaipa bought Dominick's in 1995 for $693 mil...00no need to correct
2They had published an advertisement on the Int...On June 10, the ship's owners had published an...They had published an advertisement on the Int...On June 10, the ship's owners had published an...11no need to correct
3Around 0335 GMT, Tab shares were up 19 cents, ...Tab shares jumped 20 cents, or 4.6%, to set a ...Around 0335 GMT, Tab shares were up 19 cents, ...Tab shares jumped 20 cents, or 4.6%, to set a ...00no need to correct
4The stock rose $2.11, or about 11 percent, to ...PG&E Corp. shares jumped $1.63 or 8 percent to...The stock rose $2.11, or about 11 percent, to ...PG&E Corp. shares jumped $1.63 or 8 percent to...10can't correct
\n", "
" ], "text/plain": [ " og_s1 \\\n", "0 Amrozi accused his brother, whom he called \"th... \n", "1 Yucaipa owned Dominick's before selling the ch... \n", "2 They had published an advertisement on the Int... \n", "3 Around 0335 GMT, Tab shares were up 19 cents, ... \n", "4 The stock rose $2.11, or about 11 percent, to ... \n", "\n", " og_s2 \\\n", "0 Referring to him as only \"the witness\", Amrozi... \n", "1 Yucaipa bought Dominick's in 1995 for $693 mil... \n", "2 On June 10, the ship's owners had published an... \n", "3 Tab shares jumped 20 cents, or 4.6%, to set a ... \n", "4 PG&E Corp. shares jumped $1.63 or 8 percent to... \n", "\n", " new_s1 \\\n", "0 Amrozi accused his brother, whom he called \"th... \n", "1 Yucaipa owned Dominick's before selling the ch... \n", "2 They had published an advertisement on the Int... \n", "3 Around 0335 GMT, Tab shares were up 19 cents, ... \n", "4 The stock rose $2.11, or about 11 percent, to ... \n", "\n", " new_s2 og_label new_label \\\n", "0 Referring to him as only \"the witness\", Amrozi... 1 1 \n", "1 Yucaipa bought Dominick's in 1995 for $693 mil... 0 0 \n", "2 On June 10, the ship's owners had published an... 1 1 \n", "3 Tab shares jumped 20 cents, or 4.6%, to set a ... 0 0 \n", "4 PG&E Corp. shares jumped $1.63 or 8 percent to... 1 0 \n", "\n", " remarks \n", "0 no need to correct \n", "1 no need to correct \n", "2 no need to correct \n", "3 no need to correct \n", "4 can't correct " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "split = \"train\"\n", "df = pd.read_csv(\"./mrpc_\"+split+\"_corrected.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "36cba8a1-8997-4563-9c33-4d4f6049ec5a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 4409/4409 [00:53<00:00, 82.98it/s]\n" ] } ], "source": [ "og_wpd_list = []\n", "og_ld_list = []\n", "new_wpd_list = []\n", "new_ld_list = []\n", "\n", "for index, row in tqdm(df.iterrows(), total=len(df)):\n", " # original pair\n", " og_s1, og_s2 = nlp(row['og_s1']), nlp(row['og_s2'])\n", " og_wpd = pm.wpd(og_s1, og_s2)\n", " og_ld = pm.ld(og_s1, og_s2)\n", " og_wpd_list.append(og_wpd)\n", " og_ld_list.append(og_ld)\n", " \n", " # new pair\n", " new_s1, new_s2 = nlp(row['new_s1']), nlp(row['new_s2'])\n", " new_wpd = pm.wpd(new_s1, new_s2)\n", " new_ld = pm.ld(new_s1, new_s2)\n", " new_wpd_list.append(new_wpd)\n", " new_ld_list.append(new_ld)\n", " " ] }, { "cell_type": "code", "execution_count": 4, "id": "afcf311c-3c44-4d08-b8df-8decf051e315", "metadata": {}, "outputs": [], "source": [ "df[\"og_wpd\"] = og_wpd_list\n", "df[\"og_ld\"] = og_ld_list\n", "df[\"new_wpd\"] = new_wpd_list\n", "df[\"new_ld\"] = new_ld_list" ] }, { "cell_type": "code", "execution_count": 5, "id": "9e8c3562-58ad-43d0-b887-218769a885b7", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"./mrpc_\"+split+\"_scores.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "05aa6437-c11c-4f06-9e23-a300d26e128d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }