Spaces:

tlkh
/

paraphrase-metrics-mrpc

Runtime error

File size: 8,403 Bytes

df3852e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "32744c39-dae0-4f8c-beea-af8cf934f977",
   "metadata": {},
   "outputs": [],
   "source": [
    "from paraphrase_metrics import metrics as pm\n",
    "import pandas as pd\n",
    "import spacy\n",
    "from tqdm import tqdm\n",
    "nlp = spacy.load(\"en_core_web_sm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c219268a-d25b-4b27-b0eb-0bb578ec3450",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>og_s1</th>\n",
       "      <th>og_s2</th>\n",
       "      <th>new_s1</th>\n",
       "      <th>new_s2</th>\n",
       "      <th>og_label</th>\n",
       "      <th>new_label</th>\n",
       "      <th>remarks</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Amrozi accused his brother, whom he called \"th...</td>\n",
       "      <td>Referring to him as only \"the witness\", Amrozi...</td>\n",
       "      <td>Amrozi accused his brother, whom he called \"th...</td>\n",
       "      <td>Referring to him as only \"the witness\", Amrozi...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>no need to correct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Yucaipa owned Dominick's before selling the ch...</td>\n",
       "      <td>Yucaipa bought Dominick's in 1995 for $693 mil...</td>\n",
       "      <td>Yucaipa owned Dominick's before selling the ch...</td>\n",
       "      <td>Yucaipa bought Dominick's in 1995 for $693 mil...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>no need to correct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10, the ship's owners had published an...</td>\n",
       "      <td>They had published an advertisement on the Int...</td>\n",
       "      <td>On June 10, the ship's owners had published an...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>no need to correct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Around 0335 GMT, Tab shares were up 19 cents, ...</td>\n",
       "      <td>Tab shares jumped 20 cents, or 4.6%, to set a ...</td>\n",
       "      <td>Around 0335 GMT, Tab shares were up 19 cents, ...</td>\n",
       "      <td>Tab shares jumped 20 cents, or 4.6%, to set a ...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>no need to correct</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The stock rose $2.11, or about 11 percent, to ...</td>\n",
       "      <td>PG&amp;E Corp. shares jumped $1.63 or 8 percent to...</td>\n",
       "      <td>The stock rose $2.11, or about 11 percent, to ...</td>\n",
       "      <td>PG&amp;E Corp. shares jumped $1.63 or 8 percent to...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>can't correct</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               og_s1  \\\n",
       "0  Amrozi accused his brother, whom he called \"th...   \n",
       "1  Yucaipa owned Dominick's before selling the ch...   \n",
       "2  They had published an advertisement on the Int...   \n",
       "3  Around 0335 GMT, Tab shares were up 19 cents, ...   \n",
       "4  The stock rose $2.11, or about 11 percent, to ...   \n",
       "\n",
       "                                               og_s2  \\\n",
       "0  Referring to him as only \"the witness\", Amrozi...   \n",
       "1  Yucaipa bought Dominick's in 1995 for $693 mil...   \n",
       "2  On June 10, the ship's owners had published an...   \n",
       "3  Tab shares jumped 20 cents, or 4.6%, to set a ...   \n",
       "4  PG&E Corp. shares jumped $1.63 or 8 percent to...   \n",
       "\n",
       "                                              new_s1  \\\n",
       "0  Amrozi accused his brother, whom he called \"th...   \n",
       "1  Yucaipa owned Dominick's before selling the ch...   \n",
       "2  They had published an advertisement on the Int...   \n",
       "3  Around 0335 GMT, Tab shares were up 19 cents, ...   \n",
       "4  The stock rose $2.11, or about 11 percent, to ...   \n",
       "\n",
       "                                              new_s2  og_label  new_label  \\\n",
       "0  Referring to him as only \"the witness\", Amrozi...         1          1   \n",
       "1  Yucaipa bought Dominick's in 1995 for $693 mil...         0          0   \n",
       "2  On June 10, the ship's owners had published an...         1          1   \n",
       "3  Tab shares jumped 20 cents, or 4.6%, to set a ...         0          0   \n",
       "4  PG&E Corp. shares jumped $1.63 or 8 percent to...         1          0   \n",
       "\n",
       "              remarks  \n",
       "0  no need to correct  \n",
       "1  no need to correct  \n",
       "2  no need to correct  \n",
       "3  no need to correct  \n",
       "4       can't correct  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "split = \"train\"\n",
    "df = pd.read_csv(\"./mrpc_\"+split+\"_corrected.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "36cba8a1-8997-4563-9c33-4d4f6049ec5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4409/4409 [00:53<00:00, 82.98it/s]\n"
     ]
    }
   ],
   "source": [
    "og_wpd_list = []\n",
    "og_ld_list = []\n",
    "new_wpd_list = []\n",
    "new_ld_list = []\n",
    "\n",
    "for index, row in tqdm(df.iterrows(), total=len(df)):\n",
    "    # original pair\n",
    "    og_s1, og_s2 = nlp(row['og_s1']), nlp(row['og_s2'])\n",
    "    og_wpd = pm.wpd(og_s1, og_s2)\n",
    "    og_ld = pm.ld(og_s1, og_s2)\n",
    "    og_wpd_list.append(og_wpd)\n",
    "    og_ld_list.append(og_ld)\n",
    "    \n",
    "    # new pair\n",
    "    new_s1, new_s2 = nlp(row['new_s1']), nlp(row['new_s2'])\n",
    "    new_wpd = pm.wpd(new_s1, new_s2)\n",
    "    new_ld = pm.ld(new_s1, new_s2)\n",
    "    new_wpd_list.append(new_wpd)\n",
    "    new_ld_list.append(new_ld)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "afcf311c-3c44-4d08-b8df-8decf051e315",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"og_wpd\"] = og_wpd_list\n",
    "df[\"og_ld\"] = og_ld_list\n",
    "df[\"new_wpd\"] = new_wpd_list\n",
    "df[\"new_ld\"] = new_ld_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9e8c3562-58ad-43d0-b887-218769a885b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"./mrpc_\"+split+\"_scores.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05aa6437-c11c-4f06-9e23-a300d26e128d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}