{ "cells": [ { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df_input = pd.read_csv('sampled_data.csv')\n", "df_inferenced = pd.read_csv('inference_output.csv')" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1000\n", "1000\n" ] } ], "source": [ "print(len(df_input))\n", "print(len(df_inferenced))" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "df_combined = pd.concat([df_input, df_inferenced], axis=1)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titletextlabelOutputTokens UsedFinish Reason
0Live at Truthdig: Robert Scheer and Thomas Fra...Live at Truthdig: Robert Scheer and Thomas Fra...0Real265stop
1The Mirage of a Return to Manufacturing Greatn...Half a century ago, harvesting California’s 2....1Real1627stop
2British PM expected to offer to fill post-Brex...(Reuters) - The British government has told Ge...1fake200stop
3Checkmating ObamaOriginally published by the Jerusalem Post . \\...0fake2166stop
4Thirty-eight injured in police charges in Cata...MADRID (Reuters) - Emergency services have att...1Real176stop
\n", "
" ], "text/plain": [ " title \\\n", "0 Live at Truthdig: Robert Scheer and Thomas Fra... \n", "1 The Mirage of a Return to Manufacturing Greatn... \n", "2 British PM expected to offer to fill post-Brex... \n", "3 Checkmating Obama \n", "4 Thirty-eight injured in police charges in Cata... \n", "\n", " text label Output \\\n", "0 Live at Truthdig: Robert Scheer and Thomas Fra... 0 Real \n", "1 Half a century ago, harvesting California’s 2.... 1 Real \n", "2 (Reuters) - The British government has told Ge... 1 fake \n", "3 Originally published by the Jerusalem Post . \\... 0 fake \n", "4 MADRID (Reuters) - Emergency services have att... 1 Real \n", "\n", " Tokens Used Finish Reason \n", "0 265 stop \n", "1 1627 stop \n", "2 200 stop \n", "3 2166 stop \n", "4 176 stop " ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined.head()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['stop', 'length'], dtype=object)" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined[\"Finish Reason\"].unique()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "994" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined = df_combined[df_combined[\"Finish Reason\"] != \"length\"]\n", "len(df_combined)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "df_combined.drop(columns=[\"title\", \"text\", \"Tokens Used\", \"Finish Reason\"], inplace=True)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\kimi\\AppData\\Local\\Temp\\ipykernel_31372\\3169472720.py:2: DeprecationWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`\n", " df_combined.loc[:, \"Output\"] = df_combined[\"Output\"].str.strip().str.lower().map({\"real\": 1, \"fake\": 0})\n" ] } ], "source": [ "df_combined = df_combined.copy()\n", "df_combined.loc[:, \"Output\"] = df_combined[\"Output\"].str.strip().str.lower().map({\"real\": 1, \"fake\": 0})" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "994" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df_combined)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7323943661971831\n", "F1 Score: 0.5969696969696969\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\kimi\\AppData\\Local\\Temp\\ipykernel_31372\\2541391757.py:14: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-