Spaces:

danieldux
/

isco_hierarchical_accuracy

Sleeping

App Files Files Community

danieldux commited on Apr 3, 2024

Commit

cb3e43c

1 Parent(s): d726519

tests notebook

Browse files

Files changed (1) hide show

tests.ipynb +275 -452

tests.ipynb CHANGED Viewed

@@ -166,6 +166,41 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "from datasets import load_dataset\n",
@@ -178,34 +213,45 @@
     "if hf_token is None:\n",
     "    raise ValueError(\"HF_TOKEN environment variable is not set.\")\n",
     "\n",
     "# Load the dataset\n",
     "test_data_subset = (\n",
-    "    load_dataset(\n",
-    "        \"ICILS/multilingual_parental_occupations\", split=\"test\", token=hf_token\n",
-    "    )\n",
-    "    .shuffle(seed=42)\n",
-    "    .select(range(100))\n",
-    ")\n",
-    "test_data = load_dataset(\n",
-    "    \"ICILS/multilingual_parental_occupations\", split=\"test\", token=hf_token\n",
-    ")\n",
-    "\n",
-    "validation_data = load_dataset(\n",
-    "    \"ICILS/multilingual_parental_occupations\", split=\"validation\", token=hf_token\n",
     ")\n",
     "\n",
     "# Initialize the pipeline\n",
-    "pipe = pipeline(\"text-classification\", model=\"ICILS/XLM-R-ISCO\", token=hf_token)\n",
-    "\n",
-    "# Define the mapping from ISCO_CODE_TITLE to ISCO codes\n",
-    "def extract_isco_code(isco_code_title: str):\n",
-    "    # ISCO_CODE_TITLE is a string like \"7412 Electrical Mechanics and Fitters\" so we need to extract the first part for the evaluation.\n",
-    "    return isco_code_title.split()[0]\n",
     "\n",
     "# Initialize the hierarchical accuracy measure\n",
     "hierarchical_accuracy = evaluate.load(\"danieldux/isco_hierarchical_accuracy\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -215,29 +261,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy: 0.8611914401388086, Hierarchical Precision: 0.989010989010989, Hierarchical Recall: 0.9836065573770492, Hierarchical F-measure: 0.9863013698630136\n",
-      "Evaluation results saved to isco_test_results.json\n"
      ]
     }
    ],
    "source": [
     "# Evaluate the model\n",
     "predictions = []\n",
     "references = []\n",
-    "for example in test_data:\n",
     "\n",
     "    # Predict\n",
     "    prediction = pipe(\n",
     "        example[\"JOB_DUTIES\"]\n",
     "    )  # Use the key \"JOB_DUTIES\" for the text data\n",
-    "    predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
     "    predictions.append(predicted_label)\n",
     "\n",
     "    # Reference\n",
@@ -248,10 +300,158 @@
     "test_results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
     "\n",
     "# Save the results to a JSON file\n",
-    "with open(\"isco_test_results.json\", \"w\") as f:\n",
     "    json.dump(test_results, f)\n",
     "\n",
-    "print(\"Evaluation results saved to isco_test_results.json\")"
    ]
   },
   {
@@ -309,9 +509,16 @@
     "# Inter rater agreement"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -332,6 +539,13 @@
     "grouped_df = isco_rel_df.groupby('LANGUAGE')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -371,19 +585,14 @@
     "results_df.loc[len(results_df)] = average_row\n",
     "\n",
     "\n",
-    "results_df.to_csv('language_results.csv', index=False)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# create a dataframe with samples where ISCO and ISCO_REL the same\n",
-    "isco_rel_df_same = isco_rel_df[isco_rel_df['ISCO'] == isco_rel_df['ISCO_REL']]\n",
-    "\n",
-    "isco_rel_df_same"
    ]
   },
   {
@@ -392,467 +601,81 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# create a dataframe with samples where ISCO and ISCO_REL are different\n",
-    "isco_rel_df_diff = isco_rel_df[isco_rel_df['ISCO'] != isco_rel_df['ISCO_REL']]\n",
     "\n",
-    "isco_rel_df_diff"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Make a list of all values in ISCO and ISCO_REL columns\n",
-    "coder1 = list(isco_rel_df['ISCO'])\n",
-    "coder2 = list(isco_rel_df['ISCO_REL'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compute the hierarchical accuracy\n",
-    "reliability_results = hierarchical_accuracy.compute(predictions=coder2, references=coder1)\n",
     "\n",
-    "# Save the results to a JSON file\n",
-    "with open(\"isco_rel_results.json\", \"w\") as f:\n",
-    "    json.dump(reliability_results, f)\n",
     "\n",
-    "print(\"Evaluation results saved to isco_rel_results.json\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Giskard model testing"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "from scipy.special import softmax\n",
-    "from datasets import load_dataset\n",
-    "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
-    "\n",
-    "from giskard import Dataset, Model, scan, testing, GiskardClient, Suite"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>IDSTUD</th>\n",
-       "      <th>JOB_DUTIES</th>\n",
-       "      <th>ISCO</th>\n",
-       "      <th>ISCO_REL</th>\n",
-       "      <th>ISCO_TITLE</th>\n",
-       "      <th>ISCO_CODE_TITLE</th>\n",
-       "      <th>COUNTRY</th>\n",
-       "      <th>LANGUAGE</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>10670109</td>\n",
-       "      <td>forældre 1:   Han arbejder som med-chef sammen...</td>\n",
-       "      <td>7412</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Electrical Mechanics and Fitters</td>\n",
-       "      <td>7412 Electrical Mechanics and Fitters</td>\n",
-       "      <td>DNK</td>\n",
-       "      <td>da</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>10130106</td>\n",
-       "      <td>asistente de parbulo y basica. ayudaba en la e...</td>\n",
-       "      <td>5312</td>\n",
-       "      <td>5312</td>\n",
-       "      <td>Teachers' Aides</td>\n",
-       "      <td>5312 Teachers' Aides</td>\n",
-       "      <td>CHL</td>\n",
-       "      <td>es</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>10740120</td>\n",
-       "      <td>trabajaba en el campo como capatas. aveces cui...</td>\n",
-       "      <td>6121</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Livestock and Dairy Producers</td>\n",
-       "      <td>6121 Livestock and Dairy Producers</td>\n",
-       "      <td>URY</td>\n",
-       "      <td>es</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>10170109</td>\n",
-       "      <td>gas abastible. vende gas abastible</td>\n",
-       "      <td>9621</td>\n",
-       "      <td>5243</td>\n",
-       "      <td>Messengers, Package Deliverers and Luggage Por...</td>\n",
-       "      <td>9621 Messengers, Package Deliverers and Luggag...</td>\n",
-       "      <td>CHL</td>\n",
-       "      <td>es</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>11480109</td>\n",
-       "      <td>jordbruk. sår potatis tar upp potatis plogar h...</td>\n",
-       "      <td>6111</td>\n",
-       "      <td>6111</td>\n",
-       "      <td>Field Crop and Vegetable Growers</td>\n",
-       "      <td>6111 Field Crop and Vegetable Growers</td>\n",
-       "      <td>FIN</td>\n",
-       "      <td>sv</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>495</th>\n",
-       "      <td>11780107</td>\n",
-       "      <td>acountent mannager|she mannages calls for jobs...</td>\n",
-       "      <td>1211</td>\n",
-       "      <td>9998</td>\n",
-       "      <td>Finance Managers</td>\n",
-       "      <td>1211 Finance Managers</td>\n",
-       "      <td>AUS</td>\n",
-       "      <td>en</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>496</th>\n",
-       "      <td>10850104</td>\n",
-       "      <td>geometra/muratore. proggetta case e le restaura</td>\n",
-       "      <td>3112</td>\n",
-       "      <td>3112</td>\n",
-       "      <td>Civil Engineering Technicians</td>\n",
-       "      <td>3112 Civil Engineering Technicians</td>\n",
-       "      <td>ITA</td>\n",
-       "      <td>it</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>497</th>\n",
-       "      <td>11460111</td>\n",
-       "      <td>fa parte della misericordia. Trasporta i malat...</td>\n",
-       "      <td>3258</td>\n",
-       "      <td>3258</td>\n",
-       "      <td>Ambulance Workers</td>\n",
-       "      <td>3258 Ambulance Workers</td>\n",
-       "      <td>ITA</td>\n",
-       "      <td>it</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>498</th>\n",
-       "      <td>10340111</td>\n",
-       "      <td>사회복지사. 회사에서 복지원 관리</td>\n",
-       "      <td>2635</td>\n",
-       "      <td>2635</td>\n",
-       "      <td>Social Work and Counselling Professionals</td>\n",
-       "      <td>2635 Social Work and Counselling Professionals</td>\n",
-       "      <td>KOR</td>\n",
-       "      <td>ko</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>499</th>\n",
-       "      <td>10370105</td>\n",
-       "      <td>자영업. 가게를 운영하신다.</td>\n",
-       "      <td>5221</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Shopkeepers</td>\n",
-       "      <td>5221 Shopkeepers</td>\n",
-       "      <td>KOR</td>\n",
-       "      <td>ko</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>500 rows × 8 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       IDSTUD                                         JOB_DUTIES  ISCO  \\\n",
-       "0    10670109  forældre 1:   Han arbejder som med-chef sammen...  7412   \n",
-       "1    10130106  asistente de parbulo y basica. ayudaba en la e...  5312   \n",
-       "2    10740120  trabajaba en el campo como capatas. aveces cui...  6121   \n",
-       "3    10170109                 gas abastible. vende gas abastible  9621   \n",
-       "4    11480109  jordbruk. sår potatis tar upp potatis plogar h...  6111   \n",
-       "..        ...                                                ...   ...   \n",
-       "495  11780107  acountent mannager|she mannages calls for jobs...  1211   \n",
-       "496  10850104    geometra/muratore. proggetta case e le restaura  3112   \n",
-       "497  11460111  fa parte della misericordia. Trasporta i malat...  3258   \n",
-       "498  10340111                                 사회복지사. 회사에서 복지원 관리  2635   \n",
-       "499  10370105                                    자영업. 가게를 운영하신다.  5221   \n",
-       "\n",
-       "    ISCO_REL                                         ISCO_TITLE  \\\n",
-       "0       None                   Electrical Mechanics and Fitters   \n",
-       "1       5312                                    Teachers' Aides   \n",
-       "2       None                      Livestock and Dairy Producers   \n",
-       "3       5243  Messengers, Package Deliverers and Luggage Por...   \n",
-       "4       6111                   Field Crop and Vegetable Growers   \n",
-       "..       ...                                                ...   \n",
-       "495     9998                                   Finance Managers   \n",
-       "496     3112                      Civil Engineering Technicians   \n",
-       "497     3258                                  Ambulance Workers   \n",
-       "498     2635          Social Work and Counselling Professionals   \n",
-       "499     None                                        Shopkeepers   \n",
-       "\n",
-       "                                       ISCO_CODE_TITLE COUNTRY LANGUAGE  \n",
-       "0                7412 Electrical Mechanics and Fitters     DNK       da  \n",
-       "1                                 5312 Teachers' Aides     CHL       es  \n",
-       "2                   6121 Livestock and Dairy Producers     URY       es  \n",
-       "3    9621 Messengers, Package Deliverers and Luggag...     CHL       es  \n",
-       "4                6111 Field Crop and Vegetable Growers     FIN       sv  \n",
-       "..                                                 ...     ...      ...  \n",
-       "495                              1211 Finance Managers     AUS       en  \n",
-       "496                 3112 Civil Engineering Technicians     ITA       it  \n",
-       "497                             3258 Ambulance Workers     ITA       it  \n",
-       "498     2635 Social Work and Counselling Professionals     KOR       ko  \n",
-       "499                                   5221 Shopkeepers     KOR       ko  \n",
-       "\n",
-       "[500 rows x 8 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "MODEL_NAME = \"ICILS/XLM-R-ISCO\"\n",
-    "# DATASET_CONFIG = {\"path\": \"tweet_eval\", \"name\": \"sentiment\", \"split\": \"validation\"}\n",
-    "TEXT_COLUMN = \"JOB_DUTIES\"\n",
-    "TARGET_COLUMN = \"ISCO_CODE_TITLE\"\n",
-    "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)\n",
-    "\n",
-    "label2id: dict = model.config.label2id\n",
-    "id2label: dict = model.config.id2label\n",
-    "# LABEL_MAPPING = id2label.items()\n",
-    "\n",
-    "# raw_data = load_dataset(**DATASET_CONFIG).to_pandas().iloc[:500]\n",
-    "raw_data = load_dataset(\"ICILS/multilingual_parental_occupations\", split=\"test\").to_pandas().iloc[:500]\n",
-    "# raw_data = raw_data.replace({\"ISCO_CODE_TITLE\": LABEL_MAPPING})\n",
-    "raw_data[\"ISCO\"] = raw_data[\"ISCO\"].astype(str)\n",
-    "raw_data[\"ISCO_REL\"] = raw_data[\"ISCO_REL\"].astype(str)\n",
     "\n",
-    "raw_data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-03-15 01:07:06,923 pid:166193 MainThread giskard.datasets.base INFO     Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.\n",
-      "2024-03-15 01:07:06,925 pid:166193 MainThread giskard.models.automodel INFO     Your 'prediction_function' is successfully wrapped by Giskard's 'PredictionFunctionModel' wrapper class.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/dux/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/datasets/base/__init__.py:466: UserWarning: The column ISCO is declared as numeric but has 'object' as data type. To avoid potential future issues, make sure to cast this column to the correct data type.\n",
-      "  warning(\n"
-     ]
-    }
-   ],
    "source": [
-    "giskard_dataset = Dataset(\n",
-    "    df=raw_data,  # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).\n",
-    "    target=TARGET_COLUMN,  # Ground truth variable.\n",
-    "    name=\"ISCO-08 Parental Occupation Corpus\",  # Optional.\n",
-    ")\n",
-    "\n",
-    "def prediction_function(df: pd.DataFrame) -> np.ndarray:\n",
-    "    encoded_input = tokenizer(list(df[TEXT_COLUMN]), padding=True, return_tensors=\"pt\")\n",
-    "    output = model(**encoded_input)\n",
-    "    return softmax(output[\"logits\"].detach().numpy(), axis=1)\n",
-    "\n",
     "\n",
-    "giskard_model = Model(\n",
-    "    model=prediction_function,  # A prediction function that encapsulates all the data pre-processing steps and that\n",
-    "    model_type=\"classification\",  # Either regression, classification or text_generation.\n",
-    "    name=\"XLM-R ISCO\",  # Optional\n",
-    "    classification_labels=list(label2id.keys()),  # Their order MUST be identical to the prediction_function's\n",
-    "    feature_names=[TEXT_COLUMN],  # Default: all columns of your dataset\n",
-    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-03-15 01:07:10,228 pid:166193 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-03-15 01:07:12,838 pid:166193 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (10, 8) executed in 0:00:02.617399\n",
-      "2024-03-15 01:07:12,848 pid:166193 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n",
-      "2024-03-15 01:07:13,007 pid:166193 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (1, 8) executed in 0:00:00.166843\n",
-      "2024-03-15 01:07:13,015 pid:166193 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n",
-      "2024-03-15 01:07:13,017 pid:166193 MainThread giskard.utils.logging_utils INFO     Predicted dataset with shape (10, 8) executed in 0:00:00.009517\n",
-      "2024-03-15 01:07:13,029 pid:166193 MainThread giskard.datasets.base INFO     Casting dataframe columns from {'JOB_DUTIES': 'object'} to {'JOB_DUTIES': 'object'}\n"
-     ]
-    },
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
-      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
-      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
-      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
-    }
-   ],
    "source": [
-    "results = scan(giskard_model, giskard_dataset)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'results' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m display(\u001b[43mresults\u001b[49m)\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Save it to a file\u001b[39;00m\n\u001b[1;32m      4\u001b[0m results\u001b[38;5;241m.\u001b[39mto_html(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscan_report.html\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'results' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "display(results)\n",
-    "\n",
-    "# Save it to a file\n",
-    "results.to_html(\"scan_report.html\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "GiskardError",
-     "evalue": "No details or messages available.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mGiskardError\u001b[0m                              Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[2], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m project_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxlmr_isco\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;66;03m# Create a giskard client to communicate with Giskard\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m client \u001b[38;5;241m=\u001b[39m \u001b[43mGiskardClient\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/client/giskard_client.py:153\u001b[0m, in \u001b[0;36mGiskardClient.__init__\u001b[0;34m(self, url, key, hf_token)\u001b[0m\n\u001b[1;32m    150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m hf_token:\n\u001b[1;32m    151\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_session\u001b[38;5;241m.\u001b[39mcookies[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mspaces-jwt\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m hf_token\n\u001b[0;32m--> 153\u001b[0m server_settings: ServerInfo \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_server_info\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m server_settings\u001b[38;5;241m.\u001b[39mserverVersion \u001b[38;5;241m!=\u001b[39m giskard\u001b[38;5;241m.\u001b[39m__version__:\n\u001b[1;32m    156\u001b[0m     warning(\n\u001b[1;32m    157\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYour giskard client version (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mgiskard\u001b[38;5;241m.\u001b[39m__version__\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) does not match the hub version \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    158\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mserver_settings\u001b[38;5;241m.\u001b[39mserverVersion\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m). \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    159\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease upgrade your client to the latest version. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    160\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpip install \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgiskard[hub]>=2.0.0b\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m -U\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m    161\u001b[0m     )\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/client/giskard_client.py:417\u001b[0m, in \u001b[0;36mGiskardClient.get_server_info\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    416\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_server_info\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m ServerInfo:\n\u001b[0;32m--> 417\u001b[0m     resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/public-api/ml-worker-connect\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    418\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    419\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m ServerInfo\u001b[38;5;241m.\u001b[39mparse_obj(resp\u001b[38;5;241m.\u001b[39mjson())\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/sessions.py:602\u001b[0m, in \u001b[0;36mSession.get\u001b[0;34m(self, url, **kwargs)\u001b[0m\n\u001b[1;32m    594\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a GET request. Returns :class:`Response` object.\u001b[39;00m\n\u001b[1;32m    595\u001b[0m \n\u001b[1;32m    596\u001b[0m \u001b[38;5;124;03m:param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[1;32m    597\u001b[0m \u001b[38;5;124;03m:param \\*\\*kwargs: Optional arguments that ``request`` takes.\u001b[39;00m\n\u001b[1;32m    598\u001b[0m \u001b[38;5;124;03m:rtype: requests.Response\u001b[39;00m\n\u001b[1;32m    599\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    601\u001b[0m kwargs\u001b[38;5;241m.\u001b[39msetdefault(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 602\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests_toolbelt/sessions.py:76\u001b[0m, in \u001b[0;36mBaseUrlSession.request\u001b[0;34m(self, method, url, *args, **kwargs)\u001b[0m\n\u001b[1;32m     74\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Send the request after generating the complete URL.\"\"\"\u001b[39;00m\n\u001b[1;32m     75\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcreate_url(url)\n\u001b[0;32m---> 76\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mBaseUrlSession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m     78\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m    585\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m    586\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m    587\u001b[0m }\n\u001b[1;32m    588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m    700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m    702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m    706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/requests/adapters.py:538\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    535\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    536\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m--> 538\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuild_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresp\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/miniconda3/envs/autogenstudio/lib/python3.11/site-packages/giskard/client/giskard_client.py:107\u001b[0m, in \u001b[0;36mErrorHandlingAdapter.build_response\u001b[0;34m(self, req, resp)\u001b[0m\n\u001b[1;32m    105\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m(ErrorHandlingAdapter, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mbuild_response(req, resp)\n\u001b[1;32m    106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _get_status(resp) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m400\u001b[39m:\n\u001b[0;32m--> 107\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m explain_error(resp)\n\u001b[1;32m    109\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
-      "\u001b[0;31mGiskardError\u001b[0m: No details or messages available."
-     ]
-    }
-   ],
    "source": [
-    "import giskard\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"ICILS/multilingual_parental_occupations\", split=\"test\")\n",
-    "\n",
-    "# Replace this with your own data & model creation.\n",
-    "# df = giskard.demo.titanic_df()\n",
-    "df = dataset\n",
-    "demo_data_preprocessing_function, demo_sklearn_model = giskard.demo.titanic_pipeline()\n",
-    "\n",
-    "# Wrap your Pandas DataFrame\n",
-    "giskard_dataset = giskard.Dataset(df=df,\n",
-    "                                  target=\"ISCO_CODE_TITLE\",\n",
-    "                                  name=\"ISCO-08 Parental Occupation Corpus\",\n",
-    "                                  cat_columns=['LANGUAGE', 'COUNTRY'])\n",
-    "\n",
-    "# Wrap your model\n",
-    "def prediction_function(df):\n",
-    "    preprocessed_df = demo_data_preprocessing_function(df)\n",
-    "    return demo_sklearn_model.predict_proba(preprocessed_df)\n",
-    "\n",
-    "giskard_model = giskard.Model(model=prediction_function,\n",
-    "                              model_type=\"classification\",\n",
-    "                              name=\"Titanic model\",\n",
-    "                              classification_labels=demo_sklearn_model.classes_,\n",
-    "                              feature_names=['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])\n",
-    "\n",
-    "# Then apply the scan\n",
-    "results = giskard.scan(giskard_model, giskard_dataset)\n",
-    "\n",
-    "\n",
-    "# Create a Giskard client\n",
-    "client = giskard.GiskardClient(\n",
-    "    url=\"https://danieldux-giskard.hf.space\",  # URL of your Giskard instance\n",
-    "    key=\"<Generate your API Key on the Giskard Hub settings page first>\")\n",
     "\n",
     "\n",
-    "# Upload an automatically created test suite to the current project ✉️\n",
-    "results.generate_test_suite(\"Test suite created by scan\").upload(client, \"xlmr_isco\")\n"
    ]
   }
  ],

    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from datasets import load_dataset, get_dataset_config_names, get_dataset_infos, get_dataset_split_names\n",
+    "\n",
+    "dataset = load_dataset(\"ICILS/multilingual_parental_occupations\", \"ilo\")\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4634a4a344384ef28d182adeea1f5afc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading builder script:   0%|          | 0.00/13.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ISCO CSV file downloaded\n",
+      "Weighted ISCO hierarchy dictionary created as isco_hierarchy\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
     "from datasets import load_dataset\n",
     "if hf_token is None:\n",
     "    raise ValueError(\"HF_TOKEN environment variable is not set.\")\n",
     "\n",
+    "test_split = load_dataset(\"ICILS/multilingual_parental_occupations\", \"icils\", split=\"test\", token=hf_token)\n",
+    "validation_split = load_dataset(\"ICILS/multilingual_parental_occupations\", \"icils\", split=\"validation\", token=hf_token)\n",
+    "\n",
     "# Load the dataset\n",
     "test_data_subset = (\n",
+    "   test_split.shuffle(seed=42).select(range(100))\n",
     ")\n",
     "\n",
     "# Initialize the pipeline\n",
+    "model = \"danieldux/XLM-R-ISCO-v2\" # ICILS/XLM-R-ISCO\n",
+    "pipe = pipeline(\"text-classification\", model=model, token=hf_token)\n",
     "\n",
     "# Initialize the hierarchical accuracy measure\n",
     "hierarchical_accuracy = evaluate.load(\"danieldux/isco_hierarchical_accuracy\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['IDSTUD', 'JOB_DUTIES', 'ISCO', 'ISCO_REL', 'ISCO_TITLE', 'ISCO_CODE_TITLE', 'COUNTRY', 'LANGUAGE'],\n",
+       "    num_rows: 100\n",
+       "})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_data_subset"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "2024-03-31--01:29\n",
+      "Evaluation results saved to test_split_results-2024-03-31--01:29.json\n"
      ]
     }
    ],
    "source": [
+    "import datetime\n",
+    "\n",
+    "stamp = datetime.datetime.now().strftime(\"%Y-%m-%d--%H:%M\")\n",
+    "print(stamp)\n",
+    "\n",
     "# Evaluate the model\n",
     "predictions = []\n",
     "references = []\n",
+    "for example in test_data_subset:\n",
     "\n",
     "    # Predict\n",
     "    prediction = pipe(\n",
     "        example[\"JOB_DUTIES\"]\n",
     "    )  # Use the key \"JOB_DUTIES\" for the text data\n",
+    "    # predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
+    "    predicted_label = prediction[0][\"label\"]\n",
     "    predictions.append(predicted_label)\n",
     "\n",
     "    # Reference\n",
     "test_results = hierarchical_accuracy.compute(predictions=predictions, references=references)\n",
     "\n",
     "# Save the results to a JSON file\n",
+    "with open(f\"test_split_results-{stamp}.json\", \"w\") as f:\n",
     "    json.dump(test_results, f)\n",
     "\n",
+    "print(f\"Evaluation results saved to test_split_results-{stamp}.json\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'accuracy': 0.82,\n",
+       " 'hierarchical_precision': 0.9090909090909091,\n",
+       " 'hierarchical_recall': 0.8839779005524862,\n",
+       " 'hierarchical_fmeasure': 0.8963585434173669}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.8523316062176166, Hierarchical Precision: 0.9711751662971175, Hierarchical Recall: 0.9733333333333334, Hierarchical F-measure: 0.9722530521642619\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_376175/1380879571.py:30: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
+      "  results_df = pd.concat(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.8549323017408124, Hierarchical Precision: 0.9425981873111783, Hierarchical Recall: 0.96, Hierarchical F-measure: 0.9512195121951218\n",
+      "Accuracy: 0.817351598173516, Hierarchical Precision: 0.9076305220883534, Hierarchical Recall: 0.9377593360995851, Hierarchical F-measure: 0.9224489795918367\n",
+      "Accuracy: 0.8160919540229885, Hierarchical Precision: 0.9140893470790378, Hierarchical Recall: 0.9204152249134948, Hierarchical F-measure: 0.9172413793103448\n",
+      "Accuracy: 0.7801724137931034, Hierarchical Precision: 0.8776978417266187, Hierarchical Recall: 0.9207547169811321, Hierarchical F-measure: 0.8987108655616942\n",
+      "Accuracy: 0.8200836820083682, Hierarchical Precision: 0.9007352941176471, Hierarchical Recall: 0.9176029962546817, Hierarchical F-measure: 0.9090909090909092\n",
+      "Accuracy: 0.5149253731343284, Hierarchical Precision: 0.7487684729064039, Hierarchical Recall: 0.8, Hierarchical F-measure: 0.7735368956743003\n",
+      "Accuracy: 0.9, Hierarchical Precision: 0.9244444444444444, Hierarchical Recall: 0.9285714285714286, Hierarchical F-measure: 0.9265033407572383\n",
+      "Accuracy: 0.9030612244897959, Hierarchical Precision: 0.9509803921568627, Hierarchical Recall: 0.9603960396039604, Hierarchical F-measure: 0.9556650246305418\n",
+      "Accuracy: 0.7836538461538461, Hierarchical Precision: 0.9047619047619048, Hierarchical Recall: 0.8916967509025271, Hierarchical F-measure: 0.8981818181818182\n",
+      "Accuracy: 0.8707865168539326, Hierarchical Precision: 0.9269406392694064, Hierarchical Recall: 0.9441860465116279, Hierarchical F-measure: 0.9354838709677419\n",
+      "Accuracy: 0.9230769230769231, Hierarchical Precision: 0.9, Hierarchical Recall: 0.9473684210526315, Hierarchical F-measure: 0.9230769230769231\n",
+      "   Language  Accuracy  Hierarchical Precision  Hierarchical Recall  \\\n",
+      "0        sv  0.923077                0.900000             0.947368   \n",
+      "1        ko  0.870787                0.926941             0.944186   \n",
+      "2        pt  0.783654                0.904762             0.891697   \n",
+      "3        kk  0.903061                0.950980             0.960396   \n",
+      "4        ru  0.900000                0.924444             0.928571   \n",
+      "5        de  0.514925                0.748768             0.800000   \n",
+      "6        fi  0.820084                0.900735             0.917603   \n",
+      "7        da  0.780172                0.877698             0.920755   \n",
+      "8        fr  0.816092                0.914089             0.920415   \n",
+      "9        it  0.817352                0.907631             0.937759   \n",
+      "10       es  0.854932                0.942598             0.960000   \n",
+      "11       en  0.852332                0.971175             0.973333   \n",
+      "\n",
+      "    Hierarchical F1  \n",
+      "0          0.923077  \n",
+      "1          0.935484  \n",
+      "2          0.898182  \n",
+      "3          0.955665  \n",
+      "4          0.926503  \n",
+      "5          0.773537  \n",
+      "6          0.909091  \n",
+      "7          0.898711  \n",
+      "8          0.917241  \n",
+      "9          0.922449  \n",
+      "10         0.951220  \n",
+      "11         0.972253  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "test_data_df = test_data.to_pandas()\n",
+    "results_df = pd.DataFrame(columns=['Language', 'Accuracy', 'Hierarchical Precision', 'Hierarchical Recall', 'Hierarchical F1'])\n",
+    "\n",
+    "# Iterate over unique languages\n",
+    "for language in test_data_df[\"LANGUAGE\"].unique():\n",
+    "    # Filter test data for the current language\n",
+    "    test_data_subset = test_data_df[test_data_df[\"LANGUAGE\"] == language]\n",
+    "\n",
+    "    # Evaluate the model for the current language\n",
+    "    predictions = []\n",
+    "    references = []\n",
+    "    for example in test_data_subset.to_dict(\"records\"):\n",
+    "        # Predict\n",
+    "        prediction = pipe(example[\"JOB_DUTIES\"])\n",
+    "        predicted_label = extract_isco_code(prediction[0][\"label\"])\n",
+    "        predictions.append(predicted_label)\n",
+    "\n",
+    "        # Reference\n",
+    "        reference_label = example[\"ISCO\"]\n",
+    "        references.append(reference_label)\n",
+    "\n",
+    "    # Compute the hierarchical accuracy for the current language\n",
+    "    test_results = hierarchical_accuracy.compute(\n",
+    "        predictions=predictions, references=references\n",
+    "    )\n",
+    "\n",
+    "    # Save the results to a JSON file\n",
+    "    results_df = pd.concat(\n",
+    "        [\n",
+    "            pd.DataFrame(\n",
+    "                {\n",
+    "                    \"Language\": [language],\n",
+    "                    \"Accuracy\": [test_results[\"accuracy\"]],\n",
+    "                    \"Hierarchical Precision\": [test_results[\"hierarchical_precision\"]],\n",
+    "                    \"Hierarchical Recall\": [test_results[\"hierarchical_recall\"]],\n",
+    "                    \"Hierarchical F1\": [test_results[\"hierarchical_fmeasure\"]],\n",
+    "                }\n",
+    "            ),\n",
+    "            results_df,\n",
+    "        ],\n",
+    "        ignore_index=True\n",
+    "    )\n",
+    "\n",
+    "# Print the evaluation results\n",
+    "print(results_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_df.to_csv('model_language_results.csv', index=False)"
    ]
   },
   {
     "# Inter rater agreement"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## All ICILS 2018 data"
+   ]
+  },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "grouped_df = isco_rel_df.groupby('LANGUAGE')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### By language"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
     "results_df.loc[len(results_df)] = average_row\n",
     "\n",
     "\n",
+    "results_df.to_csv('inter-rater_language_results.csv', index=False)"
    ]
   },
   {
+   "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Training data"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
     "\n",
+    "test_data_df = test_data.to_pandas()\n",
+    "unknown_reliability_samples = test_data_df[test_data_df['ISCO_REL'].isna() | test_data_df['ISCO_REL'].isin([\"9998\", \"9999\"])]\n",
     "\n",
+    "# Exclude unknown reliability samples from test_data_df\n",
+    "test_split_rel_df = test_data_df[~test_data_df['ISCO_REL'].isna() & ~test_data_df['ISCO_REL'].isin([\"9998\", \"9999\"])]\n",
     "\n",
+    "# Group the DataFrame by LANGUAGE column\n",
+    "test_split_rel_grouped_df = test_split_rel_df.groupby('LANGUAGE')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Validation data"
    ]
   },
   {
+   "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Test data"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# create a dataframe with samples where ISCO and ISCO_REL the same\n",
+    "isco_rel_df_same = isco_rel_df[isco_rel_df['ISCO'] == isco_rel_df['ISCO_REL']]\n",
     "\n",
+    "isco_rel_df_same"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# create a dataframe with samples where ISCO and ISCO_REL are different\n",
+    "isco_rel_df_diff = isco_rel_df[isco_rel_df['ISCO'] != isco_rel_df['ISCO_REL']]\n",
     "\n",
+    "isco_rel_df_diff"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 64,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# Make a list of all values in ISCO and ISCO_REL columns\n",
+    "coder1 = list(isco_rel_df['ISCO'])\n",
+    "coder2 = list(isco_rel_df['ISCO_REL'])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# Compute the hierarchical accuracy\n",
+    "reliability_results = hierarchical_accuracy.compute(predictions=coder2, references=coder1)\n",
     "\n",
+    "# Save the results to a JSON file\n",
+    "with open(\"isco_rel_results.json\", \"w\") as f:\n",
+    "    json.dump(reliability_results, f)\n",
     "\n",
+    "print(\"Evaluation results saved to isco_rel_results.json\")"
    ]
   }
  ],