In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

try:
    from google.colab import drive
    drive.mount('/content/drive')
    workding_dir = "/content/drive/MyDrive/logical-reasoning/"
except ModuleNotFoundError:
    workding_dir = str(Path.cwd().parent)

In [3]:
import os
import sys

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

workding dir: /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning


In [4]:
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

loading env vars from: /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning/.env


True

In [5]:
import pandas as pd


df_dev = pd.read_csv("datasets/mgtv/dev.csv")
len(df_dev["title"].value_counts()), len(df_dev["puzzle"].value_counts()), len(
    df_dev["truth"].value_counts()
)

(5, 5, 5)

In [7]:
from llm_toolkit.translation_utils import translate
import pandas as pd


def translate_df(df, cache_path=None):
    if cache_path and os.path.exists(cache_path):
        cache_df = pd.read_csv(cache_path)
    else:
        cache_df = pd.DataFrame(columns=["chinese", "english"])

    cache_dict = {k: v for k, v in zip(cache_df["chinese"], cache_df["english"])}

    df["text"] = df["text"].apply(lambda x: translate(x, cache_dict))
    df["title"] = df["title"].apply(lambda x: translate(x, cache_dict))
    df["label"] = df["label"].apply(lambda x: translate(x, cache_dict))
    df["puzzle"] = df["puzzle"].apply(lambda x: translate(x, cache_dict))
    df["truth"] = df["truth"].apply(lambda x: translate(x, cache_dict))

    if cache_path:
        for k in cache_df["chinese"]:
            if k in cache_dict:
                del cache_dict[k]

        if k in cache_dict:
            new_data = {"chinese": k, "english": cache_dict[k]}
            new_row_df = pd.DataFrame([new_data])
            df_cache = pd.concat(
                [df_cache, new_row_df],
                ignore_index=True,
            )

        cache_df.to_csv(cache_path, index=False)

    return df

loading /Users/inflaton/Library/CloudStorage/GoogleDrive-dh.huang.2023@smu.edu.sg/My Drive/logical-reasoning/llm_toolkit/translation_utils.py


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/inflaton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/inflaton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/inflaton/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
df_dev = translate_df(df_dev, "datasets/mgtv/unique_translations.csv")

In [9]:
df_dev.to_csv("datasets/mgtv/dev_en.csv", index=False)

In [10]:
import pandas as pd

df = pd.read_csv("datasets/mgtv/train.csv")

In [11]:
df = translate_df(df, "datasets/mgtv/unique_translations.csv")

In [12]:
df.to_csv("datasets/mgtv/train_en.csv", index=False)

In [13]:
df["label"].value_counts()

label
No                       11783
Yes                       6591
Unimportant               5076
Incorrect questioning      921
Correct answer             629
Name: count, dtype: int64

In [14]:
df.head()

Unnamed: 0,text,label,answer,title,puzzle,truth
0,Did the thief believe in the gods?,No,,The Mystery of the Vanishing Pumpkins,"In the village of Zhen, there is a legend that...",The truth turned out to be related to an old f...
1,Did they steal the pumpkins to ensure a bounti...,No,,The Mystery of the Vanishing Pumpkins,"In the village of Zhen, there is a legend that...",The truth turned out to be related to an old f...
2,The villagers like pumpkins too.,Unimportant,,The Mystery of the Vanishing Pumpkins,"In the village of Zhen, there is a legend that...",The truth turned out to be related to an old f...
3,People in the village need to use pumpkins as ...,No,,The Mystery of the Vanishing Pumpkins,"In the village of Zhen, there is a legend that...",The truth turned out to be related to an old f...
4,Were they stolen from the village?,Yes,,The Mystery of the Vanishing Pumpkins,"In the village of Zhen, there is a legend that...",The truth turned out to be related to an old f...


In [15]:
df_cn = pd.read_csv("datasets/mgtv/train.csv")
df_cache = pd.read_csv("datasets/mgtv/unique_translations.csv")

In [16]:
df_cn.columns

Index(['text', 'label', 'answer', 'title', 'puzzle', 'truth'], dtype='object')

In [17]:
count = 0
for col in ["text", "title", "puzzle", "truth"]:
    for c in df_cn[col].unique():
        if c not in df_cache["chinese"].values:
            # print(c)
            loc = df_cn.loc[df_cn[col] == c, col]
            first_occurrence_index = loc.index[
                0
            ]  # Get the index of the first occurrence
            # print(f"First occurrence at index: {first_occurrence_index}")
            row_cn = df_cn.iloc[first_occurrence_index][col]
            row_en = df.iloc[first_occurrence_index][col]
            new_data = {"chinese": row_cn, "english": row_en}
            new_row_df = pd.DataFrame([new_data])
            df_cache = pd.concat(
                [df_cache, new_row_df],
                ignore_index=True,
            )
            count += 1

count

0

In [18]:
import re

# Function to check if an English translation contains Chinese characters
def contains_chinese(text):
    return bool(re.search(r"[\u4e00-\u9fff]", str(text)))


# Apply the function to the English column to find rows with partial Chinese text
partial_translations = df_cache[df_cache["english"].apply(contains_chinese)]

partial_translations.head()

Unnamed: 0,chinese,english


In [19]:
partial_translations.shape

(0, 2)