import pandas as pd # first need to download from haoranxu/ALMA-R-Preference(https://huggingface.co/datasets/haoranxu/ALMA-R-Preference) # Parquet to txt df = pd.read_parquet('haoranxu-ALMA-R-Preference.parquet') print(df.columns) # text_column = df[['alma_en', 'alma_zh', 'en', 'gpt4_en', 'gpt4_zh', 'zh']] # text_column = df[['en', 'zh']] # save txt with open('haoranxu-ALMA-R-Preference-en-zh--zh-en.txt', 'w', encoding='utf-8') as f: for item in df['translation']: en_text = item.get('en') zh_text = item.get('zh') if en_text and zh_text: # check 'en' and 'zh' f.write(f"English: {en_text}\nChinese: {zh_text}\n\n") f.write(f"Chinese: {zh_text}\nEnglish: {en_text}\n\n") # then u can use it to make your language imatrix.dat