GGUF
ALMA-7B-R-gguf / parquet to.txt-ALMA-make imatrix.py
DataSoul's picture
Upload parquet to.txt-ALMA-make imatrix.py
cbf7dfa verified
import pandas as pd
# first need to download from haoranxu/ALMA-R-Preference(https://huggingface.co/datasets/haoranxu/ALMA-R-Preference)
# Parquet to txt
df = pd.read_parquet('haoranxu-ALMA-R-Preference.parquet')
print(df.columns)
# text_column = df[['alma_en', 'alma_zh', 'en', 'gpt4_en', 'gpt4_zh', 'zh']]
# text_column = df[['en', 'zh']]
# save txt
with open('haoranxu-ALMA-R-Preference-en-zh--zh-en.txt', 'w', encoding='utf-8') as f:
for item in df['translation']:
en_text = item.get('en')
zh_text = item.get('zh')
if en_text and zh_text: # check 'en' and 'zh'
f.write(f"English: {en_text}\nChinese: {zh_text}\n\n")
f.write(f"Chinese: {zh_text}\nEnglish: {en_text}\n\n")
# then u can use it to make your language imatrix.dat