kurd-spell-app / create_asosoft_benchmark.py
ariahmed's picture
Upload folder using huggingface_hub
e489264 verified
import pandas as pd
from tqdm import tqdm
from ckb_helpers import *
df = pd.read_csv('data/asotest.csv')
data_df = pd.read_csv('data/data.txt', names=['text'])
train_df = pd.read_csv('train.csv')
data = []
pbar = tqdm(df.itertuples(), total=len(df))
for row in pbar:
incorrect_word = row.text
correct_word = row.summary
# look up sentences from data_df that contain correct_word and make only keep those rows that are not in train_df
sentences = data_df[data_df['text'].str.contains(correct_word, case=False, na=False)]
sentences = sentences[~sentences.text.isin(train_df.summary)]
pbar.set_description(f"Rows found after cross checking train data: {len(sentences)} for {correct_word}")
for r in sentences.head(1).itertuples():
new_sentence = r.text.replace(correct_word, incorrect_word)
data.append({"text": new_sentence, "summary": process_text(r.text)})
# drop that row so the final dataset doesn't include same sentence for two incorrect words
data_df.drop(index=r.Index, axis=0, inplace=True)
df = pd.DataFrame(data)
df.to_csv('asosoft_spell.csv', index=False)