Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
No virus
1.44 kB
import pandas as pd
import numpy as np
import random
random.seed(1996)
CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv"
CORPUS_ALL = "data/migration/corpus_all.csv"
RATIO_DEV = 0.05
RATIO_TEST = 0.25
def preprocess_annotated():
print("Loading corpus...")
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
print(f"\tfound {len(df)} annotated headlines")
train_idx = []
dev_idx = []
test_idx = []
print("Making random train/dev/test split...")
for i in range(len(df)):
rnd = random.random()
if rnd < RATIO_DEV:
dev_idx.append(i)
elif rnd < (RATIO_DEV + RATIO_TEST):
test_idx.append(i)
else:
train_idx.append(i)
print(f"\tassigned {len(train_idx)} samples to train")
print(f"\tassigned {len(dev_idx)} samples to dev")
print(f"\tassigned {len(test_idx)} samples to test")
df_train = df.iloc[train_idx]
df_dev = df.iloc[dev_idx]
df_test = df.iloc[test_idx]
df_train.to_csv("output/migration/preprocess/annotations_train.csv")
df_dev.to_csv("output/migration/preprocess/annotations_dev.csv")
df_test.to_csv("output/migration/preprocess/annotations_test.csv")
def preprocess_all():
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
for _, row in df.iterrows():
pass
if __name__ == "__main__":
# preprocess_annotated()
preprocess_all()