Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
711 Bytes
import pandas as pd
texts_meta = pd.read_csv("output/crashes/split_data/split_dev10.texts.meta.csv", index_col=0, dtype={"text_id": str})
def is_a_dutch_text(doc_id, exclude_frisian=True):
filtered_for_doc = texts_meta[texts_meta["text_id"] == doc_id]
if len(filtered_for_doc) >= 1:
if exclude_frisian:
# exclude newsproviders publishing mainly in Frisian
# (NB these texts are recognized as Dutch by langdetect, hence the need for a provider filter)
if filtered_for_doc["provider"].iloc[0] == "omropfryslan.nl":
return False
if filtered_for_doc["language"].iloc[0] == "nl":
return True
return False