import pandas as pd texts_meta = pd.read_csv("output/crashes/split_data/split_dev10.texts.meta.csv", index_col=0, dtype={"text_id": str}) def is_a_dutch_text(doc_id, exclude_frisian=True): filtered_for_doc = texts_meta[texts_meta["text_id"] == doc_id] if len(filtered_for_doc) >= 1: if exclude_frisian: # exclude newsproviders publishing mainly in Frisian # (NB these texts are recognized as Dutch by langdetect, hence the need for a provider filter) if filtered_for_doc["provider"].iloc[0] == "omropfryslan.nl": return False if filtered_for_doc["language"].iloc[0] == "nl": return True return False