File size: 711 Bytes
b11ac48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import pandas as pd

texts_meta = pd.read_csv("output/crashes/split_data/split_dev10.texts.meta.csv", index_col=0, dtype={"text_id": str})


def is_a_dutch_text(doc_id, exclude_frisian=True):
    filtered_for_doc = texts_meta[texts_meta["text_id"] == doc_id]
    if len(filtered_for_doc) >= 1:
        if exclude_frisian:
            # exclude newsproviders publishing mainly in Frisian
            # (NB these texts are recognized as Dutch by langdetect, hence the need for a provider filter)
            if filtered_for_doc["provider"].iloc[0] == "omropfryslan.nl":
                return False
        if filtered_for_doc["language"].iloc[0] == "nl":
            return True
    return False