File size: 1,876 Bytes
78f7e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from ast import literal_eval

def make_lang_list(row):
    languages = row["languages"]
    if languages == "none":
        return []
    return literal_eval(languages)

def language_count(row):
    return len(row["languages"])

def process_for_lang(data, modality):
    # Filter by modality
    if modality == "NLP":
        data = data[data["modality"] == "nlp"]
    elif modality == "Audio":
        data = data[data["modality"] == "audio"]
    elif modality == "Multimodal":
        data = data[data["modality"] == "multimodal"]

    # Remove rows without languages
    data.loc[data.languages == "False", 'languages'] = None
    data.loc[data.languages == {}, 'languages'] = None

    # Count of rows that have no languages
    no_lang_count = data["languages"].isna().sum()

    # As the languages column might have multiple languages,
    # we need to convert it to a list. We then count the number of languages.
    data["languages"] = data["languages"].fillna('none')
    data["languages"] = data.apply(make_lang_list, axis=1)
    data["language_count"] = data.apply(language_count, axis=1)

    # Just keep the models with at least one language
    models_with_langs = data[data["language_count"] > 0]
    langs = models_with_langs["languages"].explode()
    langs = langs[langs != {}]
    total_langs = len(langs.unique())

    data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)

    return data, no_lang_count, total_langs, langs.unique()

def filter_multilinguality(data, linguality):
    if linguality == "Just Multilingual":
        multilingual_tag = data["multilingual"] == 1
        multiple_lang_tags = data["language_count"] > 1
        return data[multilingual_tag | multiple_lang_tags]
    elif linguality == "Three or more languages":
        return data[data["language_count"] >= 3]
    else:
        return data