from ast import literal_eval def make_lang_list(row): languages = row["languages"] if languages == "none": return [] return literal_eval(languages) def language_count(row): return len(row["languages"]) def process_for_lang(data, modality): # Filter by modality if modality == "NLP": data = data[data["modality"] == "nlp"] elif modality == "Audio": data = data[data["modality"] == "audio"] elif modality == "Multimodal": data = data[data["modality"] == "multimodal"] # Remove rows without languages data.loc[data.languages == "False", 'languages'] = None data.loc[data.languages == {}, 'languages'] = None # Count of rows that have no languages no_lang_count = data["languages"].isna().sum() # As the languages column might have multiple languages, # we need to convert it to a list. We then count the number of languages. data["languages"] = data["languages"].fillna('none') data["languages"] = data.apply(make_lang_list, axis=1) data["language_count"] = data.apply(language_count, axis=1) # Just keep the models with at least one language models_with_langs = data[data["language_count"] > 0] langs = models_with_langs["languages"].explode() langs = langs[langs != {}] total_langs = len(langs.unique()) data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1) return data, no_lang_count, total_langs, langs.unique() def filter_multilinguality(data, linguality): if linguality == "Just Multilingual": multilingual_tag = data["multilingual"] == 1 multiple_lang_tags = data["language_count"] > 1 return data[multilingual_tag | multiple_lang_tags] elif linguality == "Three or more languages": return data[data["language_count"] >= 3] else: return data