Fill-Mask
Transformers
PyTorch
esm
Inference Endpoints
File size: 4,985 Bytes
ffaff91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
from fuson_plm.utils.logging import log_update

def clean_rows_and_cols(df: pd.Series) -> pd.Series:
    """
    Deletes empty rows and columns
    
    Args:
        df (pd.Series): input DatFrame to be cleaned 
        
    Returns:
        pd.Series: cleaned DataFrame
    """
    # Delete rows with no data
    log_update(f"\trow cleaning...\n\t\toriginal # rows: {len(df)}")
    log_update("\t\tdropping rows where all entries are np.nan...")
    df = df.dropna(how='all')
    log_update(f"\t\tnew # rows: {len(df)}")

    # Delete columns with no data
    log_update(f"\tcolumn cleaning...\n\t\toriginal # columns: {len(df.columns)}")
    log_update("\t\tdropping columns where all entries are np.nan...")
    df = df.dropna(axis=1,how='all')
    log_update(f"\t\tnew # columns: {len(df.columns)}")
    log_update(f"\t\tcolumn names: {','.join(list(df.columns))}")

    return df

def check_columns_for_listlike(df: pd.DataFrame, cols_of_interest: list, delimiters: set):
    """
    Checks if a column contains any listlike items
    
    Args:
        df (pd.DataFrame): DataFrame to be investigated
        cols_of_interest (list): columns in df to be investigated for list-containing potential
        delimiters (set): set of potential delimiting strings to search for. A column with any of these strings is considered listlike.
    
    Returns:
        dict: dictionary containing a set {} of all delimiters found in each column
            e.g., { 'col1': {',',';'},
                    'col2': {'|'}   }
    """
    # return the delimiters/listlike things found for each column
    return_dict = {}

    log_update("\tchecking if any of our columns of interest look listlike (contain list objects or delimiters)...")
    for col in cols_of_interest:
        unique_col = list(df[col].value_counts().index)
        listlike = any([check_item_for_listlike(x, delimiters) for x in unique_col])

        if listlike:
            found_delims = df[col].apply(lambda x: check_item_for_listlike(x, delimiters)).value_counts().reset_index()['index'].to_list()
            unique_found_delims = set()
            for x in found_delims:
                unique_found_delims = unique_found_delims.union(x)

            return_dict[col] = unique_found_delims
        else:
            return_dict[col] = False
        
        # display the return dict
        log_update(f"\t\tcolumn name: {col}\tlistlike: {return_dict[col]}")
    
    return return_dict

def check_item_for_listlike(x, delimiters: set):
    """
    Checks if a column looks like it contains a list of items, rather than an inidvidual item, based on string delimiters.

    Args:
        x: the item to check. Any dtype. 
        delimiters: a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}

    Returns:
        If x is a string: the set (may be empty) of delimiters contained in the string
        If x is not a string: the dtype of x 
    """
    if isinstance(x, str):
        return find_delimiters(x, delimiters)
    else:
        if x is None:
            # if it's None, it's not listlike, it's just empty. return {} because it has no delimiters. 
            return {}
        if type(x)==float:
            # if it's nan, it's not listlike, it's just empty. return {} because it has no delimiters. 
            if np.isnan(x):
                return {}
        return type(x)

def find_delimiters(seq: str, delimiters: set) -> set:
    """
    Find and return a set of delimiters in a sequence. Helper mtehod for check_item_for_listlike.

    Args:
        seq (str): The sequence you wish to search for invalid characters.
        delimiters (set): a set of delimiters to check for. e.g., {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}

    Returns:
        set: A set of characters in the sequence that are not in the set of valid characters.
    """
    unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"
    overlap = delimiters.intersection(unique_chars)

    if len(overlap)==0:
        return {}
    else:
        return overlap

def find_invalid_chars(seq: str, valid_chars: set) -> set:
    """
    Find and return a set of invalid characters in a sequence.

    Args:
        seq (str): The sequence you wish to search for invalid characters.
        valid_chars (set): A set of valid characters.

    Returns:
        set: A set of characters in the sequence that are not in the set of valid characters.
    """
    unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"

    if unique_chars.issubset(valid_chars):  # e.g. unique_chars = {A,C}, and {A,C} is a subset of valid_chars
        return ''
    else: # e.g. unique_chars = {A,X}. {A,X} is not a subset of valid_chars because X is not in valid_chars
        return unique_chars.difference(valid_chars) # e.g. {A,X} - valid_chars = {X}