|  | import pandas as pd | 
					
						
						|  | import io | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import requests | 
					
						
						|  | from constants import ( | 
					
						
						|  | REQUIRED_COLUMNS, | 
					
						
						|  | ASSAY_LIST, | 
					
						
						|  | CV_COLUMN, | 
					
						
						|  | SEQUENCES_FILE_DICT, | 
					
						
						|  | GDPa1_path, | 
					
						
						|  | ) | 
					
						
						|  | from evaluation import evaluate | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def validate_username(username: str) -> bool: | 
					
						
						|  | """ | 
					
						
						|  | Validate that the username corresponds to a real Hugging Face profile. | 
					
						
						|  | Just check https://huggingface.co/username exists. | 
					
						
						|  |  | 
					
						
						|  | Parameters | 
					
						
						|  | ---------- | 
					
						
						|  | username: str | 
					
						
						|  | The username to validate | 
					
						
						|  |  | 
					
						
						|  | Returns | 
					
						
						|  | ------- | 
					
						
						|  | bool | 
					
						
						|  | True if the username is valid and profile exists, False otherwise | 
					
						
						|  |  | 
					
						
						|  | Raises | 
					
						
						|  | ------ | 
					
						
						|  | gr.Error: If username is invalid or profile doesn't exist | 
					
						
						|  | """ | 
					
						
						|  | username = username.strip() | 
					
						
						|  | if username == "": | 
					
						
						|  | raise gr.Error("β Please provide a username.") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | profile_url = f"https://huggingface.co/{username}" | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  | response = requests.get(profile_url, timeout=10) | 
					
						
						|  |  | 
					
						
						|  | if response.status_code == 200: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if "profile" in response.text.lower() or "models" in response.text.lower(): | 
					
						
						|  | return True | 
					
						
						|  | else: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β '{username}' does not appear to be a valid Hugging Face user profile" | 
					
						
						|  | ) | 
					
						
						|  | elif response.status_code == 404: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions." | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β Unable to verify username '{username}'. Please try again later." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | except requests.exceptions.Timeout: | 
					
						
						|  | raise gr.Error("β Timeout while checking username. Please try again.") | 
					
						
						|  | except requests.exceptions.ConnectionError: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | "β Unable to connect to Hugging Face. Please check your internet connection." | 
					
						
						|  | ) | 
					
						
						|  | except requests.exceptions.RequestException as e: | 
					
						
						|  | raise gr.Error(f"β Error validating username: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: | 
					
						
						|  | """ | 
					
						
						|  | Validate that the CSV file can be read and parsed. | 
					
						
						|  |  | 
					
						
						|  | Parameters | 
					
						
						|  | ---------- | 
					
						
						|  | file_content: str | 
					
						
						|  | The content of the uploaded CSV file. | 
					
						
						|  |  | 
					
						
						|  | Returns | 
					
						
						|  | ------- | 
					
						
						|  | pd.DataFrame | 
					
						
						|  | The parsed DataFrame if successful. | 
					
						
						|  |  | 
					
						
						|  | Raises | 
					
						
						|  | ------ | 
					
						
						|  | gr.Error: If CSV cannot be read or parsed | 
					
						
						|  | """ | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | df = pd.read_csv(io.StringIO(file_content)) | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  | except pd.errors.EmptyDataError: | 
					
						
						|  | raise gr.Error("β CSV file is empty or contains no valid data") | 
					
						
						|  | except pd.errors.ParserError as e: | 
					
						
						|  | raise gr.Error(f"β Invalid CSV format<br><br>" f"Error: {str(e)}") | 
					
						
						|  | except UnicodeDecodeError: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | "β File encoding error<br><br>" | 
					
						
						|  | "Your file appears to have an unsupported encoding.<br>" | 
					
						
						|  | "Please save your CSV file with UTF-8 encoding and try again." | 
					
						
						|  | ) | 
					
						
						|  | except Exception as e: | 
					
						
						|  | raise gr.Error(f"β Unexpected error reading CSV file: {str(e)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def validate_cv_submission( | 
					
						
						|  | df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation" | 
					
						
						|  | ) -> None: | 
					
						
						|  | """Validate cross-validation submission""" | 
					
						
						|  |  | 
					
						
						|  | if CV_COLUMN not in df.columns: | 
					
						
						|  | raise gr.Error(f"β CV submissions must include a '{CV_COLUMN}' column") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | expected_cv_df = pd.read_csv(SEQUENCES_FILE_DICT[submission_type])[ | 
					
						
						|  | ["antibody_name", CV_COLUMN] | 
					
						
						|  | ] | 
					
						
						|  | antibody_check = expected_cv_df.merge( | 
					
						
						|  | df[["antibody_name", CV_COLUMN]], | 
					
						
						|  | on="antibody_name", | 
					
						
						|  | how="left", | 
					
						
						|  | suffixes=("_expected", "_submitted"), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | fold_mismatches = antibody_check[ | 
					
						
						|  | antibody_check[f"{CV_COLUMN}_expected"] | 
					
						
						|  | != antibody_check[f"{CV_COLUMN}_submitted"] | 
					
						
						|  | ] | 
					
						
						|  | if len(fold_mismatches) > 0: | 
					
						
						|  | examples = [] | 
					
						
						|  | for _, row in fold_mismatches.head(3).iterrows(): | 
					
						
						|  | examples.append( | 
					
						
						|  | f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})" | 
					
						
						|  | ) | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def validate_full_dataset_submission(df: pd.DataFrame) -> None: | 
					
						
						|  | """Validate full dataset submission""" | 
					
						
						|  | if CV_COLUMN in df.columns: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β Your submission contains a '{CV_COLUMN}' column. " | 
					
						
						|  | "Please select 'Cross-Validation Predictions' if you want to submit CV results." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_assay_columns(df: pd.DataFrame) -> list[str]: | 
					
						
						|  | """Get all assay columns from the DataFrame""" | 
					
						
						|  | return [col for col in df.columns if col in ASSAY_LIST] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None: | 
					
						
						|  | """ | 
					
						
						|  | Validate the DataFrame content and structure. | 
					
						
						|  |  | 
					
						
						|  | Parameters | 
					
						
						|  | ---------- | 
					
						
						|  | df: pd.DataFrame | 
					
						
						|  | The DataFrame to validate. | 
					
						
						|  | submission_type: str | 
					
						
						|  | Type of submission: "GDPa1" or "GDPa1_cross_validation" | 
					
						
						|  |  | 
					
						
						|  | Raises | 
					
						
						|  | ------ | 
					
						
						|  | gr.Error: If validation fails | 
					
						
						|  | """ | 
					
						
						|  | if submission_type not in SEQUENCES_FILE_DICT.keys(): | 
					
						
						|  | raise ValueError(f"Invalid submission type: {submission_type}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) | 
					
						
						|  | if missing_columns: | 
					
						
						|  | raise gr.Error(f"β Missing required columns: {', '.join(missing_columns)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | assay_columns = get_assay_columns(df) | 
					
						
						|  | if len(assay_columns) < 1: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β CSV should include at least one of the following assay columns: {', '.join(ASSAY_LIST)}. Found columns: {', '.join(df.columns)}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | submission_columns = REQUIRED_COLUMNS + assay_columns | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if df.empty: | 
					
						
						|  | raise gr.Error("β CSV file is empty") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for col in submission_columns: | 
					
						
						|  | missing_count = df[col].isnull().sum() | 
					
						
						|  | if missing_count > 0: | 
					
						
						|  | raise gr.Error(f"β Column '{col}' contains {missing_count} missing values") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | n_duplicates = df["antibody_name"].duplicated().sum() | 
					
						
						|  | if n_duplicates > 0: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | example_df = pd.read_csv(SEQUENCES_FILE_DICT[submission_type]) | 
					
						
						|  |  | 
					
						
						|  | unrecognized_antibodies = set(df["antibody_name"]) - set( | 
					
						
						|  | example_df["antibody_name"].tolist() | 
					
						
						|  | ) | 
					
						
						|  | if unrecognized_antibodies: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | missing_antibodies = set(example_df["antibody_name"].tolist()) - set( | 
					
						
						|  | df["antibody_name"] | 
					
						
						|  | ) | 
					
						
						|  | if missing_antibodies: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | f"β Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if submission_type.endswith("_cross_validation"): | 
					
						
						|  | validate_cv_submission(df, submission_type) | 
					
						
						|  | else: | 
					
						
						|  | validate_full_dataset_submission(df) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df_gdpa1 = pd.read_csv(GDPa1_path) | 
					
						
						|  | if submission_type in ["GDPa1", "GDPa1_cross_validation"]: | 
					
						
						|  | results_df = evaluate( | 
					
						
						|  | predictions_df=df, target_df=df_gdpa1, dataset_name=submission_type | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if results_df["spearman"].max() > 0.9: | 
					
						
						|  | raise gr.Error( | 
					
						
						|  | message="β οΈ Your submission shows abnormally high correlations (>0.9) on the public set. " | 
					
						
						|  | "Please check that you're not overfitting/don't have data leakage on the public set and are using cross-validation if training a new model.\n" | 
					
						
						|  | "This will result in a better model for eventually submitting to the heldout test set.\n" | 
					
						
						|  | "If you think this is a mistake, please contact antibodycompetition@ginkgobioworks.com.", | 
					
						
						|  | duration=30, | 
					
						
						|  | title="Data Leakage Warning", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None: | 
					
						
						|  | """ | 
					
						
						|  | Validate the uploaded CSV file. | 
					
						
						|  |  | 
					
						
						|  | Parameters | 
					
						
						|  | ---------- | 
					
						
						|  | file_content: str | 
					
						
						|  | The content of the uploaded CSV file. | 
					
						
						|  | submission_type: str | 
					
						
						|  | Type of submission: "GDPa1" or "GDPa1_cross_validation" | 
					
						
						|  |  | 
					
						
						|  | Raises | 
					
						
						|  | ------ | 
					
						
						|  | gr.Error: If validation fails | 
					
						
						|  | """ | 
					
						
						|  | df = validate_csv_can_be_read(file_content) | 
					
						
						|  | validate_dataframe(df, submission_type) | 
					
						
						|  |  |