nikraf
/

directionality_probe

Feature Extraction

Model card Files Files and versions

directionality_probe / protify /benchmarks /proteingym /data_loader.py

nikraf's picture

Upload folder using huggingface_hub

714cf46 verified 3 days ago

history blame contribute delete

1.97 kB

	import pandas as pd
	from typing import Optional
	from huggingface_hub import hf_hub_download


	def _load_parquet_by_dms(repo_id: str, dms_id: str) -> Optional[pd.DataFrame]:
	"""
	Loads a single-assay parquet shard from the Hub at by_dms_id/{DMS_id}.parquet.
	"""
	assay_files = []
	id_str = str(dms_id)
	assay_files.append(f"by_dms_id/{id_str}.parquet")
	for filename in assay_files:
	local_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
	df = pd.read_parquet(local_path)
	return df


	def load_proteingym_dms(dms_id: str, mode: str, repo_id: str = "GleghornLab/ProteinGym_DMS") -> pd.DataFrame:
	"""
	Load a single ProteinGym DMS assay, processing columns as specified.

	Modes:
	- "benchmark": Keeps the columns for standard substitution benchmark.
	- "indels": Keeps only indels assays.
	- "singles": Keeps only single substitutions variants.
	- "multiples": Keeps only multiple substitutions variants.
	"""
	df = _load_parquet_by_dms(repo_id=repo_id, dms_id=dms_id)

	if mode == 'benchmark':
	df = df[df['is_indel'] == False]
	df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]]
	elif mode == 'indels':
	# Indels only, no mutant triplet column
	df = df[df['is_indel'] == True]
	df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin"]]
	elif mode == 'singles':
	# Single substitutions only
	df = df[df['is_indel'] == False]
	df = df[df['num_mutations'] == 1]
	df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]]
	elif mode == 'multiples':
	# Multiple substitutions only
	df = df[df['is_indel'] == False]
	df = df[df['num_mutations'] > 1]
	df = df[["DMS_id", "mutated_seq", "target_seq", "DMS_score", "DMS_score_bin", "mutant"]]

	return df.reset_index(drop=True)