UbiquitinAI / prediction.py
shubham142000's picture
Update prediction.py
3d44518 verified
import pandas as pd
import joblib
def predict_sequence_label(sequence):
"""
Predict the label for a given amino acid sequence using the saved Random Forest model.
Parameters:
sequence (str): A string representing the amino acid sequence.
Returns:
int: The predicted label (0 or 1).
"""
def compute_aac_features(sequence):
"""
Compute the Amino Acid Composition (AAC) features for a given sequence.
Parameters:
sequence (str): A string representing the amino acid sequence.
Returns:
pd.DataFrame: DataFrame containing the AAC features for the sequence.
"""
# Define the 20 standard amino acids
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
# Initialize a dictionary to hold the counts of each amino acid
aac_counts = {f"AAC_{aa}": 0 for aa in amino_acids}
# Calculate the length of the sequence
seq_length = len(sequence)
# Count the occurrences of each amino acid in the sequence
for aa in sequence:
if f"AAC_{aa}" in aac_counts:
aac_counts[f"AAC_{aa}"] += 1
# Convert counts to frequencies
aac_features = {aa: count / seq_length for aa, count in aac_counts.items()}
# Convert the AAC features to a DataFrame
aac_features_df = pd.DataFrame([aac_features])
return aac_features_df
# Compute AAC features from the sequence
aac_features_df = compute_aac_features(sequence)
# Load the saved Random Forest model
saved_model = joblib.load('model.joblib')
# Predict using the loaded model
prediction = saved_model.predict(aac_features_df)
return prediction[0]
# Example usage:
# sequence = "YOUR_AMINO_ACID_SEQUENCE_HERE"
# print(predict_sequence_label(sequence))