File size: 8,268 Bytes
import requests
import json
import pprint
import time
import sys
import os
import numpy as np

def check_internet_connectivity():
    """Check if we can connect to the internet"""
    print("Testing internet connectivity...")
    try:
        response = requests.get("https://huggingface.co", timeout=5)
        print(f"Connection to huggingface.co: Status {response.status_code}")
        return response.status_code == 200
    except Exception as e:
        print(f"Error connecting to huggingface.co: {str(e)}")
        return False

def check_model_repository():
    """Check if we can connect to the specific model repository"""
    print("Testing connection to model repository...")
    try:
        # Try to access the model repository
        url = "https://huggingface.co/allenai/longformer-base-4096"
        response = requests.get(url, timeout=5)
        print(f"Connection to model repository: Status {response.status_code}")
        return response.status_code == 200
    except Exception as e:
        print(f"Error connecting to model repository: {str(e)}")
        return False

def check_debug_endpoint(api_url):
    """Check the debug endpoint for diagnostic information"""
    print(f"Checking debug endpoint at {api_url.replace('/predict', '/debug')}...")
    try:
        response = requests.get(api_url.replace("/predict", "/debug"), timeout=10)
        if response.status_code == 200:
            debug_info = response.json()
            print("Debug information retrieved:")
            print(f"- API Status: {debug_info.get('api_status', 'Unknown')}")
            print(f"- Model Loaded: {debug_info.get('model_loaded', 'Unknown')}")
            print(f"- Cache Directory Exists: {debug_info.get('model_cache_exists', 'Unknown')}")
            print(f"- Temp Directory Writable: {debug_info.get('tmp_directory_writable', 'Unknown')}")
            
            # Check internet connectivity from the server
            internet_check = debug_info.get('internet_connectivity', {})
            print(f"- Server Internet Connectivity: {internet_check.get('status', 'Unknown')}")
            if internet_check.get('message'):
                print(f"  Message: {internet_check.get('message')}")
            
            # Check tokenizer test
            tokenizer_test = debug_info.get('tokenizer_test', {})
            print(f"- Tokenizer Test: {tokenizer_test.get('status', 'Unknown')}")
            if tokenizer_test.get('message'):
                print(f"  Message: {tokenizer_test.get('message')}")
            
            # Check disk space
            disk_space = debug_info.get('disk_space', {})
            if disk_space.get('status') == 'ok':
                print(f"- Disk Space: Total: {disk_space.get('total_gb', 0):.2f} GB, Used: {disk_space.get('used_gb', 0):.2f} GB, Free: {disk_space.get('free_gb', 0):.2f} GB ({disk_space.get('percent_used', 0):.1f}% used)")
            
            return debug_info
        else:
            print(f"Error accessing debug endpoint: Status {response.status_code}")
            print(response.text)
            return None
    except Exception as e:
        print(f"Exception when accessing debug endpoint: {str(e)}")
        return None

# API endpoint on Hugging Face Spaces
API_URL = "https://angusfung-kickstarter-success-prediction.hf.space/predict"

# Sample input data (similar to what would be in input.json)
campaign_data = {
  "raw_description": "Introducing the AquaGo: The Smart, Eco-Friendly Portable Water Purifier! Clean water is a basic human right — yet for millions around the world, it's a daily struggle. Whether you're an outdoor adventurer, traveling to remote areas, or preparing for emergencies, access to safe drinking water should never be a compromise. That's why we created **AquaGo**, a revolutionary portable water purifier that combines cutting-edge filtration technology, smart sensors, and sustainable materials — all packed into a sleek, lightweight design you can take anywhere.",
  "raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier that delivers clean, safe drinking water anywhere.",
  "raw_risks": "Bringing a product to market involves complex engineering, regulatory approvals, and safety testing. Delays may occur due to certification or supply chain issues.",
  "raw_subcategory": "Gadgets",
  "raw_category": "Technology",
  "raw_country": "Canada",
  "funding_goal": 2000,
  "image_count": 8,
  "video_count": 3,
  "campaign_duration": 90,
  "previous_projects_count": 5,
  "previous_success_rate": 0.4,
  "previous_pledged": 18745.33,
  "previous_funding_goal": 23564.99
}

def predict_success(data, max_retries=3, retry_delay=10):
    """Send data to the API and get prediction results with retries"""
    for attempt in range(max_retries):
        try:
            # Make the POST request to the API
            print(f"Sending request to: {API_URL} (Attempt {attempt + 1}/{max_retries})")
            response = requests.post(API_URL, json=data, timeout=60)
            
            # Check if the request was successful
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error: {response.status_code}")
                print(response.text)
                
                if response.status_code == 500 and "Can't load tokenizer" in response.text:
                    print(f"The model might be downloading. Waiting {retry_delay} seconds before retry...")
                    time.sleep(retry_delay)
                else:
                    # For other errors, don't retry
                    return None
                    
        except Exception as e:
            print(f"Exception occurred: {str(e)}")
            print(f"Waiting {retry_delay} seconds before retry...")
            time.sleep(retry_delay)
    
    return None

def display_results(results):
    """Display the prediction results in a user-friendly way"""
    if not results:
        print("No results to display.")
        return
    
    print("\n===== KICKSTARTER SUCCESS PREDICTION =====\n")
    print(f"Success Probability: {results['success_probability']:.2%}")
    print(f"Predicted Outcome: {results['predicted_outcome']}")
    
    print("\n----- TOP INFLUENCING FACTORS -----")
    # Get the top 5 factors by absolute magnitude
    top_factors = sorted(
        results['shap_values'].items(), 
        key=lambda x: abs(float(x[1])), 
        reverse=True
    )[:5]
    
    for factor, value in top_factors:
        impact = "POSITIVE" if float(value) > 0 else "NEGATIVE"
        print(f"{factor}: {value:.4f} ({impact})")
    
    print("\n----- ALL SHAP VALUES -----")
    pp = pprint.PrettyPrinter(indent=2)
    pp.pprint(results['shap_values'])
    
    # Display Longformer embedding information if available
    if 'longformer_embedding' in results:
        embedding = np.array(results['longformer_embedding'])
        print("\n----- LONGFORMER EMBEDDING -----")
        print(f"Embedding Shape: {embedding.shape if hasattr(embedding, 'shape') else len(embedding)}")
        print(f"First 10 values: {embedding[:10]}")
        
        # Calculate some basic statistics on the embedding
        try:
            embedding_np = np.array(embedding)
            print(f"Mean: {np.mean(embedding_np):.4f}")
            print(f"Std: {np.std(embedding_np):.4f}")
            print(f"Min: {np.min(embedding_np):.4f}")
            print(f"Max: {np.max(embedding_np):.4f}")
        except Exception as e:
            print(f"Error calculating embedding statistics: {str(e)}")

# Main execution
if __name__ == "__main__":
    print("==== DIAGNOSTICS ====")
    print("Testing connectivity from client machine...")
    internet_ok = check_internet_connectivity()
    repo_ok = check_model_repository()
    
    debug_info = check_debug_endpoint(API_URL)
    
    print("\n==== PREDICTION TEST ====")
    if not internet_ok:
        print("WARNING: Internet connectivity issues detected on client machine.")
    
    if not repo_ok:
        print("WARNING: Cannot access model repository from client machine.")
    
    print("Sending prediction request...")
    results = predict_success(campaign_data, max_retries=2, retry_delay=10)
    display_results(results)