angusfung's picture
Initial setup with Longformer embedding feature
7812756
import requests
import json
import pprint
import time
import sys
import os
import numpy as np
def check_internet_connectivity():
"""Check if we can connect to the internet"""
print("Testing internet connectivity...")
try:
response = requests.get("https://huggingface.co", timeout=5)
print(f"Connection to huggingface.co: Status {response.status_code}")
return response.status_code == 200
except Exception as e:
print(f"Error connecting to huggingface.co: {str(e)}")
return False
def check_model_repository():
"""Check if we can connect to the specific model repository"""
print("Testing connection to model repository...")
try:
# Try to access the model repository
url = "https://huggingface.co/allenai/longformer-base-4096"
response = requests.get(url, timeout=5)
print(f"Connection to model repository: Status {response.status_code}")
return response.status_code == 200
except Exception as e:
print(f"Error connecting to model repository: {str(e)}")
return False
def check_debug_endpoint(api_url):
"""Check the debug endpoint for diagnostic information"""
print(f"Checking debug endpoint at {api_url.replace('/predict', '/debug')}...")
try:
response = requests.get(api_url.replace("/predict", "/debug"), timeout=10)
if response.status_code == 200:
debug_info = response.json()
print("Debug information retrieved:")
print(f"- API Status: {debug_info.get('api_status', 'Unknown')}")
print(f"- Model Loaded: {debug_info.get('model_loaded', 'Unknown')}")
print(f"- Cache Directory Exists: {debug_info.get('model_cache_exists', 'Unknown')}")
print(f"- Temp Directory Writable: {debug_info.get('tmp_directory_writable', 'Unknown')}")
# Check internet connectivity from the server
internet_check = debug_info.get('internet_connectivity', {})
print(f"- Server Internet Connectivity: {internet_check.get('status', 'Unknown')}")
if internet_check.get('message'):
print(f" Message: {internet_check.get('message')}")
# Check tokenizer test
tokenizer_test = debug_info.get('tokenizer_test', {})
print(f"- Tokenizer Test: {tokenizer_test.get('status', 'Unknown')}")
if tokenizer_test.get('message'):
print(f" Message: {tokenizer_test.get('message')}")
# Check disk space
disk_space = debug_info.get('disk_space', {})
if disk_space.get('status') == 'ok':
print(f"- Disk Space: Total: {disk_space.get('total_gb', 0):.2f} GB, Used: {disk_space.get('used_gb', 0):.2f} GB, Free: {disk_space.get('free_gb', 0):.2f} GB ({disk_space.get('percent_used', 0):.1f}% used)")
return debug_info
else:
print(f"Error accessing debug endpoint: Status {response.status_code}")
print(response.text)
return None
except Exception as e:
print(f"Exception when accessing debug endpoint: {str(e)}")
return None
# API endpoint on Hugging Face Spaces
API_URL = "https://angusfung-kickstarter-success-prediction.hf.space/predict"
# Sample input data (similar to what would be in input.json)
campaign_data = {
"raw_description": "Introducing the AquaGo: The Smart, Eco-Friendly Portable Water Purifier! Clean water is a basic human right β€” yet for millions around the world, it's a daily struggle. Whether you're an outdoor adventurer, traveling to remote areas, or preparing for emergencies, access to safe drinking water should never be a compromise. That's why we created **AquaGo**, a revolutionary portable water purifier that combines cutting-edge filtration technology, smart sensors, and sustainable materials β€” all packed into a sleek, lightweight design you can take anywhere.",
"raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier that delivers clean, safe drinking water anywhere.",
"raw_risks": "Bringing a product to market involves complex engineering, regulatory approvals, and safety testing. Delays may occur due to certification or supply chain issues.",
"raw_subcategory": "Gadgets",
"raw_category": "Technology",
"raw_country": "Canada",
"funding_goal": 2000,
"image_count": 8,
"video_count": 3,
"campaign_duration": 90,
"previous_projects_count": 5,
"previous_success_rate": 0.4,
"previous_pledged": 18745.33,
"previous_funding_goal": 23564.99
}
def predict_success(data, max_retries=3, retry_delay=10):
"""Send data to the API and get prediction results with retries"""
for attempt in range(max_retries):
try:
# Make the POST request to the API
print(f"Sending request to: {API_URL} (Attempt {attempt + 1}/{max_retries})")
response = requests.post(API_URL, json=data, timeout=60)
# Check if the request was successful
if response.status_code == 200:
return response.json()
else:
print(f"Error: {response.status_code}")
print(response.text)
if response.status_code == 500 and "Can't load tokenizer" in response.text:
print(f"The model might be downloading. Waiting {retry_delay} seconds before retry...")
time.sleep(retry_delay)
else:
# For other errors, don't retry
return None
except Exception as e:
print(f"Exception occurred: {str(e)}")
print(f"Waiting {retry_delay} seconds before retry...")
time.sleep(retry_delay)
return None
def display_results(results):
"""Display the prediction results in a user-friendly way"""
if not results:
print("No results to display.")
return
print("\n===== KICKSTARTER SUCCESS PREDICTION =====\n")
print(f"Success Probability: {results['success_probability']:.2%}")
print(f"Predicted Outcome: {results['predicted_outcome']}")
print("\n----- TOP INFLUENCING FACTORS -----")
# Get the top 5 factors by absolute magnitude
top_factors = sorted(
results['shap_values'].items(),
key=lambda x: abs(float(x[1])),
reverse=True
)[:5]
for factor, value in top_factors:
impact = "POSITIVE" if float(value) > 0 else "NEGATIVE"
print(f"{factor}: {value:.4f} ({impact})")
print("\n----- ALL SHAP VALUES -----")
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(results['shap_values'])
# Display Longformer embedding information if available
if 'longformer_embedding' in results:
embedding = np.array(results['longformer_embedding'])
print("\n----- LONGFORMER EMBEDDING -----")
print(f"Embedding Shape: {embedding.shape if hasattr(embedding, 'shape') else len(embedding)}")
print(f"First 10 values: {embedding[:10]}")
# Calculate some basic statistics on the embedding
try:
embedding_np = np.array(embedding)
print(f"Mean: {np.mean(embedding_np):.4f}")
print(f"Std: {np.std(embedding_np):.4f}")
print(f"Min: {np.min(embedding_np):.4f}")
print(f"Max: {np.max(embedding_np):.4f}")
except Exception as e:
print(f"Error calculating embedding statistics: {str(e)}")
# Main execution
if __name__ == "__main__":
print("==== DIAGNOSTICS ====")
print("Testing connectivity from client machine...")
internet_ok = check_internet_connectivity()
repo_ok = check_model_repository()
debug_info = check_debug_endpoint(API_URL)
print("\n==== PREDICTION TEST ====")
if not internet_ok:
print("WARNING: Internet connectivity issues detected on client machine.")
if not repo_ok:
print("WARNING: Cannot access model repository from client machine.")
print("Sending prediction request...")
results = predict_success(campaign_data, max_retries=2, retry_delay=10)
display_results(results)