Spaces:

angusfung
/

Kickstarter-prediction-embedding

Sleeping

App Files Files Community

Kickstarter-prediction-embedding / test.py

angusfung

Initial setup with Longformer embedding feature

7812756 3 months ago

raw

history blame contribute delete

8.27 kB

	import requests
	import json
	import pprint
	import time
	import sys
	import os
	import numpy as np

	def check_internet_connectivity():
	"""Check if we can connect to the internet"""
	print("Testing internet connectivity...")
	try:
	response = requests.get("https://huggingface.co", timeout=5)
	print(f"Connection to huggingface.co: Status {response.status_code}")
	return response.status_code == 200
	except Exception as e:
	print(f"Error connecting to huggingface.co: {str(e)}")
	return False

	def check_model_repository():
	"""Check if we can connect to the specific model repository"""
	print("Testing connection to model repository...")
	try:
	# Try to access the model repository
	url = "https://huggingface.co/allenai/longformer-base-4096"
	response = requests.get(url, timeout=5)
	print(f"Connection to model repository: Status {response.status_code}")
	return response.status_code == 200
	except Exception as e:
	print(f"Error connecting to model repository: {str(e)}")
	return False

	def check_debug_endpoint(api_url):
	"""Check the debug endpoint for diagnostic information"""
	print(f"Checking debug endpoint at {api_url.replace('/predict', '/debug')}...")
	try:
	response = requests.get(api_url.replace("/predict", "/debug"), timeout=10)
	if response.status_code == 200:
	debug_info = response.json()
	print("Debug information retrieved:")
	print(f"- API Status: {debug_info.get('api_status', 'Unknown')}")
	print(f"- Model Loaded: {debug_info.get('model_loaded', 'Unknown')}")
	print(f"- Cache Directory Exists: {debug_info.get('model_cache_exists', 'Unknown')}")
	print(f"- Temp Directory Writable: {debug_info.get('tmp_directory_writable', 'Unknown')}")

	# Check internet connectivity from the server
	internet_check = debug_info.get('internet_connectivity', {})
	print(f"- Server Internet Connectivity: {internet_check.get('status', 'Unknown')}")
	if internet_check.get('message'):
	print(f" Message: {internet_check.get('message')}")

	# Check tokenizer test
	tokenizer_test = debug_info.get('tokenizer_test', {})
	print(f"- Tokenizer Test: {tokenizer_test.get('status', 'Unknown')}")
	if tokenizer_test.get('message'):
	print(f" Message: {tokenizer_test.get('message')}")

	# Check disk space
	disk_space = debug_info.get('disk_space', {})
	if disk_space.get('status') == 'ok':
	print(f"- Disk Space: Total: {disk_space.get('total_gb', 0):.2f} GB, Used: {disk_space.get('used_gb', 0):.2f} GB, Free: {disk_space.get('free_gb', 0):.2f} GB ({disk_space.get('percent_used', 0):.1f}% used)")

	return debug_info
	else:
	print(f"Error accessing debug endpoint: Status {response.status_code}")
	print(response.text)
	return None
	except Exception as e:
	print(f"Exception when accessing debug endpoint: {str(e)}")
	return None

	# API endpoint on Hugging Face Spaces
	API_URL = "https://angusfung-kickstarter-success-prediction.hf.space/predict"

	# Sample input data (similar to what would be in input.json)
	campaign_data = {
	"raw_description": "Introducing the AquaGo: The Smart, Eco-Friendly Portable Water Purifier! Clean water is a basic human right — yet for millions around the world, it's a daily struggle. Whether you're an outdoor adventurer, traveling to remote areas, or preparing for emergencies, access to safe drinking water should never be a compromise. That's why we created AquaGo, a revolutionary portable water purifier that combines cutting-edge filtration technology, smart sensors, and sustainable materials — all packed into a sleek, lightweight design you can take anywhere.",
	"raw_blurb": "AquaGo is a smart, eco-friendly portable water purifier that delivers clean, safe drinking water anywhere.",
	"raw_risks": "Bringing a product to market involves complex engineering, regulatory approvals, and safety testing. Delays may occur due to certification or supply chain issues.",
	"raw_subcategory": "Gadgets",
	"raw_category": "Technology",
	"raw_country": "Canada",
	"funding_goal": 2000,
	"image_count": 8,
	"video_count": 3,
	"campaign_duration": 90,
	"previous_projects_count": 5,
	"previous_success_rate": 0.4,
	"previous_pledged": 18745.33,
	"previous_funding_goal": 23564.99
	}

	def predict_success(data, max_retries=3, retry_delay=10):
	"""Send data to the API and get prediction results with retries"""
	for attempt in range(max_retries):
	try:
	# Make the POST request to the API
	print(f"Sending request to: {API_URL} (Attempt {attempt + 1}/{max_retries})")
	response = requests.post(API_URL, json=data, timeout=60)

	# Check if the request was successful
	if response.status_code == 200:
	return response.json()
	else:
	print(f"Error: {response.status_code}")
	print(response.text)

	if response.status_code == 500 and "Can't load tokenizer" in response.text:
	print(f"The model might be downloading. Waiting {retry_delay} seconds before retry...")
	time.sleep(retry_delay)
	else:
	# For other errors, don't retry
	return None

	except Exception as e:
	print(f"Exception occurred: {str(e)}")
	print(f"Waiting {retry_delay} seconds before retry...")
	time.sleep(retry_delay)

	return None

	def display_results(results):
	"""Display the prediction results in a user-friendly way"""
	if not results:
	print("No results to display.")
	return

	print("\n===== KICKSTARTER SUCCESS PREDICTION =====\n")
	print(f"Success Probability: {results['success_probability']:.2%}")
	print(f"Predicted Outcome: {results['predicted_outcome']}")

	print("\n----- TOP INFLUENCING FACTORS -----")
	# Get the top 5 factors by absolute magnitude
	top_factors = sorted(
	results['shap_values'].items(),
	key=lambda x: abs(float(x[1])),
	reverse=True
	)[:5]

	for factor, value in top_factors:
	impact = "POSITIVE" if float(value) > 0 else "NEGATIVE"
	print(f"{factor}: {value:.4f} ({impact})")

	print("\n----- ALL SHAP VALUES -----")
	pp = pprint.PrettyPrinter(indent=2)
	pp.pprint(results['shap_values'])

	# Display Longformer embedding information if available
	if 'longformer_embedding' in results:
	embedding = np.array(results['longformer_embedding'])
	print("\n----- LONGFORMER EMBEDDING -----")
	print(f"Embedding Shape: {embedding.shape if hasattr(embedding, 'shape') else len(embedding)}")
	print(f"First 10 values: {embedding[:10]}")

	# Calculate some basic statistics on the embedding
	try:
	embedding_np = np.array(embedding)
	print(f"Mean: {np.mean(embedding_np):.4f}")
	print(f"Std: {np.std(embedding_np):.4f}")
	print(f"Min: {np.min(embedding_np):.4f}")
	print(f"Max: {np.max(embedding_np):.4f}")
	except Exception as e:
	print(f"Error calculating embedding statistics: {str(e)}")

	# Main execution
	if __name__ == "__main__":
	print("==== DIAGNOSTICS ====")
	print("Testing connectivity from client machine...")
	internet_ok = check_internet_connectivity()
	repo_ok = check_model_repository()

	debug_info = check_debug_endpoint(API_URL)

	print("\n==== PREDICTION TEST ====")
	if not internet_ok:
	print("WARNING: Internet connectivity issues detected on client machine.")

	if not repo_ok:
	print("WARNING: Cannot access model repository from client machine.")

	print("Sending prediction request...")
	results = predict_success(campaign_data, max_retries=2, retry_delay=10)
	display_results(results)