Spaces:

angusfung
/

Kickstarter-prediction-embedding

Sleeping

File size: 28,729 Bytes

"""
Campaign Data Processor for Kickstarter Prediction

This module handles the preprocessing of raw Kickstarter campaign data,
generating text embeddings and preparing numerical features for prediction.

Key functionality:
- Longformer embeddings for project descriptions
- Sentence transformer embeddings for blurbs and risk statements
- GloVe embeddings for categories and countries
- Normalization of numerical features

Author: Angus Fung
Date: April 2025
"""

import os
# Set gensim data directory to a writable location at the very start
os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
try:
    os.makedirs('/tmp/gensim-data', exist_ok=True)
    print(f"Created directory at {os.environ['GENSIM_DATA_DIR']}")
except Exception as e:
    print(f"Error creating gensim directory: {str(e)}")

import json
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
import torch
from transformers import AutoTokenizer, AutoModel
import gc
import gensim.downloader

class CampaignProcessor:
    """
    Processor for Kickstarter campaign data.
    
    This class handles the preprocessing of raw campaign data, transforming
    text and categorical features into embeddings using various NLP models
    and preparing numerical features for the prediction model.
    """
    
    def __init__(self, data: List[Dict], lazy_load: bool = False):
        """
        Initialize the CampaignProcessor.
        
        Args:
            data (List[Dict]): List of campaign dictionaries to process
            lazy_load (bool): If True, models will be loaded only when needed
                             rather than at initialization time
        """
        self.data = data
        self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
        self.lazy_load = lazy_load
        
        # Country name to ISO alpha-2 code mapping
        # First letter of each word is capitalized in the original data
        self.country_to_alpha2 = {
            # Main ISO country names
            "United States": "US",
            "United Kingdom": "GB",
            "Canada": "CA",
            "Australia": "AU",
            "New Zealand": "NZ",
            "Germany": "DE",
            "France": "FR",
            "Italy": "IT",
            "Spain": "ES",
            "Netherlands": "NL",
            "Sweden": "SE",
            "Denmark": "DK",
            "Norway": "NO",
            "Ireland": "IE",
            "Switzerland": "CH",
            "Austria": "AT",
            "Belgium": "BE",
            "Luxembourg": "LU",
            "Hong Kong": "HK",
            "Singapore": "SG",
            "Mexico": "MX",
            "Japan": "JP",
            "China": "CN",
            "Brazil": "BR",
            "India": "IN",
            "South Korea": "KR",
            "South Africa": "ZA",
            "Argentina": "AR",
            "Poland": "PL",
            "Portugal": "PT",
            "Russia": "RU",
            "Greece": "GR",
            "Czech Republic": "CZ",
            "Finland": "FI",
            "Hungary": "HU",
            "Romania": "RO",
            "Thailand": "TH",
            "Turkey": "TR",
            "Ukraine": "UA",
            "Colombia": "CO",
            "Chile": "CL",
            "Peru": "PE",
            "Malaysia": "MY",
            "Vietnam": "VN",
            "Indonesia": "ID",
            "Philippines": "PH",
            "United Arab Emirates": "AE",
            "Saudi Arabia": "SA",
            "Israel": "IL",
            "Egypt": "EG",
            "Nigeria": "NG",
            "Kenya": "KE",
            
            # Common variants and abbreviations
            "USA": "US",
            "U.S.A.": "US",
            "U.S.": "US",
            "UK": "GB",
            "U.K.": "GB",
            "Great Britain": "GB",
            "England": "GB",
            "Republic Of Korea": "KR",
            "Korea": "KR",
            "Republic Of China": "CN",
            "Republic Of India": "IN",
            "UAE": "AE",
            "Russia": "RU",
            "Russian Federation": "RU",
            "The Netherlands": "NL",
            "Holland": "NL",
            "Republic Of Ireland": "IE",
            "Czech": "CZ",
            "Czechia": "CZ",
        }
        
        # Create a lowercase version of the dictionary for case-insensitive lookups
        self.country_to_alpha2_lower = {k.lower(): v for k, v in self.country_to_alpha2.items()}
        
        # Initialize model variables (to be loaded later)
        self.tokenizer = None  # Longformer tokenizer for descriptions
        self.model = None  # Longformer model for descriptions
        self.RiskandBlurb_tokenizer = None  # MiniLM tokenizer for blurb and risk
        self.RiskandBlurb_model = None  # MiniLM model for blurb and risk
        self.glove = None  # GloVe word vectors for categories and countries
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load models at initialization if not using lazy loading
        if not lazy_load:
            self._load_models()
    
    def _load_models(self):
        """
        Load all NLP models required for processing campaign data.
        
        This method loads:
        - Longformer for description embeddings
        - MiniLM for blurb and risk embeddings
        - GloVe for category and country embeddings
        
        Models are cached to avoid reloading and moved to the appropriate device.
        """
        print("Loading NLP models...")
        # Cache models locally to avoid downloading every time
        cache_dir = "/tmp/model_cache"
        os.environ["TRANSFORMERS_CACHE"] = cache_dir
        os.environ["HF_HOME"] = cache_dir
        
        try:
            os.makedirs(cache_dir, exist_ok=True)
            print(f"Created cache directory at {cache_dir}")
        except Exception as e:
            print(f"Error creating cache directory: {str(e)}")
        
        # Initialize Longformer model and tokenizer (for processing description)
        model_name = "allenai/longformer-base-4096"
        print(f"Loading {model_name}...")
        
        try:
            # Add internet connectivity check
            try:
                import requests
                print("Testing internet connectivity...")
                response = requests.get("https://huggingface.co", timeout=5)
                if response.status_code == 200:
                    print("Successfully connected to huggingface.co")
                else:
                    print(f"Error connecting to huggingface.co: {response.status_code}")
            except Exception as e:
                print(f"Network connectivity test failed: {str(e)}")
            
            # Check if directory exists and is writable
            if os.path.exists(cache_dir):
                print(f"Cache directory {cache_dir} exists")
                if os.access(cache_dir, os.W_OK):
                    print(f"Cache directory {cache_dir} is writable")
                else:
                    print(f"Cache directory {cache_dir} is not writable")
            else:
                print(f"Cache directory {cache_dir} does not exist")
            
            # Try loading with explicit cache_dir parameter
            from transformers import AutoTokenizer
            print(f"Initializing tokenizer from {model_name} with cache_dir={cache_dir}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
            print(f"Tokenizer loaded successfully")
            
            # Load model with explicit cache_dir parameter
            from transformers import AutoModel
            print(f"Initializing model from {model_name} with cache_dir={cache_dir}")
            self.model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
            print(f"Model loaded successfully")
        except Exception as e:
            print(f"Error loading Longformer model: {str(e)}")
            # Continue with a fallback approach or raise the exception
            raise e

        try:
            # Initialize minilm model and tokenizer (for processing risk and blurb)
            RiskandBlurb_model_name = "sentence-transformers/all-minilm-l6-v2"
            print(f"Loading {RiskandBlurb_model_name}...")
            self.RiskandBlurb_tokenizer = AutoTokenizer.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
            self.RiskandBlurb_model = AutoModel.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
            print(f"RiskandBlurb model loaded successfully")
        except Exception as e:
            print(f"Error loading minilm model: {str(e)}")
            raise e

        try:
            # Load GloVe model for country and subcategory embeddings
            print("Loading GloVe model...")
            # GENSIM_DATA_DIR is already set at the top of the file
            print(f"Using GENSIM_DATA_DIR: {os.environ.get('GENSIM_DATA_DIR', 'Not set')}")
            
            self.glove = gensim.downloader.load('glove-wiki-gigaword-100')
            print("GloVe model loaded successfully")
        except Exception as e:
            print(f"Error loading GloVe model: {str(e)}")
            raise e
        
        try:
            # Move models to device
            self.model = self.model.to(self.device)
            self.RiskandBlurb_model = self.RiskandBlurb_model.to(self.device)
            print("All models loaded successfully.")
        except Exception as e:
            print(f"Error moving models to device: {str(e)}")
            raise e

    def _ensure_models_loaded(self):
        """
        Ensure all required models are loaded.
        
        This is called before any processing to make sure models are ready,
        particularly important when using lazy loading.
        """
        if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
            self._load_models()

    def _process_text_embedding(self, text: str, max_length: int, tokenizer: AutoTokenizer, model: AutoModel) -> np.ndarray:
        """
        Generate embedding for text using the specified model and tokenizer.
        
        This method handles tokenization, model inference, and pooling to 
        create a single vector representation of the input text.
        
        Args:
            text (str): Text to embed
            max_length (int): Maximum token length for the model
            tokenizer (AutoTokenizer): Tokenizer to use
            model (AutoModel): Model to use for embedding generation
            
        Returns:
            np.ndarray: Embedding vector for the text
        """
        # Clean up memory before processing
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        gc.collect()

        # Tokenize the text
        inputs = tokenizer(text, 
                        padding=True, 
                        truncation=True, 
                        max_length=max_length, 
                        return_tensors="pt")
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling - take average of all token embeddings
        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Convert to numpy array
        embedding = sentence_embeddings.cpu().numpy()
        
        # Clean up to prevent memory leaks
        del inputs, outputs, token_embeddings, sentence_embeddings
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        gc.collect()
        
        return embedding[0]

    def _get_glove_embedding(self, text: str, dim: int = 100) -> np.ndarray:
        """
        Generate GloVe embedding for a text by averaging word vectors.
        
        Args:
            text (str): Text to embed
            dim (int): Dimension of the GloVe embeddings
            
        Returns:
            np.ndarray: Averaged GloVe embedding for the text
        """
        if not text:
            return np.zeros(dim)
            
        # Normalize and split text
        text = text.lower().strip()
        words = text.split()
        vectors = []
        
        # Collect vectors for words that exist in the vocabulary
        for word in words:
            if word in self.glove:
                vectors.append(self.glove[word])
        
        # Average vectors if any exist, otherwise return zeros
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(dim)

    def process_description_embedding(self, campaign: Dict, idx: int) -> Tuple[np.ndarray, int]:
        """
        Process the project description to generate a Longformer embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            Tuple containing:
                - np.ndarray: Longformer embedding of the description
                - int: Word count of the description
        """
        self._ensure_models_loaded()
        
        try:
            text = campaign.get("raw_description", '')
            description_length = len(text.split())
            embedding = self._process_text_embedding(text, 4096, self.tokenizer, self.model)
            return embedding, description_length
        except Exception as e:
            print(f"Error processing description: {str(e)}")
            return np.zeros(768), 0

    def process_riskandchallenges_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the risks and challenges section to generate a MiniLM embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: MiniLM embedding of the risks section
        """
        self._ensure_models_loaded()
        
        try:
            text = campaign.get("raw_risks", '')
            return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
        except Exception as e:
            print(f"Error processing risk statement: {str(e)}")
            return np.zeros(384)

    def process_blurb(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the project blurb to generate a MiniLM embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: MiniLM embedding of the blurb
        """
        self._ensure_models_loaded()
        
        try:
            text = campaign.get("raw_blurb", '')
            return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
        except Exception as e:
            print(f"Error processing blurb: {str(e)}")
            return np.zeros(384)

    def process_category(self, campaign: Dict) -> List[int]:
        """
        Process the project category into a one-hot encoding.
        
        Args:
            campaign (Dict): Campaign data
            
        Returns:
            List[int]: One-hot encoding of the category
        """
        try:
            # All categories in the dataset
            fixed_categories = [
            "Art", "Comics", "Crafts", "Dance", "Design", "Fashion", 
            "Film & Video", "Food", "Games", "Journalism", "Music", 
            "Photography", "Publishing", "Technology", "Theater"
            ]

            category = campaign.get('raw_category', '')
            # Create one-hot encoding
            encoding = [1 if cat == category else 0 for cat in fixed_categories]
            return encoding
        except Exception as e:
            print(f"Error processing category: {str(e)}")
            return [0] * 15

    def process_subcategory_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the project subcategory to generate a GloVe embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: GloVe embedding of the subcategory
        """
        self._ensure_models_loaded()
        
        try:
            subcategory = campaign.get('raw_subcategory', '')
            return self._get_glove_embedding(subcategory)
        except Exception as e:
            print(f"Error processing subcategory: {str(e)}")
            return np.zeros(100)

    def _convert_country_to_alpha2(self, country_name: str) -> str:
        """
        Convert a country name to its ISO alpha-2 code.
        
        This helper method handles the conversion with proper logging:
        1. Tries exact match first
        2. Falls back to case-insensitive match
        3. Returns original string if no match found
        
        Args:
            country_name (str): Country name to convert
            
        Returns:
            str: ISO alpha-2 code (e.g., "US") or original country name if no match
        """
        if not country_name:
            return ""
            
        # Try exact match first
        alpha2_code = self.country_to_alpha2.get(country_name)
        
        # If no exact match, try case-insensitive match
        if not alpha2_code:
            alpha2_code = self.country_to_alpha2_lower.get(country_name.lower())
        
        # Log results
        if alpha2_code:
            print(f"Country conversion: '{country_name}' → '{alpha2_code}'")
            return alpha2_code
        else:
            print(f"Country conversion failed: '{country_name}' not found in dictionary")
            return country_name

    def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the project country to generate a GloVe embedding.
        
        This method:
        1. Extracts the country name from campaign data
        2. Converts full country name to ISO alpha-2 code (e.g., "United States" → "US")
        3. Generates an embedding using GloVe for the standardized country code
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: GloVe embedding of the country (as alpha-2 code)
        """
        self._ensure_models_loaded()
        
        try:
            # Extract country name from campaign data
            country_name = campaign.get('raw_country', '')
            
            # Convert to alpha-2 code using helper method
            alpha2_code = self._convert_country_to_alpha2(country_name)
            
            # Generate embedding using standardized country code
            return self._get_glove_embedding(alpha2_code)
            
        except Exception as e:
            print(f"Error processing country for campaign {idx}: {str(e)}")
            return np.zeros(100)

    def process_funding_goal(self, campaign: Dict, idx: int) -> float:
        """
        Process campaign funding goal with logarithmic compression.
        
        Applies Log1p transformation with base 10 to compress extreme values while
        preserving relative differences between funding goals.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The transformed funding goal
        """
        try:
            goal = float(campaign.get('funding_goal', 0))
            
            # Log1p transformation, good for general compression while preserving relative differences
            transformed_goal = np.log1p(goal)/np.log(10)
            
            return transformed_goal
            
        except Exception as e:
            print(f"Error processing funding goal for campaign {idx}: {str(e)}")
            return 0.0

    def process_previous_funding_goal(self, campaign: Dict, idx: int) -> float:
        """
        Process previous campaign funding goal with logarithmic compression.
        
        Applies Log1p transformation with base 10 to compress extreme values while
        preserving relative differences between previous funding goals.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The transformed previous funding goal
        """
        try:
            # Get the raw value
            previous_goal = float(campaign.get('previous_funding_goal', 0))
            
            # Log the source value
            print(f"Previous funding goal raw value: {previous_goal}")
            
            # Apply logarithmic transformation if value is positive
            if previous_goal > 0:
                transformed_goal = np.log1p(previous_goal)/np.log(10)
                print(f"Applied log transformation to previous_funding_goal: {previous_goal} → {transformed_goal}")
                return transformed_goal
            else:
                print(f"No transformation applied to previous_funding_goal: value is {previous_goal}")
                return 0.0
            
        except Exception as e:
            print(f"Error processing previous funding goal for campaign {idx}: {str(e)}")
            return 0.0

    def process_previous_pledged(self, campaign: Dict, idx: int) -> float:
        """
        Process previous campaign pledged amount with logarithmic compression.
        
        Applies Log1p transformation with base 10 to compress extreme values while
        preserving relative differences between previous pledged amounts.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The transformed previous pledged amount
        """
        try:
            # Get the raw value
            pledged = float(campaign.get('previous_pledged', 0))
            
            # Log the source value
            print(f"Previous pledged raw value: {pledged}")
            
            # Apply logarithmic transformation if value is positive
            if pledged > 0:
                transformed_pledge = np.log1p(pledged)/np.log(10)
                print(f"Applied log transformation to previous_pledged: {pledged} → {transformed_pledge}")
                return transformed_pledge
            else:
                print(f"No transformation applied to previous_pledged: value is {pledged}")
                return 0.0
            
        except Exception as e:
            print(f"Error processing pledge amount for campaign {idx}: {str(e)}")
            return 0.0

    def calculate_previous_sucess_rate(self, campaign: Dict, idx: int) -> float:
        """
        Calculate success rate of creator's previous campaigns.
        
        Computes the ratio of successful previous projects to total previous projects.
        Can use either direct 'previous_success_rate' or calculate from 
        'previous_successful_projects' and 'previous_projects_count'.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The previous success rate (0-1)
        """
        try:
            # First check if success rate is provided directly
            if 'previous_success_rate' in campaign:
                # Log the direct usage for debugging
                rate = float(campaign.get('previous_success_rate', 0))
                print(f"Using provided previous_success_rate directly: {rate}")
                return rate
                
            # Otherwise calculate from successful projects and total projects
            previousProjects = float(campaign.get('previous_projects_count', 0))
            previousSuccessfulProjects = float(campaign.get('previous_successful_projects', 0))
            
            # Log the values used for calculation (for debugging)
            print(f"Calculating success rate from: projects={previousProjects}, successful={previousSuccessfulProjects}")
            
            if previousProjects == 0.0:
                return 0.0
            else:
                previous_success_rate = previousSuccessfulProjects / previousProjects
                return previous_success_rate
                
        except Exception as e:
            print(f"Error calculating previous success rate for campaign {idx}: {str(e)}")
            return 0.0

    def process_campaign(self, campaign: Dict, idx: int) -> Dict:
        """
        Process a single campaign to prepare all required features for prediction.
        
        This is the main method that processes a raw campaign and prepares
        all features (embeddings and numerical) for the prediction model.
        
        Processing steps include:
        - Text embedding generation using appropriate models
        - Category and country embedding through GloVe
        - Logarithmic transformation of monetary values
        - Normalization of numerical features
        
        Args:
            campaign (Dict): Raw campaign data
            idx (int): Index of the campaign
            
        Returns:
            Dict: Processed data with all features ready for prediction
        """
        self._ensure_models_loaded()
        
        # Log the incoming campaign data for debugging
        print(f"Processing campaign {idx} with keys: {list(campaign.keys())}")
        
        # Generate embeddings for text fields
        description_embedding, calculated_description_length = self.process_description_embedding(campaign, idx)
        
        # Use existing value for description_length if present, otherwise use calculated
        description_length = campaign.get('description_length', calculated_description_length)
        
        # Create processed data dictionary with embeddings and numerical features
        result = {
            'description_embedding': description_embedding.tolist(),
            'description_length': description_length,
            'blurb_embedding': self.process_blurb(campaign, idx).tolist(),
            'risk_embedding': self.process_riskandchallenges_embedding(campaign, idx).tolist(),
            'category_embedding': self.process_category(campaign),
            'subcategory_embedding': self.process_subcategory_embedding(campaign, idx).tolist(),
            'country_embedding': self.process_country_embedding(campaign, idx).tolist()
        }
        
        # Process financial features with logarithmic transformation
        result['funding_goal'] = self.process_funding_goal(campaign, idx)
        result['previous_funding_goal'] = self.process_previous_funding_goal(campaign, idx)
        result['previous_pledged'] = self.process_previous_pledged(campaign, idx)
        
        # Calculate success rate based on previous projects
        # Ensure both direct values and calculated values are handled
        result['previous_success_rate'] = self.calculate_previous_sucess_rate(campaign, idx)
        
        # Extract simple integer features, with specific handling for previous_projects_count
        for field in ['image_count', 'video_count', 'campaign_duration']:
            result[field] = int(campaign.get(field, 0))
        
        # Special handling for previous_projects_count to ensure consistency
        if 'previous_projects_count' in campaign:
            # Use the value directly from input
            result['previous_projects_count'] = int(campaign.get('previous_projects_count', 0))
            print(f"Using provided previous_projects_count: {result['previous_projects_count']}")
        else:
            # Default to 0 if not provided
            result['previous_projects_count'] = 0
        
        # Log the final result for debugging
        print(f"Processed campaign with previous metrics: " +
              f"count={result.get('previous_projects_count')}, " +
              f"rate={result.get('previous_success_rate')}, " +
              f"pledged={result.get('previous_pledged')}, " +
              f"goal={result.get('previous_funding_goal')}")
        
        return result