File size: 28,729 Bytes
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
 
 
7e4ff82
7812756
 
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
c732f23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
7812756
 
7e4ff82
7812756
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
7812756
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
7e4ff82
7812756
 
 
 
 
 
7e4ff82
7812756
 
7e4ff82
7812756
 
 
7e4ff82
7812756
 
 
 
 
7e4ff82
7812756
 
7e4ff82
7812756
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
7e4ff82
7812756
 
 
 
7e4ff82
7812756
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
7812756
 
 
 
 
 
 
 
 
c732f23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
c732f23
 
 
 
 
7e4ff82
 
 
 
 
c732f23
7e4ff82
7812756
 
 
c732f23
 
 
 
 
 
 
 
 
7812756
c732f23
7812756
 
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7812756
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01783ce
 
 
 
 
2592deb
01783ce
2592deb
 
 
 
 
01783ce
2592deb
7e4ff82
 
 
 
7812756
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01783ce
 
 
 
 
2592deb
01783ce
2592deb
 
 
 
 
01783ce
2592deb
7e4ff82
 
 
 
7812756
7e4ff82
 
 
 
 
2592deb
 
7e4ff82
 
 
 
 
 
 
 
 
2592deb
 
 
 
 
 
 
 
7e4ff82
 
 
2592deb
 
 
7e4ff82
 
 
 
 
 
 
 
 
7812756
7e4ff82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7812756
 
2592deb
 
 
7812756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4ff82
 
 
 
 
 
2592deb
7e4ff82
7812756
2592deb
 
7812756
 
2592deb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7812756
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
"""
Campaign Data Processor for Kickstarter Prediction

This module handles the preprocessing of raw Kickstarter campaign data,
generating text embeddings and preparing numerical features for prediction.

Key functionality:
- Longformer embeddings for project descriptions
- Sentence transformer embeddings for blurbs and risk statements
- GloVe embeddings for categories and countries
- Normalization of numerical features

Author: Angus Fung
Date: April 2025
"""

import os
# Set gensim data directory to a writable location at the very start
os.environ['GENSIM_DATA_DIR'] = '/tmp/gensim-data'
try:
    os.makedirs('/tmp/gensim-data', exist_ok=True)
    print(f"Created directory at {os.environ['GENSIM_DATA_DIR']}")
except Exception as e:
    print(f"Error creating gensim directory: {str(e)}")

import json
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
import torch
from transformers import AutoTokenizer, AutoModel
import gc
import gensim.downloader

class CampaignProcessor:
    """
    Processor for Kickstarter campaign data.
    
    This class handles the preprocessing of raw campaign data, transforming
    text and categorical features into embeddings using various NLP models
    and preparing numerical features for the prediction model.
    """
    
    def __init__(self, data: List[Dict], lazy_load: bool = False):
        """
        Initialize the CampaignProcessor.
        
        Args:
            data (List[Dict]): List of campaign dictionaries to process
            lazy_load (bool): If True, models will be loaded only when needed
                             rather than at initialization time
        """
        self.data = data
        self.categories = sorted(list(set(camp.get('raw_category', '') for camp in self.data)))
        self.lazy_load = lazy_load
        
        # Country name to ISO alpha-2 code mapping
        # First letter of each word is capitalized in the original data
        self.country_to_alpha2 = {
            # Main ISO country names
            "United States": "US",
            "United Kingdom": "GB",
            "Canada": "CA",
            "Australia": "AU",
            "New Zealand": "NZ",
            "Germany": "DE",
            "France": "FR",
            "Italy": "IT",
            "Spain": "ES",
            "Netherlands": "NL",
            "Sweden": "SE",
            "Denmark": "DK",
            "Norway": "NO",
            "Ireland": "IE",
            "Switzerland": "CH",
            "Austria": "AT",
            "Belgium": "BE",
            "Luxembourg": "LU",
            "Hong Kong": "HK",
            "Singapore": "SG",
            "Mexico": "MX",
            "Japan": "JP",
            "China": "CN",
            "Brazil": "BR",
            "India": "IN",
            "South Korea": "KR",
            "South Africa": "ZA",
            "Argentina": "AR",
            "Poland": "PL",
            "Portugal": "PT",
            "Russia": "RU",
            "Greece": "GR",
            "Czech Republic": "CZ",
            "Finland": "FI",
            "Hungary": "HU",
            "Romania": "RO",
            "Thailand": "TH",
            "Turkey": "TR",
            "Ukraine": "UA",
            "Colombia": "CO",
            "Chile": "CL",
            "Peru": "PE",
            "Malaysia": "MY",
            "Vietnam": "VN",
            "Indonesia": "ID",
            "Philippines": "PH",
            "United Arab Emirates": "AE",
            "Saudi Arabia": "SA",
            "Israel": "IL",
            "Egypt": "EG",
            "Nigeria": "NG",
            "Kenya": "KE",
            
            # Common variants and abbreviations
            "USA": "US",
            "U.S.A.": "US",
            "U.S.": "US",
            "UK": "GB",
            "U.K.": "GB",
            "Great Britain": "GB",
            "England": "GB",
            "Republic Of Korea": "KR",
            "Korea": "KR",
            "Republic Of China": "CN",
            "Republic Of India": "IN",
            "UAE": "AE",
            "Russia": "RU",
            "Russian Federation": "RU",
            "The Netherlands": "NL",
            "Holland": "NL",
            "Republic Of Ireland": "IE",
            "Czech": "CZ",
            "Czechia": "CZ",
        }
        
        # Create a lowercase version of the dictionary for case-insensitive lookups
        self.country_to_alpha2_lower = {k.lower(): v for k, v in self.country_to_alpha2.items()}
        
        # Initialize model variables (to be loaded later)
        self.tokenizer = None  # Longformer tokenizer for descriptions
        self.model = None  # Longformer model for descriptions
        self.RiskandBlurb_tokenizer = None  # MiniLM tokenizer for blurb and risk
        self.RiskandBlurb_model = None  # MiniLM model for blurb and risk
        self.glove = None  # GloVe word vectors for categories and countries
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load models at initialization if not using lazy loading
        if not lazy_load:
            self._load_models()
    
    def _load_models(self):
        """
        Load all NLP models required for processing campaign data.
        
        This method loads:
        - Longformer for description embeddings
        - MiniLM for blurb and risk embeddings
        - GloVe for category and country embeddings
        
        Models are cached to avoid reloading and moved to the appropriate device.
        """
        print("Loading NLP models...")
        # Cache models locally to avoid downloading every time
        cache_dir = "/tmp/model_cache"
        os.environ["TRANSFORMERS_CACHE"] = cache_dir
        os.environ["HF_HOME"] = cache_dir
        
        try:
            os.makedirs(cache_dir, exist_ok=True)
            print(f"Created cache directory at {cache_dir}")
        except Exception as e:
            print(f"Error creating cache directory: {str(e)}")
        
        # Initialize Longformer model and tokenizer (for processing description)
        model_name = "allenai/longformer-base-4096"
        print(f"Loading {model_name}...")
        
        try:
            # Add internet connectivity check
            try:
                import requests
                print("Testing internet connectivity...")
                response = requests.get("https://huggingface.co", timeout=5)
                if response.status_code == 200:
                    print("Successfully connected to huggingface.co")
                else:
                    print(f"Error connecting to huggingface.co: {response.status_code}")
            except Exception as e:
                print(f"Network connectivity test failed: {str(e)}")
            
            # Check if directory exists and is writable
            if os.path.exists(cache_dir):
                print(f"Cache directory {cache_dir} exists")
                if os.access(cache_dir, os.W_OK):
                    print(f"Cache directory {cache_dir} is writable")
                else:
                    print(f"Cache directory {cache_dir} is not writable")
            else:
                print(f"Cache directory {cache_dir} does not exist")
            
            # Try loading with explicit cache_dir parameter
            from transformers import AutoTokenizer
            print(f"Initializing tokenizer from {model_name} with cache_dir={cache_dir}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
            print(f"Tokenizer loaded successfully")
            
            # Load model with explicit cache_dir parameter
            from transformers import AutoModel
            print(f"Initializing model from {model_name} with cache_dir={cache_dir}")
            self.model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
            print(f"Model loaded successfully")
        except Exception as e:
            print(f"Error loading Longformer model: {str(e)}")
            # Continue with a fallback approach or raise the exception
            raise e

        try:
            # Initialize minilm model and tokenizer (for processing risk and blurb)
            RiskandBlurb_model_name = "sentence-transformers/all-minilm-l6-v2"
            print(f"Loading {RiskandBlurb_model_name}...")
            self.RiskandBlurb_tokenizer = AutoTokenizer.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
            self.RiskandBlurb_model = AutoModel.from_pretrained(RiskandBlurb_model_name, cache_dir=cache_dir)
            print(f"RiskandBlurb model loaded successfully")
        except Exception as e:
            print(f"Error loading minilm model: {str(e)}")
            raise e

        try:
            # Load GloVe model for country and subcategory embeddings
            print("Loading GloVe model...")
            # GENSIM_DATA_DIR is already set at the top of the file
            print(f"Using GENSIM_DATA_DIR: {os.environ.get('GENSIM_DATA_DIR', 'Not set')}")
            
            self.glove = gensim.downloader.load('glove-wiki-gigaword-100')
            print("GloVe model loaded successfully")
        except Exception as e:
            print(f"Error loading GloVe model: {str(e)}")
            raise e
        
        try:
            # Move models to device
            self.model = self.model.to(self.device)
            self.RiskandBlurb_model = self.RiskandBlurb_model.to(self.device)
            print("All models loaded successfully.")
        except Exception as e:
            print(f"Error moving models to device: {str(e)}")
            raise e

    def _ensure_models_loaded(self):
        """
        Ensure all required models are loaded.
        
        This is called before any processing to make sure models are ready,
        particularly important when using lazy loading.
        """
        if self.model is None or self.tokenizer is None or self.RiskandBlurb_model is None or self.RiskandBlurb_tokenizer is None or self.glove is None:
            self._load_models()

    def _process_text_embedding(self, text: str, max_length: int, tokenizer: AutoTokenizer, model: AutoModel) -> np.ndarray:
        """
        Generate embedding for text using the specified model and tokenizer.
        
        This method handles tokenization, model inference, and pooling to 
        create a single vector representation of the input text.
        
        Args:
            text (str): Text to embed
            max_length (int): Maximum token length for the model
            tokenizer (AutoTokenizer): Tokenizer to use
            model (AutoModel): Model to use for embedding generation
            
        Returns:
            np.ndarray: Embedding vector for the text
        """
        # Clean up memory before processing
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        gc.collect()

        # Tokenize the text
        inputs = tokenizer(text, 
                        padding=True, 
                        truncation=True, 
                        max_length=max_length, 
                        return_tensors="pt")
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling - take average of all token embeddings
        attention_mask = inputs['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sentence_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Convert to numpy array
        embedding = sentence_embeddings.cpu().numpy()
        
        # Clean up to prevent memory leaks
        del inputs, outputs, token_embeddings, sentence_embeddings
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        gc.collect()
        
        return embedding[0]

    def _get_glove_embedding(self, text: str, dim: int = 100) -> np.ndarray:
        """
        Generate GloVe embedding for a text by averaging word vectors.
        
        Args:
            text (str): Text to embed
            dim (int): Dimension of the GloVe embeddings
            
        Returns:
            np.ndarray: Averaged GloVe embedding for the text
        """
        if not text:
            return np.zeros(dim)
            
        # Normalize and split text
        text = text.lower().strip()
        words = text.split()
        vectors = []
        
        # Collect vectors for words that exist in the vocabulary
        for word in words:
            if word in self.glove:
                vectors.append(self.glove[word])
        
        # Average vectors if any exist, otherwise return zeros
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(dim)

    def process_description_embedding(self, campaign: Dict, idx: int) -> Tuple[np.ndarray, int]:
        """
        Process the project description to generate a Longformer embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            Tuple containing:
                - np.ndarray: Longformer embedding of the description
                - int: Word count of the description
        """
        self._ensure_models_loaded()
        
        try:
            text = campaign.get("raw_description", '')
            description_length = len(text.split())
            embedding = self._process_text_embedding(text, 4096, self.tokenizer, self.model)
            return embedding, description_length
        except Exception as e:
            print(f"Error processing description: {str(e)}")
            return np.zeros(768), 0

    def process_riskandchallenges_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the risks and challenges section to generate a MiniLM embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: MiniLM embedding of the risks section
        """
        self._ensure_models_loaded()
        
        try:
            text = campaign.get("raw_risks", '')
            return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
        except Exception as e:
            print(f"Error processing risk statement: {str(e)}")
            return np.zeros(384)

    def process_blurb(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the project blurb to generate a MiniLM embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: MiniLM embedding of the blurb
        """
        self._ensure_models_loaded()
        
        try:
            text = campaign.get("raw_blurb", '')
            return self._process_text_embedding(text, 512, self.RiskandBlurb_tokenizer, self.RiskandBlurb_model)
        except Exception as e:
            print(f"Error processing blurb: {str(e)}")
            return np.zeros(384)

    def process_category(self, campaign: Dict) -> List[int]:
        """
        Process the project category into a one-hot encoding.
        
        Args:
            campaign (Dict): Campaign data
            
        Returns:
            List[int]: One-hot encoding of the category
        """
        try:
            # All categories in the dataset
            fixed_categories = [
            "Art", "Comics", "Crafts", "Dance", "Design", "Fashion", 
            "Film & Video", "Food", "Games", "Journalism", "Music", 
            "Photography", "Publishing", "Technology", "Theater"
            ]

            category = campaign.get('raw_category', '')
            # Create one-hot encoding
            encoding = [1 if cat == category else 0 for cat in fixed_categories]
            return encoding
        except Exception as e:
            print(f"Error processing category: {str(e)}")
            return [0] * 15

    def process_subcategory_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the project subcategory to generate a GloVe embedding.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: GloVe embedding of the subcategory
        """
        self._ensure_models_loaded()
        
        try:
            subcategory = campaign.get('raw_subcategory', '')
            return self._get_glove_embedding(subcategory)
        except Exception as e:
            print(f"Error processing subcategory: {str(e)}")
            return np.zeros(100)

    def _convert_country_to_alpha2(self, country_name: str) -> str:
        """
        Convert a country name to its ISO alpha-2 code.
        
        This helper method handles the conversion with proper logging:
        1. Tries exact match first
        2. Falls back to case-insensitive match
        3. Returns original string if no match found
        
        Args:
            country_name (str): Country name to convert
            
        Returns:
            str: ISO alpha-2 code (e.g., "US") or original country name if no match
        """
        if not country_name:
            return ""
            
        # Try exact match first
        alpha2_code = self.country_to_alpha2.get(country_name)
        
        # If no exact match, try case-insensitive match
        if not alpha2_code:
            alpha2_code = self.country_to_alpha2_lower.get(country_name.lower())
        
        # Log results
        if alpha2_code:
            print(f"Country conversion: '{country_name}' β†’ '{alpha2_code}'")
            return alpha2_code
        else:
            print(f"Country conversion failed: '{country_name}' not found in dictionary")
            return country_name

    def process_country_embedding(self, campaign: Dict, idx: int) -> np.ndarray:
        """
        Process the project country to generate a GloVe embedding.
        
        This method:
        1. Extracts the country name from campaign data
        2. Converts full country name to ISO alpha-2 code (e.g., "United States" β†’ "US")
        3. Generates an embedding using GloVe for the standardized country code
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            np.ndarray: GloVe embedding of the country (as alpha-2 code)
        """
        self._ensure_models_loaded()
        
        try:
            # Extract country name from campaign data
            country_name = campaign.get('raw_country', '')
            
            # Convert to alpha-2 code using helper method
            alpha2_code = self._convert_country_to_alpha2(country_name)
            
            # Generate embedding using standardized country code
            return self._get_glove_embedding(alpha2_code)
            
        except Exception as e:
            print(f"Error processing country for campaign {idx}: {str(e)}")
            return np.zeros(100)

    def process_funding_goal(self, campaign: Dict, idx: int) -> float:
        """
        Process campaign funding goal with logarithmic compression.
        
        Applies Log1p transformation with base 10 to compress extreme values while
        preserving relative differences between funding goals.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The transformed funding goal
        """
        try:
            goal = float(campaign.get('funding_goal', 0))
            
            # Log1p transformation, good for general compression while preserving relative differences
            transformed_goal = np.log1p(goal)/np.log(10)
            
            return transformed_goal
            
        except Exception as e:
            print(f"Error processing funding goal for campaign {idx}: {str(e)}")
            return 0.0

    def process_previous_funding_goal(self, campaign: Dict, idx: int) -> float:
        """
        Process previous campaign funding goal with logarithmic compression.
        
        Applies Log1p transformation with base 10 to compress extreme values while
        preserving relative differences between previous funding goals.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The transformed previous funding goal
        """
        try:
            # Get the raw value
            previous_goal = float(campaign.get('previous_funding_goal', 0))
            
            # Log the source value
            print(f"Previous funding goal raw value: {previous_goal}")
            
            # Apply logarithmic transformation if value is positive
            if previous_goal > 0:
                transformed_goal = np.log1p(previous_goal)/np.log(10)
                print(f"Applied log transformation to previous_funding_goal: {previous_goal} β†’ {transformed_goal}")
                return transformed_goal
            else:
                print(f"No transformation applied to previous_funding_goal: value is {previous_goal}")
                return 0.0
            
        except Exception as e:
            print(f"Error processing previous funding goal for campaign {idx}: {str(e)}")
            return 0.0

    def process_previous_pledged(self, campaign: Dict, idx: int) -> float:
        """
        Process previous campaign pledged amount with logarithmic compression.
        
        Applies Log1p transformation with base 10 to compress extreme values while
        preserving relative differences between previous pledged amounts.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The transformed previous pledged amount
        """
        try:
            # Get the raw value
            pledged = float(campaign.get('previous_pledged', 0))
            
            # Log the source value
            print(f"Previous pledged raw value: {pledged}")
            
            # Apply logarithmic transformation if value is positive
            if pledged > 0:
                transformed_pledge = np.log1p(pledged)/np.log(10)
                print(f"Applied log transformation to previous_pledged: {pledged} β†’ {transformed_pledge}")
                return transformed_pledge
            else:
                print(f"No transformation applied to previous_pledged: value is {pledged}")
                return 0.0
            
        except Exception as e:
            print(f"Error processing pledge amount for campaign {idx}: {str(e)}")
            return 0.0

    def calculate_previous_sucess_rate(self, campaign: Dict, idx: int) -> float:
        """
        Calculate success rate of creator's previous campaigns.
        
        Computes the ratio of successful previous projects to total previous projects.
        Can use either direct 'previous_success_rate' or calculate from 
        'previous_successful_projects' and 'previous_projects_count'.
        
        Args:
            campaign (Dict): Campaign data
            idx (int): Index of the campaign
            
        Returns:
            float: The previous success rate (0-1)
        """
        try:
            # First check if success rate is provided directly
            if 'previous_success_rate' in campaign:
                # Log the direct usage for debugging
                rate = float(campaign.get('previous_success_rate', 0))
                print(f"Using provided previous_success_rate directly: {rate}")
                return rate
                
            # Otherwise calculate from successful projects and total projects
            previousProjects = float(campaign.get('previous_projects_count', 0))
            previousSuccessfulProjects = float(campaign.get('previous_successful_projects', 0))
            
            # Log the values used for calculation (for debugging)
            print(f"Calculating success rate from: projects={previousProjects}, successful={previousSuccessfulProjects}")
            
            if previousProjects == 0.0:
                return 0.0
            else:
                previous_success_rate = previousSuccessfulProjects / previousProjects
                return previous_success_rate
                
        except Exception as e:
            print(f"Error calculating previous success rate for campaign {idx}: {str(e)}")
            return 0.0

    def process_campaign(self, campaign: Dict, idx: int) -> Dict:
        """
        Process a single campaign to prepare all required features for prediction.
        
        This is the main method that processes a raw campaign and prepares
        all features (embeddings and numerical) for the prediction model.
        
        Processing steps include:
        - Text embedding generation using appropriate models
        - Category and country embedding through GloVe
        - Logarithmic transformation of monetary values
        - Normalization of numerical features
        
        Args:
            campaign (Dict): Raw campaign data
            idx (int): Index of the campaign
            
        Returns:
            Dict: Processed data with all features ready for prediction
        """
        self._ensure_models_loaded()
        
        # Log the incoming campaign data for debugging
        print(f"Processing campaign {idx} with keys: {list(campaign.keys())}")
        
        # Generate embeddings for text fields
        description_embedding, calculated_description_length = self.process_description_embedding(campaign, idx)
        
        # Use existing value for description_length if present, otherwise use calculated
        description_length = campaign.get('description_length', calculated_description_length)
        
        # Create processed data dictionary with embeddings and numerical features
        result = {
            'description_embedding': description_embedding.tolist(),
            'description_length': description_length,
            'blurb_embedding': self.process_blurb(campaign, idx).tolist(),
            'risk_embedding': self.process_riskandchallenges_embedding(campaign, idx).tolist(),
            'category_embedding': self.process_category(campaign),
            'subcategory_embedding': self.process_subcategory_embedding(campaign, idx).tolist(),
            'country_embedding': self.process_country_embedding(campaign, idx).tolist()
        }
        
        # Process financial features with logarithmic transformation
        result['funding_goal'] = self.process_funding_goal(campaign, idx)
        result['previous_funding_goal'] = self.process_previous_funding_goal(campaign, idx)
        result['previous_pledged'] = self.process_previous_pledged(campaign, idx)
        
        # Calculate success rate based on previous projects
        # Ensure both direct values and calculated values are handled
        result['previous_success_rate'] = self.calculate_previous_sucess_rate(campaign, idx)
        
        # Extract simple integer features, with specific handling for previous_projects_count
        for field in ['image_count', 'video_count', 'campaign_duration']:
            result[field] = int(campaign.get(field, 0))
        
        # Special handling for previous_projects_count to ensure consistency
        if 'previous_projects_count' in campaign:
            # Use the value directly from input
            result['previous_projects_count'] = int(campaign.get('previous_projects_count', 0))
            print(f"Using provided previous_projects_count: {result['previous_projects_count']}")
        else:
            # Default to 0 if not provided
            result['previous_projects_count'] = 0
        
        # Log the final result for debugging
        print(f"Processed campaign with previous metrics: " +
              f"count={result.get('previous_projects_count')}, " +
              f"rate={result.get('previous_success_rate')}, " +
              f"pledged={result.get('previous_pledged')}, " +
              f"goal={result.get('previous_funding_goal')}")
        
        return result