Instructions to use ModelForge/spam-classifier with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use ModelForge/spam-classifier with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("ModelForge/spam-classifier", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| # Authors: The scikit-learn developers | |
| # SPDX-License-Identifier: BSD-3-Clause | |
| """ | |
| Random utility function | |
| ======================= | |
| This module complements missing features of ``numpy.random``. | |
| The module contains: | |
| * Several algorithms to sample integers without replacement. | |
| * Fast rand_r alternative based on xor shifts | |
| """ | |
| import numpy as np | |
| from . import check_random_state | |
| from ._typedefs cimport intp_t | |
| cdef uint32_t DEFAULT_SEED = 1 | |
| # Compatibility type to always accept the default int type used by NumPy, both | |
| # before and after NumPy 2. On Windows, `long` does not always match `inp_t`. | |
| # See the comments in the `sample_without_replacement` Python function for more | |
| # details. | |
| ctypedef fused default_int: | |
| intp_t | |
| long | |
| cpdef _sample_without_replacement_check_input(default_int n_population, | |
| default_int n_samples): | |
| """ Check that input are consistent for sample_without_replacement""" | |
| if n_population < 0: | |
| raise ValueError('n_population should be greater than 0, got %s.' | |
| % n_population) | |
| if n_samples > n_population: | |
| raise ValueError('n_population should be greater or equal than ' | |
| 'n_samples, got n_samples > n_population (%s > %s)' | |
| % (n_samples, n_population)) | |
| cpdef _sample_without_replacement_with_tracking_selection( | |
| default_int n_population, | |
| default_int n_samples, | |
| random_state=None): | |
| r"""Sample integers without replacement. | |
| Select n_samples integers from the set [0, n_population) without | |
| replacement. | |
| Time complexity: | |
| - Worst-case: unbounded | |
| - Average-case: | |
| O(O(np.random.randint) * \sum_{i=1}^n_samples 1 / | |
| (1 - i / n_population))) | |
| <= O(O(np.random.randint) * | |
| n_population * ln((n_population - 2) | |
| /(n_population - 1 - n_samples))) | |
| <= O(O(np.random.randint) * | |
| n_population * 1 / (1 - n_samples / n_population)) | |
| Space complexity of O(n_samples) in a python set. | |
| Parameters | |
| ---------- | |
| n_population : int | |
| The size of the set to sample from. | |
| n_samples : int | |
| The number of integer to sample. | |
| random_state : int, RandomState instance or None, default=None | |
| If int, random_state is the seed used by the random number generator; | |
| If RandomState instance, random_state is the random number generator; | |
| If None, the random number generator is the RandomState instance used | |
| by `np.random`. | |
| Returns | |
| ------- | |
| out : ndarray of shape (n_samples,) | |
| The sampled subsets of integer. | |
| """ | |
| _sample_without_replacement_check_input(n_population, n_samples) | |
| cdef default_int i | |
| cdef default_int j | |
| cdef default_int[::1] out = np.empty((n_samples, ), dtype=int) | |
| rng = check_random_state(random_state) | |
| rng_randint = rng.randint | |
| # The following line of code are heavily inspired from python core, | |
| # more precisely of random.sample. | |
| cdef set selected = set() | |
| for i in range(n_samples): | |
| j = rng_randint(n_population) | |
| while j in selected: | |
| j = rng_randint(n_population) | |
| selected.add(j) | |
| out[i] = j | |
| return np.asarray(out) | |
| cpdef _sample_without_replacement_with_pool(default_int n_population, | |
| default_int n_samples, | |
| random_state=None): | |
| """Sample integers without replacement. | |
| Select n_samples integers from the set [0, n_population) without | |
| replacement. | |
| Time complexity: O(n_population + O(np.random.randint) * n_samples) | |
| Space complexity of O(n_population + n_samples). | |
| Parameters | |
| ---------- | |
| n_population : int | |
| The size of the set to sample from. | |
| n_samples : int | |
| The number of integer to sample. | |
| random_state : int, RandomState instance or None, default=None | |
| If int, random_state is the seed used by the random number generator; | |
| If RandomState instance, random_state is the random number generator; | |
| If None, the random number generator is the RandomState instance used | |
| by `np.random`. | |
| Returns | |
| ------- | |
| out : ndarray of shape (n_samples,) | |
| The sampled subsets of integer. | |
| """ | |
| _sample_without_replacement_check_input(n_population, n_samples) | |
| cdef default_int i | |
| cdef default_int j | |
| cdef default_int[::1] out = np.empty((n_samples,), dtype=int) | |
| cdef default_int[::1] pool = np.empty((n_population,), dtype=int) | |
| rng = check_random_state(random_state) | |
| rng_randint = rng.randint | |
| # Initialize the pool | |
| for i in range(n_population): | |
| pool[i] = i | |
| # The following line of code are heavily inspired from python core, | |
| # more precisely of random.sample. | |
| for i in range(n_samples): | |
| j = rng_randint(n_population - i) # invariant: non-selected at [0,n-i) | |
| out[i] = pool[j] | |
| pool[j] = pool[n_population - i - 1] # move non-selected item into vacancy | |
| return np.asarray(out) | |
| cpdef _sample_without_replacement_with_reservoir_sampling( | |
| default_int n_population, | |
| default_int n_samples, | |
| random_state=None | |
| ): | |
| """Sample integers without replacement. | |
| Select n_samples integers from the set [0, n_population) without | |
| replacement. | |
| Time complexity of | |
| O((n_population - n_samples) * O(np.random.randint) + n_samples) | |
| Space complexity of O(n_samples) | |
| Parameters | |
| ---------- | |
| n_population : int | |
| The size of the set to sample from. | |
| n_samples : int | |
| The number of integer to sample. | |
| random_state : int, RandomState instance or None, default=None | |
| If int, random_state is the seed used by the random number generator; | |
| If RandomState instance, random_state is the random number generator; | |
| If None, the random number generator is the RandomState instance used | |
| by `np.random`. | |
| Returns | |
| ------- | |
| out : ndarray of shape (n_samples,) | |
| The sampled subsets of integer. The order of the items is not | |
| necessarily random. Use a random permutation of the array if the order | |
| of the items has to be randomized. | |
| """ | |
| _sample_without_replacement_check_input(n_population, n_samples) | |
| cdef default_int i | |
| cdef default_int j | |
| cdef default_int[::1] out = np.empty((n_samples, ), dtype=int) | |
| rng = check_random_state(random_state) | |
| rng_randint = rng.randint | |
| # This cython implementation is based on the one of Robert Kern: | |
| # http://mail.scipy.org/pipermail/numpy-discussion/2010-December/ | |
| # 054289.html | |
| # | |
| for i in range(n_samples): | |
| out[i] = i | |
| for i from n_samples <= i < n_population: | |
| j = rng_randint(0, i + 1) | |
| if j < n_samples: | |
| out[j] = i | |
| return np.asarray(out) | |
| cdef _sample_without_replacement(default_int n_population, | |
| default_int n_samples, | |
| method="auto", | |
| random_state=None): | |
| """Sample integers without replacement. | |
| Private function for the implementation, see sample_without_replacement | |
| documentation for more details. | |
| """ | |
| _sample_without_replacement_check_input(n_population, n_samples) | |
| all_methods = ("auto", "tracking_selection", "reservoir_sampling", "pool") | |
| ratio = <double> n_samples / n_population if n_population != 0.0 else 1.0 | |
| # Check ratio and use permutation unless ratio < 0.01 or ratio > 0.99 | |
| if method == "auto" and ratio > 0.01 and ratio < 0.99: | |
| rng = check_random_state(random_state) | |
| return rng.permutation(n_population)[:n_samples] | |
| if method == "auto" or method == "tracking_selection": | |
| # TODO the pool based method can also be used. | |
| # however, it requires special benchmark to take into account | |
| # the memory requirement of the array vs the set. | |
| # The value 0.2 has been determined through benchmarking. | |
| if ratio < 0.2: | |
| return _sample_without_replacement_with_tracking_selection( | |
| n_population, n_samples, random_state) | |
| else: | |
| return _sample_without_replacement_with_reservoir_sampling( | |
| n_population, n_samples, random_state) | |
| elif method == "reservoir_sampling": | |
| return _sample_without_replacement_with_reservoir_sampling( | |
| n_population, n_samples, random_state) | |
| elif method == "pool": | |
| return _sample_without_replacement_with_pool(n_population, n_samples, | |
| random_state) | |
| else: | |
| raise ValueError('Expected a method name in %s, got %s. ' | |
| % (all_methods, method)) | |
| def sample_without_replacement( | |
| object n_population, object n_samples, method="auto", random_state=None): | |
| """Sample integers without replacement. | |
| Select n_samples integers from the set [0, n_population) without | |
| replacement. | |
| Parameters | |
| ---------- | |
| n_population : int | |
| The size of the set to sample from. | |
| n_samples : int | |
| The number of integer to sample. | |
| random_state : int, RandomState instance or None, default=None | |
| If int, random_state is the seed used by the random number generator; | |
| If RandomState instance, random_state is the random number generator; | |
| If None, the random number generator is the RandomState instance used | |
| by `np.random`. | |
| method : {"auto", "tracking_selection", "reservoir_sampling", "pool"}, \ | |
| default='auto' | |
| If method == "auto", the ratio of n_samples / n_population is used | |
| to determine which algorithm to use: | |
| If ratio is between 0 and 0.01, tracking selection is used. | |
| If ratio is between 0.01 and 0.99, numpy.random.permutation is used. | |
| If ratio is greater than 0.99, reservoir sampling is used. | |
| The order of the selected integers is undefined. If a random order is | |
| desired, the selected subset should be shuffled. | |
| If method =="tracking_selection", a set based implementation is used | |
| which is suitable for `n_samples` <<< `n_population`. | |
| If method == "reservoir_sampling", a reservoir sampling algorithm is | |
| used which is suitable for high memory constraint or when | |
| O(`n_samples`) ~ O(`n_population`). | |
| The order of the selected integers is undefined. If a random order is | |
| desired, the selected subset should be shuffled. | |
| If method == "pool", a pool based algorithm is particularly fast, even | |
| faster than the tracking selection method. However, a vector containing | |
| the entire population has to be initialized. | |
| If n_samples ~ n_population, the reservoir sampling method is faster. | |
| Returns | |
| ------- | |
| out : ndarray of shape (n_samples,) | |
| The sampled subsets of integer. The subset of selected integer might | |
| not be randomized, see the method argument. | |
| Examples | |
| -------- | |
| >>> from sklearn.utils.random import sample_without_replacement | |
| >>> sample_without_replacement(10, 5, random_state=42) | |
| array([8, 1, 5, 0, 7]) | |
| """ | |
| cdef: | |
| intp_t n_pop_intp, n_samples_intp | |
| long n_pop_long, n_samples_long | |
| # On most platforms `np.int_ is np.intp`. However, before NumPy 2 the | |
| # default integer `np.int_` was a long which is 32bit on 64bit windows | |
| # while `intp` is 64bit on 64bit platforms and 32bit on 32bit ones. | |
| if np.int_ is np.intp: | |
| # Branch always taken on NumPy >=2 (or when not on 64bit windows). | |
| # Cython has different rules for conversion of values to integers. | |
| # For NumPy <1.26.2 AND Cython 3, this first branch requires `int()` | |
| # called explicitly to allow e.g. floats. | |
| n_pop_intp = int(n_population) | |
| n_samples_intp = int(n_samples) | |
| return _sample_without_replacement( | |
| n_pop_intp, n_samples_intp, method, random_state) | |
| else: | |
| # Branch taken on 64bit windows with Numpy<2.0 where `long` is 32bit | |
| n_pop_long = n_population | |
| n_samples_long = n_samples | |
| return _sample_without_replacement( | |
| n_pop_long, n_samples_long, method, random_state) | |
| def _our_rand_r_py(seed): | |
| """Python utils to test the our_rand_r function""" | |
| cdef uint32_t my_seed = seed | |
| return our_rand_r(&my_seed) | |