Source code for PermutationImportance.abstract_runner

"""The general algorithm for all of the data-based variable importance methods
is the same, regardless of whether the method is Sequential Selection or 
Permutation Importance or something else. This is represented in the 
``abstract_variable_importance`` function. All of the different methods we 
provide use this function under the hood and the only difference between them is
the ``selection_strategy`` object, which is detailed in 
:mod:`PermutationImportance.selection_strategies`. Typically, you will not need 
to use this method but can instead use one of the methods imported directly into 
the top package of **PermutationImportance**.

If you wish to implement your own variable importance method, you will need to
devise your own ``selection_strategy``. We recommend using
:mod:`PermutationImportance.selection_strategies` as a template for implementing 
your own variable importance method."""

import numpy as np
import multiprocessing as mp

from .data_verification import verify_data, determine_variable_names
from .multiprocessing_utils import pool_imap_unordered
from .result import ImportanceResult
from .scoring_strategies import verify_scoring_strategy
from .utils import add_ranks_to_dict, get_data_subset


[docs]def abstract_variable_importance(training_data, scoring_data, scoring_fn, scoring_strategy, selection_strategy, variable_names=None, nimportant_vars=None, method=None, njobs=1):
    """Performs an abstract variable importance over data given a particular
    set of functions for scoring, determining optimal variables, and selecting
    data

    :param training_data: a 2-tuple ``(inputs, outputs)`` for training in the
        ``scoring_fn``
    :param scoring_data: a 2-tuple ``(inputs, outputs)`` for scoring in the
        ``scoring_fn``
    :param scoring_fn: a function to be used for scoring. Should be of the form
        ``(training_data, scoring_data) -> some_value``
    :param scoring_strategy: a function to be used for determining optimal
        variables. Should be of the form ``([some_value]) -> index``
    :param variable_names: an optional list for variable names. If not given,
        will use names of columns of data (if pandas dataframe) or column
        indices
    :param nimportant_vars: number of variables to compute importance for.
        Defaults to all variables
    :param method: a string for the name of the method used. Defaults to the
        name of the ``selection_strategy`` if not given
    :param njobs: an integer for the number of threads to use. If negative, will
        use ``num_cpus + njobs``. Defaults to 1
    :returns: :class:`PermutationImportance.result.ImportanceResult` object 
        which contains the results for each run
    """

    training_data = verify_data(training_data)
    scoring_data = verify_data(scoring_data)
    scoring_strategy = verify_scoring_strategy(scoring_strategy)
    variable_names = determine_variable_names(scoring_data, variable_names)
    nimportant_vars = len(
        variable_names) if nimportant_vars is None else nimportant_vars
    method = getattr(selection_strategy, "name", getattr(
        selection_strategy, "__name__")) if method is None else method
    njobs = mp.cpu_count() + njobs if njobs <= 0 else njobs

    important_vars = list()
    num_vars = len(variable_names)

    # Compute the original score over all the data
    original_score = scoring_fn(training_data, scoring_data)
    result_obj = ImportanceResult(method, variable_names, original_score)
    for _ in range(nimportant_vars):
        selection_iter = selection_strategy(
            training_data, scoring_data, num_vars, important_vars)
        if njobs == 1:
            result = _singlethread_iteration(
                selection_iter, scoring_fn)
        else:
            result = _multithread_iteration(
                selection_iter, scoring_fn, njobs)
        next_result = add_ranks_to_dict(
            result, variable_names, scoring_strategy)
        best_var = min(
            next_result.keys(), key=lambda key: next_result[key][0])
        best_index = np.flatnonzero(variable_names == best_var)[0]
        result_obj.add_new_results(
            next_result, next_important_variable=best_var)
        important_vars.append(best_index)

    return result_obj


[docs]def _singlethread_iteration(selection_iterator, scoring_fn):
    """Handles a single pass of the abstract variable importance algorithm, 
    assuming a single worker thread

    :param selection_iterator: an iterator which yields triples
        ``(variable, training_data, scoring_data)``. Typically a 
        :class:`PermutationImportance.selection_strategies.SelectionStrategy`
    :param scoring_fn: a function to be used for scoring. Should be of the form
        ``(training_data, scoring_data) -> float``
    :returns: a dict of ``{var: score}``
    """
    result = dict()
    for var, training_data, scoring_data in selection_iterator:
        score = scoring_fn(training_data, scoring_data)
        result[var] = score
    return result


[docs]def _multithread_iteration(selection_iterator, scoring_fn, njobs):
    """Handles a single pass of the abstract variable importance algorithm using
    multithreading

    :param selection_iterator: an iterator which yields triples
        ``(variable, training_data, scoring_data)``. Typically a 
        :class:`PermutationImportance.selection_strategies.SelectionStrategy`
    :param scoring_fn: a function to be used for scoring. Should be of the form
        ``(training_data, scoring_data) -> float``
    :param num_jobs: number of processes to use
    :returns: a dict of ``{var: score}``
    """
    result = dict()
    for index, score in pool_imap_unordered(scoring_fn, selection_iterator, njobs):
        result[index] = score
    return result