Source code for PermutationImportance.abstract_runner

"""The general algorithm for all of the data-based variable importance methods
is the same, regardless of whether the method is Sequential Selection or 
Permutation Importance or something else. This is represented in the 
``abstract_variable_importance`` function. All of the different methods we 
provide use this function under the hood and the only difference between them is
the ``selection_strategy`` object, which is detailed in 
:mod:`PermutationImportance.selection_strategies`. Typically, you will not need 
to use this method but can instead use one of the methods imported directly into 
the top package of **PermutationImportance**.

If you wish to implement your own variable importance method, you will need to
devise your own ``selection_strategy``. We recommend using
:mod:`PermutationImportance.selection_strategies` as a template for implementing 
your own variable importance method."""

import numpy as np
import multiprocessing as mp

from .data_verification import verify_data, determine_variable_names
from .multiprocessing_utils import pool_imap_unordered
from .result import ImportanceResult
from .scoring_strategies import verify_scoring_strategy
from .utils import add_ranks_to_dict, get_data_subset


[docs]def abstract_variable_importance(training_data, scoring_data, scoring_fn, scoring_strategy, selection_strategy, variable_names=None, nimportant_vars=None, method=None, njobs=1): """Performs an abstract variable importance over data given a particular set of functions for scoring, determining optimal variables, and selecting data :param training_data: a 2-tuple ``(inputs, outputs)`` for training in the ``scoring_fn`` :param scoring_data: a 2-tuple ``(inputs, outputs)`` for scoring in the ``scoring_fn`` :param scoring_fn: a function to be used for scoring. Should be of the form ``(training_data, scoring_data) -> some_value`` :param scoring_strategy: a function to be used for determining optimal variables. Should be of the form ``([some_value]) -> index`` :param variable_names: an optional list for variable names. If not given, will use names of columns of data (if pandas dataframe) or column indices :param nimportant_vars: number of variables to compute importance for. Defaults to all variables :param method: a string for the name of the method used. Defaults to the name of the ``selection_strategy`` if not given :param njobs: an integer for the number of threads to use. If negative, will use ``num_cpus + njobs``. Defaults to 1 :returns: :class:`PermutationImportance.result.ImportanceResult` object which contains the results for each run """ training_data = verify_data(training_data) scoring_data = verify_data(scoring_data) scoring_strategy = verify_scoring_strategy(scoring_strategy) variable_names = determine_variable_names(scoring_data, variable_names) nimportant_vars = len( variable_names) if nimportant_vars is None else nimportant_vars method = getattr(selection_strategy, "name", getattr( selection_strategy, "__name__")) if method is None else method njobs = mp.cpu_count() + njobs if njobs <= 0 else njobs important_vars = list() num_vars = len(variable_names) # Compute the original score over all the data original_score = scoring_fn(training_data, scoring_data) result_obj = ImportanceResult(method, variable_names, original_score) for _ in range(nimportant_vars): selection_iter = selection_strategy( training_data, scoring_data, num_vars, important_vars) if njobs == 1: result = _singlethread_iteration( selection_iter, scoring_fn) else: result = _multithread_iteration( selection_iter, scoring_fn, njobs) next_result = add_ranks_to_dict( result, variable_names, scoring_strategy) best_var = min( next_result.keys(), key=lambda key: next_result[key][0]) best_index = np.flatnonzero(variable_names == best_var)[0] result_obj.add_new_results( next_result, next_important_variable=best_var) important_vars.append(best_index) return result_obj
[docs]def _singlethread_iteration(selection_iterator, scoring_fn): """Handles a single pass of the abstract variable importance algorithm, assuming a single worker thread :param selection_iterator: an iterator which yields triples ``(variable, training_data, scoring_data)``. Typically a :class:`PermutationImportance.selection_strategies.SelectionStrategy` :param scoring_fn: a function to be used for scoring. Should be of the form ``(training_data, scoring_data) -> float`` :returns: a dict of ``{var: score}`` """ result = dict() for var, training_data, scoring_data in selection_iterator: score = scoring_fn(training_data, scoring_data) result[var] = score return result
[docs]def _multithread_iteration(selection_iterator, scoring_fn, njobs): """Handles a single pass of the abstract variable importance algorithm using multithreading :param selection_iterator: an iterator which yields triples ``(variable, training_data, scoring_data)``. Typically a :class:`PermutationImportance.selection_strategies.SelectionStrategy` :param scoring_fn: a function to be used for scoring. Should be of the form ``(training_data, scoring_data) -> float`` :param num_jobs: number of processes to use :returns: a dict of ``{var: score}`` """ result = dict() for index, score in pool_imap_unordered(scoring_fn, selection_iterator, njobs): result[index] = score return result