Source code for PermutationImportance.selection_strategies

"""Each of the various variable importance methods uses the same code to compute
successively important variables. The only difference between each of these 
methods is the data which is provided to the scoring function. The 
``SelectionStrategy`` handles the process of converting the original training
and scoring data to the form required for each of the individual variables. This
is done by using the current list of important variables to generate a sequence
of triples ``(variable, training_data, scoring_data)``, which will later be 
passed to the scoring function to determine the score for variable.

Below, ``SelectionStrategy`` encapsulates the base functionality which houses the
parameters necessary to produce the generator as well as the default method for
providing only the datasets which are necessary to be evaluated. Each of the
other classes extends this base class to implement a particular variable 
importance method.

If you wish to design your own variable importance method, you will want to
extend the ``SelectionStrategy`` base class in the same way as the other 
strategies.

-----
"""

import numpy as np
import pandas as pd

from .utils import get_data_subset, make_data_from_columns

__all__ = ["SequentialForwardSelectionStrategy",
           "SequentialBackwardSelectionStrategy",
           "PermutationImportanceSelectionStrategy",
           "SelectionStrategy"]


[docs]class SelectionStrategy(object):
    """The base ``SelectionStrategy`` only provides the tools for storing the 
    data and other important information as well as the convenience method for
    iterating over the selection strategies triples lazily."""

    name = "Abstract Selection Strategy"

[docs]    def __init__(self, training_data, scoring_data, num_vars, important_vars):
        """Initializes the object by storing the data and keeping track of other
        important information

        :param training_data: (training_inputs, training_outputs)
        :param scoring_data: (scoring_inputs, scoring_outputs)
        :param num_vars: integer for the total number of variables
        :param important_vars: a list of the indices of variables which are already
            considered important
        """
        self.training_data = training_data
        self.scoring_data = scoring_data
        self.num_vars = num_vars
        self.important_vars = important_vars

[docs]    def generate_datasets(self, important_variables):
        """Generator which returns triples (variable, training_data, scoring_data)"""
        raise NotImplementedError(
            "Please implement a strategy for generating datasets on class %s" % self.name)

[docs]    def generate_all_datasets(self):
        """By default, loops over all variables not yet considered important"""
        for var in range(self.num_vars):
            if var not in self.important_vars:
                training_data, scoring_data = self.generate_datasets(
                    self.important_vars + [var, ])
                yield (var, training_data, scoring_data)

    def __iter__(self):
        return self.generate_all_datasets()


[docs]class SequentialForwardSelectionStrategy(SelectionStrategy):
    """Sequential Forward Selection tests all variables which are not yet 
    considered important by adding that columns to the other columns which are
    returned. This means that the shape of the training data will be
    ``(num_rows, num_important_vars + 1)``."""

    name = "Sequential Forward Selection"

[docs]    def generate_datasets(self, important_variables):
        """Check each of the non-important variables. Dataset is the columns 
        which are important

        :returns: (training_data, scoring_data)
        """
        training_inputs, training_outputs = self.training_data
        scoring_inputs, scoring_outputs = self.scoring_data

        columns = important_variables
        # Make a slice of the training inputs
        training_inputs_subset = get_data_subset(
            training_inputs, None, columns)
        # Make a slice of the scoring inputs
        scoring_inputs_subset = get_data_subset(
            scoring_inputs, None, columns)
        return (training_inputs_subset, training_outputs), (scoring_inputs_subset, scoring_outputs)


[docs]class SequentialBackwardSelectionStrategy(SelectionStrategy):
    """Sequential Backward Selection tests all variables which are not yet 
    considered important by removing that column from the data. This means that
    the shape of the training data will be 
    ``(num_rows, num_vars - num_important_vars - 1)``."""

    name = "Sequential Backward Selection"

[docs]    def generate_datasets(self, important_variables):
        """Check each of the non-important variables. Dataset is the columns 
        which are not important

        :yields: a sequence of (variable being evaluated, columns to include)
        """
        training_inputs, training_outputs = self.training_data
        scoring_inputs, scoring_outputs = self.scoring_data

        columns = [x for x in range(self.num_vars)
                   if x not in important_variables]
        # Make a slice of the training inputs
        training_inputs_subset = get_data_subset(
            training_inputs, None, columns)
        # Make a slice of the scoring inputs
        scoring_inputs_subset = get_data_subset(
            scoring_inputs, None, columns)
        return (training_inputs_subset, training_outputs), (scoring_inputs_subset, scoring_outputs)


[docs]class PermutationImportanceSelectionStrategy(SelectionStrategy):
    """Permutation Importance tests all variables which are not yet considered
    important by shuffling that column in addition to the columns of the 
    variables which are considered important. The shape of the data will remain
    constant, but at each step, one additional column will be permuted."""

    name = "Permutation Importance"

[docs]    def __init__(self, training_data, scoring_data, num_vars, important_vars):
        """Initializes the object by storing the data and keeping track of other
        important information

        :param training_data: (training_inputs, training_outputs)
        :param scoring_data: (scoring_inputs, scoring_outputs)
        :param num_vars: integer for the total number of variables
        :param important_vars: a list of the indices of variables which are 
            already considered important
        """
        super(PermutationImportanceSelectionStrategy, self).__init__(
            training_data, scoring_data, num_vars, important_vars)
        # Also initialize the "shuffled data"
        scoring_inputs, __ = self.scoring_data
        indices = np.random.permutation(len(scoring_inputs))
        self.shuffled_scoring_inputs = get_data_subset(
            scoring_inputs, indices)  # This copies
        # keep track of the initial index (assuming this is pandas data)
        self.original_index = scoring_inputs.index if isinstance(scoring_inputs, pd.DataFrame) else None

[docs]    def generate_datasets(self, important_variables):
        """Check each of the non-important variables. Dataset has columns which
        are important shuffled

        :returns: (training_data, scoring_data)
        """
        scoring_inputs, scoring_outputs = self.scoring_data
        complete_scoring_inputs = make_data_from_columns(
            [get_data_subset(self.shuffled_scoring_inputs if i in important_variables else scoring_inputs, None, [i]) for i in range(self.num_vars)], index=self.original_index)

        return self.training_data, (complete_scoring_inputs, scoring_outputs)