Source code for teex.featureImportance.data

""" Module for synthetic and real datasets with available ground truth feature importance explanations. Also contains
methods and classes for decisionRule data manipulation.

All of the datasets must be instanced first. Then, when sliced, they all return the observations, labels and ground
truth explanations, respectively. """

from math import isnan

import numpy as np
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler
from sympy import parse_expr, re, diff, Symbol

from teex._baseClasses._baseDatasets import _SyntheticDataset
from teex._baseClasses._baseClassifier import _BaseClassifier
from teex._utils._arrays import _scale_array
from teex._utils._misc import _generate_feature_names


[docs]class TransparentLinearClassifier(_BaseClassifier):
    """ Used on the higher level data generation class :class:`SenecaFI` (**use that and get it from there
    preferably**).

    Transparent, linear classifier with feature importances as explanations. This class also generates labeled
    data according to the generated random linear expression. Presented in [Evaluating local explanation methods on
    ground truth, Riccardo Guidotti, 2021]. """

    def __init__(self, randomState: int = 888):
        super().__init__()
        self.randomState = randomState
        # SymPy expression
        self.expression = None
        self.derivatives = None

        self.X = None
        self.y = None
        self.classIndices = None  # {class_0: [X_indices_class_0], class_1: [X_indices_class_1]}

        # Scalers for predicting probabilities
        self._scalerNeg = MinMaxScaler(feature_range=[0., 0.5])
        self._scalerPos = MinMaxScaler(feature_range=[0.5, 1.])

[docs]    def fit(self, nFeatures=None, featureNames=None, nSamples=100) -> None:
        """ Generates a random linear expression and random data labeled by the linear expression as a binary
        dataset.

        :param nFeatures: (int) number of features in the data.
        :param featureNames: (array-like) names of the features in the data.
        :param nSamples: (int) number of samples for the generated data.
        :return: (ndarray, ndarray) data of shape (n, m) and their respective labels of shape (n)
        """
        if featureNames is None and nFeatures is None:
            raise ValueError('The number of features or feature names should be provided.')
        elif featureNames is None:
            self.featureNames = _generate_feature_names(nFeatures)
        elif nFeatures is None:
            self.featureNames = featureNames
        elif len(featureNames) != nFeatures:
            raise ValueError("Provide all of the features' names.")

        self.expression = self._generate_expression()
        self.derivatives = self._differentiate_expression(self.expression)

        self.X, self.y = self._generate_data(nSamples=nSamples)

        self.classIndices = {dataClass: np.argwhere(self.y == dataClass).squeeze() for dataClass in np.unique(self.y)}

        # fit the scalers
        self._scalerNeg.fit(self.X[self.classIndices[0]].reshape(-1, 1))
        self._scalerPos.fit(self.X[self.classIndices[1]].reshape(-1, 1))

        return self.X, self.y

[docs]    def predict(self, data):
        """ Predicts label for observations. Class 1 if f(x) > 0 and 0 otherwise where x is a point to label and f()
        is the generated classification expression.

        :param data: (ndarray) observations to label, shape (k, m).
        :return: (ndarray) array of length n with binary labels.
        """
        return np.argmax(self.predict_proba(data), axis=1)

[docs]    def predict_proba(self, data):
        """ Get class probabilities by evaluating the expression f at 'data', normalizing the result and
        setting the probabilities as 1 - norm(f(data)), norm(f(data)).

        :param data: (ndarray) observations for which to obtain probabilities, shape (k, m).
        :return: (ndarray) array of shape (n, 2) with predicted class probabilities.
        """
        probs = []
        for point in data:
            value = self._evaluate_expression({f: v for f, v in zip(self.featureNames, point)})
            if isnan(value):
                value = 0
            else:
                if value > 0:
                    value = self._scalerPos.transform(np.array(value, dtype=np.float32).reshape(-1, 1))[0][0]
                else:
                    value = self._scalerNeg.transform(np.array(value, dtype=np.float32).reshape(-1, 1))[0][0]
            # bound all possible values
            value = max(min(value, 1.), 0.)
            probs.append([1 - value, value])
        return np.array(probs)

[docs]    def explain(self, data, newLabels=None):
        """ Get feature importance explanation as the gradient of the expression evaluated at the point (from the
        n 'training' observations) with the same class as 'obs' and closest to the decision boundary f = 0.

        The procedure is as follows: for each data observation x to explain, get the observation z from the 'training'
        data that is closer to the decision boundary and is of different class than x. Then, get the observation t from
        the 'training' data that is closer to z but of the same class as x. Finally, return the explanation for x as the
        gradient vector of f evaluated at t.

        :param data: (ndarray) array of k observations and m features, shape (k, m).
        :param newLabels: (ndarray, optional) precomputed data labels (binary ints) for 'data'. Shape (k).
        :return: (ndarray) (k, m) array of feature importance explanations. """

        if len(data.shape) != 2:
            raise ValueError('Observations to explain should have shape (k, m).')
        if newLabels is None:
            # compute labels
            newLabels = self.predict(data)

        distances = cdist(data, self.X, metric='euclidean')  # (k, n) where n is len(self.X)
        explanations = []
        for index, obs in enumerate(data):
            # get closest point of different class
            obsClass = newLabels[index]
            maskedDistances = distances[index].copy()
            maskedDistances[self.classIndices[obsClass]] = np.inf
            closestNot = np.argmin(maskedDistances)
            # get closest point to point of different class (same class as original data point)
            notObsClass = int(not newLabels[index])
            maskedDistances = cdist(self.X[closestNot].reshape(1, -1), self.X).squeeze()
            maskedDistances[self.classIndices[notObsClass]] = np.inf
            closest = np.argmin(maskedDistances)
            # evaluate gradient at 'closest'
            exp = self._evaluate_derivatives({f: v for f, v in zip(self.featureNames, self.X[closest])})
            explanations.append(exp)
        exps = np.array(explanations, dtype=np.float32)

        for i in range(len(self.featureNames)):
            # scale to (-1, 1) by feature max. and min. importance values
            exps[:, i] = np.round(np.interp(exps[:, i], (np.amin(exps[:, i]), np.amax(exps[:, i])), (-1, +1)), 4)

        return exps

    def _generate_expression(self):
        """ Generate a random linear expression following the procedure described in ["Evaluating local explanation
         methods on ground truth", Riccardo Guidotti 2020]. """

        unaryOps = ['{f}', '-{f}', '{f} ** 2', '{f} ** 3', 'sqrt({f})', 'log({f})', 'sign({f}', 'sin({f})', 'cos({f})',
                    'tan({f})', 'sinh({f})', 'cosh({f})', 'tanh({f})', 'asin({f})', 'acos({f})', 'atan({f})']
        binaryOps = ['{f1} + {f2}', '{f1} - {f2}', '{f1} * {f2}', '{f1} / {f2}', '{f1} ** {f2}']

        rng = np.random.default_rng(self.randomState)
        features = set(self.featureNames)
        expr = []
        for feature in features:
            if rng.uniform() < 0.5:
                expr.append(rng.choice(unaryOps).format(f=feature))
            else:
                # binary op
                op = rng.choice(binaryOps)
                # choose second feature
                feature2 = rng.choice(list(features - set(feature)))
                # decide order of set
                if rng.uniform() < 0.5:
                    expr.append(op.format(f1=feature, f2=feature2))
                else:
                    expr.append(op.format(f1=feature2, f2=feature))

        return parse_expr('+'.join(expr))

    def _evaluate_expression(self, values: dict):
        return re(self.expression.evalf(subs=values))

    def _evaluate_derivatives(self, values: dict):
        """ Returns a list as the gradient vector of n features at a point 'values'. """
        grad = []
        for feature in values.keys():
            try:
                value = float(re(self.derivatives[feature].evalf(subs=values)))
            except TypeError or KeyError:
                # expression is not defined or feature does not play a role in the expression
                value = 0
            grad.append(value)
        return grad

    def _generate_data(self, nSamples):
        """ Generates two ndarrays of containing artificial data and its labels of shape nSamples * nFeatures and
        nFeatures, respectively. """
        rng = np.random.default_rng(self.randomState)
        data = np.array([rng.normal(scale=1, size=nSamples) for _ in range(len(self.featureNames))]).T
        labels = []
        for obs in data:
            labels.append(1 if self._evaluate_expression({f: v for f, v in zip(self.featureNames, obs)}) > 0 else 0)
        return data, np.array(labels, dtype=int)

    @staticmethod
    def _differentiate_expression(expression):
        """ Returns a dict with the first order _derivatives of a sympy expression w.r.t to each variable. """
        return {str(feature): diff(expression, feature) for feature in expression.atoms(Symbol)}


[docs]class SenecaFI(_SyntheticDataset):
    """ Generate synthetic binary classification tabular data with ground truth feature importance explanations. This
    method was presented in [Evaluating local explanation methods on ground truth, Riccardo Guidotti, 2021].

    From this class one can also obtain a trained transparent model (instance of :class:`TransparentLinearClassifier`).
    When sliced, this object will return

        - X (ndarray) of shape (nSamples, nFeatures) or (nFeatures). Generated data.
        - y (ndarray) of shape (nSamples,) or int. Generated binary data labels.
        - explanations (ndarray) of shape (nSamples, nFeatures) or (nFeatures). Generated g.t. feature importance
          explanations. For each explanation, the values are normalised to the [-1, 1] range.

    :param nSamples: (int) number of samples to be generated.
    :param nFeatures: (int) total number of features in the generated data.
    :param featureNames: (array-like) names of the generated features. If not provided, a list with the generated
        feature names will be returned by the function.
    :param randomState: (int) random state seed. """

    def __init__(self, nSamples: int = 200, nFeatures: int = 3, featureNames=None, randomState: int = 888) -> None:
        self.nSamples = nSamples
        self.nFeatures = nFeatures
        self.featureNames = _generate_feature_names(nFeatures) if featureNames is None else featureNames
        self.randomState = randomState

        self.X, self.y, self.exp, self.transparentModel = self._gen_seneca_dataset_fi()

    def __getitem__(self, item):
        if isinstance(item, (slice, int)):
            return self.X[item], self.y[item], self.exp[item]
        else:
            raise TypeError('Invalid argument type.')

    def __len__(self) -> int:
        return len(self.y)

    def _gen_seneca_dataset_fi(self):
        # explanations as gradient vectors around a decision boundary
        classifier = TransparentLinearClassifier(randomState=self.randomState)
        data, targets = classifier.fit(nSamples=self.nSamples, featureNames=self.featureNames)
        explanations = classifier.explain(data, newLabels=targets)

        return data, targets, explanations, classifier


[docs]def lime_to_feature_importance(exp, nFeatures, label=1):
    """ Convert from a ``lime.explanation.Explanation`` object to a np.array feature importance vector.

    :param lime.explanation.Explanation exp: explanation to convert to vector.
    :param label: (int, str) label of lime explanation. If lime explanations are generated by default, then it will be
        1.
    :param int nFeatures: number of features in the explanation
    :return: feature importance vector
    :rtype: np.ndarray """

    fiExp = np.zeros(nFeatures)
    for index, fi in exp.local_exp[label]:
        fiExp[index] = fi
    return fiExp


[docs]def scale_fi_bounds(x: np.ndarray, verbose: bool = False):
    """ Map values of an 1D or 2D np.ndarray on certain conditions. The mapping is on a by-column basis. That is, each
    column will be separately scaled.::

        (for each column in ``x``)
        if values in the range [-1, 1] or [0, 1]       -> do nothing
        else:
            case 1: if values in the [0, inf] range    -> map to [0, 1]
            case 2: if values in the [-inf, 0] range   -> map to [-1, 1]
            case 3: if values in the [-inf, inf] range -> map to [-1, 1] """

    if len(x.shape) == 1 or (len(x.shape) == 2 and x.shape[0] == 1):
        # 1D
        return _scale_array(x, verbose)
    elif len(x.shape) == 2 and x.shape[0] != 1:
        # 2D
        totalNegVals = False
        for i in range(x.shape[1]):
            x[:, i], negVals = _scale_array(x[:, i], verbose)
            if negVals is True:
                totalNegVals = True
        return x, totalNegVals
    else:
        raise ValueError('Shape of array not supported.')