## Software for masking + explainability
## (c) Vicenç Torra
## Current version: 20221201
## 
## References:
##  A. Bozorgpanah, V. Torra, L. Aliahmadipour,
##    Privacy and explainability: the effects of data protection on Shapley values,
##    Technologies 10:6 (2022) 125
##    https://doi.org/10.3390/technologies10060125 
##
## The software computes the Shapley values for machine learning models,
##   we use our own implementation to build a game, and we use our own
##   implementation to compute the Shapley value for a game. 
##
## Main functions:
##
## Computation of results: 
##    meILShapleyMeansXp (shapleyX, meanShapleyX, xp, y, testX, testY, modelRegression, listColumnNames)  ## shapley-based-divergence(original,masked)
##    testMeILMeanShapley (x, y, testX, testY, modelRegression, listColumnNames, masking, loParameters)   ## results for masking using several parameters
## Plotting: 
##    plotLoResults (lox, mmethod, loLoParams, which=[0,1,2,3], myFileName="", whichCurves=[], wLeg=[], pltYLim=[])
##

## To execute this file we need to run these other files: 
## exec(open("prog.vectors.matrices.web.txt").read())
## exec(open("prog.choquet.web.txt").read())
## exec(open("prog.sdc.web.txt").read())


import random
import numpy as np
import numpy.linalg as lina
import matplotlib.pyplot as plt
import statistics
import math
import sklearn
import time

import pandas as pd
from scipy.stats import spearmanr

## -------------------------------------------------------------------------
##   Build games, compute Shapley values, and interactions
## -------------------------------------------------------------------------

def meBuildGame (myModel, nInputs, meanInput, meanOutput, realInput):
    """ 
    Example: 
    meBuildGame (meModelExample, 4, [2,2,2,2], meModelExample([2,2,2,2]), [1,2,3,4])
    ciShapley(meBuildGame (meModelExample, 4, [2,2,2,2], meModelExample([2,2,2,2]), [1,2,3,4]),4)
    meModelExample([2,2,2,2]) + sum(ciShapley(meBuildGame (meModelExample, 4, [2,2,2,2], meModelExample([2,2,2,2]), [1,2,3,4]),4)) == meModelExample([1,2,3,4])
    """
    game = [0]*(2**nInputs)
    mySetA = firstSet(nInputs)
    set2IntA = 0
    while notLastSet(mySetA):
        mySetA = nextSet(mySetA, nInputs)
        set2IntA = set2IntA+1
        mySetInputs = [x[1] if x[0]==0 else x[2] for x in zip(mySetA, meanInput, realInput)]
        game[set2IntA] = myModel(mySetInputs) - meanOutput
    return(game)


def meComputeAllShapley(myModel, nInputs, meanInput, meanOutput, data):
    allShapley = []
    for nExample in range(0,len(data)):
        allShapley.append(ciShapley (meBuildGame (myModel, nInputs, meanInput, meanOutput, data[nExample]),nInputs))
    return(allShapley)


def meComputeAll2Interactions(myModel, nInputs, meanInput, meanOutput, data):
    allShapley = []
    for nExample in range(0,len(data)):
        allShapley.append(ci2Interactions (meBuildGame (myModel, nInputs, meanInput, meanOutput, data[nExample]),nInputs))
    return(allShapley)


## -----------------------------------------
## Auxiliary functions for building / applying models
## -----------------------------------------


## Function:
##   Simple example of a "model" to be used in testing
##   ML models will have this structure: given an input list return a number
def meModelExample (x):
    """ 
    Example: 
    meModelExample([1,2,3,4])
    """
    return (vectorProduct(x,[0.1,0.2,0.3,0.4]))

## Function: 
##    A function so that we can build the game from models learnt using sklearn,
def meMyModelPredict (listColumnNames, model):
    """
    Function: 
    # Given a model of sklearn, 
    # and list of column names (as in a dataframe for sklearn)
    # return a function that given a list computes its output
    # This function is to compute Shapley values for models in sklearn
    """
    def myModel (myInputs):
        oneRowDataFrame = pd.DataFrame(np.array([myInputs]), columns = listColumnNames)
        return (model.predict(oneRowDataFrame)[0])
    return(myModel)

## Function: 
##    A function so that we can build the game from models learnt using sklearn,
##    when we are using non-linear regression using polynomials
def meMyPolyModelPredict (poly, model):
    """
    Function: 
    # Given polynomial features
    # and a linear model based on these features
    # return a function that given a list corresponding to the 
    # standard input (not to the polynomial features),
    # it computes its output. 
    # This function is to be used to compute the game and the Shapley values
    """
    def myModel (myInputs):
        return (model.predict(poly.fit_transform([myInputs]))[0])
    return(myModel)


## ------------------------------------------------
## Functions to prepare and perform the experiments
## ------------------------------------------------

## Function:
##    Given a training set (trX, trY), compute Shapley for a subset (subsetX) given a sklearn model.
## Input:
##    subsetX: list of records for which we compute the Shapley
##    listColumnNames: names of the columns/attributes (inputs) for applying the model
##    also2Interactions: should we also compute interactions (not only Shapley)
## Return:
##    a list of all Shapley (a vector), or a list of all interactions (2D array)
def meTestAllShapleySklearn (trX, trY, subsetX, sklearnModel, listColumnNames, also2Interactions=False):
    model = sklearnModel()
    model.fit(trX, trY)
    #explainer = shap.Explainer(model.predict, pd.DataFrame(np.array(trX), columns = listColumnNames))
    #shap_values = explainer(pd.DataFrame(np.array(subsetX), columns = listColumnNames))
    meanInput = matrixColumnMeans(trX)
    # return (meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)), shap_values)
    if also2Interactions:
        return (meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)),
                meComputeAll2Interactions(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)))
    return meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX))

## Function:
##    Given a training set (trX, trY), compute Shapley for a subset (subsetX) given a sklearn model.
##    In this case, polynomial features up to degree 2 are used
def meTestAllShapleyPolyMod (trX, trY, subsetX, sklearnModel, listColumnNames, also2Interactions=False):
    # Data is extended to a given degree
    poly = sklearn.preprocessing.PolynomialFeatures(degree=2)
    trX_ = poly.fit_transform(trX)
    # generate the regression object
    model = sklearnModel() #                                         sklearn.linear_model.LinearRegression()
    #preform the actual regression
    model.fit(trX_, trY)
    meanInput = matrixColumnMeans(trX)
    if also2Interactions:
        return (meComputeAllShapley(meMyPolyModelPredict(poly,model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)),
                meComputeAll2Interactions(meMyPolyModelPredict(poly,model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)))
    return meComputeAllShapley(meMyPolyModelPredict(poly,model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX))


## Function: 
##    Building training and test datasets. 
## Input:
##    data: input attributes, as list of records
##    target: output variable, as list of values
##    nPoints: n records for training
def meMyTrainTest (data, target, nPoints):  ## data and target
    selected = np.random.choice(range(0,len(data)),nPoints,replace=False)
    trainX, testX = matrixSelection(data,selected)
    # trainX, testX = matrixSelection(data,selected)
    trainY, testY = matrixSelection(target,selected)
    #X = np.matrix(data)[selected]
    #y = np.array(target)[np.random.choice(range(0,len(data)),nPoints,replace=False)]
    return trainX, trainY, testX, testY


## Compare this implementation with Shap.
## Function: 
##    similar to meTestAllShapleySklearn, but also returning
##    shapley values computed from Shap package, to compare both results
## Use the following to compare: 
##    trX, trY, teX, teY = meMyTrainTest(california_housing.data.values.tolist(), california_housing.target.values.tolist(), 2)
##    myResult = meTestAllShapleySklearnCompareOurShap (trX, trY, matrixSelection(teX,[0,1,2])[0], sklearn.linear_model.LinearRegression, california_housing.feature_names)
## Example:
##    Compare, first computed here and the second computed using Shap package
##       myResult[0][0] and myResult[1][0]
##       myResult[0][1] and myResult[1][1]
##       myResult[0][2] and myResult[1][2]
def meTestAllShapleySklearnCompareOurShap (trX, trY, subsetX, sklearnModel, listColumnNames):
    model = sklearnModel()
    model.fit(trX, trY)
    explainer = shap.Explainer(model.predict, pd.DataFrame(np.array(trX), columns = listColumnNames))
    shap_values = explainer(pd.DataFrame(np.array(subsetX), columns = listColumnNames))
    meanInput = matrixColumnMeans(trX)
    return (meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)), shap_values.values)


## -----------------------------------------------------------------------------------------------
## Main functions to compute Shapley values for the data and the regression model  ---------------
## -----------------------------------------------------------------------------------------------


def meILShapleyMeansXp (shapleyX, meanShapleyX, xp, y, testX, testY, modelRegression, listColumnNames):
    """
    Function: 
    (1) Compares mean shap values for x and xp, for a given model and a testX set of records
    (2) Compares shap values for x and xp, and computes mean
    (3) Returns mean shap 
    Here, the mean shap of x is assumed to be known: meanShapleyX
    It uses the norm and the spearman rank correlation coefficient.
    """
    allShapley_p = meTestAllShapleySklearn (xp, y, testX, modelRegression, listColumnNames)
    meanShapley_p = matrixColumnMeansAbs(allShapley_p)
    shapleyMean = mean(list(map(lambda shapley1, shapley2: fNorm(shapley1, shapley2), shapleyX, allShapley_p)))
    shapleyMeanR = mean(list(map(lambda shapley1, shapley2: spearmanr(shapley1, shapley2)[0], shapleyX, allShapley_p)))
    # print(meanShapley_p)
    return (fNorm (meanShapleyX, meanShapley_p),spearmanr(meanShapleyX, meanShapley_p)[0], shapleyMean, shapleyMeanR, meanShapley_p)

def testMeILMeanShapley (x, y, testX, testY, modelRegression, listColumnNames, masking, loParameters):
    allShapley_x = meTestAllShapleySklearn (x, y, testX, modelRegression, listColumnNames)
    meanShapley_x = matrixColumnMeansAbs(allShapley_x)
    print("Mean Shap original")
    print(meanShapley_x)
    SDCerror = [(0,0,0,0,meanShapley_x)]
    for k in loParameters:
        print (k)
        SDCerror.append(meILShapleyMeansXp (allShapley_x, meanShapley_x, masking(x,k), y, testX, testY, modelRegression, listColumnNames))
    return(SDCerror)

## test = testMeILMeanShapley (trDX, trDY, teDX, teDY, sklearn.linear_model.LinearRegression,diabetes.feature_names, mdav, [2,3])

def plotLoResults (lox, mmethod, loLoParams, which=[0,1,2,3], myFileName="", whichCurves=[], wLeg=[], pltYLim=[]):
    plt.close()
    plt.show()
    # plt.clf()
    plt.ion()
    if wLeg == []:
        wLeg = [""]*len(lox)
    if whichCurves == []:
        whichCurves = list(range(0,len(lox)))
    if (1 in which) or (3 in which):
        plt.ylim([-1,1])
    if (pltYLim!=[]):
        plt.ylim(pltYLim)
    #print(wLeg)
    #print(wLeg[1])
    #print("Curves="+str(whichCurves))
    for i in whichCurves:
        #print("Here:")
        #print(i)
        if 0 in which:
            plt.plot(loLoParams[i],(list(map(lambda x:x[0], lox[i]))), label=wLeg[i]+"dm") # "d(m(S),m(S'))")
        if 1 in which:
            plt.plot(loLoParams[i],(list(map(lambda x:x[1], lox[i]))), label=wLeg[i]+"Rm")
        if 2 in which:
            plt.plot(loLoParams[i],(list(map(lambda x:x[2], lox[i]))), label=wLeg[i]+"md") # (d(s,s))")
        if 3 in which:
            plt.plot(loLoParams[i],(list(map(lambda x:x[3], lox[i]))), label=wLeg[i]+"mR") # (R(s,s))")
    plt.legend()
    plt.xlabel('parameters')
    plt.ylabel('distances')
    plt.title(mmethod)
    if myFileName!="":
        plt.savefig(myFileName, format='eps')
    return(0)

##
## Our implementation vs Shap implementation
## 
def meATestFromShap():
    """
    Function: This is to reproduce an example from:
       https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
    with my shap calculations
    """
    # 
    # a classic housing price dataset
    X,y = shap.datasets.california(n_points=1000)
    # 
    X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution
    # 
    # a simple linear model
    model = sklearn.linear_model.LinearRegression()
    model.fit(X, y)
    # 
    print("Model coefficients:\n")
    for i in range(X.shape[1]):
        print(X.columns[i], "=", model.coef_[i].round(5))
    #
    # compute the SHAP values for the linear model
    explainer = shap.Explainer(model.predict, X100)
    shap_values = explainer(X)
    #
    sample_ind = 20
    shap.partial_dependence_plot(
        "MedInc", model.predict, X100, model_expected_value=True,
        feature_expected_value=True, ice=False,
        shap_values=shap_values[sample_ind:sample_ind+1,:]
    )
    #
    print("SHAP uses the following example: sample_ind = 20, with prediction 2.84553.")
    print("  Observe: prediction(sample_ind=20)="+
          str(model.predict(pd.DataFrame(np.array([list(X.iloc[sample_ind])]), columns = list(X.columns)))))
    print("SHAP has E(f(x))=2.215, my average of output is 2.0723. Observe: "+str(y.mean())+", prediction(mean)="+
          str(model.predict(pd.DataFrame(np.array([list(X.mean())]), columns = list(X.columns)))))
    #
    # model.predict(pd.DataFrame(np.array([list(X.iloc[sample_ind])]), columns = list(X.columns)))
    ### array([2.84553121])
    # model.predict(pd.DataFrame(np.array([list(X.mean())]), columns = list(X.columns)))
    ### array([2.07236148])
    # y.mean()
    ### 2.07236148
    print("My shap values for this instance:"+
          str(ciShapley (meBuildGame (meMyModelPredict(list(X.columns), model), 8, list(X.mean()), 2.215, list(X.iloc[sample_ind])),8)))
    ## [1.0610125371096228, -0.2360828191399687, -0.13854656707724222, -0.10003378218727547, 0.052154055383100764, -0.02189353187644981, 0.6328362370928394, -0.6189149174119997]
    print("Their shap values for this instance:"+
          str(shap_values[sample_ind].values))
    ## array([0.92038342,-0.2210084 ,-0.11332664,-0.04333158,0.0743623 ,0.00746375,0.53112064, -0.52487211])
    print("Prediction goal:"+str(model.predict(pd.DataFrame(np.array([list(X.iloc[sample_ind])]), columns = list(X.columns)))))
    # array([2.84553121])
    print("This is correct in my case (mean value + my shapley):"+
          str(2.215 + sum(ciShapley (meBuildGame (meMyModelPredict(list(X.columns), model), 8, list(X.mean()), 2.215, list(X.iloc[sample_ind])),8))))
    ## 2.8455312118926273
    print("This is also correct in their case (mean value + their shapley):"+
          str(2.215 + sum(shap_values[sample_ind].values)))
    ## 2.8457913791810583
    

def meAnotherExampleNonLinear ():
    import xgboost
    X,y = shap.datasets.california(n_points=1000)
    X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution
    sample_ind = 20
    # a classic adult census dataset price dataset
    X_adult,y_adult = shap.datasets.adult()
    # a simple linear logistic model
    model_adult = sklearn.linear_model.LogisticRegression(max_iter=10000)
    model_adult.fit(X_adult, y_adult)
    # compute the SHAP values for the linear model
    background_adult = shap.maskers.Independent(X_adult, max_samples=100)
    explainer = shap.Explainer(model_adult_proba, background_adult)
    shap_values_adult = explainer(X_adult[:1000])
    ## END OF PRELIMINARIES ==========================
    model_xgb = xgboost.XGBRegressor(n_estimators=100, max_depth=2).fit(X, y)
    #
    # explain the GAM model with SHAP
    explainer_xgb = shap.Explainer(model_xgb, X100)
    shap_values_xgb = explainer_xgb(X)
    #
    # make a standard partial dependence plot with a single SHAP value overlaid
    fig,ax = shap.partial_dependence_plot(
        "MedInc", model_xgb.predict, X100, model_expected_value=True,
        feature_expected_value=True, show=False, ice=False,
        shap_values=shap_values_xgb[sample_ind:sample_ind+1,:]
    )
    # 
    # train XGBoost model
    model = xgboost.XGBClassifier(n_estimators=100, max_depth=2).fit(X_adult, y_adult*1, eval_metric="logloss")
    # 
    # compute SHAP values
    explainer = shap.Explainer(model, background_adult)
    shap_values = explainer(X_adult)
    # 
    # set a display version of the data to use for plotting (has string values)
    shap_values.display_data = shap.datasets.adult(display=True)[0].values
    shap.plots.bar(shap_values)
    return(0)

###
### The last two functions are to test SHAP and compare SHAP and this implementation
### 
# import shap
### a classic adult census dataset price dataset
#X_adult,y_adult = shap.datasets.adult()
#
### a simple linear logistic model
#model_adult = sklearn.linear_model.LogisticRegression(max_iter=10000)
#model_adult.fit(X_adult, y_adult)
#
#def model_adult_proba(x):
#    return model_adult.predict_proba(x)[:,1]
#
#def model_adult_log_odds(x):
#    p = model_adult.predict_log_proba(x)
#    return p[:,1] - p[:,0]
#
#
### EXAMPLE: Comparison
#
# meATestFromShap()
###
### EXAMPLE: testing SHAP
###
# meAnotherExampleNonLinear ()