## Software for masking + explainability ## (c) Vicenç Torra ## Current version: 20221201 ## ## References: ## A. Bozorgpanah, V. Torra, L. Aliahmadipour, ## Privacy and explainability: the effects of data protection on Shapley values, ## Technologies 10:6 (2022) 125 ## https://doi.org/10.3390/technologies10060125 ## ## The software computes the Shapley values for machine learning models, ## we use our own implementation to build a game, and we use our own ## implementation to compute the Shapley value for a game. ## ## Main functions: ## ## Computation of results: ## meILShapleyMeansXp (shapleyX, meanShapleyX, xp, y, testX, testY, modelRegression, listColumnNames) ## shapley-based-divergence(original,masked) ## testMeILMeanShapley (x, y, testX, testY, modelRegression, listColumnNames, masking, loParameters) ## results for masking using several parameters ## Plotting: ## plotLoResults (lox, mmethod, loLoParams, which=[0,1,2,3], myFileName="", whichCurves=[], wLeg=[], pltYLim=[]) ## ## To execute this file we need to run these other files: ## exec(open("prog.vectors.matrices.web.txt").read()) ## exec(open("prog.choquet.web.txt").read()) ## exec(open("prog.sdc.web.txt").read()) import random import numpy as np import numpy.linalg as lina import matplotlib.pyplot as plt import statistics import math import sklearn import time import pandas as pd from scipy.stats import spearmanr ## ------------------------------------------------------------------------- ## Build games, compute Shapley values, and interactions ## ------------------------------------------------------------------------- def meBuildGame (myModel, nInputs, meanInput, meanOutput, realInput): """ Example: meBuildGame (meModelExample, 4, [2,2,2,2], meModelExample([2,2,2,2]), [1,2,3,4]) ciShapley(meBuildGame (meModelExample, 4, [2,2,2,2], meModelExample([2,2,2,2]), [1,2,3,4]),4) meModelExample([2,2,2,2]) + sum(ciShapley(meBuildGame (meModelExample, 4, [2,2,2,2], meModelExample([2,2,2,2]), [1,2,3,4]),4)) == meModelExample([1,2,3,4]) """ game = [0]*(2**nInputs) mySetA = firstSet(nInputs) set2IntA = 0 while notLastSet(mySetA): mySetA = nextSet(mySetA, nInputs) set2IntA = set2IntA+1 mySetInputs = [x[1] if x[0]==0 else x[2] for x in zip(mySetA, meanInput, realInput)] game[set2IntA] = myModel(mySetInputs) - meanOutput return(game) def meComputeAllShapley(myModel, nInputs, meanInput, meanOutput, data): allShapley = [] for nExample in range(0,len(data)): allShapley.append(ciShapley (meBuildGame (myModel, nInputs, meanInput, meanOutput, data[nExample]),nInputs)) return(allShapley) def meComputeAll2Interactions(myModel, nInputs, meanInput, meanOutput, data): allShapley = [] for nExample in range(0,len(data)): allShapley.append(ci2Interactions (meBuildGame (myModel, nInputs, meanInput, meanOutput, data[nExample]),nInputs)) return(allShapley) ## ----------------------------------------- ## Auxiliary functions for building / applying models ## ----------------------------------------- ## Function: ## Simple example of a "model" to be used in testing ## ML models will have this structure: given an input list return a number def meModelExample (x): """ Example: meModelExample([1,2,3,4]) """ return (vectorProduct(x,[0.1,0.2,0.3,0.4])) ## Function: ## A function so that we can build the game from models learnt using sklearn, def meMyModelPredict (listColumnNames, model): """ Function: # Given a model of sklearn, # and list of column names (as in a dataframe for sklearn) # return a function that given a list computes its output # This function is to compute Shapley values for models in sklearn """ def myModel (myInputs): oneRowDataFrame = pd.DataFrame(np.array([myInputs]), columns = listColumnNames) return (model.predict(oneRowDataFrame)[0]) return(myModel) ## Function: ## A function so that we can build the game from models learnt using sklearn, ## when we are using non-linear regression using polynomials def meMyPolyModelPredict (poly, model): """ Function: # Given polynomial features # and a linear model based on these features # return a function that given a list corresponding to the # standard input (not to the polynomial features), # it computes its output. # This function is to be used to compute the game and the Shapley values """ def myModel (myInputs): return (model.predict(poly.fit_transform([myInputs]))[0]) return(myModel) ## ------------------------------------------------ ## Functions to prepare and perform the experiments ## ------------------------------------------------ ## Function: ## Given a training set (trX, trY), compute Shapley for a subset (subsetX) given a sklearn model. ## Input: ## subsetX: list of records for which we compute the Shapley ## listColumnNames: names of the columns/attributes (inputs) for applying the model ## also2Interactions: should we also compute interactions (not only Shapley) ## Return: ## a list of all Shapley (a vector), or a list of all interactions (2D array) def meTestAllShapleySklearn (trX, trY, subsetX, sklearnModel, listColumnNames, also2Interactions=False): model = sklearnModel() model.fit(trX, trY) #explainer = shap.Explainer(model.predict, pd.DataFrame(np.array(trX), columns = listColumnNames)) #shap_values = explainer(pd.DataFrame(np.array(subsetX), columns = listColumnNames)) meanInput = matrixColumnMeans(trX) # return (meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)), shap_values) if also2Interactions: return (meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)), meComputeAll2Interactions(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX))) return meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)) ## Function: ## Given a training set (trX, trY), compute Shapley for a subset (subsetX) given a sklearn model. ## In this case, polynomial features up to degree 2 are used def meTestAllShapleyPolyMod (trX, trY, subsetX, sklearnModel, listColumnNames, also2Interactions=False): # Data is extended to a given degree poly = sklearn.preprocessing.PolynomialFeatures(degree=2) trX_ = poly.fit_transform(trX) # generate the regression object model = sklearnModel() # sklearn.linear_model.LinearRegression() #preform the actual regression model.fit(trX_, trY) meanInput = matrixColumnMeans(trX) if also2Interactions: return (meComputeAllShapley(meMyPolyModelPredict(poly,model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)), meComputeAll2Interactions(meMyPolyModelPredict(poly,model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX))) return meComputeAllShapley(meMyPolyModelPredict(poly,model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)) ## Function: ## Building training and test datasets. ## Input: ## data: input attributes, as list of records ## target: output variable, as list of values ## nPoints: n records for training def meMyTrainTest (data, target, nPoints): ## data and target selected = np.random.choice(range(0,len(data)),nPoints,replace=False) trainX, testX = matrixSelection(data,selected) # trainX, testX = matrixSelection(data,selected) trainY, testY = matrixSelection(target,selected) #X = np.matrix(data)[selected] #y = np.array(target)[np.random.choice(range(0,len(data)),nPoints,replace=False)] return trainX, trainY, testX, testY ## Compare this implementation with Shap. ## Function: ## similar to meTestAllShapleySklearn, but also returning ## shapley values computed from Shap package, to compare both results ## Use the following to compare: ## trX, trY, teX, teY = meMyTrainTest(california_housing.data.values.tolist(), california_housing.target.values.tolist(), 2) ## myResult = meTestAllShapleySklearnCompareOurShap (trX, trY, matrixSelection(teX,[0,1,2])[0], sklearn.linear_model.LinearRegression, california_housing.feature_names) ## Example: ## Compare, first computed here and the second computed using Shap package ## myResult[0][0] and myResult[1][0] ## myResult[0][1] and myResult[1][1] ## myResult[0][2] and myResult[1][2] def meTestAllShapleySklearnCompareOurShap (trX, trY, subsetX, sklearnModel, listColumnNames): model = sklearnModel() model.fit(trX, trY) explainer = shap.Explainer(model.predict, pd.DataFrame(np.array(trX), columns = listColumnNames)) shap_values = explainer(pd.DataFrame(np.array(subsetX), columns = listColumnNames)) meanInput = matrixColumnMeans(trX) return (meComputeAllShapley(meMyModelPredict(listColumnNames, model), len(listColumnNames), meanInput, mean(trY), np.array(subsetX)), shap_values.values) ## ----------------------------------------------------------------------------------------------- ## Main functions to compute Shapley values for the data and the regression model --------------- ## ----------------------------------------------------------------------------------------------- def meILShapleyMeansXp (shapleyX, meanShapleyX, xp, y, testX, testY, modelRegression, listColumnNames): """ Function: (1) Compares mean shap values for x and xp, for a given model and a testX set of records (2) Compares shap values for x and xp, and computes mean (3) Returns mean shap Here, the mean shap of x is assumed to be known: meanShapleyX It uses the norm and the spearman rank correlation coefficient. """ allShapley_p = meTestAllShapleySklearn (xp, y, testX, modelRegression, listColumnNames) meanShapley_p = matrixColumnMeansAbs(allShapley_p) shapleyMean = mean(list(map(lambda shapley1, shapley2: fNorm(shapley1, shapley2), shapleyX, allShapley_p))) shapleyMeanR = mean(list(map(lambda shapley1, shapley2: spearmanr(shapley1, shapley2)[0], shapleyX, allShapley_p))) # print(meanShapley_p) return (fNorm (meanShapleyX, meanShapley_p),spearmanr(meanShapleyX, meanShapley_p)[0], shapleyMean, shapleyMeanR, meanShapley_p) def testMeILMeanShapley (x, y, testX, testY, modelRegression, listColumnNames, masking, loParameters): allShapley_x = meTestAllShapleySklearn (x, y, testX, modelRegression, listColumnNames) meanShapley_x = matrixColumnMeansAbs(allShapley_x) print("Mean Shap original") print(meanShapley_x) SDCerror = [(0,0,0,0,meanShapley_x)] for k in loParameters: print (k) SDCerror.append(meILShapleyMeansXp (allShapley_x, meanShapley_x, masking(x,k), y, testX, testY, modelRegression, listColumnNames)) return(SDCerror) ## test = testMeILMeanShapley (trDX, trDY, teDX, teDY, sklearn.linear_model.LinearRegression,diabetes.feature_names, mdav, [2,3]) def plotLoResults (lox, mmethod, loLoParams, which=[0,1,2,3], myFileName="", whichCurves=[], wLeg=[], pltYLim=[]): plt.close() plt.show() # plt.clf() plt.ion() if wLeg == []: wLeg = [""]*len(lox) if whichCurves == []: whichCurves = list(range(0,len(lox))) if (1 in which) or (3 in which): plt.ylim([-1,1]) if (pltYLim!=[]): plt.ylim(pltYLim) #print(wLeg) #print(wLeg[1]) #print("Curves="+str(whichCurves)) for i in whichCurves: #print("Here:") #print(i) if 0 in which: plt.plot(loLoParams[i],(list(map(lambda x:x[0], lox[i]))), label=wLeg[i]+"dm") # "d(m(S),m(S'))") if 1 in which: plt.plot(loLoParams[i],(list(map(lambda x:x[1], lox[i]))), label=wLeg[i]+"Rm") if 2 in which: plt.plot(loLoParams[i],(list(map(lambda x:x[2], lox[i]))), label=wLeg[i]+"md") # (d(s,s))") if 3 in which: plt.plot(loLoParams[i],(list(map(lambda x:x[3], lox[i]))), label=wLeg[i]+"mR") # (R(s,s))") plt.legend() plt.xlabel('parameters') plt.ylabel('distances') plt.title(mmethod) if myFileName!="": plt.savefig(myFileName, format='eps') return(0) ## ## Our implementation vs Shap implementation ## def meATestFromShap(): """ Function: This is to reproduce an example from: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html with my shap calculations """ # # a classic housing price dataset X,y = shap.datasets.california(n_points=1000) # X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution # # a simple linear model model = sklearn.linear_model.LinearRegression() model.fit(X, y) # print("Model coefficients:\n") for i in range(X.shape[1]): print(X.columns[i], "=", model.coef_[i].round(5)) # # compute the SHAP values for the linear model explainer = shap.Explainer(model.predict, X100) shap_values = explainer(X) # sample_ind = 20 shap.partial_dependence_plot( "MedInc", model.predict, X100, model_expected_value=True, feature_expected_value=True, ice=False, shap_values=shap_values[sample_ind:sample_ind+1,:] ) # print("SHAP uses the following example: sample_ind = 20, with prediction 2.84553.") print(" Observe: prediction(sample_ind=20)="+ str(model.predict(pd.DataFrame(np.array([list(X.iloc[sample_ind])]), columns = list(X.columns))))) print("SHAP has E(f(x))=2.215, my average of output is 2.0723. Observe: "+str(y.mean())+", prediction(mean)="+ str(model.predict(pd.DataFrame(np.array([list(X.mean())]), columns = list(X.columns))))) # # model.predict(pd.DataFrame(np.array([list(X.iloc[sample_ind])]), columns = list(X.columns))) ### array([2.84553121]) # model.predict(pd.DataFrame(np.array([list(X.mean())]), columns = list(X.columns))) ### array([2.07236148]) # y.mean() ### 2.07236148 print("My shap values for this instance:"+ str(ciShapley (meBuildGame (meMyModelPredict(list(X.columns), model), 8, list(X.mean()), 2.215, list(X.iloc[sample_ind])),8))) ## [1.0610125371096228, -0.2360828191399687, -0.13854656707724222, -0.10003378218727547, 0.052154055383100764, -0.02189353187644981, 0.6328362370928394, -0.6189149174119997] print("Their shap values for this instance:"+ str(shap_values[sample_ind].values)) ## array([0.92038342,-0.2210084 ,-0.11332664,-0.04333158,0.0743623 ,0.00746375,0.53112064, -0.52487211]) print("Prediction goal:"+str(model.predict(pd.DataFrame(np.array([list(X.iloc[sample_ind])]), columns = list(X.columns))))) # array([2.84553121]) print("This is correct in my case (mean value + my shapley):"+ str(2.215 + sum(ciShapley (meBuildGame (meMyModelPredict(list(X.columns), model), 8, list(X.mean()), 2.215, list(X.iloc[sample_ind])),8)))) ## 2.8455312118926273 print("This is also correct in their case (mean value + their shapley):"+ str(2.215 + sum(shap_values[sample_ind].values))) ## 2.8457913791810583 def meAnotherExampleNonLinear (): import xgboost X,y = shap.datasets.california(n_points=1000) X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution sample_ind = 20 # a classic adult census dataset price dataset X_adult,y_adult = shap.datasets.adult() # a simple linear logistic model model_adult = sklearn.linear_model.LogisticRegression(max_iter=10000) model_adult.fit(X_adult, y_adult) # compute the SHAP values for the linear model background_adult = shap.maskers.Independent(X_adult, max_samples=100) explainer = shap.Explainer(model_adult_proba, background_adult) shap_values_adult = explainer(X_adult[:1000]) ## END OF PRELIMINARIES ========================== model_xgb = xgboost.XGBRegressor(n_estimators=100, max_depth=2).fit(X, y) # # explain the GAM model with SHAP explainer_xgb = shap.Explainer(model_xgb, X100) shap_values_xgb = explainer_xgb(X) # # make a standard partial dependence plot with a single SHAP value overlaid fig,ax = shap.partial_dependence_plot( "MedInc", model_xgb.predict, X100, model_expected_value=True, feature_expected_value=True, show=False, ice=False, shap_values=shap_values_xgb[sample_ind:sample_ind+1,:] ) # # train XGBoost model model = xgboost.XGBClassifier(n_estimators=100, max_depth=2).fit(X_adult, y_adult*1, eval_metric="logloss") # # compute SHAP values explainer = shap.Explainer(model, background_adult) shap_values = explainer(X_adult) # # set a display version of the data to use for plotting (has string values) shap_values.display_data = shap.datasets.adult(display=True)[0].values shap.plots.bar(shap_values) return(0) ### ### The last two functions are to test SHAP and compare SHAP and this implementation ### # import shap ### a classic adult census dataset price dataset #X_adult,y_adult = shap.datasets.adult() # ### a simple linear logistic model #model_adult = sklearn.linear_model.LogisticRegression(max_iter=10000) #model_adult.fit(X_adult, y_adult) # #def model_adult_proba(x): # return model_adult.predict_proba(x)[:,1] # #def model_adult_log_odds(x): # p = model_adult.predict_log_proba(x) # return p[:,1] - p[:,0] # # ### EXAMPLE: Comparison # # meATestFromShap() ### ### EXAMPLE: testing SHAP ### # meAnotherExampleNonLinear ()