## Software for masking + explainability ## (c) Vicenç Torra ## Current version: 20221201 ## ## This is the code for the paper ## V. Torra, A systematic construction of non-i.i.d. data sets from a single data set ## Knowl Inf Syst (2022). https://doi.org/10.1007/s10115-022-01785-3 ## ## MAIN FUNCTIONS: ## create_solution_weights (numInstancesClass, classesPerDevice, nCopies, vecWeights=[]) ## create_assignment (y, mtDevClas2Records) ## import random import numpy as np import numpy.linalg as lina from scipy.optimize import linprog ## $ pip install cvxopt ## from cvxopt import matrix, solvers ## import cvxopt.matrix as co.matrix import cvxopt as opt def subsets_Size_NElem(s, nElems): """ Function: A set is a list of 0 or 1. Consider all sets of size s with nElems. Example: subsets_Size_NElem(4, 2) Output: [[0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0], [1.0, 1.0, 0.0, 0.0]] """ res = [] for i in range(1 << s): bin_string = list(format(i, '0'+str(s)+'b')) vec = [float(x) for x in bin_string] if (sum(vec)==nElems): res.append(vec) return(res) ### Multiply two lists elementwise ### [a*b for a,b in zip(lista,listb)] ### map(lambda x,y:x*y,lista,listb) ### Example: multLL([1,2,3],[4,5,6]) def multLL(lista, listb): return [a*b for a,b in zip(lista,listb)] ### Create constraints for the columns, left side def create_lhs_eqCol (nClass, nVars, nDev, vecVars): res = [] for i in range(0,nClass): constrCol = [0.0]*nVars for j in range(0,nDev): pos = i + j*nClass constrCol[pos]=1.0 restrictedCol = multLL(constrCol, vecVars) len(constrCol) res.append(restrictedCol) ## constrCol) return res ### Create constraints for the columns, left side def create_lhs_eqCol_weights (nClass, nVars, nDev, vecVars, vecWeights): res = [] for i in range(0,nClass): constrCol = [0.0]*nVars for j in range(0,nDev): pos = i + j*nClass constrCol[pos]=vecWeights[j] restrictedCol = multLL(constrCol, vecVars) len(constrCol) res.append(restrictedCol) return res ### Create constraints for the rows, left side def create_lhs_eqRow (nClass, nVars, nDev, vecVars): res = [] for i in range(0,nDev): constrRow = [0.0]*nVars for j in range(0,nClass): pos = i*nClass + j constrRow[pos]=1.0 restrictedRow = multLL(constrRow, vecVars) len(constrRow) res.append(restrictedRow) return res def matrMN (m, n): """ Example: matrMN(2,3) ==> array([[1.0,0.0,0.0],[0.0,1.0,0.0]]) """ mt = np.zeros((m,n)) for i in range(0,m): mt[i][i]=-1.0 return(mt) def sumColsGetCol (mt): """Matrix given as list of rows""" return(map(sum,mt)) def sumRowsGetRow (mt): """Matrix given as list of rows""" return(sumColsGetCol(np.transpose(mt))) ## printMat([[1,2,3],[4,4,5]]) def printMat (mt): for i in range(0,len(mt)): printRow(mt[i]) def printRow (rw): print("[",end='') for i in range(0,len(rw)): print(rw[i],' ',end='') print("]") ## From list of variables to list of equations ## [0, 1, 1, 1, 0, 1, 1, 1, 0] ## probZeroLeft = makeEquationsForZeros(np.array([0, 1, 1, 1, 0, 1, 1, 1, 0])) ## probZeroRight = [0]*len(probZeroLeft) def makeEquationsForZeros (lwZeros): return (list(map(lambda pos: [0]*(pos)+[1]+[0]*(len(lwZeros)-pos-1), np.where(lwZeros==0)[0]))) ## MAIN FUNCTION def create_solution_weights (numInstancesClass, classesPerDevice, nCopies, vecWeights=[]): """ Input: (1) List with number of instances in each class (2) Integer corresponding to the number of classes in each device (same for all) (3) Integer number of copies of devices with the same classes (4) weight of each device, optional (if [], all the same) Example: create_solution_weights ([50, 50, 50], 2, 2) """ totalNRecords = sum(numInstancesClass) # number of records nClass = len(numInstancesClass) # number of classes oneSetofDevices = subsets_Size_NElem(nClass, classesPerDevice) lol = oneSetofDevices*nCopies nDev = nCopies * len(oneSetofDevices) # number of devices nVars = nClass * nDev # number of variables # sumCols = list(map(lambda x:x*1/totalNRecords,numInstancesClass)) # vecVars = list(np.array(lol).flat) if vecWeights==[]: vecWeights=[1/nDev]*nDev # ### EQUALITY CONSTRAINTS # ### Constraints for the columns, left side lhs_eqCol = create_lhs_eqCol_weights (nClass, nVars, nDev, vecVars, vecWeights) ### Constraints for the columns, right side. This is given rhs_eqCol = sumCols ## numInstancesClass # ### Constraints for the rows, left side lhs_eqRow = create_lhs_eqRow (nClass, nVars, nDev, vecVars) ### Constraints for the rows, right side rhs_eqRow = [1.0]*nDev # # Equations to assign probabilities to zero (i.e., unnecessary variables) lhs_probZero = makeEquationsForZeros(np.array(vecVars)) rhs_probZero = [0]*len(lhs_probZero) neq_probZero = len(rhs_probZero) # # All equality constraints. # Probabilities equal to zero, before the others as we may delete an equality later. # lp_lhs_eq = lhs_probZero + lhs_eqCol + lhs_eqRow lp_rhs_eq = rhs_probZero + rhs_eqCol + rhs_eqRow nConstraints = neq_probZero + nDev + nClass # Probs == 0, for rows, eq. for columns rankAllMat = lina.matrix_rank(lp_lhs_eq) newRank = lina.matrix_rank(lp_lhs_eq[0:len(lp_lhs_eq)-1]) print("Rank matrix lp_lhs_eq, dim (and rank if full rank) = "+str(len(lp_lhs_eq))) print(str(rankAllMat)) print("Rank of matrix lp_lhs_eq without the last row=") print("newRank"+str(newRank)) # # Solving a linear dependency: # ## One of the rows of the matrix is linear dependent of the others, ## (for the two examples, adding all horizontal constraints ## substracting all vertical constraints ## makes the sum equal zero: I.e., they are linear dependent) ## I remove the last row and see if this is enough, which it is for my examples ... if (len(lp_lhs_eq) != rankAllMat) and (newRank == (len(lp_lhs_eq)-1)): print("CORRECT RANK") lp_lhs_eq = lp_lhs_eq[0:len(lp_lhs_eq)-1] lp_rhs_eq = lp_rhs_eq[0:len(lp_rhs_eq)-1] nConstraints = nConstraints-1 # mA=opt.matrix(np.array(lp_lhs_eq)) mb=opt.matrix(lp_rhs_eq, (nConstraints, 1)) # # The objective function x'Qx + L'x # mQ = opt.matrix(np.identity(nVars)) # ecsQ = matrix(np.identity(18)) # L = (-1, ..., -1) alphaRandomVector = np.random.uniform(0,1,nVars) mL = opt.matrix(-1*2*alphaRandomVector,(nVars,1)) # print("Alpha Random Vector") print(alphaRandomVector) # # print("Possible different devices:"+str(len(oneSetofDevices))) # print("nVars:"+str(nVars)) # print("nClass:"+str(nClass)) # print("nDev:"+str(nDev)) # # The inequality constraints: p_ij >= 0.0 # G = -Id mG = opt.matrix((np.identity(nVars))*-1) # ecsG = matrix((np.identity(18))*-1) # h = (0, ..., 0) mh = opt.matrix([0.0]*(nVars),(nVars,1)) # ecsH = matrix([0.0]*18,(18,1)) # # Solving the problem # sol = opt.solvers.qp(mQ, mL, G=mG, h=mh, A=mA, b=mb) vPDev = multLL(sol['x'], vecVars) ## Probabilities Devices mtPDev = np.array(vPDev).reshape((nDev,nClass)) ## mtNRecDev = list(map(lambda l,e: list(map(lambda el: el*e*totalNRecords, l)), mtPDev, vecWeights)) # return sol return(mtNRecDev) def create_assignment (y, mtDevClas2Records): '''Function: Assign devices to records. Input: (1) array of output classes, (2) output of function create_solution_weights Output: vector of devices''' nDev = len(mtDevClas2Records) record2Dev = [0]*len(y) for classPos in range(0,len(set(y))): className = list(set(y))[classPos] ## vDev2Records = list(map(lambda x:x[classPos], mtDevClas2Records)) vDev2Probs = list(map(lambda x:x/sum(vDev2Records), vDev2Records)) recordsInClass= np.where(y==className)[0] vDevices = np.random.choice(range(1,nDev+1), size=len(recordsInClass),replace=True,p=vDev2Probs) for i in range(0,len(recordsInClass)): indexRecord = recordsInClass[i] device=vDevices[i] record2Dev[indexRecord]=device return(record2Dev) ## ## Examples ## ## random.seed(a=12) ## sol50Iris = create_solution_weights ([50, 50, 50], 2, 2) ## sol50Iris = create_solution_weights ([50, 50, 50], 2, 2, [3/10,2/10,2/10,1/10,1/10,1/10]) import sklearn as sk import sklearn.datasets ## This example shows how to build a function that assigns each record ## of the data set Iris to a device. ## It returns a list with a value for each record in 1..6 which corresponds to a device def exampleIrisDataSet (): classesPerDevice = 2 nCopies = 2 iris = sk.datasets.load_iris() y = iris.target nRecordsClass = list(map(lambda e: np.count_nonzero(y==e), set(y))) mtDevClas2Records = create_solution_weights (nRecordsClass, classesPerDevice, nCopies) rec2dev = create_assignment (y, mtDevClas2Records) # Double check that we have assigned values to all records, considering all devices ass = sum(map(lambda a:sum(map(lambda x:1 if x==a else 0, rec2dev)), list(range(1,len(mtDevClas2Records)+1)))) print ("assigned="+str(ass)) return rec2dev ## Experiments: import time def timeMNIST (n,copies): startN = time.time() solMNIST = create_solution_weights ([6000,6000,6000,6000,6000,6000,6000,6000,6000,6000], n,copies) endN = time.time() print(endN - startN) return(endN - startN) # timeMNIST(4,1) # 6.064868927001953 # timeMNIST(5,1) # 27.855716943740845 # timeMNIST(6,1) # 15.693092346191406 # timeMNIST(7,1) # 5.713701248168945 # timeMNIST(8,1) # 1.0110714435577393 # r411 = timeMNIST(4,1) # r412 = timeMNIST(4,1) # r413 = timeMNIST(4,1) # r414 = timeMNIST(4,1) # r415 = timeMNIST(4,1) # r41 = [r411,r412,r413,r414,r415] # r41 ## [6.663037538528442, 21.23730993270874, 22.804277896881104, 28.104968309402466, 27.207292795181274]