## Software for masking + explainability
## (c) Vicenç Torra
## Current version: 20221201
##
## This is the code for the paper
##   V. Torra, A systematic construction of non-i.i.d. data sets from a single data set
##      Knowl Inf Syst (2022). https://doi.org/10.1007/s10115-022-01785-3
##
## MAIN FUNCTIONS: 
##    create_solution_weights (numInstancesClass, classesPerDevice, nCopies, vecWeights=[])
##    create_assignment (y, mtDevClas2Records)
## 
import random
import numpy as np
import numpy.linalg as lina
from scipy.optimize import linprog
## $ pip install cvxopt
## from cvxopt import matrix, solvers
## import cvxopt.matrix as co.matrix
import cvxopt as opt

def subsets_Size_NElem(s, nElems):
     """
     Function: A set is a list of 0 or 1. Consider all sets of size s with 
     nElems. 
     Example: subsets_Size_NElem(4, 2)
     Output: 
     [[0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0], 
     [1.0, 0.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0], [1.0, 1.0, 0.0, 0.0]]
     """
     res = []
     for i in range(1 << s):
        bin_string = list(format(i, '0'+str(s)+'b'))
        vec = [float(x) for x in bin_string]
        if (sum(vec)==nElems):
             res.append(vec)
     return(res)

### Multiply two lists elementwise
###     [a*b for a,b in zip(lista,listb)]
###     map(lambda x,y:x*y,lista,listb)
### Example: multLL([1,2,3],[4,5,6])
def multLL(lista, listb): 
  return [a*b for a,b in zip(lista,listb)]

### Create constraints for the columns, left side
def create_lhs_eqCol (nClass, nVars, nDev, vecVars): 
   res = []
   for i in range(0,nClass):
      constrCol = [0.0]*nVars
      for j in range(0,nDev):
         pos = i + j*nClass
         constrCol[pos]=1.0
      restrictedCol = multLL(constrCol, vecVars)
      len(constrCol)
      res.append(restrictedCol) ## constrCol)
   return res

### Create constraints for the columns, left side
def create_lhs_eqCol_weights (nClass, nVars, nDev, vecVars, vecWeights): 
   res = []
   for i in range(0,nClass):
      constrCol = [0.0]*nVars
      for j in range(0,nDev):
         pos = i + j*nClass
         constrCol[pos]=vecWeights[j]  
      restrictedCol = multLL(constrCol, vecVars)
      len(constrCol)
      res.append(restrictedCol) 
   return res

### Create constraints for the rows, left side
def create_lhs_eqRow (nClass, nVars, nDev, vecVars): 
   res = []
   for i in range(0,nDev):
      constrRow = [0.0]*nVars
      for j in range(0,nClass):
         pos = i*nClass + j
         constrRow[pos]=1.0
      restrictedRow = multLL(constrRow, vecVars)
      len(constrRow)
      res.append(restrictedRow)
   return res

def matrMN (m, n):
     """
     Example: matrMN(2,3) ==> array([[1.0,0.0,0.0],[0.0,1.0,0.0]])
     """
     mt = np.zeros((m,n))
     for i in range(0,m):
          mt[i][i]=-1.0
     return(mt)

def sumColsGetCol (mt):
     """Matrix given as list of rows"""
     return(map(sum,mt))

def sumRowsGetRow (mt):
     """Matrix given as list of rows"""
     return(sumColsGetCol(np.transpose(mt)))

## printMat([[1,2,3],[4,4,5]])
def printMat (mt):
     for i in range(0,len(mt)):
          printRow(mt[i])

def printRow (rw):
     print("[",end='')
     for i in range(0,len(rw)):
          print(rw[i],' ',end='')
     print("]")

## From list of variables to list of equations
## [0, 1, 1, 1, 0, 1, 1, 1, 0]
## probZeroLeft  = makeEquationsForZeros(np.array([0, 1, 1, 1, 0, 1, 1, 1, 0]))
## probZeroRight = [0]*len(probZeroLeft)
def makeEquationsForZeros (lwZeros):
     return (list(map(lambda pos: [0]*(pos)+[1]+[0]*(len(lwZeros)-pos-1),
                      np.where(lwZeros==0)[0])))

## MAIN FUNCTION
def create_solution_weights (numInstancesClass, classesPerDevice, nCopies, vecWeights=[]):
    """ Input:
    (1) List with number of instances in each class
    (2) Integer corresponding to the number of classes in each device (same for all)
    (3) Integer number of copies of devices with the same classes
    (4) weight of each device, optional (if [], all the same)
    Example: create_solution_weights ([50, 50, 50], 2, 2)
    """
    totalNRecords = sum(numInstancesClass)     # number of records 
    nClass = len(numInstancesClass)            # number of classes
    oneSetofDevices = subsets_Size_NElem(nClass, classesPerDevice)
    lol = oneSetofDevices*nCopies
    nDev = nCopies * len(oneSetofDevices)      # number of devices
    nVars = nClass * nDev                      # number of variables
    #
    sumCols = list(map(lambda x:x*1/totalNRecords,numInstancesClass))
    #
    vecVars = list(np.array(lol).flat)
    if vecWeights==[]:
         vecWeights=[1/nDev]*nDev
    #    
    ### EQUALITY CONSTRAINTS
    # 
    ### Constraints for the columns, left side
    lhs_eqCol = create_lhs_eqCol_weights (nClass, nVars, nDev, vecVars, vecWeights)
    ### Constraints for the columns, right side. This is given
    rhs_eqCol = sumCols  ## numInstancesClass 
    #    
    ### Constraints for the rows, left side
    lhs_eqRow = create_lhs_eqRow (nClass, nVars, nDev, vecVars)
    ### Constraints for the rows, right side
    rhs_eqRow = [1.0]*nDev
    #
    # Equations to assign probabilities to zero (i.e., unnecessary variables)
    lhs_probZero  = makeEquationsForZeros(np.array(vecVars))
    rhs_probZero = [0]*len(lhs_probZero)
    neq_probZero = len(rhs_probZero)
    #
    # All equality constraints.
    # Probabilities equal to zero, before the others as we may delete an equality later. 
    #    
    lp_lhs_eq = lhs_probZero + lhs_eqCol + lhs_eqRow 
    lp_rhs_eq = rhs_probZero + rhs_eqCol + rhs_eqRow
    nConstraints = neq_probZero + nDev + nClass        # Probs == 0, for rows, eq. for columns
    rankAllMat = lina.matrix_rank(lp_lhs_eq)
    newRank = lina.matrix_rank(lp_lhs_eq[0:len(lp_lhs_eq)-1])
    print("Rank matrix lp_lhs_eq, dim (and rank if full rank) = "+str(len(lp_lhs_eq)))
    print(str(rankAllMat))
    print("Rank of matrix lp_lhs_eq without the last row=")
    print("newRank"+str(newRank))
    #
    # Solving a linear dependency: 
    # 
    ## One of the rows of the matrix is linear dependent of the others,
    ##   (for the two examples, adding all horizontal constraints
    ##    substracting all vertical constraints
    ## makes the sum equal zero: I.e., they are linear dependent)
    ## I remove the last row and see if this is enough, which it is for my examples ... 
    if (len(lp_lhs_eq) != rankAllMat) and (newRank == (len(lp_lhs_eq)-1)):
         print("CORRECT RANK")
         lp_lhs_eq = lp_lhs_eq[0:len(lp_lhs_eq)-1]
         lp_rhs_eq = lp_rhs_eq[0:len(lp_rhs_eq)-1]
         nConstraints = nConstraints-1
    # 
    mA=opt.matrix(np.array(lp_lhs_eq))
    mb=opt.matrix(lp_rhs_eq, (nConstraints, 1))
    #
    # The objective function  x'Qx + L'x
    # 
    mQ = opt.matrix(np.identity(nVars))      # ecsQ = matrix(np.identity(18))
    # L = (-1, ..., -1)
    alphaRandomVector = np.random.uniform(0,1,nVars)
    mL = opt.matrix(-1*2*alphaRandomVector,(nVars,1)) 
    #
    print("Alpha Random Vector")
    print(alphaRandomVector)
    #
    # print("Possible different devices:"+str(len(oneSetofDevices)))
    # print("nVars:"+str(nVars))
    # print("nClass:"+str(nClass))
    # print("nDev:"+str(nDev))
    #
    # The inequality constraints:     p_ij >= 0.0
    # G = -Id 
    mG = opt.matrix((np.identity(nVars))*-1)   # ecsG = matrix((np.identity(18))*-1)
    # h = (0, ..., 0)
    mh = opt.matrix([0.0]*(nVars),(nVars,1))   # ecsH = matrix([0.0]*18,(18,1))
    #
    # Solving the problem
    # 
    sol = opt.solvers.qp(mQ, mL, G=mG, h=mh, A=mA, b=mb)
    vPDev = multLL(sol['x'], vecVars) ## Probabilities Devices
    mtPDev = np.array(vPDev).reshape((nDev,nClass))
    ##
    mtNRecDev = list(map(lambda l,e: list(map(lambda el: el*e*totalNRecords, l)),
                         mtPDev, vecWeights))
    # return sol
    return(mtNRecDev)


def create_assignment (y, mtDevClas2Records):
     '''Function: Assign devices to records. 
     Input: (1) array of output classes, (2) output of function create_solution_weights
     Output: vector of devices'''
     nDev = len(mtDevClas2Records)
     record2Dev = [0]*len(y)
     for classPos in range(0,len(set(y))):
          className = list(set(y))[classPos]
          ## 
          vDev2Records  = list(map(lambda x:x[classPos], mtDevClas2Records))
          vDev2Probs    = list(map(lambda x:x/sum(vDev2Records), vDev2Records)) 
          recordsInClass= np.where(y==className)[0]
          vDevices = np.random.choice(range(1,nDev+1),
                                      size=len(recordsInClass),replace=True,p=vDev2Probs)
          for i in range(0,len(recordsInClass)): 
               indexRecord = recordsInClass[i]
               device=vDevices[i]
               record2Dev[indexRecord]=device
     return(record2Dev)

##
## Examples 
## 
## random.seed(a=12)
## sol50Iris = create_solution_weights ([50, 50, 50], 2, 2)
## sol50Iris = create_solution_weights ([50, 50, 50], 2, 2, [3/10,2/10,2/10,1/10,1/10,1/10])

import sklearn as sk
import sklearn.datasets

## This example shows how to build a function that assigns each record
##   of the data set Iris to a device.
## It returns a list with a value for each record in 1..6 which corresponds to a device
def exampleIrisDataSet ():
     classesPerDevice = 2
     nCopies = 2
     iris = sk.datasets.load_iris()
     y = iris.target
     nRecordsClass = list(map(lambda e: np.count_nonzero(y==e), set(y)))
     mtDevClas2Records = create_solution_weights (nRecordsClass, classesPerDevice, nCopies)
     rec2dev = create_assignment (y, mtDevClas2Records)
     # Double check that we have assigned values to all records, considering all devices
     ass = sum(map(lambda a:sum(map(lambda x:1 if x==a else 0, rec2dev)),
                   list(range(1,len(mtDevClas2Records)+1))))
     print ("assigned="+str(ass))
     return rec2dev

## Experiments:

import time

def timeMNIST (n,copies): 
     startN = time.time()
     solMNIST = create_solution_weights ([6000,6000,6000,6000,6000,6000,6000,6000,6000,6000],
                                         n,copies)
     endN = time.time()
     print(endN - startN)
     return(endN - startN)

# timeMNIST(4,1) # 6.064868927001953
# timeMNIST(5,1) # 27.855716943740845
# timeMNIST(6,1) # 15.693092346191406
# timeMNIST(7,1) # 5.713701248168945
# timeMNIST(8,1) # 1.0110714435577393

# r411 = timeMNIST(4,1)
# r412 = timeMNIST(4,1)
# r413 = timeMNIST(4,1)
# r414 = timeMNIST(4,1)
# r415 = timeMNIST(4,1)
# r41 = [r411,r412,r413,r414,r415]
# r41
## [6.663037538528442, 21.23730993270874, 22.804277896881104, 28.104968309402466, 27.207292795181274]