/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysml.runtime.instructions.gpu.context;

import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.LongAdder;
import jcuda.Pointer;
import jcuda.jcusparse.cusparseHandle;
import jcuda.jcusparse.cusparseMatDescr;
import jcuda.runtime.JCuda;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.instructions.gpu.context.CSRPointer;
import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.instructions.gpu.context.ShadowBuffer;
import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlockCOO;
import org.apache.sysml.runtime.matrix.data.SparseBlockCSR;
import org.apache.sysml.runtime.matrix.data.SparseBlockMCSR;
import org.apache.sysml.utils.GPUStatistics;

public class GPUObject {
    private static final Log LOG = LogFactory.getLog(GPUObject.class.getName());
    private final GPUContext gpuContext;
    Pointer jcudaDenseMatrixPtr = null;
    private CSRPointer jcudaSparseMatrixPtr = null;
    protected boolean dirty = false;
    protected LongAdder readLocks = new LongAdder();
    protected boolean writeLock = false;
    AtomicLong timestamp = new AtomicLong();
    protected boolean isSparse = false;
    MatrixObject mat = null;
    final ShadowBuffer shadowBuffer;

    public Pointer getDensePointer() {
        if (this.jcudaDenseMatrixPtr == null && this.shadowBuffer.isBuffered() && this.getJcudaSparseMatrixPtr() == null) {
            this.shadowBuffer.moveToDevice();
        }
        return this.jcudaDenseMatrixPtr;
    }

    public boolean isDensePointerNull() {
        return this.jcudaDenseMatrixPtr == null;
    }

    public void clearDensePointer() {
        this.jcudaDenseMatrixPtr = null;
        this.shadowBuffer.clearShadowPointer();
    }

    public void setDensePointer(Pointer densePtr) {
        if (!this.isDensePointerNull()) {
            throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
        }
        this.jcudaDenseMatrixPtr = densePtr;
        this.isSparse = false;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Setting dense pointer of size " + this.getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr));
        }
        if (this.getJcudaSparseMatrixPtr() != null) {
            this.getJcudaSparseMatrixPtr().deallocate();
            this.jcudaSparseMatrixPtr = null;
        }
    }

    public Object clone() {
        GPUObject me = this;
        GPUObject that = new GPUObject(me.gpuContext, me.mat);
        that.dirty = me.dirty;
        that.readLocks.reset();
        that.writeLock = false;
        that.timestamp = new AtomicLong(me.timestamp.get());
        that.isSparse = me.isSparse;
        try {
            long rows;
            if (!me.isDensePointerNull()) {
                rows = me.mat.getNumRows();
                long cols = me.mat.getNumColumns();
                long size = rows * cols * (long)LibMatrixCUDA.sizeOfDataType;
                that.setDensePointer(this.allocate(size));
                JCuda.cudaMemcpy((Pointer)that.getDensePointer(), (Pointer)me.getDensePointer(), (long)size, (int)3);
            }
            if (me.getJcudaSparseMatrixPtr() != null) {
                rows = this.mat.getNumRows();
                that.jcudaSparseMatrixPtr = me.jcudaSparseMatrixPtr.clone((int)rows);
            }
        }
        catch (DMLRuntimeException e) {
            throw new RuntimeException(e);
        }
        return that;
    }

    private Pointer allocate(long size) {
        return this.getGPUContext().allocate(null, size);
    }

    private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException {
        this.getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE);
    }

    GPUContext getGPUContext() {
        return this.gpuContext;
    }

    public static Pointer transpose(GPUContext gCtx, Pointer densePtr, int m, int n, int lda, int ldc) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : transpose of block of size [" + m + "," + n + "], GPUContext=" + gCtx);
        }
        Pointer alpha = LibMatrixCUDA.one();
        Pointer beta = LibMatrixCUDA.zero();
        Pointer A = densePtr;
        Pointer C = gCtx.allocate(null, (long)m * GPUObject.getDatatypeSizeOf(n));
        LibMatrixCUDA.cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), 1, 1, m, n, alpha, A, lda, beta, new Pointer(), lda, C, ldc);
        return C;
    }

    public static CSRPointer columnMajorDenseToRowMajorSparse(GPUContext gCtx, cusparseHandle cusparseHandle2, Pointer densePtr, int rows, int cols) {
        cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
        Pointer nnzPerRowPtr = null;
        Pointer nnzTotalDevHostPtr = null;
        nnzPerRowPtr = gCtx.allocate(null, GPUObject.getIntSizeOf(rows));
        nnzTotalDevHostPtr = gCtx.allocate(null, GPUObject.getIntSizeOf(1L));
        LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle2, 0, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, nnzTotalDevHostPtr);
        int[] nnzC = new int[]{-1};
        JCuda.cudaMemcpy((Pointer)Pointer.to((int[])nnzC), (Pointer)nnzTotalDevHostPtr, (long)GPUObject.getIntSizeOf(1L), (int)2);
        if (nnzC[0] == -1) {
            throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz from the sparse-matrix vector mulitply on the GPU");
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : col-major dense size[" + rows + "," + cols + "] to row-major sparse of with nnz = " + nnzC[0] + ", GPUContext=" + gCtx);
        }
        CSRPointer C = CSRPointer.allocateEmpty(gCtx, nnzC[0], rows);
        LibMatrixCUDA.cudaSupportFunctions.cusparsedense2csr(cusparseHandle2, rows, cols, matDescr, densePtr, rows, nnzPerRowPtr, C.val, C.rowPtr, C.colInd);
        gCtx.cudaFreeHelper(null, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE);
        gCtx.cudaFreeHelper(null, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);
        return C;
    }

    public CSRPointer getSparseMatrixCudaPointer() {
        return this.getJcudaSparseMatrixPtr();
    }

    public void setSparseMatrixCudaPointer(CSRPointer sparseMatrixPtr) {
        if (this.jcudaSparseMatrixPtr != null) {
            throw new DMLRuntimeException("jcudaSparseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU");
        }
        this.jcudaSparseMatrixPtr = sparseMatrixPtr;
        this.isSparse = true;
        if (!this.isDensePointerNull() && !this.shadowBuffer.isBuffered()) {
            this.cudaFreeHelper(this.getDensePointer());
            this.clearDensePointer();
        }
    }

    public void denseToSparse() {
        cusparseHandle cusparseHandle2;
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : dense -> sparse on " + this + ", GPUContext=" + this.getGPUContext());
        }
        long t0 = 0L;
        if (DMLScript.STATISTICS) {
            t0 = System.nanoTime();
        }
        if ((cusparseHandle2 = this.getGPUContext().getCusparseHandle()) == null) {
            throw new DMLRuntimeException("Expected cusparse to be initialized");
        }
        int rows = GPUObject.toIntExact(this.mat.getNumRows());
        int cols = GPUObject.toIntExact(this.mat.getNumColumns());
        if (this.isDensePointerNull() && !this.shadowBuffer.isBuffered() || !this.isAllocated()) {
            throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call");
        }
        this.denseRowMajorToColumnMajor();
        this.setSparseMatrixCudaPointer(GPUObject.columnMajorDenseToRowMajorSparse(this.getGPUContext(), cusparseHandle2, this.getDensePointer(), rows, cols));
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaDenseToSparseCount.add(1L);
        }
    }

    public void denseRowMajorToColumnMajor() {
        int n;
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + this.getGPUContext());
        }
        int m = GPUObject.toIntExact(this.mat.getNumRows());
        int lda = n = GPUObject.toIntExact(this.mat.getNumColumns());
        int ldc = m;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated");
        }
        Pointer tmp = GPUObject.transpose(this.getGPUContext(), this.getDensePointer(), m, n, lda, ldc);
        this.cudaFreeHelper(this.getDensePointer());
        this.clearDensePointer();
        this.setDensePointer(tmp);
    }

    public void denseColumnMajorToRowMajor() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : dense Ptr row-major -> col-major on " + this + ", GPUContext=" + this.getGPUContext());
        }
        int n = GPUObject.toIntExact(this.mat.getNumRows());
        int m = GPUObject.toIntExact(this.mat.getNumColumns());
        int lda = n;
        int ldc = m;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated");
        }
        Pointer tmp = GPUObject.transpose(this.getGPUContext(), this.getDensePointer(), m, n, lda, ldc);
        this.cudaFreeHelper(this.getDensePointer());
        this.clearDensePointer();
        this.setDensePointer(tmp);
    }

    public void sparseToDense() {
        this.sparseToDense(null);
    }

    public void sparseToDense(String instructionName) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sparse -> dense on " + this + ", GPUContext=" + this.getGPUContext());
        }
        long start = 0L;
        long end = 0L;
        if (DMLScript.STATISTICS) {
            start = System.nanoTime();
        }
        if (this.getJcudaSparseMatrixPtr() == null || !this.isAllocated()) {
            throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
        }
        this.sparseToColumnMajorDense();
        this.denseColumnMajorToRowMajor();
        if (DMLScript.STATISTICS) {
            end = System.nanoTime();
        }
        if (instructionName != null && DMLScript.FINEGRAINED_STATISTICS) {
            GPUStatistics.maintainCPMiscTimes(instructionName, "s2d", end - start);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaSparseToDenseTime.add(end - start);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaSparseToDenseCount.add(1L);
        }
    }

    public void sparseToColumnMajorDense() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : sparse -> col-major dense on " + this + ", GPUContext=" + this.getGPUContext());
        }
        if (this.getJcudaSparseMatrixPtr() == null || !this.isAllocated()) {
            throw new DMLRuntimeException("Expected allocated sparse matrix before sparseToDense() call");
        }
        cusparseHandle cusparseHandle2 = this.getGPUContext().getCusparseHandle();
        if (cusparseHandle2 == null) {
            throw new DMLRuntimeException("Expected cusparse to be initialized");
        }
        int rows = GPUObject.toIntExact(this.mat.getNumRows());
        int cols = GPUObject.toIntExact(this.mat.getNumColumns());
        this.setDensePointer(this.getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle2, null, rows, cols, null));
    }

    GPUObject(GPUContext gCtx, MatrixObject mat2) {
        this.gpuContext = gCtx;
        this.mat = mat2;
        this.shadowBuffer = new ShadowBuffer(this);
    }

    public boolean isSparse() {
        return this.isSparse;
    }

    private static long getDatatypeSizeOf(long numElems) {
        return numElems * (long)LibMatrixCUDA.sizeOfDataType;
    }

    private static long getIntSizeOf(long numElems) {
        return numElems * 4L;
    }

    public boolean isAllocated() {
        boolean eitherAllocated = this.shadowBuffer.isBuffered() || !this.isDensePointerNull() || this.getJcudaSparseMatrixPtr() != null;
        return eitherAllocated;
    }

    public void allocateSparseAndEmpty() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : allocate sparse and empty block on " + this + ", GPUContext=" + this.getGPUContext());
        }
        this.setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(this.getGPUContext(), 0L, this.mat.getNumRows()));
    }

    public void allocateAndFillDense(double v) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : allocate and fill dense with value " + v + " on " + this + ", GPUContext=" + this.getGPUContext());
        }
        long rows = this.mat.getNumRows();
        long cols = this.mat.getNumColumns();
        int numElems = GPUObject.toIntExact(rows * cols);
        long size = GPUObject.getDatatypeSizeOf(numElems);
        this.setDensePointer(this.allocate(size));
        if (v != 0.0) {
            this.getGPUContext().getKernels().launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), this.getDensePointer(), v, numElems);
        }
    }

    public boolean isSparseAndEmpty() {
        boolean isSparseAndAllocated = this.isAllocated() && LibMatrixCUDA.isInSparseFormat(this.getGPUContext(), this.mat);
        boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && this.getJcudaSparseMatrixPtr().nnz == 0L;
        return isEmptyAndSparseAndAllocated;
    }

    public long getNnz(String instName, boolean recomputeDenseNNZ) {
        if (this.isAllocated()) {
            if (LibMatrixCUDA.isInSparseFormat(this.getGPUContext(), this.mat)) {
                return this.getJcudaSparseMatrixPtr().nnz;
            }
            if (!recomputeDenseNNZ) {
                return -1L;
            }
            long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0L;
            GPUContext gCtx = this.getGPUContext();
            cusparseHandle cusparseHandle2 = gCtx.getCusparseHandle();
            cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor();
            if (cusparseHandle2 == null) {
                throw new DMLRuntimeException("Expected cusparse to be initialized");
            }
            int rows = GPUObject.toIntExact(this.mat.getNumRows());
            int cols = GPUObject.toIntExact(this.mat.getNumColumns());
            Pointer nnzPerRowPtr = null;
            Pointer nnzTotalDevHostPtr = null;
            nnzPerRowPtr = gCtx.allocate(instName, GPUObject.getIntSizeOf(rows));
            nnzTotalDevHostPtr = gCtx.allocate(instName, GPUObject.getIntSizeOf(1L));
            LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle2, 0, rows, cols, matDescr, this.getDensePointer(), rows, nnzPerRowPtr, nnzTotalDevHostPtr);
            int[] nnzC = new int[]{-1};
            JCuda.cudaMemcpy((Pointer)Pointer.to((int[])nnzC), (Pointer)nnzTotalDevHostPtr, (long)GPUObject.getIntSizeOf(1L), (int)2);
            if (nnzC[0] == -1) {
                throw new DMLRuntimeException("cusparseDnnz did not calculate the correct number of nnz on the GPU");
            }
            gCtx.cudaFreeHelper(instName, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE);
            gCtx.cudaFreeHelper(instName, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE);
            if (DMLScript.FINEGRAINED_STATISTICS) {
                GPUStatistics.maintainCPMiscTimes(instName, "rnnz", System.nanoTime() - t1);
            }
            return nnzC[0];
        }
        throw new DMLRuntimeException("Expected the GPU object to be allocated");
    }

    public boolean acquireDeviceRead(String opcode) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : acquireDeviceRead on " + this);
        }
        boolean transferred = false;
        if (!this.isAllocated()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("GPU : in acquireDeviceRead, data is not allocated, copying from host, on " + this + ", GPUContext=" + this.getGPUContext());
            }
            this.copyFromHostToDevice(opcode);
            transferred = true;
        }
        this.addReadLock();
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Expected device data to be allocated");
        }
        return transferred;
    }

    public boolean acquireDeviceModifyDense() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : acquireDeviceModifyDense on " + this + ", GPUContext=" + this.getGPUContext());
        }
        boolean allocated = false;
        if (!this.isAllocated()) {
            this.mat.setDirty(true);
            if (LOG.isTraceEnabled()) {
                LOG.trace("GPU : data is not allocated, allocating a dense block, on " + this);
            }
            this.allocateDenseMatrixOnDevice();
            allocated = true;
        }
        this.dirty = true;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Expected device data to be allocated");
        }
        return allocated;
    }

    public boolean acquireDeviceModifySparse() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + this.getGPUContext());
        }
        boolean allocated = false;
        this.isSparse = true;
        if (!this.isAllocated()) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("GPU : data is not allocated, allocating a sparse block, on " + this);
            }
            this.mat.setDirty(true);
            this.allocateSparseMatrixOnDevice();
            allocated = true;
        }
        this.dirty = true;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Expected device data to be allocated");
        }
        return allocated;
    }

    public boolean acquireHostRead(String instName) {
        boolean copied = false;
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : acquireDeviceModifySparse on " + this + ", GPUContext=" + this.getGPUContext());
        }
        if (this.isAllocated() && this.dirty) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + this.getGPUContext());
            }
            if (this.isAllocated() && this.dirty) {
                if (LOG.isTraceEnabled()) {
                    LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + this.getGPUContext());
                }
                this.copyFromDeviceToHost(instName, false, true);
                copied = true;
            }
        }
        return copied;
    }

    public boolean isLocked() {
        return this.writeLock || this.readLocks.longValue() > 0L;
    }

    public void addReadLock() {
        if (this.writeLock) {
            throw new DMLRuntimeException("Attempting to add a read lock when writeLock=" + this.writeLock);
        }
        this.readLocks.increment();
    }

    public void addWriteLock() {
        if (this.readLocks.longValue() > 0L) {
            throw new DMLRuntimeException("Attempting to add a write lock when readLocks=" + this.readLocks.longValue());
        }
        if (this.writeLock) {
            throw new DMLRuntimeException("Attempting to add a write lock when writeLock=" + this.writeLock);
        }
        this.writeLock = true;
    }

    public void releaseReadLock() {
        this.readLocks.decrement();
        if (this.readLocks.longValue() < 0L) {
            throw new DMLRuntimeException("Attempting to release a read lock when readLocks=" + this.readLocks.longValue());
        }
    }

    public void releaseWriteLock() {
        if (!this.writeLock) {
            throw new DMLRuntimeException("Internal state error : Attempting to release write lock on a GPUObject, which was already released");
        }
        this.writeLock = false;
    }

    public void resetReadWriteLock() {
        this.readLocks.reset();
        this.writeLock = false;
    }

    private void updateReleaseLocks() {
        DMLScript.EvictionPolicy evictionPolicy = DMLScript.GPU_EVICTION_POLICY;
        switch (evictionPolicy) {
            case LRU: {
                this.timestamp.set(System.nanoTime());
                break;
            }
            case LFU: {
                this.timestamp.addAndGet(1L);
                break;
            }
            case MIN_EVICT: 
            case ALIGN_MEMORY: {
                break;
            }
            case MRU: {
                this.timestamp.set(-System.nanoTime());
                break;
            }
            default: {
                throw new DMLRuntimeException("The eviction policy is not supported:" + evictionPolicy.name());
            }
        }
    }

    public void releaseInput() {
        this.releaseReadLock();
        this.updateReleaseLocks();
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Attempting to release an input before allocating it");
        }
    }

    public void releaseOutput() {
        this.releaseWriteLock();
        this.updateReleaseLocks();
        this.dirty = true;
        if (!this.isAllocated()) {
            throw new DMLRuntimeException("Attempting to release an output before allocating it");
        }
    }

    void allocateDenseMatrixOnDevice() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : allocateDenseMatrixOnDevice, on " + this + ", GPUContext=" + this.getGPUContext());
        }
        if (this.isAllocated()) {
            throw new DMLRuntimeException("Internal error - trying to allocated dense matrix to a GPUObject that is already allocated");
        }
        long rows = this.mat.getNumRows();
        long cols = this.mat.getNumColumns();
        if (rows <= 0L) {
            throw new DMLRuntimeException("Internal error - invalid number of rows when allocating dense matrix:" + rows);
        }
        if (cols <= 0L) {
            throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix:" + cols);
        }
        long size = GPUObject.getDatatypeSizeOf(rows * cols);
        Pointer tmp = this.allocate(size);
        this.setDensePointer(tmp);
    }

    void allocateSparseMatrixOnDevice() {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : allocateSparseMatrixOnDevice, on " + this + ", GPUContext=" + this.getGPUContext());
        }
        if (this.isAllocated()) {
            throw new DMLRuntimeException("Internal error - trying to allocated sparse matrix to a GPUObject that is already allocated");
        }
        long rows = this.mat.getNumRows();
        long nnz = this.mat.getNnz();
        if (rows <= 0L) {
            throw new DMLRuntimeException("Internal error - invalid number of rows when allocating sparse matrix");
        }
        if (nnz < 0L) {
            throw new DMLRuntimeException("Internal error - invalid number of non zeroes when allocating a sparse matrix");
        }
        CSRPointer tmp = CSRPointer.allocateEmpty(this.getGPUContext(), nnz, rows);
        this.setSparseMatrixCudaPointer(tmp);
    }

    protected long getSizeOnDevice() {
        long GPUSize = 0L;
        long rlen = this.mat.getNumRows();
        long clen = this.mat.getNumColumns();
        long nnz = this.mat.getNnz();
        GPUSize = LibMatrixCUDA.isInSparseFormat(this.getGPUContext(), this.mat) ? CSRPointer.estimateSize(nnz, rlen) : GPUObject.getDatatypeSizeOf(rlen * clen);
        return GPUSize;
    }

    void copyFromHostToDevice(String opcode) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : copyFromHostToDevice, on " + this + ", GPUContext=" + this.getGPUContext());
        }
        long start = 0L;
        if (DMLScript.STATISTICS) {
            start = System.nanoTime();
        }
        long acqrTime = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0L;
        MatrixBlock tmp = (MatrixBlock)this.mat.acquireRead();
        if (DMLScript.FINEGRAINED_STATISTICS) {
            if (tmp.isInSparseFormat()) {
                GPUStatistics.maintainCPMiscTimes(opcode, "aqrs", System.nanoTime() - acqrTime);
            } else {
                GPUStatistics.maintainCPMiscTimes(opcode, "aqrd", System.nanoTime() - acqrTime);
            }
        }
        if (tmp.isInSparseFormat()) {
            int[] rowPtr = null;
            int[] colInd = null;
            double[] values = null;
            if (tmp.getNonZeros() < 0L) {
                tmp.recomputeNonZeros();
            }
            long nnz = tmp.getNonZeros();
            this.mat.getMatrixCharacteristics().setNonZeros(nnz);
            SparseBlock block = tmp.getSparseBlock();
            boolean copyToDevice = true;
            if (block == null && tmp.getNonZeros() == 0L) {
                copyToDevice = false;
            } else {
                if (block == null && tmp.getNonZeros() != 0L) {
                    throw new DMLRuntimeException("Expected CP sparse block to be not null.");
                }
                SparseBlockCSR csrBlock = null;
                long t0 = 0L;
                if (block instanceof SparseBlockCSR) {
                    csrBlock = (SparseBlockCSR)block;
                } else if (block instanceof SparseBlockCOO) {
                    if (DMLScript.STATISTICS) {
                        t0 = System.nanoTime();
                    }
                    SparseBlockCOO cooBlock = (SparseBlockCOO)block;
                    csrBlock = new SparseBlockCSR(GPUObject.toIntExact(this.mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values());
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
                    }
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionCount.increment();
                    }
                } else if (block instanceof SparseBlockMCSR) {
                    if (DMLScript.STATISTICS) {
                        t0 = System.nanoTime();
                    }
                    SparseBlockMCSR mcsrBlock = (SparseBlockMCSR)block;
                    csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), GPUObject.toIntExact(mcsrBlock.size()));
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
                    }
                    if (DMLScript.STATISTICS) {
                        GPUStatistics.cudaSparseConversionCount.increment();
                    }
                } else {
                    throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
                }
                rowPtr = csrBlock.rowPointers();
                colInd = csrBlock.indexes();
                values = csrBlock.values();
            }
            this.allocateSparseMatrixOnDevice();
            if (copyToDevice) {
                long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0L;
                CSRPointer.copyToDevice(this.getGPUContext(), this.getJcudaSparseMatrixPtr(), tmp.getNumRows(), tmp.getNonZeros(), rowPtr, colInd, values);
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(opcode, "H2D", System.nanoTime() - t1);
                }
            }
        } else {
            double[] data = tmp.getDenseBlockValues();
            if (data == null && tmp.getSparseBlock() != null) {
                throw new DMLRuntimeException("Incorrect sparsity calculation");
            }
            if (data == null && tmp.getNonZeros() != 0L) {
                throw new DMLRuntimeException("MatrixBlock is not allocated");
            }
            this.allocateDenseMatrixOnDevice();
            if (tmp.getNonZeros() == 0L) {
                long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0L;
                JCuda.cudaMemset((Pointer)this.getDensePointer(), (int)0, (long)GPUObject.getDatatypeSizeOf(this.mat.getNumRows() * this.mat.getNumColumns()));
                if (DMLScript.FINEGRAINED_STATISTICS) {
                    GPUStatistics.maintainCPMiscTimes(opcode, "az", System.nanoTime() - t1);
                }
            } else {
                LibMatrixCUDA.cudaSupportFunctions.hostToDevice(this.getGPUContext(), data, this.getDensePointer(), opcode);
            }
        }
        this.mat.release();
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaToDevTime.add(System.nanoTime() - start);
        }
        if (DMLScript.STATISTICS) {
            GPUStatistics.cudaToDevCount.add(1L);
        }
    }

    public static int toIntExact(long l) {
        if (l < Integer.MIN_VALUE || l > Integer.MAX_VALUE) {
            throw new DMLRuntimeException("Cannot be cast to int:" + l);
        }
        return (int)l;
    }

    protected void copyFromDeviceToHost(String instName, boolean isEviction, boolean eagerDelete) throws DMLRuntimeException {
        long start;
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + this.getGPUContext());
        }
        if (this.shadowBuffer.isBuffered()) {
            if (isEviction) {
                return;
            }
            this.shadowBuffer.moveToHost();
            return;
        }
        if (this.shadowBuffer.isEligibleForBuffering(isEviction, eagerDelete)) {
            this.shadowBuffer.moveFromDevice(instName);
            return;
        }
        if (this.isDensePointerNull() && this.getJcudaSparseMatrixPtr() == null) {
            throw new DMLRuntimeException("Cannot copy from device to host as JCuda dense/sparse pointer is not allocated");
        }
        if (!this.isDensePointerNull() && this.getJcudaSparseMatrixPtr() != null) {
            throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated");
        }
        if (this.getJcudaSparseMatrixPtr() != null && !LibMatrixCUDA.isInSparseFormat(this.getGPUContext(), this.mat)) {
            throw new DMLRuntimeException("Block not in sparse format on host yet the device sparse matrix pointer is not null");
        }
        if (this.getJcudaSparseMatrixPtr() != null && this.isSparseAndEmpty()) {
            this.mat.acquireModify(new MatrixBlock((int)this.mat.getNumRows(), (int)this.mat.getNumColumns(), 0L));
            this.mat.release();
            return;
        }
        MatrixBlock tmp = null;
        long l = start = DMLScript.STATISTICS ? System.nanoTime() : 0L;
        if (!this.isDensePointerNull()) {
            tmp = new MatrixBlock(GPUObject.toIntExact(this.mat.getNumRows()), GPUObject.toIntExact(this.mat.getNumColumns()), false);
            tmp.allocateDenseBlock();
            LibMatrixCUDA.cudaSupportFunctions.deviceToHost(this.getGPUContext(), this.getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction);
            if (eagerDelete) {
                this.clearData(instName, true);
            }
            tmp.recomputeNonZeros();
        } else {
            int rows = GPUObject.toIntExact(this.mat.getNumRows());
            int cols = GPUObject.toIntExact(this.mat.getNumColumns());
            int nnz = GPUObject.toIntExact(this.getJcudaSparseMatrixPtr().nnz);
            double[] values = new double[nnz];
            LibMatrixCUDA.cudaSupportFunctions.deviceToHost(this.getGPUContext(), this.getJcudaSparseMatrixPtr().val, values, instName, isEviction);
            int[] rowPtr = new int[rows + 1];
            int[] colInd = new int[nnz];
            CSRPointer.copyPtrToHost(this.getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd);
            if (eagerDelete) {
                this.clearData(instName, true);
            }
            SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz);
            tmp = new MatrixBlock(rows, cols, nnz, sparseBlock);
        }
        this.mat.acquireModify(tmp);
        this.mat.release();
        if (DMLScript.STATISTICS && !isEviction) {
            long totalTime = System.nanoTime() - start;
            int count = !this.isDensePointerNull() ? 1 : 3;
            GPUStatistics.cudaFromDevTime.add(totalTime);
            GPUStatistics.cudaFromDevCount.add(count);
        }
        this.dirty = false;
    }

    public void clearData(String opcode, boolean eager) throws DMLRuntimeException {
        if (LOG.isTraceEnabled()) {
            LOG.trace("GPU : clearData on " + this + ", GPUContext=" + this.getGPUContext());
        }
        if (!this.isDensePointerNull()) {
            this.getGPUContext().cudaFreeHelper(opcode, this.getDensePointer(), eager);
        }
        if (this.getJcudaSparseMatrixPtr() != null) {
            this.getJcudaSparseMatrixPtr().deallocate(eager);
        }
        this.clearDensePointer();
        this.shadowBuffer.clearShadowPointer();
        this.jcudaSparseMatrixPtr = null;
        this.resetReadWriteLock();
        this.getGPUContext().getMemoryManager().removeGPUObject(this);
    }

    public CSRPointer getJcudaSparseMatrixPtr() {
        return this.jcudaSparseMatrixPtr;
    }

    public boolean isDirty() {
        return this.dirty;
    }

    public String toString() {
        StringBuilder sb = new StringBuilder("GPUObject{");
        sb.append(", dirty=").append(this.dirty);
        sb.append(", readLocks=").append(this.readLocks.longValue());
        sb.append(", writeLock=").append(this.writeLock);
        sb.append(", sparse? ").append(this.isSparse);
        sb.append(", dims=[").append(this.mat.getNumRows()).append(",").append(this.mat.getNumColumns()).append("]");
        if (!this.isDensePointerNull()) {
            sb.append(", densePtr=").append(this.getDensePointer());
        }
        if (this.jcudaSparseMatrixPtr != null) {
            sb.append(", sparsePtr=").append(this.jcudaSparseMatrixPtr);
        }
        sb.append('}');
        return sb.toString();
    }
}

