Actual source code: cudavecimpl.h
 
   petsc-3.7.7 2017-09-25
   
  4: #if defined(__CUDACC__)
  6: #include <petsccuda.h>
  7: #include <petsc/private/vecimpl.h>
  9: #include <cublas_v2.h>
 11: #define WaitForGPU() PetscCUDASynchronize ? cudaThreadSynchronize() : 0
 13: struct Vec_CUDA {
 14:   PetscScalar  *GPUarray;      /* this always holds the GPU data */
 15:   cudaStream_t stream;        /* A stream for doing asynchronous data transfers */
 16:   PetscBool    hostDataRegisteredAsPageLocked;
 17: };
 19: #endif
 21: #include <cuda_runtime.h>
 23: PETSC_INTERN PetscErrorCode VecDotNorm2_SeqCUDA(Vec,Vec,PetscScalar*, PetscScalar*);
 24: PETSC_INTERN PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec,Vec,Vec);
 25: PETSC_INTERN PetscErrorCode VecWAXPY_SeqCUDA(Vec,PetscScalar,Vec,Vec);
 26: PETSC_INTERN PetscErrorCode VecMDot_SeqCUDA(Vec,PetscInt,const Vec[],PetscScalar*);
 27: PETSC_INTERN PetscErrorCode VecSet_SeqCUDA(Vec,PetscScalar);
 28: PETSC_INTERN PetscErrorCode VecMAXPY_SeqCUDA(Vec,PetscInt,const PetscScalar*,Vec*);
 29: PETSC_INTERN PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec,PetscScalar,PetscScalar,PetscScalar,Vec,Vec);
 30: PETSC_INTERN PetscErrorCode VecPointwiseMult_SeqCUDA(Vec,Vec,Vec);
 31: PETSC_INTERN PetscErrorCode VecPlaceArray_SeqCUDA(Vec,const PetscScalar*);
 32: PETSC_INTERN PetscErrorCode VecResetArray_SeqCUDA(Vec);
 33: PETSC_INTERN PetscErrorCode VecReplaceArray_SeqCUDA(Vec,const PetscScalar*);
 34: PETSC_INTERN PetscErrorCode VecDot_SeqCUDA(Vec,Vec,PetscScalar*);
 35: PETSC_INTERN PetscErrorCode VecTDot_SeqCUDA(Vec,Vec,PetscScalar*);
 36: PETSC_INTERN PetscErrorCode VecScale_SeqCUDA(Vec,PetscScalar);
 37: PETSC_EXTERN PetscErrorCode VecCopy_SeqCUDA(Vec,Vec);
 38: PETSC_INTERN PetscErrorCode VecSwap_SeqCUDA(Vec,Vec);
 39: PETSC_INTERN PetscErrorCode VecAXPY_SeqCUDA(Vec,PetscScalar,Vec);
 40: PETSC_INTERN PetscErrorCode VecAXPBY_SeqCUDA(Vec,PetscScalar,PetscScalar,Vec);
 41: PETSC_INTERN PetscErrorCode VecDuplicate_SeqCUDA(Vec,Vec*);
 42: PETSC_INTERN PetscErrorCode VecConjugate_SeqCUDA(Vec xin);
 43: PETSC_INTERN PetscErrorCode VecNorm_SeqCUDA(Vec,NormType,PetscReal*);
 44: PETSC_INTERN PetscErrorCode VecCUDACopyToGPU(Vec);
 45: PETSC_INTERN PetscErrorCode VecCUDAAllocateCheck(Vec);
 46: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUDA(Vec);
 47: PETSC_INTERN PetscErrorCode VecDestroy_SeqCUDA(Vec);
 48: PETSC_INTERN PetscErrorCode VecAYPX_SeqCUDA(Vec,PetscScalar,Vec);
 49: PETSC_INTERN PetscErrorCode VecSetRandom_SeqCUDA(Vec,PetscRandom);
 50: PETSC_INTERN PetscErrorCode VecGetLocalVector_SeqCUDA(Vec,Vec);
 51: PETSC_INTERN PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec,Vec);
 52: PETSC_INTERN PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin);
 53: PETSC_INTERN PetscErrorCode VecSetRandom_SeqCUDA_Private(Vec xin,PetscRandom r);
 54: PETSC_INTERN PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v);
 55: PETSC_INTERN PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin);
 56: PETSC_INTERN PetscErrorCode VecCUDACopyToGPU_Public(Vec);
 57: PETSC_INTERN PetscErrorCode VecCUDAAllocateCheck_Public(Vec);
 58: PETSC_INTERN PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci);
 59: PETSC_INTERN PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci);
 61: PETSC_INTERN PetscErrorCode VecScatterCUDAIndicesCreate_PtoP(PetscInt, PetscInt*,PetscInt, PetscInt*,PetscCUDAIndices*);
 62: PETSC_INTERN PetscErrorCode VecScatterCUDAIndicesCreate_StoS(PetscInt,PetscInt,PetscInt,PetscInt,PetscInt,PetscInt*,PetscInt*,PetscCUDAIndices*);
 63: PETSC_INTERN PetscErrorCode VecScatterCUDAIndicesDestroy(PetscCUDAIndices*);
 64: PETSC_INTERN PetscErrorCode VecScatterCUDA_StoS(Vec,Vec,PetscCUDAIndices,InsertMode,ScatterMode);
 66: typedef enum {VEC_SCATTER_CUDA_STOS, VEC_SCATTER_CUDA_PTOP} VecCUDAScatterType;
 67: typedef enum {VEC_SCATTER_CUDA_GENERAL, VEC_SCATTER_CUDA_STRIDED} VecCUDASequentialScatterMode;
 69: struct  _p_VecScatterCUDAIndices_PtoP {
 70:   PetscInt ns;
 71:   PetscInt sendLowestIndex;
 72:   PetscInt nr;
 73:   PetscInt recvLowestIndex;
 74: };
 76: struct  _p_VecScatterCUDAIndices_StoS {
 77:   /* from indices data */
 78:   PetscInt *fslots;
 79:   PetscInt fromFirst;
 80:   PetscInt fromStep;
 81:   VecCUDASequentialScatterMode fromMode;
 83:   /* to indices data */
 84:   PetscInt *tslots;
 85:   PetscInt toFirst;
 86:   PetscInt toStep;
 87:   VecCUDASequentialScatterMode toMode;
 89:   PetscInt n;
 90:   PetscInt MAX_BLOCKS;
 91:   PetscInt MAX_CORESIDENT_THREADS;
 92:   cudaStream_t stream;
 93: };
 95: struct  _p_PetscCUDAIndices {
 96:   void * scatter;
 97:   VecCUDAScatterType scatterType;
 98: };
100: /* complex single */
101: #if defined(PETSC_USE_COMPLEX)
102: #if defined(PETSC_USE_REAL_SINGLE)
103: #define cublasXaxpy(a,b,c,d,e,f,g) cublasCaxpy((a),(b),(cuComplex*)(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g))
104: #define cublasXscal(a,b,c,d,e)     cublasCscal((a),(b),(cuComplex*)(c),(cuComplex*)(d),(e))
105: #define cublasXdotu(a,b,c,d,e,f,g) cublasCdotu((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g))
106: #define cublasXdot(a,b,c,d,e,f,g)  cublasCdotc((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g))
107: #define cublasXswap(a,b,c,d,e,f)   cublasCswap((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f))
108: #define cublasXnrm2(a,b,c,d,e)     cublasScnrm2((a),(b),(cuComplex*)(c),(d),(e))
109: #define cublasIXamax(a,b,c,d,e)    cublasIcamax((a),(b),(cuComplex*)(c),(d),(e))
110: #define cublasXasum(a,b,c,d,e)     cublasScasum((a),(b),(cuComplex*)(c),(d),(e))
111: #else /* complex double */
112: #define cublasXaxpy(a,b,c,d,e,f,g) cublasZaxpy((a),(b),(cuDoubleComplex*)(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g))
113: #define cublasXscal(a,b,c,d,e)     cublasZscal((a),(b),(cuDoubleComplex*)(c),(cuDoubleComplex*)(d),(e))
114: #define cublasXdotu(a,b,c,d,e,f,g) cublasZdotu((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g))
115: #define cublasXdot(a,b,c,d,e,f,g)  cublasZdotc((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g))
116: #define cublasXswap(a,b,c,d,e,f)   cublasZswap((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f))
117: #define cublasXnrm2(a,b,c,d,e)     cublasDznrm2((a),(b),(cuDoubleComplex*)(c),(d),(e))
118: #define cublasIXamax(a,b,c,d,e)    cublasIzamax((a),(b),(cuDoubleComplex*)(c),(d),(e))
119: #define cublasXasum(a,b,c,d,e)     cublasDzasum((a),(b),(cuDoubleComplex*)(c),(d),(e))
120: #endif
121: #else /* real single */
122: #if defined(PETSC_USE_REAL_SINGLE)
123: #define cublasXaxpy  cublasSaxpy
124: #define cublasXscal  cublasSscal
125: #define cublasXdotu  cublasSdot
126: #define cublasXdot   cublasSdot
127: #define cublasXswap  cublasSswap
128: #define cublasXnrm2  cublasSnrm2
129: #define cublasIXamax cublasIsamax
130: #define cublasXasum  cublasSasum
131: #else /* real double */
132: #define cublasXaxpy  cublasDaxpy
133: #define cublasXscal  cublasDscal
134: #define cublasXdotu  cublasDdot
135: #define cublasXdot   cublasDdot
136: #define cublasXswap  cublasDswap
137: #define cublasXnrm2  cublasDnrm2
138: #define cublasIXamax cublasIdamax
139: #define cublasXasum  cublasDasum
140: #endif
141: #endif
143: #endif