It is a good tone to check CUDA API errors while calling cudaMalloc() and other functions. It also helps to find floating bugs caused by hardware (lack of memory, etc). I provide below an adapted version of CudaSafeCall I found many weeks ago in the Internet. Simply remove #define CUDA_ERROR_CHECK in production if unneeded.

#include <iostream>
#include <cuda.h>


#define CudaSafeCall(error) __cudaSafeCall(error, __FILE__, __LINE__)

inline void __cudaSafeCall(cudaError error, const char *file, const int line)
    if (error != cudaSuccess ) {
        std::cout << "error: CudaSafeCall() failed at " << file
                  << ":" << line
                  << " with \"" << cudaGetErrorString(error) << "\""
                  << std::endl;
        exit( -1 );

int main(int argc, char **argv)
    float *d_array;
    size_t N = 1024;
    CudaSafeCall(cudaMalloc((void **)&d_array, N*N*N*N));
    return 0;