Dounm · March 18, 2021 11:01
diff --git a/cu b/cu
 #include <stdio.h>

 #define NUM_STREAMS 4
 cudaError_t memcpyUsingStreams (float           *fDest,
                                float           *fSrc,
                                int             iBytes,
                                cudaMemcpyKind  eDirection)
 {
    cudaStream_t    *pCuStream = NULL;
    int             iIndex = 0 ;
    cudaError_t     cuError = cudaSuccess ;
    int             iOffset = 0 ;

    iOffset = (iBytes / NUM_STREAMS) ;
    /*Creating streams if not present */
    pCuStream = (cudaStream_t *) malloc(NUM_STREAMS * sizeof(cudaStream_t));
    for (iIndex = 0 ; iIndex < NUM_STREAMS; iIndex++)
    {
            cuError = cudaStreamCreate (&pCuStream[iIndex]) ;
    }

    if (cuError != cudaSuccess)
    {
            cuError = cudaMemcpy (fDest, fSrc, iBytes, eDirection) ;
    }
    else
    {
            for (iIndex = 0 ; iIndex < NUM_STREAMS; iIndex++)
            {
                    iOffset = iIndex * iOffset ;
                    cuError = cudaMemcpyAsync (fDest +  iOffset , fSrc + iOffset, iBytes / NUM_STREAMS , eDirection, pCuStream[iIndex]) ;
            }
    }

    if (NULL != pCuStream)
    {
            for (iIndex = 0 ; iIndex < NUM_STREAMS; iIndex++)
            {
                    cuError = cudaStreamDestroy (pCuStream[iIndex]) ;
            }
            free (pCuStream) ;
    }
    return cuError ;
 }


 int main()
 {
    float *hdata = NULL ;
    float *ddata = NULL ;
    int i, j, k, index ;

    const int bytes = 256 * 256 * 256 * 16;

    cudaMallocHost ((void **) &hdata, sizeof (float) * bytes) ;
    cudaMalloc ((void **) &ddata, sizeof (float) * bytes) ;

    for (i=0 ; i< 256 ; i++)
    {
        for (j=0; j< 256; j++)
        {
            for (k=0; k< 256 ; k++)
            {
                index = (((i * 256) + j) * 256) + k;
                hdata [index] = index ;
            }
        }
    }

    memcpyUsingStreams (ddata, hdata, sizeof (float) * bytes,  cudaMemcpyHostToDevice) ;

    cudaFree (ddata) ;
    cudaFreeHost (hdata) ;

    return 0;
 }
	#include <stdio.h>

	#define NUM_STREAMS 4
	cudaError_t memcpyUsingStreams (float *fDest,
	float *fSrc,
	int iBytes,
	cudaMemcpyKind eDirection)
	{
	cudaStream_t *pCuStream = NULL;
	int iIndex = 0 ;
	cudaError_t cuError = cudaSuccess ;
	int iOffset = 0 ;

	iOffset = (iBytes / NUM_STREAMS) ;
	/Creating streams if not present /
	pCuStream = (cudaStream_t ) malloc(NUM_STREAMS sizeof(cudaStream_t));
	for (iIndex = 0 ; iIndex < NUM_STREAMS; iIndex++)
	{
	cuError = cudaStreamCreate (&pCuStream[iIndex]) ;
	}

	if (cuError != cudaSuccess)
	{
	cuError = cudaMemcpy (fDest, fSrc, iBytes, eDirection) ;
	}
	else
	{
	for (iIndex = 0 ; iIndex < NUM_STREAMS; iIndex++)
	{
	iOffset = iIndex * iOffset ;
	cuError = cudaMemcpyAsync (fDest + iOffset , fSrc + iOffset, iBytes / NUM_STREAMS , eDirection, pCuStream[iIndex]) ;
	}
	}

	if (NULL != pCuStream)
	{
	for (iIndex = 0 ; iIndex < NUM_STREAMS; iIndex++)
	{
	cuError = cudaStreamDestroy (pCuStream[iIndex]) ;
	}
	free (pCuStream) ;
	}
	return cuError ;
	}


	int main()
	{
	float *hdata = NULL ;
	float *ddata = NULL ;
	int i, j, k, index ;

	const int bytes = 256 * 256 * 256 * 16;

	cudaMallocHost ((void *) &hdata, sizeof (float) bytes) ;
	cudaMalloc ((void *) &ddata, sizeof (float) bytes) ;

	for (i=0 ; i< 256 ; i++)
	{
	for (j=0; j< 256; j++)
	{
	for (k=0; k< 256 ; k++)
	{
	index = (((i * 256) + j) * 256) + k;
	hdata [index] = index ;
	}
	}
	}

	memcpyUsingStreams (ddata, hdata, sizeof (float) * bytes, cudaMemcpyHostToDevice) ;

	cudaFree (ddata) ;
	cudaFreeHost (hdata) ;

	return 0;
	}