Skip to content

Instantly share code, notes, and snippets.

@fac2003
Created July 11, 2016 17:56
Show Gist options
  • Save fac2003/0c58131f97afcad22954deea0f3b29f5 to your computer and use it in GitHub Desktop.
Save fac2003/0c58131f97afcad22954deea0f3b29f5 to your computer and use it in GitHub Desktop.
[Stage 4:===> (2 + 30) / 32]CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:2831 code=77(<unknown>) "cudaStreamSynchronize(*stream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:2831 code=77(<unknown>) "cudaStreamSynchronize(*stream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:2831 code=77(<unknown>) "cudaStreamSynchronize(*stream)"
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 16 in stage 4.0 failed 1 times, most recent failure: Lost task 16.0 in stage 4.0 (TID 144, localhost): java.lang.IllegalStateException: MemcpyAsync relocate H2D failed: [36012556288] -> [180810547200]
at org.nd4j.jita.handler.impl.CudaZeroHandler.relocate(CudaZeroHandler.java:366)
at org.nd4j.jita.handler.impl.CudaZeroHandler.getDevicePointer(CudaZeroHandler.java:733)
at org.nd4j.jita.allocator.impl.AtomicAllocator.getPointer(AtomicAllocator.java:256)
at org.nd4j.linalg.jcublas.ops.executioner.JCudaExecutioner.invoke(JCudaExecutioner.java:1001)
at org.nd4j.linalg.jcublas.ops.executioner.JCudaExecutioner.exec(JCudaExecutioner.java:552)
at org.nd4j.linalg.api.ndarray.BaseNDArray.assign(BaseNDArray.java:2943)
at org.nd4j.linalg.factory.BaseNDArrayFactory.ones(BaseNDArrayFactory.java:1247)
at org.nd4j.linalg.factory.BaseNDArrayFactory.ones(BaseNDArrayFactory.java:972)
at org.nd4j.linalg.factory.Nd4j.ones(Nd4j.java:4504)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.initMask(MultiLayerNetwork.java:1879)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.init(MultiLayerNetwork.java:416)
at org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingWorker.getInitialModel(ParameterAveragingTrainingWorker.java:44)
at org.deeplearning4j.spark.api.worker.ExecuteWorkerFlatMap.call(ExecuteWorkerFlatMap.java:68)
at org.deeplearning4j.spark.api.worker.ExecuteWorkerFlatMap.call(ExecuteWorkerFlatMap.java:26)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:156)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:156)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:706)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:706)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1280)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1268)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1267)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1267)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1493)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1455)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1444)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1813)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1933)
at org.apache.spark.rdd.RDD$$anonfun$aggregate$1.apply(RDD.scala:1078)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1071)
at org.apache.spark.api.java.JavaRDDLike$class.aggregate(JavaRDDLike.scala:416)
at org.apache.spark.api.java.AbstractJavaRDDLike.aggregate(JavaRDDLike.scala:47)
at org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster.processResults(ParameterAveragingTrainingMaster.java:407)
at org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster.doIteration(ParameterAveragingTrainingMaster.java:339)
at org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingMaster.executeTraining(ParameterAveragingTrainingMaster.java:181)
at org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer.fit(SparkDl4jMultiLayer.java:201)
at org.deeplearning4j.examples.rnn.GravesLSTMCharModellingExample.main(GravesLSTMCharModellingExample.java:116)
Caused by: java.lang.IllegalStateException: MemcpyAsync relocate H2D failed: [36012556288] -> [180810547200]
at org.nd4j.jita.handler.impl.CudaZeroHandler.relocate(CudaZeroHandler.java:366)
at org.nd4j.jita.handler.impl.CudaZeroHandler.getDevicePointer(CudaZeroHandler.java:733)
at org.nd4j.jita.allocator.impl.AtomicAllocator.getPointer(AtomicAllocator.java:256)
at org.nd4j.linalg.jcublas.ops.executioner.JCudaExecutioner.invoke(JCudaExecutioner.java:1001)
at org.nd4j.linalg.jcublas.ops.executioner.JCudaExecutioner.exec(JCudaExecutioner.java:552)
at org.nd4j.linalg.api.ndarray.BaseNDArray.assign(BaseNDArray.java:2943)
at org.nd4j.linalg.factory.BaseNDArrayFactory.ones(BaseNDArrayFactory.java:1247)
at org.nd4j.linalg.factory.BaseNDArrayFactory.ones(BaseNDArrayFactory.java:972)
at org.nd4j.linalg.factory.Nd4j.ones(Nd4j.java:4504)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.initMask(MultiLayerNetwork.java:1879)
at org.deeplearning4j.nn.multilayer.MultiLayerNetwork.init(MultiLayerNetwork.java:416)
at org.deeplearning4j.spark.impl.paramavg.ParameterAveragingTrainingWorker.getInitialModel(ParameterAveragingTrainingWorker.java:44)
at org.deeplearning4j.spark.api.worker.ExecuteWorkerFlatMap.call(ExecuteWorkerFlatMap.java:68)
at org.deeplearning4j.spark.api.worker.ExecuteWorkerFlatMap.call(ExecuteWorkerFlatMap.java:26)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:156)
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:156)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:706)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:706)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:297)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:264)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:2831 code=77(<unknown>) "cudaStreamSynchronize(*stream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3921 code=77(<unknown>) "cudaStreamSynchronize(*pStream)"
CUDA error at /skymind/libnd4j/blas/cuda/NativeOps.cu:3947 code=77(<unknown>) "result"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment