hadoopsters · November 10, 2021 16:15
diff --git a/ExampleStreamingListener.scala b/ExampleStreamingListener.scala
 package hadoopsters.spark.scala.monitoring.listeners

 import org.apache.spark.streaming.kafka010.OffsetRange
 import org.apache.spark.streaming.scheduler._
 import org.joda.time.DateTime

 /**
  * :: ExampleStreamingListener ::
  * A simple StreamingListener that accesses summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
  *
  * @param exampleArg        You can pass whatever you want to a listener!
  */
 class ExampleStreamingListener (exampleArg: String) extends StreamingListener {

  // ====================
  // onBatch_ Methods
  // ====================
 
  /**
    * This method executes when a Spark Streaming batch completes.
    *
    * @param batchCompleted Class having information on the completed batch
    */
  override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
   
   println("I was passed to the listener: " + exampleArg)

    // write performance metrics somewhere
    writeStatsSomewhere(batchCompleted)

    // write offsets (state) somewhere, and numRecords per topic
    processTopicInfo(batchCompleted)
  }

  /**
    * This method executes when a Spark Streaming batch is submitted to the scheduler for execution.
    *
    * @param batchSubmitted Class having information on the completed batch
    */
  override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
  }

  /**
    * This method executes when a Spark Streaming batch starts.
    *
    * @param batchStarted Class having information on the completed batch
    */
  override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = {

  }
 
  // ====================
  // onReceiver_ Methods
  // ====================
 
  /**
    * This method executes when a Spark Streaming receiver has started.
    *
    * @param receiverStarted Class having information on the receiver (e.g. errors, executor ids, etc)
    */
  override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
  }

  /**
    * This method executes when a Spark Streaming receiver encounters an error.
    *
    * @param receiverError Class having information on the receiver (e.g. errors, executor ids, etc)
    */
  override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = {
  }

  /**
    * This method executes when a Spark Streaming receiver stops working.
    *
    * @param receiverStopped Class having information on the receiver (e.g. errors, executor ids, etc)
    */
  override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = {
  }

  // =======================================================================
  //  Convenience Methods (for use in onBatch_ methods)
  // =======================================================================

  /**
    * Pulls, parses, and logs the key performance metrics of the Streaming app and logs them somewhere.
    * Processing Time: How many seconds needed to complete this batch (i.e. duration).
    * Scheduling Delay: How many seconds the start time of this bach was delayed.
    * Num Records: The total number of input records from a live stream consumed this batch.
    *
    * @param batch Class having information on the completed batch
    */
  def writeStatsSomewhere(batch: StreamingListenerBatchCompleted): Unit = {

    // Store the processing time for this batch in seconds
    val processingTime = if (batch.batchInfo.processingDelay.isDefined) {
      batch.batchInfo.processingDelay.get / 1000
    }
    else {
      0
    }

    // Store the scheduling delay for this batch in seconds
    val schedulingDelay = if (batch.batchInfo.schedulingDelay.isDefined && batch.batchInfo.schedulingDelay.get > 0) {
      batch.batchInfo.schedulingDelay.get / 1000
    }
    else {
      0
    }

    // Store the total record count for this batch
    val numRecords = batch.batchInfo.numRecords

    // do something with `processingTime`
    // do something with `schedulingDelay`
    // do something with `numRecords`

  }

  /**
    * A combination method that will process a topic in a batch.
    *
    * @param batch Class having information on the completed batch
    */
  def processTopicInfo(batch: StreamingListenerBatchCompleted): Unit = {

    // for each stream topic consumed this batch...
    batch.batchInfo.streamIdToInputInfo.foreach(topic => {
      
      writeTopicOffsetsSomewhere(topic)
      writeTopicCountSomewhere(topic)
        
    })
  }

  // =======================================================================
  // Topic Methods (designed for use inside of convenience methods)
  // =======================================================================

  /**
    * Takes a topic object and writes the max offset for each partition it contains this batch somewhere.
    *
    * @param topic A topic object within a Batch's StreamIdToInputInfo
    */
  def writeTopicOffsetsSomewhere(topic: Tuple2[Int, StreamInputInfo]): Unit = {

    // map offset info to OffsetRange objects
    val partitionOffsets = topic._2.metadata("offsets").asInstanceOf[List[OffsetRange]]

    // for every partition's range of offsets
    partitionOffsets.map(offsetRange => {

      // write the new starting offset for each partition in the topic to the state db
      var maxOffset = offsetRange.untilOffset - 1
      
      // do something with `offsetRange.topic`
      // do something with `offsetRange.partition`
      // do something with `offsetRange.count`
      // do something with `maxOffset`
    })
  }

  /**
    * Takes a topic object and writes the number of records for said topic this batch somewhere.
    *
    * @param topic A topic object within a Batch's StreamIdToInputInfo
    */
  def writeTopicCountSomewhere(topic: Tuple2[Int, StreamInputInfo]): Unit = {

    // store the individual record count for this topic
    val numRecords = topic._2.numRecords

    // store topicName
    val topicName = topic._2.metadata("offsets").asInstanceOf[List[OffsetRange]].head.topic

    // write record count for this topic this batch
    // do something with `topicName` and `numRecords`
  }
 }
	package hadoopsters.spark.scala.monitoring.listeners

	import org.apache.spark.streaming.kafka010.OffsetRange
	import org.apache.spark.streaming.scheduler._
	import org.joda.time.DateTime

	/**
	* :: ExampleStreamingListener ::
	* A simple StreamingListener that accesses summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
	*
	* @param exampleArg You can pass whatever you want to a listener!
	*/
	class ExampleStreamingListener (exampleArg: String) extends StreamingListener {

	// ====================
	// onBatch_ Methods
	// ====================

	/**
	* This method executes when a Spark Streaming batch completes.
	*
	* @param batchCompleted Class having information on the completed batch
	*/
	override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {

	println("I was passed to the listener: " + exampleArg)

	// write performance metrics somewhere
	writeStatsSomewhere(batchCompleted)

	// write offsets (state) somewhere, and numRecords per topic
	processTopicInfo(batchCompleted)
	}

	/**
	* This method executes when a Spark Streaming batch is submitted to the scheduler for execution.
	*
	* @param batchSubmitted Class having information on the completed batch
	*/
	override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
	}

	/**
	* This method executes when a Spark Streaming batch starts.
	*
	* @param batchStarted Class having information on the completed batch
	*/
	override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = {

	}

	// ====================
	// onReceiver_ Methods
	// ====================

	/**
	* This method executes when a Spark Streaming receiver has started.
	*
	* @param receiverStarted Class having information on the receiver (e.g. errors, executor ids, etc)
	*/
	override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = {
	}

	/**
	* This method executes when a Spark Streaming receiver encounters an error.
	*
	* @param receiverError Class having information on the receiver (e.g. errors, executor ids, etc)
	*/
	override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = {
	}

	/**
	* This method executes when a Spark Streaming receiver stops working.
	*
	* @param receiverStopped Class having information on the receiver (e.g. errors, executor ids, etc)
	*/
	override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = {
	}

	// =======================================================================
	// Convenience Methods (for use in onBatch_ methods)
	// =======================================================================

	/**
	* Pulls, parses, and logs the key performance metrics of the Streaming app and logs them somewhere.
	* Processing Time: How many seconds needed to complete this batch (i.e. duration).
	* Scheduling Delay: How many seconds the start time of this bach was delayed.
	* Num Records: The total number of input records from a live stream consumed this batch.
	*
	* @param batch Class having information on the completed batch
	*/
	def writeStatsSomewhere(batch: StreamingListenerBatchCompleted): Unit = {

	// Store the processing time for this batch in seconds
	val processingTime = if (batch.batchInfo.processingDelay.isDefined) {
	batch.batchInfo.processingDelay.get / 1000
	}
	else {
	0
	}

	// Store the scheduling delay for this batch in seconds
	val schedulingDelay = if (batch.batchInfo.schedulingDelay.isDefined && batch.batchInfo.schedulingDelay.get > 0) {
	batch.batchInfo.schedulingDelay.get / 1000
	}
	else {
	0
	}

	// Store the total record count for this batch
	val numRecords = batch.batchInfo.numRecords

	// do something with `processingTime`
	// do something with `schedulingDelay`
	// do something with `numRecords`

	}

	/**
	* A combination method that will process a topic in a batch.
	*
	* @param batch Class having information on the completed batch
	*/
	def processTopicInfo(batch: StreamingListenerBatchCompleted): Unit = {

	// for each stream topic consumed this batch...
	batch.batchInfo.streamIdToInputInfo.foreach(topic => {

	writeTopicOffsetsSomewhere(topic)
	writeTopicCountSomewhere(topic)

	})
	}

	// =======================================================================
	// Topic Methods (designed for use inside of convenience methods)
	// =======================================================================

	/**
	* Takes a topic object and writes the max offset for each partition it contains this batch somewhere.
	*
	* @param topic A topic object within a Batch's StreamIdToInputInfo
	*/
	def writeTopicOffsetsSomewhere(topic: Tuple2[Int, StreamInputInfo]): Unit = {

	// map offset info to OffsetRange objects
	val partitionOffsets = topic._2.metadata("offsets").asInstanceOf[List[OffsetRange]]

	// for every partition's range of offsets
	partitionOffsets.map(offsetRange => {

	// write the new starting offset for each partition in the topic to the state db
	var maxOffset = offsetRange.untilOffset - 1

	// do something with `offsetRange.topic`
	// do something with `offsetRange.partition`
	// do something with `offsetRange.count`
	// do something with `maxOffset`
	})
	}

	/**
	* Takes a topic object and writes the number of records for said topic this batch somewhere.
	*
	* @param topic A topic object within a Batch's StreamIdToInputInfo
	*/
	def writeTopicCountSomewhere(topic: Tuple2[Int, StreamInputInfo]): Unit = {

	// store the individual record count for this topic
	val numRecords = topic._2.numRecords

	// store topicName
	val topicName = topic._2.metadata("offsets").asInstanceOf[List[OffsetRange]].head.topic

	// write record count for this topic this batch
	// do something with `topicName` and `numRecords`
	}
	}