charmby · April 25, 2017 14:15
diff --git a/main.scala b/main.scala
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
 import kafka.serializer.StringDecoder
 import com.samsung.sami.common.Curator
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.kafka._
 import org.apache.spark.{Logging, SparkConf}
 import org.scalactic.{One, Bad, Good}

 import scala.util.control.NonFatal

 object App extends Logging {

  def main(args: Array[String]): Unit = {
    try {
      log.info("Init Iris job")

      Curator.initialize()
      val conf = new Config(Curator.getInstance())

      val sparkConf = makeSparkConf(conf.getMaxRatePerPartition())

      val ssc = new StreamingContext(sparkConf, Seconds(conf.getSparkBatchIntervalInSecond()))

      val kafkaBrokerList = conf.getKafkaBrokerList()
        .getOrElse(throw new RuntimeException("Cannot get KafkaBrokerList from ZK"))
      val kafkaParams = Map(
        "metadata.broker.list" -> kafkaBrokerList
      )
      val topic = conf.getKinesisSubcriptionTopic()
        .getOrElse(throw new RuntimeException("Cannot get KinesisSubcriptionTopic from ZK"))

      val kafkaOffsetManager = new KafkaOffsetsManager(Curator.getInstance())
      kafkaOffsetManager.init().badMap { error =>
        throw new RuntimeException(s"Error while initiating KafkaOffsetManager: $error")
      }

      val directKafkaStream = kafkaOffsetManager.getKafkaOffsets() match {
        case Good(offsets) =>
          val offsetMap = offsets.map { case (partition, offset) =>
            TopicAndPartition(topic, partition) -> offset
          }
          log.info(s"Starting from following partition -> offset map: $offsetMap")

          KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
            ssc,
            kafkaParams,
            offsetMap,
            (msg: MessageAndMetadata[String, String]) => msg.key() -> msg.message()
          )

        case Bad(One(error)) =>
          log.warn(s"Error while trying to get initial Kafka offsets: $error. Fallback to start consuming from latest Kafka offsets.")
          KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
            ssc,
            kafkaParams,
            Set(topic)
          )
      }

      var offsetRanges = Array[OffsetRange]()

      directKafkaStream.transform { rdd =>
        offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        rdd
      }
      // do your job here...
      .foreachRDD { rdd =>
        // commiting offset
        val offsetMap = offsetRanges.map(offsetRange => offsetRange.partition -> offsetRange.untilOffset).toMap
        log.debug(s"Committing Kafka offsets $offsetMap")

        kafkaOffsetManager.commitKafkaOffsets(offsetMap) match {
          case Good(_) => log.debug(s"Successful commit of Kafka offsets $offsetMap")
          case Bad(error) => log.error(s"Error while commiting Kafka offsets: $error")
        }
      }

      sys.addShutdownHook(shutdown(ssc))
      ssc.start()
      ssc.awaitTermination()
    } catch {
      case failure: Throwable =>
        log.error("Fatal unexpected error/failure in driver main", failure)
        throw failure
    }
  }
  
  def makeSparkConf(maybeMaxRatePerPartition: Option[Int]): SparkConf = {
    val conf = new SparkConf()
      .setAppName("Iris")
      .set("spark.streaming.backpressure.enabled", "true")

    maybeMaxRatePerPartition.map { maxRate =>
      conf.set("spark.streaming.kafka.maxRatePerPartition", maxRate.toString)
    }.getOrElse(conf)
  }

  def shutdown(ssc: StreamingContext): Unit = {
    log.info("Going to gracefully shutdown job...")
    ssc.stop(stopSparkContext = true, stopGracefully = true)
  }

 }

 class KafkaOffsetsManager(zkClient: CuratorFramework) {
  import KafkaOffset._

  val zkPath = "/SAMI/iris/kafka/topic-subscription-offsets"

  // NB: retries are handled at the Curator level

  def init(): Unit Or One[ZKError] = {
    try {
      zkClient.create()
        .creatingParentsIfNeeded()
        .forPath(zkPath)
      Good(unit)
    } catch {
      case _: NodeExistsException => Good(unit)
      case NonFatal(err) => Bad(One(ZKError(err)))
    }
  }

  def commitKafkaOffsets(offsets: PartitionsOffset): Unit Or One[ZKError] = {
    try {
      zkClient.setData().forPath(zkPath, serialize(offsets))
      Good(unit)
    } catch {
      case NonFatal(err) => Bad(One(ZKError(err)))
    }
  }

  def getKafkaOffsets(): PartitionsOffset Or One[IrisError] = {
    try {
      val bytes = zkClient.getData().forPath(zkPath)
      deserialize(bytes)
    } catch {
      case NonFatal(err) => Bad(One(ZKError(err)))
    }
  }

 }

 object KafkaOffset {
  import play.api.libs.json._

  type PartitionsOffset = Map[Int, Long]

  implicit val reads: Reads[PartitionsOffset] =
    JsPath.read[Map[String, Long]].map(offsets => offsets.map(kv => kv._1.toInt -> kv._2))
  implicit val writes: Writes[PartitionsOffset] =
    Writes(offsets => Json.toJson(offsets.map(kv => kv._1.toString -> kv._2)))

  def serialize(offsets: PartitionsOffset): Array[Byte] = {
    Json.toJson(offsets).toString.getBytes("UTF-8")
  }

  def deserialize(bytes: Array[Byte]): PartitionsOffset Or One[DeserializationError] = {
    ErrorHelper.parseJsonBytes[PartitionsOffset](bytes)
  }
 }
	import kafka.common.TopicAndPartition
	import kafka.message.MessageAndMetadata
	import kafka.serializer.StringDecoder
	import com.samsung.sami.common.Curator
	import org.apache.spark.streaming._
	import org.apache.spark.streaming.kafka._
	import org.apache.spark.{Logging, SparkConf}
	import org.scalactic.{One, Bad, Good}

	import scala.util.control.NonFatal

	object App extends Logging {

	def main(args: Array[String]): Unit = {
	try {
	log.info("Init Iris job")

	Curator.initialize()
	val conf = new Config(Curator.getInstance())

	val sparkConf = makeSparkConf(conf.getMaxRatePerPartition())

	val ssc = new StreamingContext(sparkConf, Seconds(conf.getSparkBatchIntervalInSecond()))

	val kafkaBrokerList = conf.getKafkaBrokerList()
	.getOrElse(throw new RuntimeException("Cannot get KafkaBrokerList from ZK"))
	val kafkaParams = Map(
	"metadata.broker.list" -> kafkaBrokerList
	)
	val topic = conf.getKinesisSubcriptionTopic()
	.getOrElse(throw new RuntimeException("Cannot get KinesisSubcriptionTopic from ZK"))

	val kafkaOffsetManager = new KafkaOffsetsManager(Curator.getInstance())
	kafkaOffsetManager.init().badMap { error =>
	throw new RuntimeException(s"Error while initiating KafkaOffsetManager: $error")
	}

	val directKafkaStream = kafkaOffsetManager.getKafkaOffsets() match {
	case Good(offsets) =>
	val offsetMap = offsets.map { case (partition, offset) =>
	TopicAndPartition(topic, partition) -> offset
	}
	log.info(s"Starting from following partition -> offset map: $offsetMap")

	KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
	ssc,
	kafkaParams,
	offsetMap,
	(msg: MessageAndMetadata[String, String]) => msg.key() -> msg.message()
	)

	case Bad(One(error)) =>
	log.warn(s"Error while trying to get initial Kafka offsets: $error. Fallback to start consuming from latest Kafka offsets.")
	KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
	ssc,
	kafkaParams,
	Set(topic)
	)
	}

	var offsetRanges = Array[OffsetRange]()

	directKafkaStream.transform { rdd =>
	offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
	rdd
	}
	// do your job here...
	.foreachRDD { rdd =>
	// commiting offset
	val offsetMap = offsetRanges.map(offsetRange => offsetRange.partition -> offsetRange.untilOffset).toMap
	log.debug(s"Committing Kafka offsets $offsetMap")

	kafkaOffsetManager.commitKafkaOffsets(offsetMap) match {
	case Good(_) => log.debug(s"Successful commit of Kafka offsets $offsetMap")
	case Bad(error) => log.error(s"Error while commiting Kafka offsets: $error")
	}
	}

	sys.addShutdownHook(shutdown(ssc))
	ssc.start()
	ssc.awaitTermination()
	} catch {
	case failure: Throwable =>
	log.error("Fatal unexpected error/failure in driver main", failure)
	throw failure
	}
	}

	def makeSparkConf(maybeMaxRatePerPartition: Option[Int]): SparkConf = {
	val conf = new SparkConf()
	.setAppName("Iris")
	.set("spark.streaming.backpressure.enabled", "true")

	maybeMaxRatePerPartition.map { maxRate =>
	conf.set("spark.streaming.kafka.maxRatePerPartition", maxRate.toString)
	}.getOrElse(conf)
	}

	def shutdown(ssc: StreamingContext): Unit = {
	log.info("Going to gracefully shutdown job...")
	ssc.stop(stopSparkContext = true, stopGracefully = true)
	}

	}

	class KafkaOffsetsManager(zkClient: CuratorFramework) {
	import KafkaOffset._

	val zkPath = "/SAMI/iris/kafka/topic-subscription-offsets"

	// NB: retries are handled at the Curator level

	def init(): Unit Or One[ZKError] = {
	try {
	zkClient.create()
	.creatingParentsIfNeeded()
	.forPath(zkPath)
	Good(unit)
	} catch {
	case _: NodeExistsException => Good(unit)
	case NonFatal(err) => Bad(One(ZKError(err)))
	}
	}

	def commitKafkaOffsets(offsets: PartitionsOffset): Unit Or One[ZKError] = {
	try {
	zkClient.setData().forPath(zkPath, serialize(offsets))
	Good(unit)
	} catch {
	case NonFatal(err) => Bad(One(ZKError(err)))
	}
	}

	def getKafkaOffsets(): PartitionsOffset Or One[IrisError] = {
	try {
	val bytes = zkClient.getData().forPath(zkPath)
	deserialize(bytes)
	} catch {
	case NonFatal(err) => Bad(One(ZKError(err)))
	}
	}

	}

	object KafkaOffset {
	import play.api.libs.json._

	type PartitionsOffset = Map[Int, Long]

	implicit val reads: Reads[PartitionsOffset] =
	JsPath.read[Map[String, Long]].map(offsets => offsets.map(kv => kv._1.toInt -> kv._2))
	implicit val writes: Writes[PartitionsOffset] =
	Writes(offsets => Json.toJson(offsets.map(kv => kv._1.toString -> kv._2)))

	def serialize(offsets: PartitionsOffset): Array[Byte] = {
	Json.toJson(offsets).toString.getBytes("UTF-8")
	}

	def deserialize(bytes: Array[Byte]): PartitionsOffset Or One[DeserializationError] = {
	ErrorHelper.parseJsonBytes[PartitionsOffset](bytes)
	}
	}