Skip to content

Instantly share code, notes, and snippets.

@charmby
Forked from atamborrino/main.scala
Created April 25, 2017 14:15
Show Gist options
  • Save charmby/be5e381c6f66c4d234bb523df3759a7f to your computer and use it in GitHub Desktop.
Save charmby/be5e381c6f66c4d234bb523df3759a7f to your computer and use it in GitHub Desktop.
Spark Streaming Kafka at-least-once with manual offset commit in Zookeeper (i.e not using Spark Streaming checkpoints that may be not recoverable after code changes)
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import com.samsung.sami.common.Curator
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.{Logging, SparkConf}
import org.scalactic.{One, Bad, Good}
import scala.util.control.NonFatal
object App extends Logging {
def main(args: Array[String]): Unit = {
try {
log.info("Init Iris job")
Curator.initialize()
val conf = new Config(Curator.getInstance())
val sparkConf = makeSparkConf(conf.getMaxRatePerPartition())
val ssc = new StreamingContext(sparkConf, Seconds(conf.getSparkBatchIntervalInSecond()))
val kafkaBrokerList = conf.getKafkaBrokerList()
.getOrElse(throw new RuntimeException("Cannot get KafkaBrokerList from ZK"))
val kafkaParams = Map(
"metadata.broker.list" -> kafkaBrokerList
)
val topic = conf.getKinesisSubcriptionTopic()
.getOrElse(throw new RuntimeException("Cannot get KinesisSubcriptionTopic from ZK"))
val kafkaOffsetManager = new KafkaOffsetsManager(Curator.getInstance())
kafkaOffsetManager.init().badMap { error =>
throw new RuntimeException(s"Error while initiating KafkaOffsetManager: $error")
}
val directKafkaStream = kafkaOffsetManager.getKafkaOffsets() match {
case Good(offsets) =>
val offsetMap = offsets.map { case (partition, offset) =>
TopicAndPartition(topic, partition) -> offset
}
log.info(s"Starting from following partition -> offset map: $offsetMap")
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc,
kafkaParams,
offsetMap,
(msg: MessageAndMetadata[String, String]) => msg.key() -> msg.message()
)
case Bad(One(error)) =>
log.warn(s"Error while trying to get initial Kafka offsets: $error. Fallback to start consuming from latest Kafka offsets.")
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
kafkaParams,
Set(topic)
)
}
var offsetRanges = Array[OffsetRange]()
directKafkaStream.transform { rdd =>
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
// do your job here...
.foreachRDD { rdd =>
// commiting offset
val offsetMap = offsetRanges.map(offsetRange => offsetRange.partition -> offsetRange.untilOffset).toMap
log.debug(s"Committing Kafka offsets $offsetMap")
kafkaOffsetManager.commitKafkaOffsets(offsetMap) match {
case Good(_) => log.debug(s"Successful commit of Kafka offsets $offsetMap")
case Bad(error) => log.error(s"Error while commiting Kafka offsets: $error")
}
}
sys.addShutdownHook(shutdown(ssc))
ssc.start()
ssc.awaitTermination()
} catch {
case failure: Throwable =>
log.error("Fatal unexpected error/failure in driver main", failure)
throw failure
}
}
def makeSparkConf(maybeMaxRatePerPartition: Option[Int]): SparkConf = {
val conf = new SparkConf()
.setAppName("Iris")
.set("spark.streaming.backpressure.enabled", "true")
maybeMaxRatePerPartition.map { maxRate =>
conf.set("spark.streaming.kafka.maxRatePerPartition", maxRate.toString)
}.getOrElse(conf)
}
def shutdown(ssc: StreamingContext): Unit = {
log.info("Going to gracefully shutdown job...")
ssc.stop(stopSparkContext = true, stopGracefully = true)
}
}
class KafkaOffsetsManager(zkClient: CuratorFramework) {
import KafkaOffset._
val zkPath = "/SAMI/iris/kafka/topic-subscription-offsets"
// NB: retries are handled at the Curator level
def init(): Unit Or One[ZKError] = {
try {
zkClient.create()
.creatingParentsIfNeeded()
.forPath(zkPath)
Good(unit)
} catch {
case _: NodeExistsException => Good(unit)
case NonFatal(err) => Bad(One(ZKError(err)))
}
}
def commitKafkaOffsets(offsets: PartitionsOffset): Unit Or One[ZKError] = {
try {
zkClient.setData().forPath(zkPath, serialize(offsets))
Good(unit)
} catch {
case NonFatal(err) => Bad(One(ZKError(err)))
}
}
def getKafkaOffsets(): PartitionsOffset Or One[IrisError] = {
try {
val bytes = zkClient.getData().forPath(zkPath)
deserialize(bytes)
} catch {
case NonFatal(err) => Bad(One(ZKError(err)))
}
}
}
object KafkaOffset {
import play.api.libs.json._
type PartitionsOffset = Map[Int, Long]
implicit val reads: Reads[PartitionsOffset] =
JsPath.read[Map[String, Long]].map(offsets => offsets.map(kv => kv._1.toInt -> kv._2))
implicit val writes: Writes[PartitionsOffset] =
Writes(offsets => Json.toJson(offsets.map(kv => kv._1.toString -> kv._2)))
def serialize(offsets: PartitionsOffset): Array[Byte] = {
Json.toJson(offsets).toString.getBytes("UTF-8")
}
def deserialize(bytes: Array[Byte]): PartitionsOffset Or One[DeserializationError] = {
ErrorHelper.parseJsonBytes[PartitionsOffset](bytes)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment