Skip to content

Instantly share code, notes, and snippets.

@liancheng
Last active January 8, 2018 06:45
Show Gist options
  • Save liancheng/19eac168295a907fbfd5fcb517c09580 to your computer and use it in GitHub Desktop.
Save liancheng/19eac168295a907fbfd5fcb517c09580 to your computer and use it in GitHub Desktop.
Simple Scala DSL for constructing Apache Arrow schemas.
package example
import scala.collection.JavaConverters._
import scala.language.implicitConversions
import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
trait FieldBuilder {
def named(name: String): Field
def ! : this.type = {
nullable = false
this
}
def ? : this.type = {
nullable = true
this
}
protected def isNullable: Boolean = nullable
private var nullable: Boolean = true
}
trait LowPriorityImplicits {
implicit class FieldDSL(name: String) {
def ~ (builder: FieldBuilder): Field = builder.named(name)
}
implicit class PrimitiveFieldBuilder(arrowType: ArrowType.PrimitiveType) extends FieldBuilder {
override def named(name: String): Field = {
val fieldType = new FieldType(isNullable, arrowType, null)
new Field(name, fieldType, null)
}
}
class ListFieldBuilder(elementTypeBuilder: FieldBuilder) extends FieldBuilder {
override def named(name: String): Field = {
val fieldType = new FieldType(isNullable, ArrowType.List.INSTANCE, null)
new Field(name, fieldType, Seq(elementTypeBuilder.named(null)).asJava)
}
}
class StructFieldBuilder(fields: Seq[Field]) extends FieldBuilder {
override def named(name: String): Field = {
val fieldType = new FieldType(isNullable, ArrowType.Struct.INSTANCE, null)
new Field(name, fieldType, fields.asJava)
}
}
}
object dsl extends LowPriorityImplicits {
def ArrowSchema(fields: Field*): Schema = new Schema(fields.asJava)
def ArrowStruct(fields: Field*): StructFieldBuilder = new StructFieldBuilder(fields)
def ArrowList(elementType: FieldBuilder): ListFieldBuilder = new ListFieldBuilder(elementType)
}
import scala.collection.JavaConverters._
import example.dsl._
import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema}
// Here we are using the address book example schema used in the Twitter Parquet blog post.
// https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html
// Using the Scala DSL
ArrowSchema(
// A non-nullable string field.
"owner" ~ ArrowType.Utf8.INSTANCE.!,
// A non-nullable list field whose elements are non-nullable strings.
"ownerPhoneNumbers" ~ ArrowList(ArrowType.Utf8.INSTANCE.!).!,
// A non-nullable list field whose elements are non-nullable structs.
"contacts" ~ ArrowList(
// A non-nullable struct type.
ArrowStruct(
// A non-nullable string field.
"name" ~ ArrowType.Utf8.INSTANCE.!,
// A nullable string field.
"phoneNumber" ~ ArrowType.Utf8.INSTANCE.?
).!
).!
)
// Using the Apache Arrow Java API in Scala
new Schema(
Seq(
// A non-nullable string field.
new Field("owner", new FieldType(false, ArrowType.Utf8.INSTANCE, null), null),
// A non-nullable list field whose elements are non-nullable strings.
new Field(
"ownerPhoneNumbers",
new FieldType(false, ArrowType.List.INSTANCE, null),
Seq(new Field(null, new FieldType(false, ArrowType.Utf8.INSTANCE, null), null)).asJava
),
// A non-nullable list field whose elements are non-nullable structs.
new Field(
"contacts",
new FieldType(false, ArrowType.List.INSTANCE, null),
Seq(
// A non-nullable struct
new Field(
null,
new FieldType(false, ArrowType.Struct.INSTANCE, null),
Seq(
// A non-nullable string field.
new Field("name", new FieldType(false, ArrowType.Utf8.INSTANCE, null), null),
// A nullable string field.
new Field("phoneNumber", FieldType.nullable(ArrowType.Utf8.INSTANCE), null)
).asJava
)
).asJava
)
).asJava
)
@liancheng
Copy link
Author

Field type metadata is not covered yet but should be easy to add.

@advancedxy
Copy link

Nice writeup.

But I think more comments about how the ArrowType->FieldBuilder and FieldDSL implicit take effect should be addressed.

It took me a while to figure it out.

@liancheng
Copy link
Author

@advancedxy Refactored it a little bit and removed the `ArrowType->FieldBuilder` implicit conversion. Hopefully, it's easier to reason about this time.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment