|
import TensorFlow |
|
|
|
// MARK: - Protocols |
|
|
|
/// Element of an abstract vector space. |
|
protocol Vector { |
|
static func * (lhs: Float, rhs: Self) -> Self |
|
static func + (lhs: Self, rhs: Self) -> Self |
|
static var zero: Self { get } |
|
} |
|
|
|
/// An environment containing all the types and linear operations required for defining |
|
/// derivatives. |
|
protocol Derivative { |
|
/// The derivative of a `Float`-valued function. |
|
associatedtype DFloat: Vector |
|
|
|
/// The derivative of a `(Float, Float)`-valued function. |
|
associatedtype DFloat2: Vector |
|
|
|
/// Returns the derivative of `f(_).0`, given the derivative `t` of `f(_)`. |
|
static func d0(_ t: DFloat2) -> DFloat |
|
|
|
/// Returns the derivative of `f(_).1`, given the derivative `t` of `f(_)`. |
|
static func d1(_ t: DFloat2) -> DFloat |
|
|
|
/// Returns the derivative of `(f0(_), f1(_))` given the derivative `t0` of `f0` and the |
|
/// derivative `t1` of `f1`. |
|
static func dInit(_ t0: DFloat, _ t1: DFloat) -> DFloat2 |
|
} |
|
|
|
// It is unfortunate that I have to define `associatedtype D*` for all types that I want to use |
|
// in differentiation. |
|
|
|
|
|
// MARK: - Derivative functions |
|
|
|
// These look almost exactly like JVPs. |
|
|
|
// It should be easy to automatically produce them with a forward mode autodiff transform. |
|
|
|
// The exciting thing is that we can evaluate them in any differentiation mode we want, by |
|
// defining different `Derivative`s. |
|
|
|
extension Derivative { |
|
static func sum(_ x: Float, _ y: Float, _ tIn: DFloat2) -> (value: Float, tOut: DFloat) { |
|
(x + y, d0(tIn) + d1(tIn)) |
|
} |
|
|
|
static func product(_ x: Float, _ y: Float, _ tIn: DFloat2) -> (value: Float, tOut: DFloat) { |
|
(x * y, y * d0(tIn) + x * d1(tIn)) |
|
} |
|
|
|
static func square(_ x: Float, _ tIn: DFloat) -> (value: Float, tOut: DFloat) { |
|
product(x, x, dInit(tIn, tIn)) |
|
} |
|
|
|
/// x^2 + 3x |
|
static func example1(_ x: Float, _ tIn: DFloat) -> (value: Float, tOut: DFloat) { |
|
let (v1, t1) = square(x, tIn) |
|
let (v2, t2) = product(3, x, dInit(DFloat.zero, tIn)) |
|
return sum(v1, v2, dInit(t1, t2)) |
|
} |
|
|
|
/// x^2 + 10y^2 + xy |
|
static func example2(_ x: Float, _ y: Float, _ tIn: DFloat2) -> (value: Float, tOut: DFloat) { |
|
let dx = d0(tIn) |
|
let dy = d1(tIn) |
|
let (xSq, dxSq) = square(x, dx) |
|
let (ySq, dySq) = square(y, dy) |
|
let (ySq10, dySq10) = product(10, ySq, dInit(DFloat.zero, dySq)) |
|
let (xy, dxy) = product(x, y, dInit(dx, dy)) |
|
let (r1, dr1) = sum(xSq, ySq10, dInit(dxSq, dySq10)) |
|
return sum(r1, xy, dInit(dr1, dxy)) |
|
} |
|
} |
|
|
|
|
|
// MARK: - Derivative modes |
|
|
|
enum ForwardTangent: Derivative { |
|
struct DFloat: Vector { |
|
var t: Float |
|
init(_ t: Float) { self.t = t } |
|
static func * (lhs: Float, rhs: Self) -> Self { .init(lhs * rhs.t) } |
|
static func + (lhs: Self, rhs: Self) -> Self { .init(lhs.t + rhs.t) } |
|
static var zero: Self { .init(0) } |
|
} |
|
|
|
struct DFloat2: Vector { |
|
var t0: Float |
|
var t1: Float |
|
init(_ t0: Float, _ t1: Float) { self.t0 = t0; self.t1 = t1 } |
|
static func * (lhs: Float, rhs: Self) -> Self { .init(lhs * rhs.t0, lhs * rhs.t1) } |
|
static func + (lhs: Self, rhs: Self) -> Self { .init(lhs.t0 + rhs.t0, lhs.t1 + rhs.t1) } |
|
static var zero: Self { .init(0, 0) } |
|
} |
|
|
|
static func d0(_ t: DFloat2) -> DFloat { .init(t.t0) } |
|
static func d1(_ t: DFloat2) -> DFloat { .init(t.t1) } |
|
static func dInit(_ t0: DFloat, _ t1: DFloat) -> DFloat2 { .init(t0.t, t1.t) } |
|
} |
|
|
|
enum ForwardJacobian: Derivative { |
|
struct DFloat: Vector { |
|
// Shape [1, n], or the zero scalar. |
|
var t: Tensor<Float> |
|
init(_ t: Tensor<Float>) { |
|
precondition(t == Tensor(0) || (t.rank == 2 && t.shape[0] == 1)) |
|
self.t = t |
|
} |
|
static func * (lhs: Float, rhs: Self) -> Self { .init(lhs * rhs.t) } |
|
static func + (lhs: Self, rhs: Self) -> Self { .init(lhs.t + rhs.t) } |
|
static var zero: Self { .init(Tensor(0)) } |
|
} |
|
struct DFloat2: Vector { |
|
// Shape [2, n], or the zero scalar. |
|
var t: Tensor<Float> |
|
init(_ t: Tensor<Float>) { |
|
precondition(t == Tensor(0) || (t.rank == 2 && t.shape[0] == 2)) |
|
self.t = t |
|
} |
|
static func * (lhs: Float, rhs: Self) -> Self { .init(lhs * rhs.t) } |
|
static func + (lhs: Self, rhs: Self) -> Self { .init(lhs.t + rhs.t) } |
|
static var zero: Self { .init(Tensor(0)) } |
|
} |
|
|
|
static func d0(_ t: DFloat2) -> DFloat { |
|
.init(t.t[0].expandingShape(at: 0)) |
|
} |
|
static func d1(_ t: DFloat2) -> DFloat { |
|
.init(t.t[1].expandingShape(at: 0)) |
|
} |
|
static func dInit(_ t0: DFloat, _ t1: DFloat) -> DFloat2 { |
|
if t0.t == Tensor(0) && t1.t == Tensor(0) { return .init(Tensor(0)) } |
|
let row0 = t0.t == Tensor(0) ? Tensor(zerosLike: t1.t) : t0.t |
|
let row1 = t1.t == Tensor(0) ? Tensor(zerosLike: t0.t) : t1.t |
|
return .init(Tensor(concatenating: [row0, row1])) |
|
} |
|
} |
|
|
|
enum Reverse<Base: Derivative, In: Vector>: Derivative { |
|
struct Pullback<Out: Vector>: Vector { |
|
var f: (Out) -> In |
|
init(_ f: @escaping (Out) -> In) { self.f = f } |
|
static func * (lhs: Float, rhs: Self) -> Self { |
|
.init({ rhs.f(lhs * $0) }) |
|
} |
|
static func + (lhs: Self, rhs: Self) -> Self { |
|
.init({ lhs.f($0) + rhs.f($0) }) |
|
} |
|
static var zero: Self { .init({ _ in .zero }) } |
|
} |
|
|
|
typealias DFloat = Pullback<Base.DFloat> |
|
typealias DFloat2 = Pullback<Base.DFloat2> |
|
|
|
static func d0(_ t: DFloat2) -> DFloat { |
|
.init({ t.f(Base.dInit($0, .zero)) }) |
|
} |
|
static func d1(_ t: DFloat2) -> DFloat { |
|
.init({ t.f(Base.dInit(.zero, $0)) }) |
|
} |
|
static func dInit(_ t0: DFloat, _ t1: DFloat) -> DFloat2 { |
|
.init({ t0.f(Base.d0($0)) + t1.f(Base.d1($0)) }) |
|
} |
|
} |
|
|
|
// MARK: - Example invocations |
|
|
|
// Optimizes to plain float operations. |
|
func example1ForwardDerivative(_ x: Float) -> Float { |
|
let (_, d) = ForwardTangent.example1(x, .init(1)) |
|
return d.t |
|
} |
|
|
|
// Optimizes to: |
|
// - plain float operations that produce the result |
|
// - a bunch of closure allocations and deallocations that are not used in the result (!?) |
|
func example1ReverseDerivative(_ x: Float) -> Float { |
|
typealias System = Reverse<ForwardTangent, ForwardTangent.DFloat> |
|
let (_, d) = System.example1(x, .init({ $0 })) |
|
return d.f(.init(1)).t |
|
} |
|
|
|
func example2ReverseDerivative(_ x: Float, _ y: Float) -> ForwardTangent.DFloat2 { |
|
typealias System = Reverse<ForwardTangent, ForwardTangent.DFloat2> |
|
let (_, d) = System.example2(x, y, .init({ $0 })) |
|
return d.f(.init(1)) |
|
} |
|
|
|
func example2ForwardJacobian(_ x: Float, _ y: Float) -> Tensor<Float> { |
|
let (_, d) = ForwardJacobian.example2(x, y, .init(eye(rowCount: 2))) |
|
return d.t |
|
} |
|
|
|
func example2ReverseJacobian(_ x: Float, _ y: Float) -> Tensor<Float> { |
|
typealias System = Reverse<ForwardJacobian, ForwardJacobian.DFloat2> |
|
let (_, d) = System.example2(x, y, .init({ $0 })) |
|
return d.f(.init(eye(rowCount: 1))).t |
|
} |
|
|
|
print(example1ForwardDerivative(2)) |
|
print(example1ReverseDerivative(2)) |
|
print(example2ReverseDerivative(1, 5)) |
|
print(example2ForwardJacobian(1, 5)) |
|
print(example2ReverseJacobian(1, 5)) |
I don't remember why I am reading this. I just remember encountering it and adding it to me "To Read".
Do excuse me if I am commenting unwelcomely. I am almost certainly going to say things you already know.
This is very neat.
I haven't seen this whole overall approach before, but I am not that well-versed.
I have seen the approach for computing jacobians via propagation of a matrix called chunked mode before, though idk if that is a standard name.
E.g. in ForwardDiff.jl
It's legit and should be faster because matmul is faster that a loop of matrix vector multiplications, (once big enough Strassen's Algorithm applies).
Further improves cache locality, makes redunant work more obvious to the compiler for common subexpression elimination etc.
Pairs nicely with work on sparsity, which comutes the jacobian on a compressed basis (though you cna do that via looping also)
In general it makes sense because one is constructing a functional form of the jacobian matrix.
Basis transforms are applied to matrixes via matrix-matrix products.
the pushforward function is the function form of that linear operator that can be written as a
pushforward(x) = J*x
which is identical code regardless of ifx
is a matrix or a vector.I think so?
Seems you just need
zero
,+
and*
defined.Which is well defined on polynomials.
Full details of that is in chapter 13 of Andreas Griewank and Andrea Walther. 2008. Evaluating Derivatives: Principles and Techniques of Algorithmic Differentiation (Second. ed.).
but the actual operations are not obscure,
addition is term addition of coefficients, multiplication is convolution of coeffients (or just multiple them by hand and throw away higher order terms)
See e.g TaylorSeries.jl a julia package that does higher order AD like this.
So one can define a HigherOrderTangent that does that, and it looks like that would just work with your code?
Everything gets fiddly though one extending to work not on scalars but on vector or structs.