Skip to content

Instantly share code, notes, and snippets.

@snowzurfer
Last active July 12, 2024 07:39
Show Gist options
  • Save snowzurfer/1e90678d0d23d3295dda9a0cc93b2453 to your computer and use it in GitHub Desktop.
Save snowzurfer/1e90678d0d23d3295dda9a0cc93b2453 to your computer and use it in GitHub Desktop.
3D world points from ARKit depth
import ARKit
import SceneKit
let horizontalPoints = 256 / 2
let verticalPoints = 192 / 2
var depthNodes = [SCNNode]()
var parentDebugNodes = SCNNode()
var sceneView: ARSCNView!
// Somewhere during setup
func setup() {
let configuration = ARWorldTrackingConfiguration()
configuration.frameSemantics = .smoothedSceneDepth
sceneView.session.run(configuration)
sceneView.scene.rootNode.addChildNode(parentDebugNodes)
let sizeGeomPredictions = 0.005
let geom = SCNBox(width: sizeGeomPredictions, height: sizeGeomPredictions, length: sizeGeomPredictions, chamferRadius: 0)
geom.firstMaterial?.diffuse.contents = UIColor.green
for _ in 0..<(horizontalPoints * verticalPoints) {
let node = SCNNode(geometry: geom)
self.parentDebugNodes.addChildNode(node)
self.depthNodes.append(node)
}
}
func session(_ session: ARSession, didUpdate frame: ARFrame) {
guard let smoothedDepth = frame.smoothedSceneDepth?.depthMap else {
return
}
let capturedImage = frame.capturedImage
let lockFlags = CVPixelBufferLockFlags.readOnly
CVPixelBufferLockBaseAddress(smoothedDepth, lockFlags)
defer {
CVPixelBufferUnlockBaseAddress(smoothedDepth, lockFlags)
}
let baseAddress = CVPixelBufferGetBaseAddressOfPlane(smoothedDepth, 0)!
let depthByteBuffer = baseAddress.assumingMemoryBound(to: Float32.self)
// The `.size` accessor simply read the CVPixelBuffer's width and height in pixels.
//
// They are the same ratio:
// 1920 x 1440 = 1440 x 1920 = 0.75
let depthMapSize = smoothedDepth.size
// 192 x 256 = 0.75
let capturedImageSize = capturedImage.size
var cameraIntrinsics = frame.camera.intrinsics
let depthResolution = simd_float2(x: Float(depthMapSize.x), y: Float(depthMapSize.y))
let scaleRes = simd_float2(x: Float(capturedImageSize.x) / depthResolution.x,
y: Float(capturedImageSize.y) / depthResolution.y )
// Make the camera intrinsics be with respect to Depth.
cameraIntrinsics[0][0] /= scaleRes.x
cameraIntrinsics[1][1] /= scaleRes.y
cameraIntrinsics[2][0] /= scaleRes.x
cameraIntrinsics[2][1] /= scaleRes.y
// This will be the long size, because of the rotation
let horizontalStep = Float(depthMapSize.x) / Float(self.horizontalPoints)
let halfHorizontalStep = horizontalStep / 2
// This will be the short size, because of the rotation
let verticalStep = Float(depthMapSize.y) / Float(self.verticalPoints)
let halfVerticalStep = verticalStep / 2
for h in 0..<horizontalPoints {
for v in 0..<verticalPoints {
let x = Float(h) * horizontalStep + halfHorizontalStep
let y = Float(v) * verticalStep + halfVerticalStep
let depthMapPoint = simd_float2(x, y)
// Sample depth
let metricDepth = sampleDepthRaw(depthByteBuffer, size: depthMapSize, at: .init(depthMapPoint))
let wp = worldPoint(depthMapPixelPoint: depthMapPoint,
depth: metricDepth,
cameraIntrinsics: cameraIntrinsics,
// This is crucial: you need to always use the view matrix for Landscape Right.
viewMatrixInverted: frame.camera.viewMatrix(for: .landscapeRight).inverse)
let node = self.depthNodes[v * horizontalPoints + h]
node.simdWorldPosition = wp
}
}
}
func sampleDepthRaw(_ pointer: UnsafeMutablePointer<Float32>, size: SIMD2<Int>, at: SIMD2<Int>) -> Float {
let baseAddressIndex = at.y * size.x + at.x
return Float(pointer[baseAddressIndex])
}
// This also works. Adapted from:
// https://developer.apple.com/forums/thread/676368
func worldPoint(depthMapPixelPoint: SIMD2<Float>, depth: Float, cameraIntrinsicsInverted: simd_float3x3, viewMatrixInverted: simd_float4x4) -> SIMD3<Float> {
let localPoint = cameraIntrinsicsInverted * simd_float3(depthMapPixelPoint, 1) * -depth
let localPointSwappedX = simd_float3(-localPoint.x, localPoint.y, localPoint.z)
let worldPoint = viewMatrixInverted * simd_float4(localPointSwappedX, 1)
return (worldPoint / worldPoint.w)[SIMD3(0,1,2)]
}
// This one is adapted from:
// http://nicolas.burrus.name/index.php/Research/KinectCalibration
func worldPoint(depthMapPixelPoint: SIMD2<Float>, depth: Float, cameraIntrinsics: simd_float3x3, viewMatrixInverted: simd_float4x4) -> SIMD3<Float> {
let xrw = ((depthMapPixelPoint.x - cameraIntrinsics[2][0]) * depth / cameraIntrinsics[0][0])
let yrw = (depthMapPixelPoint.y - cameraIntrinsics[2][1]) * depth / cameraIntrinsics[1][1]
// Y is UP in camera space, vs it being DOWN in image space.
let localPoint = simd_float3(xrw, -yrw, -depth)
let worldPoint = viewMatrixInverted * simd_float4(localPoint, 1)
return simd_float3(worldPoint.x, worldPoint.y, worldPoint.z)
}
extension CVPixelBuffer {
var size: SIMD2<Int> {
let width = CVPixelBufferGetWidthOfPlane(self, 0)
let height = CVPixelBufferGetHeightOfPlane(self, 0)
return .init(x: width, y: height)
}
}
@fabio914
Copy link

fabio914 commented Feb 7, 2022

Hi 👋 I noticed a few issues when I was playing with your code.

This line should be replaced with:

         let localPoint = cameraIntrinsicsInverted * simd_float3(depthMapPixelPoint, 1) * -depth

We're also missing an extension on CVPixelBuffer:

extension CVPixelBuffer {

    var size: SIMD2<Int> {
        let width = CVPixelBufferGetWidthOfPlane(self, 0)
        let height = CVPixelBufferGetHeightOfPlane(self, 0)
        return  .init(x: width, y: height)
    }
}

For anyone else trying this code, make sure to run this with this ARKit configuration:

        let configuration = ARWorldTrackingConfiguration()
        configuration.frameSemantics = .smoothedSceneDepth
        sceneView.session.run(configuration)

@snowzurfer
Copy link
Author

@fabio914 thank so much for the feedback and trying out the code.
You're right, those parts are missing as I didn't intend this to be "ready-to-use".
I'll fix the typos though, and add your suggestions so that it's more complete.

@fabio914
Copy link

fabio914 commented Feb 7, 2022

Btw @snowzurfer, I've managed to build a version with color.

IMG_1899

My version is still not ideal but here's the updated code if you're interested:

   func session(_ session: ARSession, didUpdate frame: ARFrame) {
        guard let smoothedDepth = frame.smoothedSceneDepth?.depthMap else {
            return
        }
        let capturedImage = frame.capturedImage

        let lockFlags = CVPixelBufferLockFlags.readOnly
        CVPixelBufferLockBaseAddress(smoothedDepth, lockFlags)
        defer {
            CVPixelBufferUnlockBaseAddress(smoothedDepth, lockFlags)
        }

        CVPixelBufferLockBaseAddress(capturedImage, lockFlags)
        defer {
            CVPixelBufferUnlockBaseAddress(capturedImage, lockFlags)
        }

        let baseAddress = CVPixelBufferGetBaseAddressOfPlane(smoothedDepth, 0)!
        let depthByteBuffer = baseAddress.assumingMemoryBound(to: Float32.self)

        let lumaBaseAddress = CVPixelBufferGetBaseAddressOfPlane(capturedImage, 0)!
        let lumaByteBuffer = lumaBaseAddress.assumingMemoryBound(to: UInt8.self)

        let chromaBaseAddress = CVPixelBufferGetBaseAddressOfPlane(capturedImage, 1)!
        let chromaByteBuffer = chromaBaseAddress.assumingMemoryBound(to: UInt16.self)

        // The `.size` accessor simply read the CVPixelBuffer's width and height in pixels.
        //
        // They are the same ratio:
        // 1920 x 1440 = 1440 x 1920 = 0.75
        let depthMapSize = smoothedDepth.size(ofPlane: 0)
        // 192 x 256 = 0.75
        let capturedImageSize = capturedImage.size(ofPlane: 0)
        let lumaSize = capturedImageSize
        let chromaSize = capturedImage.size(ofPlane: 1)

        var cameraIntrinsics = frame.camera.intrinsics
        let depthResolution = simd_float2(x: Float(depthMapSize.x), y: Float(depthMapSize.y))
        let scaleRes = simd_float2(x: Float(capturedImageSize.x) / depthResolution.x,
                                   y: Float(capturedImageSize.y) / depthResolution.y )
        // Make the camera intrinsics be with respect to Depth.
        cameraIntrinsics[0][0] /= scaleRes.x
        cameraIntrinsics[1][1] /= scaleRes.y

        cameraIntrinsics[2][0] /= scaleRes.x
        cameraIntrinsics[2][1] /= scaleRes.y

        // This will be the long size, because of the rotation
        let horizontalStep = Float(depthMapSize.x) / Float(self.horizontalPoints)
        let halfHorizontalStep = horizontalStep / 2
        // This will be the short size, because of the rotation
        let verticalStep = Float(depthMapSize.y) / Float(self.verticalPoints)
        let halfVerticalStep = verticalStep / 2

        let depthWidthToLumaWidth = Float(lumaSize.x)/Float(depthMapSize.x)
        let depthHeightToLumaHeight = Float(lumaSize.y)/Float(depthMapSize.y)

        let depthWidthToChromaWidth = Float(chromaSize.x)/Float(depthMapSize.x)
        let depthHeightToChromaHeight = Float(chromaSize.y)/Float(depthMapSize.y)

         for h in 0..<horizontalPoints {
            for v in 0..<verticalPoints {
                let x = Float(h) * horizontalStep + halfHorizontalStep
                let y = Float(v) * verticalStep + halfVerticalStep
                let depthMapPoint = simd_float2(x, y)

                // Sample depth
                let metricDepth = sampleDepthRaw(depthByteBuffer, size: depthMapSize, at: .init(depthMapPoint))

                let wp = worldPoint(depthMapPixelPoint: depthMapPoint,
                                    depth: metricDepth,
                                    cameraIntrinsics: cameraIntrinsics,
                                    // This is crucial: you need to always use the view matrix for Landscape Right.
                                    viewMatrixInverted: frame.camera.viewMatrix(for: .landscapeRight).inverse)


                // Sample Image
                let lumaPoint = simd_float2(x * depthWidthToLumaWidth, y * depthHeightToLumaHeight)
                let luma = sampleLuma(lumaByteBuffer, size: lumaSize, at: .init(lumaPoint))

                let chromaPoint = simd_float2(x * depthWidthToChromaWidth, y * depthHeightToChromaHeight)
                let chroma = sampleChroma(chromaByteBuffer, size: chromaSize, at: .init(chromaPoint))

                let cr = UInt8(chroma >> 8)
                let cb = UInt8((chroma << 8) >> 8)

                let node = self.depthNodes[v * horizontalPoints + h]
                node.simdWorldPosition = wp
                node.geometry?.materials.first?.diffuse.contents = UIColor(y: luma, cb: cb, cr: cr)
            }
        }
    }

where the setup() function is also a bit different (so that different nodes can have different materials):

func setup() {
    scene.rootNode.addChildNode(parentDebugNodes)

    let sizeGeomPredictions = 0.005

    for _ in 0 ..< (horizontalPoints * verticalPoints) {
        let geom = SCNBox(width: sizeGeomPredictions, height: sizeGeomPredictions, length: sizeGeomPredictions, chamferRadius: 0)
        geom.firstMaterial?.diffuse.contents = UIColor.green

        let node = SCNNode(geometry: geom)
        parentDebugNodes.addChildNode(node)
        depthNodes.append(node)
    }
}

And these are the other auxiliary functions I wrote:

func sampleLuma(_ pointer: UnsafeMutablePointer<UInt8>, size: SIMD2<Int>, at: SIMD2<Int>) -> UInt8 {
    let baseAddressIndex = at.y * size.x + at.x
    return UInt8(pointer[baseAddressIndex])
}

func sampleChroma(_ pointer: UnsafeMutablePointer<UInt16>, size: SIMD2<Int>, at: SIMD2<Int>) -> UInt16 {
    let baseAddressIndex = at.y * size.x + at.x
    return UInt16(pointer[baseAddressIndex])
}

and this extension on UIColor to convert from YCbCr to RGB:

extension UIColor {

    private static let encoding: (r: CGFloat, g: CGFloat, b: CGFloat) = (0.299, 0.587, 0.114)

    convenience init(y: UInt8, cb: UInt8, cr: UInt8, alpha: CGFloat = 1.0) {
        let Y  = (Double(y)  / 255.0)
        let Cb = (Double(cb) / 255.0) - 0.5
        let Cr = (Double(cr) / 255.0) - 0.5

        let k = UIColor.encoding
        let kr = (Cr * ((1.0 - k.r) / 0.5))
        let kgb = (Cb * ((k.b * (1.0 - k.b)) / (0.5 * k.g)))
        let kgr = (Cr * ((k.r * (1.0 - k.r)) / (0.5 * k.g)))
        let kb = (Cb * ((1.0 - k.b) / 0.5))

        let r = Y + kr
        let g = Y - kgb - kgr
        let b = Y + kb

        self.init(red: r, green: g, blue: b, alpha: alpha)
    }
}

and a different extension on CVPixelBuffer:

extension CVPixelBuffer {

    func size(ofPlane plane: Int = 0) -> SIMD2<Int> {
        let width = CVPixelBufferGetWidthOfPlane(self, plane)
        let height = CVPixelBufferGetHeightOfPlane(self, plane)
        return  .init(x: width, y: height)
    }
}

EDIT

I've uploaded my project to this repository.

@snowzurfer
Copy link
Author

It looks great, and thanks for posting the rest of your code!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment