Making CIContext.render(CIImage, CVPixelBuffer) work with AVAssetWriter

Asked 7/5, 2019 at 8:30 Answered 28/5, 2019 at 3:34

Solved avfoundation core-graphics metal core-image

I want to use Core Image for processing a bunch of CGImage objects and turning them into a QuickTime movie on macOS. The following code demonstrates what's needed, but the output contains a lot of blank (black) frames:

import AppKit
import AVFoundation
import CoreGraphics
import Foundation
import CoreVideo
import Metal

// Video output url.
let url: URL = try! FileManager.default.url(for: .downloadsDirectory, in: .userDomainMask, appropriateFor: nil, create: false).appendingPathComponent("av.mov")
try? FileManager.default.removeItem(at: url)

// Video frame size, total frame count, frame rate and frame image.
let frameSize: CGSize = CGSize(width: 2000, height: 1000)
let frameCount: Int = 100
let frameRate: Double = 1 / 30
let frameImage: CGImage

frameImage = NSImage(size: frameSize, flipped: false, drawingHandler: {
    NSColor.red.setFill()
    $0.fill()
    return true
}).cgImage(forProposedRect: nil, context: nil, hints: nil)!

let pixelBufferAttributes: [CFString: Any]
let outputSettings: [String: Any]

pixelBufferAttributes = [
    kCVPixelBufferPixelFormatTypeKey: Int(kCVPixelFormatType_32ARGB),
    kCVPixelBufferWidthKey: Float(frameSize.width),
    kCVPixelBufferHeightKey: Float(frameSize.height),
    kCVPixelBufferMetalCompatibilityKey: true,
    kCVPixelBufferCGImageCompatibilityKey: true,
    kCVPixelBufferCGBitmapContextCompatibilityKey: true,
]

outputSettings = [
    AVVideoCodecKey: AVVideoCodecType.h264,
    AVVideoWidthKey: Int(frameSize.width),
    AVVideoHeightKey: Int(frameSize.height),
]

let writer: AVAssetWriter = try! AVAssetWriter(outputURL: url, fileType: .mov)
let input: AVAssetWriterInput = AVAssetWriterInput(mediaType: .video, outputSettings: outputSettings)
let pixelBufferAdaptor: AVAssetWriterInputPixelBufferAdaptor = AVAssetWriterInputPixelBufferAdaptor(assetWriterInput: input, sourcePixelBufferAttributes: pixelBufferAttributes as [String: Any])

input.expectsMediaDataInRealTime = true

precondition(writer.canAdd(input))
writer.add(input)

precondition(writer.startWriting())
writer.startSession(atSourceTime: CMTime.zero)

let colorSpace: CGColorSpace = CGColorSpace(name: CGColorSpace.sRGB) ?? CGColorSpaceCreateDeviceRGB()
let context = CIContext(mtlDevice: MTLCreateSystemDefaultDevice()!)

Swift.print("Starting the render…")

// Preferred scenario: using CoreImage to fill the buffer from the pixel buffer adapter. Shows that
// CIImage + AVAssetWriterInputPixelBufferAdaptor are not working together.

for frameNumber in 0 ..< frameCount {
    var pixelBuffer: CVPixelBuffer?
    guard let pixelBufferPool: CVPixelBufferPool = pixelBufferAdaptor.pixelBufferPool else { preconditionFailure() }
    precondition(CVPixelBufferPoolCreatePixelBuffer(nil, pixelBufferPool, &pixelBuffer) == kCVReturnSuccess)

    precondition(CVPixelBufferLockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess)
    defer { precondition(CVPixelBufferUnlockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess) }

    let ciImage = CIImage(cgImage: frameImage)
    context.render(ciImage, to: pixelBuffer!)

    // 💥 This fails – the pixel buffer doesn't get filled. AT ALL! Why? How to make it work?
    let bytes = UnsafeBufferPointer(start: CVPixelBufferGetBaseAddress(pixelBuffer!)!.assumingMemoryBound(to: UInt8.self), count: CVPixelBufferGetDataSize(pixelBuffer!))
    precondition(bytes.contains(where: { $0 != 0 }))

    while !input.isReadyForMoreMediaData { Thread.sleep(forTimeInterval: 10 / 1000) }
    precondition(pixelBufferAdaptor.append(pixelBuffer!, withPresentationTime: CMTime(seconds: Double(frameNumber) * frameRate, preferredTimescale: 600)))
}


// Unpreferred scenario: using CoreImage to fill the manually created buffer. Proves that CIImage 
// can fill buffer and working.

// for frameNumber in 0 ..< frameCount {
//     var pixelBuffer: CVPixelBuffer?
//     precondition(CVPixelBufferCreate(nil, frameImage.width, frameImage.height, kCVPixelFormatType_32ARGB, pixelBufferAttributes as CFDictionary, &pixelBuffer) == kCVReturnSuccess)
//
//     precondition(CVPixelBufferLockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess)
//     defer { precondition(CVPixelBufferUnlockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess) }
//
//     let ciImage = CIImage(cgImage: frameImage)
//     context.render(ciImage, to: pixelBuffer!)
//
//     // ✅ This passes.
//     let bytes = UnsafeBufferPointer(start: CVPixelBufferGetBaseAddress(pixelBuffer!)!.assumingMemoryBound(to: UInt8.self), count: CVPixelBufferGetDataSize(pixelBuffer!))
//     precondition(bytes.contains(where: { $0 != 0 }))
//
//     while !input.isReadyForMoreMediaData { Thread.sleep(forTimeInterval: 10 / 1000) }
//     precondition(pixelBufferAdaptor.append(pixelBuffer!, withPresentationTime: CMTime(seconds: Double(frameNumber) * frameRate, preferredTimescale: 600)))
// }


// Unpreferred scenario: using CoreGraphics to fill the buffer from the pixel buffer adapter. Shows that
// buffer from pixel buffer adapter can be filled and working.

// for frameNumber in 0 ..< frameCount {
//     var pixelBuffer: CVPixelBuffer?
//     guard let pixelBufferPool: CVPixelBufferPool = pixelBufferAdaptor.pixelBufferPool else { preconditionFailure() }
//     precondition(CVPixelBufferPoolCreatePixelBuffer(nil, pixelBufferPool, &pixelBuffer) == kCVReturnSuccess)
//
//     precondition(CVPixelBufferLockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess)
//     defer { precondition(CVPixelBufferUnlockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess) }
//
//     guard let context: CGContext = CGContext(data: CVPixelBufferGetBaseAddress(pixelBuffer!), width: frameImage.width, height: frameImage.height, bitsPerComponent: 8, bytesPerRow: CVPixelBufferGetBytesPerRow(pixelBuffer!), space: colorSpace, bitmapInfo: CGImageAlphaInfo.premultipliedFirst.rawValue) else { preconditionFailure() }
//     context.clear(CGRect(origin: .zero, size: frameSize))
//     context.draw(frameImage, in: CGRect(origin: .zero, size: frameSize))
//
//     // ✅ This passes.
//     let bytes = UnsafeBufferPointer(start: CVPixelBufferGetBaseAddress(pixelBuffer!)!.assumingMemoryBound(to: UInt8.self), count: CVPixelBufferGetDataSize(pixelBuffer!))
//     precondition(bytes.contains(where: { $0 != 0 }))
//
//     while !input.isReadyForMoreMediaData { Thread.sleep(forTimeInterval: 10 / 1000) }
//     precondition(pixelBufferAdaptor.append(pixelBuffer!, withPresentationTime: CMTime(seconds: Double(frameNumber) * frameRate, preferredTimescale: 600)))
// }

let semaphore = DispatchSemaphore(value: 0)

input.markAsFinished()
writer.endSession(atSourceTime: CMTime(seconds: Double(frameCount) * frameRate, preferredTimescale: 600))
writer.finishWriting(completionHandler: { semaphore.signal() })

semaphore.wait()

Swift.print("Successfully finished rendering to \(url.path)")

The following, however, works with CGContext, but I need CIContext in order to make use of GPU. The problem seems to be with pixel buffers provided by the AVAssetWriterInputPixelBufferAdaptor's buffer pool. Rendering CIContext into individually created buffers and appending them to the adapter works, but is highly inefficient. Rendering CIContext into buffers provided by the adapter's pool results in no data being written into buffer at all, it literally contains all zeroes as if two are incompatible! However, rendering using CGImage works, so as copying the data manually.

The main observation is that CIContext.render appears to work asynchronously or something goes wrong between the buffer getting filled and data being written into the video stream. In other words there's no data in the buffer when it gets flushed. The following is kind of pointing in that direction:

Removing buffer locking results in almost all frames being written, except for the first few, the above code actually produces a correct output, but with the actual data the behaviour is as described.
Using a different codec, like ProRes422, results in almost all frames being written correctly, with just a few blanks – also the above code produces correct output, but larger and complex images result in skipped frames.

What's wrong with this code and what's the right way to do it?

P.S. Most iOS examples use pretty much the same implementation and seem to work perfectly fine. I found a hint that it might differ for macOS, but can't see any official documentation on this.

Pteropod answered 7/5, 2019 at 8:30 Comment(2)

You need an autorelease pool around the main loop of frames there, your current code will just consume all memory until the device crashes. – Enjoyable 8/5, 2019 at 21:19

Also, do not use kCVPixelFormatType_32ARGB, you should be using kCVPixelFormatType_32BGRA. – Enjoyable 10/5, 2019 at 23:40

After speaking with Apple Developer Technical Support it appears that:

Core Image defers the rendering until the client requests the access to the frame buffer, i.e. CVPixelBufferLockBaseAddress.

So, the solution is simply to do CVPixelBufferLockBaseAddress after calling CIContext.render as shown below:

for frameNumber in 0 ..< frameCount {
    var pixelBuffer: CVPixelBuffer?
    guard let pixelBufferPool: CVPixelBufferPool = pixelBufferAdaptor.pixelBufferPool else { preconditionFailure() }
    precondition(CVPixelBufferPoolCreatePixelBuffer(nil, pixelBufferPool, &pixelBuffer) == kCVReturnSuccess)

    let ciImage = CIImage(cgImage: frameImage)
    context.render(ciImage, to: pixelBuffer!)

    precondition(CVPixelBufferLockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess)
    defer { precondition(CVPixelBufferUnlockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess) }

    let bytes = UnsafeBufferPointer(start: CVPixelBufferGetBaseAddress(pixelBuffer!)!.assumingMemoryBound(to: UInt8.self), count: CVPixelBufferGetDataSize(pixelBuffer!))
    precondition(bytes.contains(where: { $0 != 0 }))

    while !input.isReadyForMoreMediaData { Thread.sleep(forTimeInterval: 10 / 1000) }
    precondition(pixelBufferAdaptor.append(pixelBuffer!, withPresentationTime: CMTime(seconds: Double(frameNumber) * frameRate, preferredTimescale: 600)))
}

Pteropod answered 28/5, 2019 at 3:34 Comment(3)

but still context.render(ciImage, to: pixelBuffer!) takes to much CPU, especially for 4K resolution and 60 framerate, iPhone 11 would be able to record about 33 fps maximum for 4K with context.render(ciImage, to: pixelBuffer!), cause it takes about 30 ms – Fulgurate 8/10 at 14:3

I feel your pain… There might be two separate issues here – CIImage rendering and video encoding. I would try to test two separately to find the bottleneck. For my use case, I ended up using ProRes for intermediary encoding – my tests showed that it has the lowest overhead as it does the least encoding. I then convert ProRes video to a more efficient format to avoid 10 GB files. If you make any interesting discoveries or discover optimization tricks – would be fantastic if you can share! 🙏 – Pteropod 8/10 at 19:28

Yeah, I'm trying to figure it out in the following question - #79070414 – Fulgurate 9/10 at 16:24

For your use case it would be better to use the pull-style APIs of AVAssetWriterInput because you don't need to process any media in real-time (like you would when capturing from a camera).

So rather then pausing the thread when the input isn't ready, just wait for it to pull the next frame. Remember to also set expectsMediaDataInRealTime to false in this case.

I think the main problem with you current approach is that you pause the very thread that the video processing is happening in when the writer is not yet ready.

(By the way: you can create CIImages with a solid color directly (CIImage(color:)); no need to create a CGImage first.)

Ballou answered 7/5, 2019 at 11:51 Comment(7)

Hey Frank! Appreciate the input. 1. I'm actually capturing parts of the screen and specific windows for post-processing the returned CGImage in real time, so that part is necessary. 2. Correct, but the same doesn't apply to CGImage when you uncomment the "working" part, which was left for convenience in case some one decides to play with the code. By the way, good point on pull vs. push, I had the same thought about this. While both approaches appear valid, pulling would require major rewriting, which I'd rather avoid, unless there's supporting evidence that it would solve the issue. – Pteropod 7/5, 2019 at 12:16

You can try the newish CIRenderDestination API: Create a CIRenderDestination from the pixel buffer and call let task = context.startTask(toRender: image, to: destination) on the CIContext. You can then explicitly call waitUntilCompleted() on the task object to make sure Core Image finished processing before you try to write the pixel buffer to the video file. – Ballou 7/5, 2019 at 12:27

That looked very promising, unfortunately produces same result… damn! Came across a few logging statements while digging through CoreImage disassembly. Do you know if there are any env variables to trigger verbose logging? – Pteropod 7/5, 2019 at 13:3

I don't know, but that would be awesome… Have you put a break point at the point where you append the pixel buffers and inspected them with QuickLook? Would be interesting if those are black already. – Ballou 7/5, 2019 at 13:8

Yes, just tried that + a few options. CIContext.render produces black pixels and no output. The entire pixel buffer is just zeroes right after the render. Creating new pixel buffer directly from CGImage data produces broken frames as if it's data gets partly released and parts of it become black. Finally memcpying CGImage data straight into the buffer works like in case with the CGContext, no artifacts, but slightly faster and the color planes are all mixed up. – Pteropod 7/5, 2019 at 13:51

Have you tried different format types? kCVPixelFormatType_32BGRA for instance? – Ballou 7/5, 2019 at 14:20

Let us continue this discussion in chat. – Pteropod 7/5, 2019 at 16:40

After speaking with Apple Developer Technical Support it appears that:

Core Image defers the rendering until the client requests the access to the frame buffer, i.e. CVPixelBufferLockBaseAddress.

So, the solution is simply to do CVPixelBufferLockBaseAddress after calling CIContext.render as shown below:

for frameNumber in 0 ..< frameCount {
    var pixelBuffer: CVPixelBuffer?
    guard let pixelBufferPool: CVPixelBufferPool = pixelBufferAdaptor.pixelBufferPool else { preconditionFailure() }
    precondition(CVPixelBufferPoolCreatePixelBuffer(nil, pixelBufferPool, &pixelBuffer) == kCVReturnSuccess)

    let ciImage = CIImage(cgImage: frameImage)
    context.render(ciImage, to: pixelBuffer!)

    precondition(CVPixelBufferLockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess)
    defer { precondition(CVPixelBufferUnlockBaseAddress(pixelBuffer!, []) == kCVReturnSuccess) }

    let bytes = UnsafeBufferPointer(start: CVPixelBufferGetBaseAddress(pixelBuffer!)!.assumingMemoryBound(to: UInt8.self), count: CVPixelBufferGetDataSize(pixelBuffer!))
    precondition(bytes.contains(where: { $0 != 0 }))

    while !input.isReadyForMoreMediaData { Thread.sleep(forTimeInterval: 10 / 1000) }
    precondition(pixelBufferAdaptor.append(pixelBuffer!, withPresentationTime: CMTime(seconds: Double(frameNumber) * frameRate, preferredTimescale: 600)))
}

Pteropod answered 28/5, 2019 at 3:34 Comment(3)

Yeah, I'm trying to figure it out in the following question - #79070414 – Fulgurate 9/10 at 16:24

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags