Libav (ffmpeg) copying decoded video timestamps to encoder
Asked Answered
J

2

12

I am writing an application that decodes a single video stream from an input file (any codec, any container), does a bunch of image processing, and encodes the results to an output file (single video stream, Quicktime RLE, MOV). I am using ffmpeg's libav 3.1.5 (Windows build for now, but the application will be cross-platform).

There is a 1:1 correspondence between input and output frames and I want the frame timing in the output to be identical to the input. I am having a really, really hard time accomplishing this. So my general question is: How do I reliably (as in, in all cases of inputs) set the output frame timing identical to the input?

It took me a very long time to slog through the API and get to the point I am at now. I put together a minimal test program to work with:

#include <cstdio>

extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/avutil.h>
#include <libavutil/imgutils.h>
#include <libswscale/swscale.h>
}

using namespace std;


struct DecoderStuff {
    AVFormatContext *formatx;
    int nstream;
    AVCodec *codec;
    AVStream *stream;
    AVCodecContext *codecx;
    AVFrame *rawframe;
    AVFrame *rgbframe;
    SwsContext *swsx;
};


struct EncoderStuff {
    AVFormatContext *formatx;
    AVCodec *codec;
    AVStream *stream;
    AVCodecContext *codecx;
};


template <typename T>
static void dump_timebase (const char *what, const T *o) {
    if (o)
        printf("%s timebase: %d/%d\n", what, o->time_base.num, o->time_base.den);
    else
        printf("%s timebase: null object\n", what);
}


// reads next frame into d.rawframe and d.rgbframe. returns false on error/eof.
static bool read_frame (DecoderStuff &d) {

    AVPacket packet;
    int err = 0, haveframe = 0;

    // read
    while (!haveframe && err >= 0 && ((err = av_read_frame(d.formatx, &packet)) >= 0)) {
       if (packet.stream_index == d.nstream) {
           err = avcodec_decode_video2(d.codecx, d.rawframe, &haveframe, &packet);
       }
       av_packet_unref(&packet);
    }

    // error output
    if (!haveframe && err != AVERROR_EOF) {
        char buf[500];
        av_strerror(err, buf, sizeof(buf) - 1);
        buf[499] = 0;
        printf("read_frame: %s\n", buf);
    }

    // convert to rgb
    if (haveframe) {
        sws_scale(d.swsx, d.rawframe->data, d.rawframe->linesize, 0, d.rawframe->height,
                  d.rgbframe->data, d.rgbframe->linesize);
    }

    return haveframe;

}


// writes an output frame, returns false on error.
static bool write_frame (EncoderStuff &e, AVFrame *inframe) {

    // see note in so post about outframe here
    AVFrame *outframe = av_frame_alloc();
    outframe->format = inframe->format;
    outframe->width = inframe->width;
    outframe->height = inframe->height;
    av_image_alloc(outframe->data, outframe->linesize, outframe->width, outframe->height,
                   AV_PIX_FMT_RGB24, 1);
    //av_frame_copy(outframe, inframe);
    static int count = 0;
    for (int n = 0; n < outframe->width * outframe->height; ++ n) {
        outframe->data[0][n*3+0] = ((n+count) % 100) ? 0 : 255;
        outframe->data[0][n*3+1] = ((n+count) % 100) ? 0 : 255;
        outframe->data[0][n*3+2] = ((n+count) % 100) ? 0 : 255;
    }
    ++ count;

    AVPacket packet;
    av_init_packet(&packet);
    packet.size = 0;
    packet.data = NULL;

    int err, havepacket = 0;
    if ((err = avcodec_encode_video2(e.codecx, &packet, outframe, &havepacket)) >= 0 && havepacket) {
        packet.stream_index = e.stream->index;
        err = av_interleaved_write_frame(e.formatx, &packet);
    }

    if (err < 0) {
        char buf[500];
        av_strerror(err, buf, sizeof(buf) - 1);
        buf[499] = 0;
        printf("write_frame: %s\n", buf);
    }

    av_packet_unref(&packet);
    av_freep(&outframe->data[0]);
    av_frame_free(&outframe);

    return err >= 0;

}


int main (int argc, char *argv[]) {

    const char *infile = "wildlife.wmv";
    const char *outfile = "test.mov";
    DecoderStuff d = {};
    EncoderStuff e = {};

    av_register_all();

    // decoder
    avformat_open_input(&d.formatx, infile, NULL, NULL);
    avformat_find_stream_info(d.formatx, NULL);
    d.nstream = av_find_best_stream(d.formatx, AVMEDIA_TYPE_VIDEO, -1, -1, &d.codec, 0);
    d.stream = d.formatx->streams[d.nstream];
    d.codecx = avcodec_alloc_context3(d.codec);
    avcodec_parameters_to_context(d.codecx, d.stream->codecpar);
    avcodec_open2(d.codecx, NULL, NULL);
    d.rawframe = av_frame_alloc();
    d.rgbframe = av_frame_alloc();
    d.rgbframe->format = AV_PIX_FMT_RGB24;
    d.rgbframe->width = d.codecx->width;
    d.rgbframe->height = d.codecx->height;
    av_frame_get_buffer(d.rgbframe, 1);
    d.swsx = sws_getContext(d.codecx->width, d.codecx->height, d.codecx->pix_fmt,
                            d.codecx->width, d.codecx->height, AV_PIX_FMT_RGB24,
                            SWS_POINT, NULL, NULL, NULL);
    //av_dump_format(d.formatx, 0, infile, 0);
    dump_timebase("in stream", d.stream);
    dump_timebase("in stream:codec", d.stream->codec); // note: deprecated
    dump_timebase("in codec", d.codecx);

    // encoder
    avformat_alloc_output_context2(&e.formatx, NULL, NULL, outfile);
    e.codec = avcodec_find_encoder(AV_CODEC_ID_QTRLE);
    e.stream = avformat_new_stream(e.formatx, e.codec);
    e.codecx = avcodec_alloc_context3(e.codec);
    e.codecx->bit_rate = 4000000; // arbitrary for qtrle
    e.codecx->width = d.codecx->width;
    e.codecx->height = d.codecx->height;
    e.codecx->gop_size = 30; // 99% sure this is arbitrary for qtrle
    e.codecx->pix_fmt = AV_PIX_FMT_RGB24;
    e.codecx->time_base = d.stream->time_base; // ???
    e.codecx->flags |= (e.formatx->flags & AVFMT_GLOBALHEADER) ? AV_CODEC_FLAG_GLOBAL_HEADER : 0;
    avcodec_open2(e.codecx, NULL, NULL);
    avcodec_parameters_from_context(e.stream->codecpar, e.codecx); 
    //av_dump_format(e.formatx, 0, outfile, 1);
    dump_timebase("out stream", e.stream);
    dump_timebase("out stream:codec", e.stream->codec); // note: deprecated
    dump_timebase("out codec", e.codecx);

    // open file and write header
    avio_open(&e.formatx->pb, outfile, AVIO_FLAG_WRITE); 
    avformat_write_header(e.formatx, NULL);

    // frames
    while (read_frame(d) && write_frame(e, d.rgbframe))
        ;

    // write trailer and close file
    av_write_trailer(e.formatx);
    avio_closep(&e.formatx->pb); 

}

A few notes about that:

  • Since all of my attempts at frame timing so far have failed, I've removed almost all timing-related stuff from this code to start with a clean slate.
  • Almost all error checking and cleanup omitted for brevity.
  • The reason I allocate a new output frame with a new buffer in write_frame, rather than using inframe directly, is because this is more representative of what my real application is doing. My real app also uses RGB24 internally, hence the conversions here.
  • The reason I generate a weird pattern in outframe, rather than using e.g. av_copy_frame, is because I just wanted a test pattern that compressed well with Quicktime RLE (my test input ends up generating a 1.7GB output file otherwise).
  • The input video I am using, "wildlife.wmv", can be found here. I've hard-coded the filenames.
  • I am aware that avcodec_decode_video2 and avcodec_encode_video2 are deprecated, but don't care. They work fine, I've already struggled too much getting my head around the latest version of the API, ffmpeg changes their API with nearly every release, and I really don't feel like dealing with avcodec_send_* and avcodec_receive_* right now.
  • I think I'm supposed to be finishing off by passing a NULL frame to avcodec_encode_video2 to flush some buffers or something but I'm a bit confused about that. Unless somebody feels like explaining that let's ignore it for now, it's a separate question. The docs are as vague about this point as they are about everything else.
  • My test input file's frame rate is 29.97.

Now, as for my current attempts. The following timing related fields are present in the above code, with details/confusion in bold. There's a lot of them, because the API is mind-bogglingly convoluted:

  • main: d.stream->time_base: Input video stream time base. For my test input file this is 1/1000.
  • main: d.stream->codec->time_base: Not sure what this is (I never could make sense of why AVStream has an AVCodecContext field when you always use your own new context anyways) and also the codec field is deprecated. For my test input file this is 1/1000.
  • main: d.codecx->time_base: Input codec context time-base. For my test input file this is 0/1. Am I supposed to set it?
  • main: e.stream->time_base: Time base of the output stream I create. What do I set this to?
  • main: e.stream->codec->time_base: Time base of the deprecated and mysterious codec field of the output stream I create. Do I set this to anything?
  • main: e.codecx->time_base: Time base of the encoder context I create. What do I set this to?
  • read_frame: packet.dts: Decoding timestamp of packet read.
  • read_frame: packet.pts: Presentation timestamp of packet read.
  • read_frame: packet.duration: Duration of packet read.
  • read_frame: d.rawframe->pts: Presentation timestamp of raw frame decoded. This is always 0. Why isn't it read by the decoder...?
  • read_frame: d.rgbframe->pts / write_frame: inframe->pts: Presentation timestamp of decoded frame converted to RGB. Not set to anything currently.
  • read_frame: d.rawframe->pkt_*: Fields copied from packet, discovered after reading this post. They are set correctly but I don't know if they are useful.
  • write_frame: outframe->pts: Presentation timestamp of frame being encoded. Should I set this to something?
  • write_frame: outframe->pkt_*: Timing fields from a packet. Should I set these? They seem to be ignored by the encoder.
  • write_frame: packet.dts: Decoding timestamp of packet being encoded. What do I set it to?
  • write_frame: packet.pts: Presentation timestamp of packet being encoded. What do I set it to?
  • write_frame: packet.duration: Duration of packet being encoded. What do I set it to?

I have tried the following, with the described results. Note that inframe is d.rgbframe:

  1.  
    • Init e.stream->time_base = d.stream->time_base
    • Init e.codecx->time_base = d.codecx->time_base
    • Set d.rgbframe->pts = packet.dts in read_frame
    • Set outframe->pts = inframe->pts in write_frame
    • Result: Warning that encoder time base is not set (since d.codecx->time_base was 0/1), seg fault.
  2.  
    • Init e.stream->time_base = d.stream->time_base
    • Init e.codecx->time_base = d.stream->time_base
    • Set d.rgbframe->pts = packet.dts in read_frame
    • Set outframe->pts = inframe->pts in write_frame
    • Result: No warnings, but VLC reports frame rate as 480.048 (no idea where this number came from) and file plays too fast. Also the encoder sets all the timing fields in packet to 0, which was not what I expected. (Edit: Turns out this is because av_interleaved_write_frame, unlike av_write_frame, takes ownership of the packet and swaps it with a blank one, and I was printing the values after that call. So they are not ignored.)
  3.  
    • Init e.stream->time_base = d.stream->time_base
    • Init e.codecx->time_base = d.stream->time_base
    • Set d.rgbframe->pts = packet.dts in read_frame
    • Set any of pts/dts/duration in packet in write_frame to anything.
    • Result: Warnings about packet timestamps not set. Encoder seems to reset all packet timing fields to 0, so none of this has any effect.
  4.  
    • Init e.stream->time_base = d.stream->time_base
    • Init e.codecx->time_base = d.stream->time_base
    • I found these fields, pkt_pts, pkt_dts, and pkt_duration in AVFrame after reading this post, so I tried copying those all the way through to outframe.
    • Result: Really had my hopes up, but ended up with same results as attempt 3 (packet timestamp not set warning, incorrect results).

I tried various other hand-wavey permutations of the above and nothing worked. What I want to do is create an output file that plays back with the same timing and frame rate as the input (29.97 constant frame rate in this case).

So how do I do this? Of the zillions of timing related fields here, what do I do to make the output be the same as the input? And how do I do it in such a way that handles arbitrary video input formats that may store their time stamps and time bases in different places? I need this to always work.


For reference, here is a table of all the packet and frame timestamps read from the video stream of my test input file, to give a sense of what my test file looks like. None of the input packet pts' are set, same with frame pts, and for some reason the duration of the first 108 frames is 0. VLC plays the file fine and reports the frame rate as 29.9700089:

Jackhammer answered 27/10, 2016 at 2:41 Comment(2)
@halfer Ha, I just came back to add a reward bounty, too.Jackhammer
Ah no worries Jason, I thought the answer (and the question) was very good.Inexperienced
S
20

I think your issue here is with time bases which are at first a bit confusing.

  • d.stream->time_base: Input video stream time base. This is a resolution of timestamps in the input container. Encoded frame returned from av_read_frame will have its timestamps in this resolution.
  • d.stream->codec->time_base: Not sure what this is. It is old API left here for API compatibility; you are using codec parameters so ignore it.
  • d.codecx->time_base: Input codec context time-base. For my test input file this is 0/1. Am I supposed to set it? This is a resolution of timestamps for the codec (as opposed to container). Codec will assume its input encoded frame have its timestamps in this resolution, and also it will set timestamps in output decoded frame in this resolution.
  • e.stream->time_base: Time base of the output stream I create. Same as with decoder
  • e.stream->codec->time_base. Same as with demuxer - ignore this one.
  • e.codecx->time_base - same as with demuxer

So you need to do following:

  • open demuxer. That part works
  • set decoder timebase to some "sane" value because decoder might not do that, and 0/1 is bad. Things won't work as they should if any of timebases for any of components are not set. Easiest is to just copy time base from demuxer
  • open decoder. It might change its timebase, or it might not.
  • set encoder timebase. Easiest is to copy timebase from (now opened) decoder since you are not changing framerates or anything.
  • open encoder. It might change its timebase
  • set muxer timebase. Again, easiest is to copy timebase from encoder
  • open muxer. It might change its timebase as well.

Now for each frame:

  • read it from the demuxer
  • convert timestamps from demuxer to decoder timebases. There is av_packet_rescale_ts to help you do that
  • decode packet
  • set frame timestamp (pts) to a value returned by av_frame_get_best_effort_timestamp
  • convert frame timestamp from decoder to encoder timebases. Use av_rescale_q or av_rescale_q_rnd
  • encode packet
  • convert timestamps from encoder to muxer timebases. Again, use av_packet_rescale_ts

This might be an overkill, in particular maybe encoders doesn't change their timebase on open (in which case you don't need to convert raw frames' pts).


Regarding flushing - frames you pass to encoder are not necessarily encoded and output right away, so yes you are supposed to call avcodec_encode_video2 with NULL as a frame to let the encoder know you are done and make it output all the remaining data (which you need to pass through muxer as with all the other packets). In fact, you are supposed to do so repeatedly until it stops spewing out packets. See one of encoding examples in doc/examples folder inside ffmpeg for some samples.

Sharpset answered 27/10, 2016 at 7:6 Comment(4)
Awesome. Ok, so for the encoding side then, I've got it working from this answer: On the encoder side I set e.stream->time_base = d.stream->time_base as an initial sane value then avformat_write_header may change it if needed. I set e.codecx->time_base to any sane value (I'm using {1,1000}), I didn't realize it was my choice, that was one big missing piece. Then when encoding I set the packet pts/dts from inframe's pkt_pts and pkt_dts, leave duration unset, and then let av_packet_rescale_ts do the magic. It's working correctly now. Now the remaining problem is...Jackhammer
... In my test input stream all the input packet dts's are set but the pts's are not. If I directly copy these to the output packets the encoder gives me a warning that "a packet timestamp is not set, and this will stop working in the future", because pts is unset. So what's the most robust way to generate output pts/dts? I've discovered that something like if (pts == AV_NOPTS_VALUE) pts = dts works for this stream, but is that really the best approach? PS Thanks for the flushing tip. PPS To confirm, I'd set d.codecx->time_base to any sane value prior to avcodec_open2 just in case?Jackhammer
One thing I forgot to mention is that you need to do frame->pts = av_frame_get_best_effort_timestamp(frame); after decoding frame. Don't set output packet's timestamps - encoder is supposed to do that for you based on its input frame's pts values. Yes, set decoder timebase before avcodec_open2.Sharpset
This is really an excellent answer, most of which goes over my head. Thanks for posting it, please carry on posting more of the same! I've added +100 for encouragement, if points are your thing.Inexperienced
J
9

So, thanks 100% to Andrey Turkin's amazingly clear and helpful answer, I've got this working properly, I'd like to share the exact things I did:

During initialization, with the understanding that any of these initial time bases may be changed by libav at some point:

  • Initialize decoder codec context time base to something reasonable immediately after allocating codec context. I went for sub-millisecond resolution:

    d.codecx->time_base = { 1, 10000 };
    
  • Initialize encoder stream time base immediately after creating the new stream (note: in the QtRLE case, if I leave this {0,0}, it'll be set by the encoder to {0,90000} after writing the header, but I don't know if other situations will be as cooperative, so I initialize it here). At this point it's safe to just copy from the input stream, although I noticed I can also initialize it arbitrarily (e.g. {1,10000}) and it will still work later:

    e.stream->time_base = d.stream->time_base;
    
  • Initialize encoder codec context time base immediately after allocating it. Same deal as stream time base as far as copying from decoder:

    e.codecx->time_base = d.codecx->time_base;
    

One of the things I was missing is that I can set these timestamps, and libav will obey. There are no constraints, it's up to me, and no matter what I set the decoded timestamps will be in the time base that I choose. I didn't realize this.

Then while decoding:

  • All I have to do is fill in the decoded frames pts manually. The pkt_* fields are ignorable:

    d.rawframe->pts = av_frame_get_best_effort_timestamp(d.rawframe);
    
  • And since I'm converting formats I also copy it to the converted frame:

    d.rgbframe->pts = d.rawframe->pts;
    

Then, encoding:

  • Only the frame's pts needs to be set. Libav will deal with the packet. So just prior to encoding frame:

    outframe->pts = inframe->pts;
    
  • However, I still have to manually convert packet timestamps, which seems strange, but all of this is pretty strange so I guess it's par for the course. The frame timestamp is still in the decoder stream time base, so after encoding the frame but just before writing the packet:

    av_packet_rescale_ts(&packet, d.stream->time_base, e.stream->time_base);
    

And it works like a charm, mostly: I noticed is VLC reports the input as 29.97 FPS but the output at 30.03 FPS, which I can't quite figure out. But, everything seems to play fine in all media players I've tested with.

Jackhammer answered 27/10, 2016 at 2:42 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.