I'm in the process of writing a hardware accelerated h264 decoder using Media Foundation's Source Reader, but have encountered a problem. I followed this tutorial and supported myself with Windows SDK Media Foundation samples.
My app seems to work fine when hardware acceleration is turned off, but it doesn't provide the performance I need. When I turn the acceleration on by passing a IMFDXGIDeviceManager
to IMFAttributes
used to create the reader, things get complicated.
If I create the ID3D11Device
using a D3D_DRIVER_TYPE_NULL
driver, the app works fine and the frames are processed faster that in the software mode, but judging by the CPU and GPU usage it still does majority of the processing on CPU.
On the other hand, when I create the ID3D11Device
using a D3D_DRIVER_TYPE_HARDWARE
driver and run the app, one of these four things can happen.
I only get an unpredictable number of frames (usually 1-3) before
IMFMediaBuffer::Lock
function returns 0x887a0005 which is described as "The GPU device instance has been suspended. UseGetDeviceRemovedReason
to determine the appropriate action". When I callID3D11Device::GetDeviceRemovedReason
, I get 0x887a0020 which is described as "The driver encountered a problem and was put into the device removed state" which isn't as helpful as I wish it to be.The app crashes in an external dll on
IMFMediaBuffer::Lock
call. It seems that the dll depends on the GPU used. For Intel integrated GPU it's igd10iumd32.dll and for Nvidia mobile GPU it's mfplat.dll. The message for this particular crash is as follows: "Exception thrown at 0x53C6DB8C (mfplat.dll) in decoder_ tester.exe: 0xC0000005: Access violation reading location 0x00000024". The addresses are different between executions and sometimes it involves reading, sometimes writing.The graphics driver stops responding, the system hangs for a short time and then the application crashes like in point 2 or finishes like in point 1.
The app works fine and processes all the frames with hardware acceleration.
Most of the time it's 1 or 2, seldom 3 or 4.
Here's what the CPU/GPU usage is like when processing without throttling in different modes on my machine (Intel Core i5-6500 with HD Graphics 530, Windows 10 Pro).
- NULL - CPU: ~90%, GPU: ~15%
- HARDWARE - CPU: ~15%, GPU: ~60%
- SOFTWARE - CPU: ~40%, GPU: ~7%
I tested the app on three machines. All of them had Intel integrated GPUs (HD 4400, HD 4600, HD 530). One of them also had switchable Nvidia dedicated GPU (GF 840M). It bahaves identically on all of them, the only difference is that it crashes in a different dll when Nvidia's GPU is used.
I have no previous experience with COM or DirectX, but all of this is inconsistent and unpredictable, so it looks like a memory corruption to me. Still, I don't know where I'm making the mistake. Could you please help me find what I'm doing wrong?
The minimal code example I could come up with with is below. I'm using Visual Studio Professional 2015 to compile it as a C++ project. I prepared definitions to enable hardware acceleration and select the hardware driver. Comment them out to change the behavior. Also, the code expects this video file to be present in the project directory.
#include <iostream>
#include <string>
#include <atlbase.h>
#include <d3d11.h>
#include <mfapi.h>
#include <mfidl.h>
#include <mfreadwrite.h>
#include <windows.h>
#pragma comment(lib, "d3d11.lib")
#pragma comment(lib, "mf.lib")
#pragma comment(lib, "mfplat.lib")
#pragma comment(lib, "mfreadwrite.lib")
#pragma comment(lib, "mfuuid.lib")
#define ENABLE_HW_ACCELERATION
#define ENABLE_HW_DRIVER
void handle_result(HRESULT hr)
{
if (SUCCEEDED(hr))
return;
WCHAR message[512];
FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, hr,
MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), message, ARRAYSIZE(message), nullptr);
printf("%ls", message);
abort();
}
int main(int argc, char** argv)
{
handle_result(CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED | COINIT_DISABLE_OLE1DDE));
handle_result(MFStartup(MF_VERSION));
{
CComPtr<IMFAttributes> attributes;
handle_result(MFCreateAttributes(&attributes, 3));
#if defined(ENABLE_HW_ACCELERATION)
CComPtr<ID3D11Device> device;
D3D_FEATURE_LEVEL levels[] = { D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0 };
#if defined(ENABLE_HW_DRIVER)
handle_result(D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, D3D11_CREATE_DEVICE_SINGLETHREADED | D3D11_CREATE_DEVICE_VIDEO_SUPPORT,
levels, ARRAYSIZE(levels), D3D11_SDK_VERSION, &device, nullptr, nullptr));
#else
handle_result(D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_NULL, nullptr, D3D11_CREATE_DEVICE_SINGLETHREADED,
levels, ARRAYSIZE(levels), D3D11_SDK_VERSION, &device, nullptr, nullptr));
#endif
UINT token;
CComPtr<IMFDXGIDeviceManager> manager;
handle_result(MFCreateDXGIDeviceManager(&token, &manager));
handle_result(manager->ResetDevice(device, token));
handle_result(attributes->SetUnknown(MF_SOURCE_READER_D3D_MANAGER, manager));
handle_result(attributes->SetUINT32(MF_READWRITE_ENABLE_HARDWARE_TRANSFORMS, TRUE));
handle_result(attributes->SetUINT32(MF_SOURCE_READER_ENABLE_ADVANCED_VIDEO_PROCESSING, TRUE));
#else
handle_result(attributes->SetUINT32(MF_SOURCE_READER_ENABLE_VIDEO_PROCESSING, TRUE));
#endif
CComPtr<IMFSourceReader> reader;
handle_result(MFCreateSourceReaderFromURL(L"Rogue One - A Star Wars Story - Trailer.mp4", attributes, &reader));
CComPtr<IMFMediaType> output_type;
handle_result(MFCreateMediaType(&output_type));
handle_result(output_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video));
handle_result(output_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_RGB32));
handle_result(reader->SetCurrentMediaType(MF_SOURCE_READER_FIRST_VIDEO_STREAM, nullptr, output_type));
unsigned int frame_count{};
std::cout << "Started processing frames" << std::endl;
while (true)
{
CComPtr<IMFSample> sample;
DWORD flags;
handle_result(reader->ReadSample(MF_SOURCE_READER_FIRST_VIDEO_STREAM,
0, nullptr, &flags, nullptr, &sample));
if (flags & MF_SOURCE_READERF_ENDOFSTREAM || sample == nullptr)
break;
std::cout << "Frame " << frame_count++ << std::endl;
CComPtr<IMFMediaBuffer> buffer;
BYTE* data;
handle_result(sample->ConvertToContiguousBuffer(&buffer));
handle_result(buffer->Lock(&data, nullptr, nullptr));
// Use the frame here.
buffer->Unlock();
}
std::cout << "Finished processing frames" << std::endl;
}
MFShutdown();
CoUninitialize();
return 0;
}