/* * H.264/HEVC hardware encoding using nvidia nvenc * Copyright (c) 2016 Timo Rothenpieler * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include "nvidia.h" using namespace rfb; static LogWriter vlog("nvidia"); #define FFNV_LOG_FUNC(logctx, msg, ...) vlog.info((msg), __VA_ARGS__) #define FFNV_DEBUG_LOG_FUNC(logctx, msg, ...) #include "dynlink_loader.h" #define NUM_SURF 4 typedef struct NvencSurface { NV_ENC_INPUT_PTR input_surface; int reg_idx; int width; int height; int pitch; NV_ENC_OUTPUT_PTR output_surface; NV_ENC_BUFFER_FORMAT format; } NvencSurface; typedef struct NvencDynLoadFunctions { CudaFunctions *cuda_dl; NvencFunctions *nvenc_dl; void *nvenc_ctx; NV_ENCODE_API_FUNCTION_LIST nvenc_funcs; NV_ENC_INITIALIZE_PARAMS init_enc_parms; NV_ENC_CONFIG enc_cfg; CUdevice cu_dev; CUcontext cu_ctx; NvencSurface surf[NUM_SURF]; uint8_t cursurf; } NvencDynLoadFunctions; static NvencDynLoadFunctions nvenc; /* Recommended settings for streaming Low-Latency High Quality preset Rate control mode = Two-pass CBR Very low VBV buffer size (Single frame) No B Frames Infinite GOP length Adaptive Quantization enabled */ static int loadfuncs() { int ret; NVENCSTATUS err; uint32_t nvenc_max_ver; ret = cuda_load_functions(&nvenc.cuda_dl); if (ret < 0) return ret; ret = nvenc_load_functions(&nvenc.nvenc_dl); if (ret < 0) return ret; err = nvenc.nvenc_dl->NvEncodeAPIGetMaxSupportedVersion(&nvenc_max_ver); if (err != NV_ENC_SUCCESS) return -1; vlog.info("Loaded nvenc version %u.%u", nvenc_max_ver >> 4, nvenc_max_ver & 0xf); if ((NVENCAPI_MAJOR_VERSION << 4 | NVENCAPI_MINOR_VERSION) > nvenc_max_ver) { vlog.error("Your Nvidia driver is too old. Nvenc %u.%u required", NVENCAPI_MAJOR_VERSION, NVENCAPI_MINOR_VERSION); return -1; } nvenc.nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER; err = nvenc.nvenc_dl->NvEncodeAPICreateInstance(&nvenc.nvenc_funcs); if (err != NV_ENC_SUCCESS) return -1; return 0; } static int nvenc_check_cap(NV_ENC_CAPS cap) { NV_ENC_CAPS_PARAM params; memset(¶ms, 0, sizeof(NV_ENC_CAPS_PARAM)); params.version = NV_ENC_CAPS_PARAM_VER; params.capsToQuery = cap; int ret, val = 0; ret = nvenc.nvenc_funcs.nvEncGetEncodeCaps(nvenc.nvenc_ctx, nvenc.init_enc_parms.encodeGUID, ¶ms, &val); if (ret == NV_ENC_SUCCESS) return val; return 0; } static int setupdevice() { int ret; nvenc.init_enc_parms.encodeGUID = NV_ENC_CODEC_H264_GUID; nvenc.init_enc_parms.presetGUID = NV_ENC_PRESET_P7_GUID; ret = nvenc.cuda_dl->cuInit(0); if (ret < 0) return ret; ret = nvenc.cuda_dl->cuDeviceGet(&nvenc.cu_dev, 0); if (ret < 0) return ret; ret = nvenc.cuda_dl->cuCtxCreate(&nvenc.cu_ctx, CU_CTX_SCHED_BLOCKING_SYNC, nvenc.cu_dev); if (ret < 0) return ret; CUcontext dummy; nvenc.cuda_dl->cuCtxPopCurrent(&dummy); // cuda stream is NULL to use the default // open session NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params; memset(¶ms, 0, sizeof(NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS)); NVENCSTATUS err; params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; params.apiVersion = NVENCAPI_VERSION; params.device = nvenc.cu_ctx; params.deviceType = NV_ENC_DEVICE_TYPE_CUDA; err = nvenc.nvenc_funcs.nvEncOpenEncodeSessionEx(¶ms, &nvenc.nvenc_ctx); if (err != NV_ENC_SUCCESS) return -1; // check caps const int maxw = nvenc_check_cap(NV_ENC_CAPS_WIDTH_MAX); const int maxh = nvenc_check_cap(NV_ENC_CAPS_HEIGHT_MAX); const int minw = nvenc_check_cap(NV_ENC_CAPS_WIDTH_MIN); const int minh = nvenc_check_cap(NV_ENC_CAPS_HEIGHT_MIN); vlog.info("Max enc resolution %ux%u, min %ux%u", maxw, maxh, minw, minh); return 0; } static int setupenc(const unsigned w, const unsigned h, const unsigned kbps, const unsigned fps) { NVENCSTATUS err; nvenc.enc_cfg.version = NV_ENC_CONFIG_VER; nvenc.init_enc_parms.version = NV_ENC_INITIALIZE_PARAMS_VER; nvenc.init_enc_parms.darWidth = nvenc.init_enc_parms.encodeWidth = w; nvenc.init_enc_parms.darHeight = nvenc.init_enc_parms.encodeHeight = h; nvenc.init_enc_parms.frameRateNum = fps; nvenc.init_enc_parms.frameRateDen = 1; nvenc.init_enc_parms.encodeConfig = &nvenc.enc_cfg; nvenc.init_enc_parms.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY; NV_ENC_PRESET_CONFIG preset_cfg; memset(&preset_cfg, 0, sizeof(NV_ENC_PRESET_CONFIG)); preset_cfg.version = NV_ENC_PRESET_CONFIG_VER; preset_cfg.presetCfg.version = NV_ENC_CONFIG_VER; err = nvenc.nvenc_funcs.nvEncGetEncodePresetConfigEx(nvenc.nvenc_ctx, nvenc.init_enc_parms.encodeGUID, nvenc.init_enc_parms.presetGUID, nvenc.init_enc_parms.tuningInfo, &preset_cfg); if (err != NV_ENC_SUCCESS) return -1; memcpy(&nvenc.enc_cfg, &preset_cfg.presetCfg, sizeof(nvenc.enc_cfg)); nvenc.enc_cfg.version = NV_ENC_CONFIG_VER; nvenc.init_enc_parms.enableEncodeAsync = 0; nvenc.init_enc_parms.enablePTD = 1; nvenc.enc_cfg.frameIntervalP = 0; nvenc.enc_cfg.gopLength = 1; // use 4 surfaces // setup rate control nvenc.enc_cfg.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION; nvenc.enc_cfg.rcParams.averageBitRate = kbps * 1024; nvenc.enc_cfg.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR; nvenc.enc_cfg.rcParams.lowDelayKeyFrameScale = 1; nvenc.enc_cfg.rcParams.enableAQ = 1; nvenc.enc_cfg.rcParams.aqStrength = 4; // 1 - 15, 0 would be auto nvenc.enc_cfg.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; // setup_codec_config nvenc.enc_cfg.encodeCodecConfig.h264Config.h264VUIParameters.videoFullRangeFlag = 1; nvenc.enc_cfg.encodeCodecConfig.h264Config.outputBufferingPeriodSEI = 1; nvenc.enc_cfg.encodeCodecConfig.h264Config.adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE; nvenc.enc_cfg.encodeCodecConfig.h264Config.fmoMode = NV_ENC_H264_FMO_DISABLE; nvenc.enc_cfg.profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID; nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx); err = nvenc.nvenc_funcs.nvEncInitializeEncoder(nvenc.nvenc_ctx, &nvenc.init_enc_parms); if (err != NV_ENC_SUCCESS) return -1; // custream? CUcontext dummy; nvenc.cuda_dl->cuCtxPopCurrent(&dummy); return 0; } static int setupsurf(const unsigned w, const unsigned h) { nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx); int i; for (i = 0; i < NUM_SURF; i++) { NVENCSTATUS err; NV_ENC_CREATE_BITSTREAM_BUFFER allocOut; memset(&allocOut, 0, sizeof(NV_ENC_CREATE_BITSTREAM_BUFFER)); allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER; NV_ENC_CREATE_INPUT_BUFFER allocSurf; memset(&allocSurf, 0, sizeof(NV_ENC_CREATE_INPUT_BUFFER)); nvenc.surf[i].format = NV_ENC_BUFFER_FORMAT_ARGB; // doesn't have RGBA! allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER; allocSurf.width = w; allocSurf.height = h; allocSurf.bufferFmt = nvenc.surf[i].format; err = nvenc.nvenc_funcs.nvEncCreateInputBuffer(nvenc.nvenc_ctx, &allocSurf); if (err != NV_ENC_SUCCESS) return -1; nvenc.surf[i].input_surface = allocSurf.inputBuffer; nvenc.surf[i].width = allocSurf.width; nvenc.surf[i].height = allocSurf.height; // output err = nvenc.nvenc_funcs.nvEncCreateBitstreamBuffer(nvenc.nvenc_ctx, &allocOut); if (err != NV_ENC_SUCCESS) return -1; nvenc.surf[i].output_surface = allocOut.bitstreamBuffer; } CUcontext dummy; nvenc.cuda_dl->cuCtxPopCurrent(&dummy); return 0; } int nvenc_frame(const uint8_t *data, unsigned pts, uint8_t *out, uint32_t &outlen) { NVENCSTATUS err; NV_ENC_PIC_PARAMS params; memset(¶ms, 0, sizeof(NV_ENC_PIC_PARAMS)); params.version = NV_ENC_PIC_PARAMS_VER; params.encodePicFlags = NV_ENC_PIC_FLAG_FORCEINTRA | NV_ENC_PIC_FLAG_OUTPUT_SPSPPS; nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx); NV_ENC_LOCK_INPUT_BUFFER lockBufferParams; memset(&lockBufferParams, 0, sizeof(NV_ENC_LOCK_INPUT_BUFFER)); lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER; lockBufferParams.inputBuffer = nvenc.surf[nvenc.cursurf].input_surface; err = nvenc.nvenc_funcs.nvEncLockInputBuffer(nvenc.nvenc_ctx, &lockBufferParams); if (err != NV_ENC_SUCCESS) return -1; nvenc.surf[nvenc.cursurf].pitch = lockBufferParams.pitch; //vlog.info("pitch %u", lockBufferParams.pitch); // copy frame unsigned y; uint8_t *dst = (uint8_t *) lockBufferParams.bufferDataPtr; const unsigned linelen = nvenc.surf[nvenc.cursurf].width * 4; for (y = 0; y < (unsigned) nvenc.surf[nvenc.cursurf].height; y++) { memcpy(dst, data, linelen); data += linelen; dst += lockBufferParams.pitch; } err = nvenc.nvenc_funcs.nvEncUnlockInputBuffer(nvenc.nvenc_ctx, nvenc.surf[nvenc.cursurf].input_surface); if (err != NV_ENC_SUCCESS) return -1; CUcontext dummy; nvenc.cuda_dl->cuCtxPopCurrent(&dummy); params.inputBuffer = nvenc.surf[nvenc.cursurf].input_surface; params.bufferFmt = nvenc.surf[nvenc.cursurf].format; params.inputWidth = nvenc.surf[nvenc.cursurf].width; params.inputHeight = nvenc.surf[nvenc.cursurf].height; params.inputPitch = nvenc.surf[nvenc.cursurf].pitch; params.outputBitstream = nvenc.surf[nvenc.cursurf].output_surface; params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; params.inputTimeStamp = pts; nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx); err = nvenc.nvenc_funcs.nvEncEncodePicture(nvenc.nvenc_ctx, ¶ms); nvenc.cuda_dl->cuCtxPopCurrent(&dummy); if (err != NV_ENC_SUCCESS) return -1; nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx); // Get output NV_ENC_LOCK_BITSTREAM lock_params; memset(&lock_params, 0, sizeof(NV_ENC_LOCK_BITSTREAM)); lock_params.version = NV_ENC_LOCK_BITSTREAM_VER; lock_params.doNotWait = 0; lock_params.outputBitstream = nvenc.surf[nvenc.cursurf].output_surface; // lock_params.sliceOffsets = slice_offsets; TODO? err = nvenc.nvenc_funcs.nvEncLockBitstream(nvenc.nvenc_ctx, &lock_params); if (err != NV_ENC_SUCCESS) return -1; memcpy(out, lock_params.bitstreamBufferPtr, lock_params.bitstreamSizeInBytes); outlen = lock_params.bitstreamSizeInBytes; err = nvenc.nvenc_funcs.nvEncUnlockBitstream(nvenc.nvenc_ctx, nvenc.surf[nvenc.cursurf].output_surface); if (err != NV_ENC_SUCCESS) return -1; nvenc.cuda_dl->cuCtxPopCurrent(&dummy); //vlog.info("Pic type %x, idr %x i %x", lock_params.pictureType, NV_ENC_PIC_TYPE_IDR, // NV_ENC_PIC_TYPE_I); return 0; } void nvidia_unload() { NV_ENC_PIC_PARAMS params; memset(¶ms, 0, sizeof(NV_ENC_PIC_PARAMS)); params.version = NV_ENC_PIC_PARAMS_VER; params.encodePicFlags = NV_ENC_PIC_FLAG_EOS; nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx); nvenc.nvenc_funcs.nvEncEncodePicture(nvenc.nvenc_ctx, ¶ms); int i; for (i = 0; i < NUM_SURF; i++) { nvenc.nvenc_funcs.nvEncDestroyInputBuffer(nvenc.nvenc_ctx, nvenc.surf[i].input_surface); nvenc.nvenc_funcs.nvEncDestroyBitstreamBuffer(nvenc.nvenc_ctx, nvenc.surf[i].output_surface); } nvenc.nvenc_funcs.nvEncDestroyEncoder(nvenc.nvenc_ctx); CUcontext dummy; nvenc.cuda_dl->cuCtxPopCurrent(&dummy); nvenc.cuda_dl->cuCtxDestroy(nvenc.cu_ctx); nvenc_free_functions(&nvenc.nvenc_dl); cuda_free_functions(&nvenc.cuda_dl); } /* int main() { unsigned w = 256, h = 256, kbps = 400, fps = 15; memset(&nvenc, 0, sizeof(NvencDynLoadFunctions)); if (loadfuncs() < 0) return 1; if (setupdevice() < 0) return 1; if (setupenc(w, h, kbps, fps) < 0) return 1; if (setupsurf(w, h) < 0) return 1; unload(); return 0; } */ int nvidia_init(const unsigned w, const unsigned h, const unsigned kbps, const unsigned fps) { memset(&nvenc, 0, sizeof(NvencDynLoadFunctions)); if (loadfuncs() < 0) return 1; if (setupdevice() < 0) return 1; if (setupenc(w, h, kbps, fps) < 0) return 1; if (setupsurf(w, h) < 0) return 1; return 0; }