You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
454 lines
12 KiB
C++
454 lines
12 KiB
C++
/*
|
|
* H.264/HEVC hardware encoding using nvidia nvenc
|
|
* Copyright (c) 2016 Timo Rothenpieler <timo@rothenpieler.org>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include <rfb/LogWriter.h>
|
|
|
|
#include <dlfcn.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "nvidia.h"
|
|
|
|
using namespace rfb;
|
|
|
|
static LogWriter vlog("nvidia");
|
|
|
|
#define FFNV_LOG_FUNC(logctx, msg, ...) vlog.info((msg), __VA_ARGS__)
|
|
#define FFNV_DEBUG_LOG_FUNC(logctx, msg, ...)
|
|
|
|
#include "dynlink_loader.h"
|
|
|
|
#define NUM_SURF 4
|
|
|
|
typedef struct NvencSurface
|
|
{
|
|
NV_ENC_INPUT_PTR input_surface;
|
|
int reg_idx;
|
|
int width;
|
|
int height;
|
|
int pitch;
|
|
|
|
NV_ENC_OUTPUT_PTR output_surface;
|
|
NV_ENC_BUFFER_FORMAT format;
|
|
} NvencSurface;
|
|
|
|
typedef struct NvencDynLoadFunctions
|
|
{
|
|
CudaFunctions *cuda_dl;
|
|
NvencFunctions *nvenc_dl;
|
|
|
|
void *nvenc_ctx;
|
|
NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
|
|
|
|
NV_ENC_INITIALIZE_PARAMS init_enc_parms;
|
|
NV_ENC_CONFIG enc_cfg;
|
|
CUdevice cu_dev;
|
|
CUcontext cu_ctx;
|
|
|
|
NvencSurface surf[NUM_SURF];
|
|
uint8_t cursurf;
|
|
} NvencDynLoadFunctions;
|
|
|
|
static NvencDynLoadFunctions nvenc;
|
|
|
|
/*
|
|
Recommended settings for streaming
|
|
Low-Latency High Quality preset
|
|
Rate control mode = Two-pass CBR
|
|
Very low VBV buffer size (Single frame)
|
|
No B Frames
|
|
Infinite GOP length
|
|
Adaptive Quantization enabled
|
|
*/
|
|
|
|
static int loadfuncs() {
|
|
int ret;
|
|
NVENCSTATUS err;
|
|
uint32_t nvenc_max_ver;
|
|
|
|
ret = cuda_load_functions(&nvenc.cuda_dl);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = nvenc_load_functions(&nvenc.nvenc_dl);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
err = nvenc.nvenc_dl->NvEncodeAPIGetMaxSupportedVersion(&nvenc_max_ver);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
vlog.info("Loaded nvenc version %u.%u", nvenc_max_ver >> 4, nvenc_max_ver & 0xf);
|
|
|
|
if ((NVENCAPI_MAJOR_VERSION << 4 | NVENCAPI_MINOR_VERSION) > nvenc_max_ver) {
|
|
vlog.error("Your Nvidia driver is too old. Nvenc %u.%u required",
|
|
NVENCAPI_MAJOR_VERSION, NVENCAPI_MINOR_VERSION);
|
|
return -1;
|
|
}
|
|
|
|
nvenc.nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
|
|
|
|
err = nvenc.nvenc_dl->NvEncodeAPICreateInstance(&nvenc.nvenc_funcs);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
return 0;
|
|
}
|
|
|
|
static int nvenc_check_cap(NV_ENC_CAPS cap) {
|
|
NV_ENC_CAPS_PARAM params;
|
|
memset(¶ms, 0, sizeof(NV_ENC_CAPS_PARAM));
|
|
|
|
params.version = NV_ENC_CAPS_PARAM_VER;
|
|
params.capsToQuery = cap;
|
|
|
|
int ret, val = 0;
|
|
|
|
ret = nvenc.nvenc_funcs.nvEncGetEncodeCaps(nvenc.nvenc_ctx,
|
|
nvenc.init_enc_parms.encodeGUID,
|
|
¶ms, &val);
|
|
if (ret == NV_ENC_SUCCESS)
|
|
return val;
|
|
return 0;
|
|
}
|
|
|
|
static int setupdevice() {
|
|
int ret;
|
|
|
|
nvenc.init_enc_parms.encodeGUID = NV_ENC_CODEC_H264_GUID;
|
|
nvenc.init_enc_parms.presetGUID = NV_ENC_PRESET_P7_GUID;
|
|
|
|
ret = nvenc.cuda_dl->cuInit(0);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = nvenc.cuda_dl->cuDeviceGet(&nvenc.cu_dev, 0);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
ret = nvenc.cuda_dl->cuCtxCreate(&nvenc.cu_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
|
|
nvenc.cu_dev);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
CUcontext dummy;
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
// cuda stream is NULL to use the default
|
|
|
|
// open session
|
|
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params;
|
|
memset(¶ms, 0, sizeof(NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS));
|
|
NVENCSTATUS err;
|
|
|
|
params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
|
|
params.apiVersion = NVENCAPI_VERSION;
|
|
params.device = nvenc.cu_ctx;
|
|
params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
|
|
|
|
err = nvenc.nvenc_funcs.nvEncOpenEncodeSessionEx(¶ms, &nvenc.nvenc_ctx);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
// check caps
|
|
const int maxw = nvenc_check_cap(NV_ENC_CAPS_WIDTH_MAX);
|
|
const int maxh = nvenc_check_cap(NV_ENC_CAPS_HEIGHT_MAX);
|
|
const int minw = nvenc_check_cap(NV_ENC_CAPS_WIDTH_MIN);
|
|
const int minh = nvenc_check_cap(NV_ENC_CAPS_HEIGHT_MIN);
|
|
|
|
vlog.info("Max enc resolution %ux%u, min %ux%u", maxw, maxh, minw, minh);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int setupenc(const unsigned w, const unsigned h, const unsigned kbps,
|
|
const unsigned fps) {
|
|
NVENCSTATUS err;
|
|
|
|
nvenc.enc_cfg.version = NV_ENC_CONFIG_VER;
|
|
nvenc.init_enc_parms.version = NV_ENC_INITIALIZE_PARAMS_VER;
|
|
nvenc.init_enc_parms.darWidth =
|
|
nvenc.init_enc_parms.encodeWidth = w;
|
|
nvenc.init_enc_parms.darHeight =
|
|
nvenc.init_enc_parms.encodeHeight = h;
|
|
|
|
nvenc.init_enc_parms.frameRateNum = fps;
|
|
nvenc.init_enc_parms.frameRateDen = 1;
|
|
|
|
nvenc.init_enc_parms.encodeConfig = &nvenc.enc_cfg;
|
|
nvenc.init_enc_parms.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
|
|
|
|
NV_ENC_PRESET_CONFIG preset_cfg;
|
|
memset(&preset_cfg, 0, sizeof(NV_ENC_PRESET_CONFIG));
|
|
|
|
preset_cfg.version = NV_ENC_PRESET_CONFIG_VER;
|
|
preset_cfg.presetCfg.version = NV_ENC_CONFIG_VER;
|
|
|
|
err = nvenc.nvenc_funcs.nvEncGetEncodePresetConfigEx(nvenc.nvenc_ctx,
|
|
nvenc.init_enc_parms.encodeGUID,
|
|
nvenc.init_enc_parms.presetGUID,
|
|
nvenc.init_enc_parms.tuningInfo,
|
|
&preset_cfg);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
memcpy(&nvenc.enc_cfg, &preset_cfg.presetCfg, sizeof(nvenc.enc_cfg));
|
|
|
|
nvenc.enc_cfg.version = NV_ENC_CONFIG_VER;
|
|
|
|
nvenc.init_enc_parms.enableEncodeAsync = 0;
|
|
nvenc.init_enc_parms.enablePTD = 1;
|
|
|
|
nvenc.enc_cfg.frameIntervalP = 0;
|
|
nvenc.enc_cfg.gopLength = 1;
|
|
|
|
// use 4 surfaces
|
|
|
|
// setup rate control
|
|
nvenc.enc_cfg.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION;
|
|
nvenc.enc_cfg.rcParams.averageBitRate = kbps * 1024;
|
|
nvenc.enc_cfg.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
|
|
nvenc.enc_cfg.rcParams.lowDelayKeyFrameScale = 1;
|
|
|
|
nvenc.enc_cfg.rcParams.enableAQ = 1;
|
|
nvenc.enc_cfg.rcParams.aqStrength = 4; // 1 - 15, 0 would be auto
|
|
|
|
nvenc.enc_cfg.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
|
|
|
|
// setup_codec_config
|
|
nvenc.enc_cfg.encodeCodecConfig.h264Config.h264VUIParameters.videoFullRangeFlag = 1;
|
|
nvenc.enc_cfg.encodeCodecConfig.h264Config.outputBufferingPeriodSEI = 1;
|
|
nvenc.enc_cfg.encodeCodecConfig.h264Config.adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
|
|
nvenc.enc_cfg.encodeCodecConfig.h264Config.fmoMode = NV_ENC_H264_FMO_DISABLE;
|
|
nvenc.enc_cfg.profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
|
|
|
|
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
|
|
|
|
err = nvenc.nvenc_funcs.nvEncInitializeEncoder(nvenc.nvenc_ctx,
|
|
&nvenc.init_enc_parms);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
// custream?
|
|
|
|
CUcontext dummy;
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int setupsurf(const unsigned w, const unsigned h) {
|
|
|
|
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
|
|
|
|
int i;
|
|
for (i = 0; i < NUM_SURF; i++) {
|
|
NVENCSTATUS err;
|
|
NV_ENC_CREATE_BITSTREAM_BUFFER allocOut;
|
|
memset(&allocOut, 0, sizeof(NV_ENC_CREATE_BITSTREAM_BUFFER));
|
|
allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
|
|
|
|
NV_ENC_CREATE_INPUT_BUFFER allocSurf;
|
|
memset(&allocSurf, 0, sizeof(NV_ENC_CREATE_INPUT_BUFFER));
|
|
|
|
nvenc.surf[i].format = NV_ENC_BUFFER_FORMAT_ARGB; // doesn't have RGBA!
|
|
allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
|
|
allocSurf.width = w;
|
|
allocSurf.height = h;
|
|
allocSurf.bufferFmt = nvenc.surf[i].format;
|
|
|
|
err = nvenc.nvenc_funcs.nvEncCreateInputBuffer(nvenc.nvenc_ctx, &allocSurf);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
nvenc.surf[i].input_surface = allocSurf.inputBuffer;
|
|
nvenc.surf[i].width = allocSurf.width;
|
|
nvenc.surf[i].height = allocSurf.height;
|
|
|
|
// output
|
|
err = nvenc.nvenc_funcs.nvEncCreateBitstreamBuffer(nvenc.nvenc_ctx, &allocOut);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
nvenc.surf[i].output_surface = allocOut.bitstreamBuffer;
|
|
}
|
|
|
|
CUcontext dummy;
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int nvenc_frame(const uint8_t *data, unsigned pts, uint8_t *out, uint32_t &outlen) {
|
|
NVENCSTATUS err;
|
|
|
|
NV_ENC_PIC_PARAMS params;
|
|
memset(¶ms, 0, sizeof(NV_ENC_PIC_PARAMS));
|
|
params.version = NV_ENC_PIC_PARAMS_VER;
|
|
params.encodePicFlags = NV_ENC_PIC_FLAG_FORCEINTRA | NV_ENC_PIC_FLAG_OUTPUT_SPSPPS;
|
|
|
|
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
|
|
|
|
NV_ENC_LOCK_INPUT_BUFFER lockBufferParams;
|
|
memset(&lockBufferParams, 0, sizeof(NV_ENC_LOCK_INPUT_BUFFER));
|
|
lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
|
|
lockBufferParams.inputBuffer = nvenc.surf[nvenc.cursurf].input_surface;
|
|
|
|
err = nvenc.nvenc_funcs.nvEncLockInputBuffer(nvenc.nvenc_ctx, &lockBufferParams);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
nvenc.surf[nvenc.cursurf].pitch = lockBufferParams.pitch;
|
|
//vlog.info("pitch %u", lockBufferParams.pitch);
|
|
|
|
// copy frame
|
|
unsigned y;
|
|
uint8_t *dst = (uint8_t *) lockBufferParams.bufferDataPtr;
|
|
const unsigned linelen = nvenc.surf[nvenc.cursurf].width * 4;
|
|
for (y = 0; y < (unsigned) nvenc.surf[nvenc.cursurf].height; y++) {
|
|
memcpy(dst, data, linelen);
|
|
data += linelen;
|
|
dst += lockBufferParams.pitch;
|
|
}
|
|
|
|
err = nvenc.nvenc_funcs.nvEncUnlockInputBuffer(nvenc.nvenc_ctx,
|
|
nvenc.surf[nvenc.cursurf].input_surface);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
CUcontext dummy;
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
params.inputBuffer = nvenc.surf[nvenc.cursurf].input_surface;
|
|
params.bufferFmt = nvenc.surf[nvenc.cursurf].format;
|
|
params.inputWidth = nvenc.surf[nvenc.cursurf].width;
|
|
params.inputHeight = nvenc.surf[nvenc.cursurf].height;
|
|
params.inputPitch = nvenc.surf[nvenc.cursurf].pitch;
|
|
params.outputBitstream = nvenc.surf[nvenc.cursurf].output_surface;
|
|
params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
|
|
params.inputTimeStamp = pts;
|
|
|
|
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
|
|
|
|
err = nvenc.nvenc_funcs.nvEncEncodePicture(nvenc.nvenc_ctx, ¶ms);
|
|
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
|
|
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
|
|
|
|
// Get output
|
|
NV_ENC_LOCK_BITSTREAM lock_params;
|
|
memset(&lock_params, 0, sizeof(NV_ENC_LOCK_BITSTREAM));
|
|
|
|
lock_params.version = NV_ENC_LOCK_BITSTREAM_VER;
|
|
lock_params.doNotWait = 0;
|
|
lock_params.outputBitstream = nvenc.surf[nvenc.cursurf].output_surface;
|
|
// lock_params.sliceOffsets = slice_offsets; TODO?
|
|
|
|
err = nvenc.nvenc_funcs.nvEncLockBitstream(nvenc.nvenc_ctx, &lock_params);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
memcpy(out, lock_params.bitstreamBufferPtr, lock_params.bitstreamSizeInBytes);
|
|
outlen = lock_params.bitstreamSizeInBytes;
|
|
|
|
err = nvenc.nvenc_funcs.nvEncUnlockBitstream(nvenc.nvenc_ctx,
|
|
nvenc.surf[nvenc.cursurf].output_surface);
|
|
if (err != NV_ENC_SUCCESS)
|
|
return -1;
|
|
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
//vlog.info("Pic type %x, idr %x i %x", lock_params.pictureType, NV_ENC_PIC_TYPE_IDR,
|
|
// NV_ENC_PIC_TYPE_I);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void unload() {
|
|
NV_ENC_PIC_PARAMS params;
|
|
memset(¶ms, 0, sizeof(NV_ENC_PIC_PARAMS));
|
|
params.version = NV_ENC_PIC_PARAMS_VER;
|
|
params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
|
|
|
|
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
|
|
|
|
nvenc.nvenc_funcs.nvEncEncodePicture(nvenc.nvenc_ctx, ¶ms);
|
|
|
|
int i;
|
|
for (i = 0; i < NUM_SURF; i++) {
|
|
nvenc.nvenc_funcs.nvEncDestroyInputBuffer(nvenc.nvenc_ctx,
|
|
nvenc.surf[i].input_surface);
|
|
nvenc.nvenc_funcs.nvEncDestroyBitstreamBuffer(nvenc.nvenc_ctx,
|
|
nvenc.surf[i].output_surface);
|
|
}
|
|
|
|
nvenc.nvenc_funcs.nvEncDestroyEncoder(nvenc.nvenc_ctx);
|
|
|
|
CUcontext dummy;
|
|
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
|
|
|
|
nvenc.cuda_dl->cuCtxDestroy(nvenc.cu_ctx);
|
|
|
|
nvenc_free_functions(&nvenc.nvenc_dl);
|
|
cuda_free_functions(&nvenc.cuda_dl);
|
|
}
|
|
/*
|
|
int main() {
|
|
|
|
unsigned w = 256, h = 256, kbps = 400, fps = 15;
|
|
|
|
memset(&nvenc, 0, sizeof(NvencDynLoadFunctions));
|
|
if (loadfuncs() < 0)
|
|
return 1;
|
|
if (setupdevice() < 0)
|
|
return 1;
|
|
if (setupenc(w, h, kbps, fps) < 0)
|
|
return 1;
|
|
if (setupsurf(w, h) < 0)
|
|
return 1;
|
|
|
|
unload();
|
|
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
int nvidia_init(const unsigned w, const unsigned h, const unsigned kbps,
|
|
const unsigned fps) {
|
|
|
|
memset(&nvenc, 0, sizeof(NvencDynLoadFunctions));
|
|
if (loadfuncs() < 0)
|
|
return 1;
|
|
if (setupdevice() < 0)
|
|
return 1;
|
|
if (setupenc(w, h, kbps, fps) < 0)
|
|
return 1;
|
|
if (setupsurf(w, h) < 0)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|