You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
KasmVNC/common/rfb/nvidia.cxx

454 lines
12 KiB
C++

/*
* H.264/HEVC hardware encoding using nvidia nvenc
* Copyright (c) 2016 Timo Rothenpieler <timo@rothenpieler.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <rfb/LogWriter.h>
#include <dlfcn.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "nvidia.h"
using namespace rfb;
static LogWriter vlog("nvidia");
#define FFNV_LOG_FUNC(logctx, msg, ...) vlog.info((msg), __VA_ARGS__)
#define FFNV_DEBUG_LOG_FUNC(logctx, msg, ...)
#include "dynlink_loader.h"
#define NUM_SURF 4
typedef struct NvencSurface
{
NV_ENC_INPUT_PTR input_surface;
int reg_idx;
int width;
int height;
int pitch;
NV_ENC_OUTPUT_PTR output_surface;
NV_ENC_BUFFER_FORMAT format;
} NvencSurface;
typedef struct NvencDynLoadFunctions
{
CudaFunctions *cuda_dl;
NvencFunctions *nvenc_dl;
void *nvenc_ctx;
NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
NV_ENC_INITIALIZE_PARAMS init_enc_parms;
NV_ENC_CONFIG enc_cfg;
CUdevice cu_dev;
CUcontext cu_ctx;
NvencSurface surf[NUM_SURF];
uint8_t cursurf;
} NvencDynLoadFunctions;
static NvencDynLoadFunctions nvenc;
/*
Recommended settings for streaming
Low-Latency High Quality preset
Rate control mode = Two-pass CBR
Very low VBV buffer size (Single frame)
No B Frames
Infinite GOP length
Adaptive Quantization enabled
*/
static int loadfuncs() {
int ret;
NVENCSTATUS err;
uint32_t nvenc_max_ver;
ret = cuda_load_functions(&nvenc.cuda_dl);
if (ret < 0)
return ret;
ret = nvenc_load_functions(&nvenc.nvenc_dl);
if (ret < 0)
return ret;
err = nvenc.nvenc_dl->NvEncodeAPIGetMaxSupportedVersion(&nvenc_max_ver);
if (err != NV_ENC_SUCCESS)
return -1;
vlog.info("Loaded nvenc version %u.%u", nvenc_max_ver >> 4, nvenc_max_ver & 0xf);
if ((NVENCAPI_MAJOR_VERSION << 4 | NVENCAPI_MINOR_VERSION) > nvenc_max_ver) {
vlog.error("Your Nvidia driver is too old. Nvenc %u.%u required",
NVENCAPI_MAJOR_VERSION, NVENCAPI_MINOR_VERSION);
return -1;
}
nvenc.nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
err = nvenc.nvenc_dl->NvEncodeAPICreateInstance(&nvenc.nvenc_funcs);
if (err != NV_ENC_SUCCESS)
return -1;
return 0;
}
static int nvenc_check_cap(NV_ENC_CAPS cap) {
NV_ENC_CAPS_PARAM params;
memset(&params, 0, sizeof(NV_ENC_CAPS_PARAM));
params.version = NV_ENC_CAPS_PARAM_VER;
params.capsToQuery = cap;
int ret, val = 0;
ret = nvenc.nvenc_funcs.nvEncGetEncodeCaps(nvenc.nvenc_ctx,
nvenc.init_enc_parms.encodeGUID,
&params, &val);
if (ret == NV_ENC_SUCCESS)
return val;
return 0;
}
static int setupdevice() {
int ret;
nvenc.init_enc_parms.encodeGUID = NV_ENC_CODEC_H264_GUID;
nvenc.init_enc_parms.presetGUID = NV_ENC_PRESET_P7_GUID;
ret = nvenc.cuda_dl->cuInit(0);
if (ret < 0)
return ret;
ret = nvenc.cuda_dl->cuDeviceGet(&nvenc.cu_dev, 0);
if (ret < 0)
return ret;
ret = nvenc.cuda_dl->cuCtxCreate(&nvenc.cu_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
nvenc.cu_dev);
if (ret < 0)
return ret;
CUcontext dummy;
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
// cuda stream is NULL to use the default
// open session
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS params;
memset(&params, 0, sizeof(NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS));
NVENCSTATUS err;
params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
params.apiVersion = NVENCAPI_VERSION;
params.device = nvenc.cu_ctx;
params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
err = nvenc.nvenc_funcs.nvEncOpenEncodeSessionEx(&params, &nvenc.nvenc_ctx);
if (err != NV_ENC_SUCCESS)
return -1;
// check caps
const int maxw = nvenc_check_cap(NV_ENC_CAPS_WIDTH_MAX);
const int maxh = nvenc_check_cap(NV_ENC_CAPS_HEIGHT_MAX);
const int minw = nvenc_check_cap(NV_ENC_CAPS_WIDTH_MIN);
const int minh = nvenc_check_cap(NV_ENC_CAPS_HEIGHT_MIN);
vlog.info("Max enc resolution %ux%u, min %ux%u\n", maxw, maxh, minw, minh);
return 0;
}
static int setupenc(const unsigned w, const unsigned h, const unsigned kbps,
const unsigned fps) {
NVENCSTATUS err;
nvenc.enc_cfg.version = NV_ENC_CONFIG_VER;
nvenc.init_enc_parms.version = NV_ENC_INITIALIZE_PARAMS_VER;
nvenc.init_enc_parms.darWidth =
nvenc.init_enc_parms.encodeWidth = w;
nvenc.init_enc_parms.darHeight =
nvenc.init_enc_parms.encodeHeight = h;
nvenc.init_enc_parms.frameRateNum = fps;
nvenc.init_enc_parms.frameRateDen = 1;
nvenc.init_enc_parms.encodeConfig = &nvenc.enc_cfg;
nvenc.init_enc_parms.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
NV_ENC_PRESET_CONFIG preset_cfg;
memset(&preset_cfg, 0, sizeof(NV_ENC_PRESET_CONFIG));
preset_cfg.version = NV_ENC_PRESET_CONFIG_VER;
preset_cfg.presetCfg.version = NV_ENC_CONFIG_VER;
err = nvenc.nvenc_funcs.nvEncGetEncodePresetConfigEx(nvenc.nvenc_ctx,
nvenc.init_enc_parms.encodeGUID,
nvenc.init_enc_parms.presetGUID,
nvenc.init_enc_parms.tuningInfo,
&preset_cfg);
if (err != NV_ENC_SUCCESS)
return -1;
memcpy(&nvenc.enc_cfg, &preset_cfg.presetCfg, sizeof(nvenc.enc_cfg));
nvenc.enc_cfg.version = NV_ENC_CONFIG_VER;
nvenc.init_enc_parms.enableEncodeAsync = 0;
nvenc.init_enc_parms.enablePTD = 1;
nvenc.enc_cfg.frameIntervalP = 0;
nvenc.enc_cfg.gopLength = 1;
// use 4 surfaces
// setup rate control
nvenc.enc_cfg.rcParams.multiPass = NV_ENC_TWO_PASS_FULL_RESOLUTION;
nvenc.enc_cfg.rcParams.averageBitRate = kbps * 1024;
nvenc.enc_cfg.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CBR;
nvenc.enc_cfg.rcParams.lowDelayKeyFrameScale = 1;
nvenc.enc_cfg.rcParams.enableAQ = 1;
nvenc.enc_cfg.rcParams.aqStrength = 4; // 1 - 15, 0 would be auto
nvenc.enc_cfg.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
// setup_codec_config
nvenc.enc_cfg.encodeCodecConfig.h264Config.h264VUIParameters.videoFullRangeFlag = 1;
nvenc.enc_cfg.encodeCodecConfig.h264Config.outputBufferingPeriodSEI = 1;
nvenc.enc_cfg.encodeCodecConfig.h264Config.adaptiveTransformMode = NV_ENC_H264_ADAPTIVE_TRANSFORM_ENABLE;
nvenc.enc_cfg.encodeCodecConfig.h264Config.fmoMode = NV_ENC_H264_FMO_DISABLE;
nvenc.enc_cfg.profileGUID = NV_ENC_H264_PROFILE_MAIN_GUID;
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
err = nvenc.nvenc_funcs.nvEncInitializeEncoder(nvenc.nvenc_ctx,
&nvenc.init_enc_parms);
if (err != NV_ENC_SUCCESS)
return -1;
// custream?
CUcontext dummy;
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
return 0;
}
static int setupsurf(const unsigned w, const unsigned h) {
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
int i;
for (i = 0; i < NUM_SURF; i++) {
NVENCSTATUS err;
NV_ENC_CREATE_BITSTREAM_BUFFER allocOut;
memset(&allocOut, 0, sizeof(NV_ENC_CREATE_BITSTREAM_BUFFER));
allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
NV_ENC_CREATE_INPUT_BUFFER allocSurf;
memset(&allocSurf, 0, sizeof(NV_ENC_CREATE_INPUT_BUFFER));
nvenc.surf[i].format = NV_ENC_BUFFER_FORMAT_ABGR; // doesn't have RGBA!
allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
allocSurf.width = w;
allocSurf.height = h;
allocSurf.bufferFmt = nvenc.surf[i].format;
err = nvenc.nvenc_funcs.nvEncCreateInputBuffer(nvenc.nvenc_ctx, &allocSurf);
if (err != NV_ENC_SUCCESS)
return -1;
nvenc.surf[i].input_surface = allocSurf.inputBuffer;
nvenc.surf[i].width = allocSurf.width;
nvenc.surf[i].height = allocSurf.height;
// output
err = nvenc.nvenc_funcs.nvEncCreateBitstreamBuffer(nvenc.nvenc_ctx, &allocOut);
if (err != NV_ENC_SUCCESS)
return -1;
nvenc.surf[i].output_surface = allocOut.bitstreamBuffer;
}
CUcontext dummy;
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
return 0;
}
int nvenc_frame(const uint8_t *data, unsigned pts, uint8_t *out, uint32_t &outlen) {
NVENCSTATUS err;
NV_ENC_PIC_PARAMS params;
memset(&params, 0, sizeof(NV_ENC_PIC_PARAMS));
params.version = NV_ENC_PIC_PARAMS_VER;
params.encodePicFlags = NV_ENC_PIC_FLAG_FORCEINTRA | NV_ENC_PIC_FLAG_OUTPUT_SPSPPS;
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
NV_ENC_LOCK_INPUT_BUFFER lockBufferParams;
memset(&lockBufferParams, 0, sizeof(NV_ENC_LOCK_INPUT_BUFFER));
lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
lockBufferParams.inputBuffer = nvenc.surf[nvenc.cursurf].input_surface;
err = nvenc.nvenc_funcs.nvEncLockInputBuffer(nvenc.nvenc_ctx, &lockBufferParams);
if (err != NV_ENC_SUCCESS)
return -1;
nvenc.surf[nvenc.cursurf].pitch = lockBufferParams.pitch;
//vlog.info("pitch %u", lockBufferParams.pitch);
// copy frame
unsigned y;
uint8_t *dst = (uint8_t *) lockBufferParams.bufferDataPtr;
const unsigned linelen = nvenc.surf[nvenc.cursurf].width * 4;
for (y = 0; y < (unsigned) nvenc.surf[nvenc.cursurf].height; y++) {
memcpy(dst, data, linelen);
data += linelen;
dst += lockBufferParams.pitch;
}
err = nvenc.nvenc_funcs.nvEncUnlockInputBuffer(nvenc.nvenc_ctx,
nvenc.surf[nvenc.cursurf].input_surface);
if (err != NV_ENC_SUCCESS)
return -1;
CUcontext dummy;
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
params.inputBuffer = nvenc.surf[nvenc.cursurf].input_surface;
params.bufferFmt = nvenc.surf[nvenc.cursurf].format;
params.inputWidth = nvenc.surf[nvenc.cursurf].width;
params.inputHeight = nvenc.surf[nvenc.cursurf].height;
params.inputPitch = nvenc.surf[nvenc.cursurf].pitch;
params.outputBitstream = nvenc.surf[nvenc.cursurf].output_surface;
params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
params.inputTimeStamp = pts;
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
err = nvenc.nvenc_funcs.nvEncEncodePicture(nvenc.nvenc_ctx, &params);
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
if (err != NV_ENC_SUCCESS)
return -1;
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
// Get output
NV_ENC_LOCK_BITSTREAM lock_params;
memset(&lock_params, 0, sizeof(NV_ENC_LOCK_BITSTREAM));
lock_params.version = NV_ENC_LOCK_BITSTREAM_VER;
lock_params.doNotWait = 0;
lock_params.outputBitstream = nvenc.surf[nvenc.cursurf].output_surface;
// lock_params.sliceOffsets = slice_offsets; TODO?
err = nvenc.nvenc_funcs.nvEncLockBitstream(nvenc.nvenc_ctx, &lock_params);
if (err != NV_ENC_SUCCESS)
return -1;
memcpy(out, lock_params.bitstreamBufferPtr, lock_params.bitstreamSizeInBytes);
outlen = lock_params.bitstreamSizeInBytes;
err = nvenc.nvenc_funcs.nvEncUnlockBitstream(nvenc.nvenc_ctx,
nvenc.surf[nvenc.cursurf].output_surface);
if (err != NV_ENC_SUCCESS)
return -1;
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
//vlog.info("Pic type %x, idr %x i %x", lock_params.pictureType, NV_ENC_PIC_TYPE_IDR,
// NV_ENC_PIC_TYPE_I);
return 0;
}
static void unload() {
NV_ENC_PIC_PARAMS params;
memset(&params, 0, sizeof(NV_ENC_PIC_PARAMS));
params.version = NV_ENC_PIC_PARAMS_VER;
params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
nvenc.cuda_dl->cuCtxPushCurrent(nvenc.cu_ctx);
nvenc.nvenc_funcs.nvEncEncodePicture(nvenc.nvenc_ctx, &params);
int i;
for (i = 0; i < NUM_SURF; i++) {
nvenc.nvenc_funcs.nvEncDestroyInputBuffer(nvenc.nvenc_ctx,
nvenc.surf[i].input_surface);
nvenc.nvenc_funcs.nvEncDestroyBitstreamBuffer(nvenc.nvenc_ctx,
nvenc.surf[i].output_surface);
}
nvenc.nvenc_funcs.nvEncDestroyEncoder(nvenc.nvenc_ctx);
CUcontext dummy;
nvenc.cuda_dl->cuCtxPopCurrent(&dummy);
nvenc.cuda_dl->cuCtxDestroy(nvenc.cu_ctx);
nvenc_free_functions(&nvenc.nvenc_dl);
cuda_free_functions(&nvenc.cuda_dl);
}
/*
int main() {
unsigned w = 256, h = 256, kbps = 400, fps = 15;
memset(&nvenc, 0, sizeof(NvencDynLoadFunctions));
if (loadfuncs() < 0)
return 1;
if (setupdevice() < 0)
return 1;
if (setupenc(w, h, kbps, fps) < 0)
return 1;
if (setupsurf(w, h) < 0)
return 1;
unload();
return 0;
}
*/
int nvidia_init(const unsigned w, const unsigned h, const unsigned kbps,
const unsigned fps) {
memset(&nvenc, 0, sizeof(NvencDynLoadFunctions));
if (loadfuncs() < 0)
return 1;
if (setupdevice() < 0)
return 1;
if (setupenc(w, h, kbps, fps) < 0)
return 1;
if (setupsurf(w, h) < 0)
return 1;
return 0;
}