From 0cb2c0ba9f86c45a451e93548c99a45f1cd4b5c8 Mon Sep 17 00:00:00 2001 From: mmcclaskey Date: Thu, 9 Sep 2021 13:55:33 -0400 Subject: [PATCH] Sse scaling (#52) * Add CPUID functions for runtime dispatch * Add SSE2 scaling --- CMakeLists.txt | 4 + common/rfb/CMakeLists.txt | 22 +++ common/rfb/EncodeManager.cxx | 60 ++++++++ common/rfb/VNCServerST.cxx | 4 + common/rfb/cpuid.cxx | 70 ++++++++++ common/rfb/cpuid.h | 28 ++++ common/rfb/scale_dummy.cxx | 37 +++++ common/rfb/scale_sse2.cxx | 257 +++++++++++++++++++++++++++++++++++ common/rfb/scale_sse2.h | 38 ++++++ kasmweb | 2 +- 10 files changed, 521 insertions(+), 1 deletion(-) create mode 100644 common/rfb/cpuid.cxx create mode 100644 common/rfb/cpuid.h create mode 100644 common/rfb/scale_dummy.cxx create mode 100644 common/rfb/scale_sse2.cxx create mode 100644 common/rfb/scale_sse2.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 179c278..c32884a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ include(CheckLibraryExists) include(CheckTypeSize) include(CheckCSourceCompiles) include(CheckCXXSourceCompiles) +include(CheckCXXCompilerFlag) include(CheckCSourceRuns) include(CMakeMacroLibtoolFile) @@ -208,6 +209,9 @@ if(ENABLE_PAM) endif() set(HAVE_PAM ${ENABLE_PAM}) +# Check for SSE2 +check_cxx_compiler_flag(-msse2 COMPILER_SUPPORTS_SSE2) + # Generate config.h and make sure the source finds it configure_file(config.h.in config.h) add_definitions(-DHAVE_CONFIG_H) diff --git a/common/rfb/CMakeLists.txt b/common/rfb/CMakeLists.txt index 1b37dd5..dea5f35 100644 --- a/common/rfb/CMakeLists.txt +++ b/common/rfb/CMakeLists.txt @@ -64,6 +64,7 @@ set(RFB_SOURCES VNCServerST.cxx ZRLEEncoder.cxx ZRLEDecoder.cxx + cpuid.cxx encodings.cxx util.cxx xxhash.c) @@ -97,6 +98,27 @@ if(GNUTLS_FOUND) ) endif() +# SSE2 + +set(SSE2_SOURCES + scale_sse2.cxx) + +set(SCALE_DUMMY_SOURCES + scale_dummy.cxx) + +if(COMPILER_SUPPORTS_SSE2) + set_source_files_properties(${SSE2_SOURCES} PROPERTIES COMPILE_FLAGS ${COMPILE_FLAGS} -msse2) + set(RFB_SOURCES + ${RFB_SOURCES} + ${SSE2_SOURCES} + ) +else() + set(RFB_SOURCES + ${RFB_SOURCES} + ${SCALE_DUMMY_SOURCES} + ) +endif() + add_library(rfb STATIC ${RFB_SOURCES}) target_link_libraries(rfb ${RFB_LIBRARIES}) diff --git a/common/rfb/EncodeManager.cxx b/common/rfb/EncodeManager.cxx index 565ab16..30489c7 100644 --- a/common/rfb/EncodeManager.cxx +++ b/common/rfb/EncodeManager.cxx @@ -22,10 +22,12 @@ #include #include +#include #include #include #include #include +#include #include #include #include @@ -972,6 +974,64 @@ PixelBuffer *rfb::progressiveBilinearScale(const PixelBuffer *pb, const uint16_t tgtw, const uint16_t tgth, const float tgtdiff) { + if (supportsSSE2()) { + if (tgtdiff >= 0.5f) { + ManagedPixelBuffer *newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth); + + int oldstride, newstride; + const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride); + rdr::U8 *newpx = newpb->getBufferRW(newpb->getRect(), &newstride); + + SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff); + return newpb; + } + + PixelBuffer *newpb; + uint16_t neww, newh, oldw, oldh; + bool del = false; + + do { + oldw = pb->getRect().width(); + oldh = pb->getRect().height(); + neww = oldw / 2; + newh = oldh / 2; + + newpb = new ManagedPixelBuffer(pb->getPF(), neww, newh); + + int oldstride, newstride; + const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride); + rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(), + &newstride); + + SSE2_halve(oldpx, neww, newh, newpx, oldstride, newstride); + + if (del) + delete pb; + del = true; + + pb = newpb; + } while (tgtw * 2 < neww); + + // Final, non-halving step + if (tgtw != neww || tgth != newh) { + oldw = pb->getRect().width(); + oldh = pb->getRect().height(); + + newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth); + + int oldstride, newstride; + const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride); + rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(), + &newstride); + + SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff); + if (del) + delete pb; + } + + return newpb; + } // SSE2 + if (tgtdiff >= 0.5f) return bilinearScale(pb, tgtw, tgth, tgtdiff); diff --git a/common/rfb/VNCServerST.cxx b/common/rfb/VNCServerST.cxx index 785ab42..3e87892 100644 --- a/common/rfb/VNCServerST.cxx +++ b/common/rfb/VNCServerST.cxx @@ -53,6 +53,7 @@ #include +#include #include #include #include @@ -134,6 +135,9 @@ VNCServerST::VNCServerST(const char* name_, SDesktop* desktop_) { lastUserInputTime = lastDisconnectTime = time(0); slog.debug("creating single-threaded server %s", name.buf); + slog.info("CPU capability: SSE2 %s, AVX512f %s", + supportsSSE2() ? "yes" : "no", + supportsAVX512f() ? "yes" : "no"); DLPRegion.enabled = DLPRegion.percents = false; diff --git a/common/rfb/cpuid.cxx b/common/rfb/cpuid.cxx new file mode 100644 index 0000000..c89f950 --- /dev/null +++ b/common/rfb/cpuid.cxx @@ -0,0 +1,70 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#include + +static uint32_t cpuid[4] = { 0 }; +static uint32_t extcpuid[4] = { 0 }; + +static void getcpuid() { + if (cpuid[0]) + return; + +#if defined(__x86_64__) || defined(__i386__) + uint32_t eax, ecx = 0; + + eax = 1; // normal feature bits + + __asm__ __volatile__( + "cpuid\n\t" + : "=a"(cpuid[0]), "=b"(cpuid[1]), "=c"(cpuid[2]), "=d"(cpuid[3]) + : "0"(eax), "2"(ecx) + ); + + eax = 7; // ext feature bits + ecx = 0; + + __asm__ __volatile__( + "cpuid\n\t" + : "=a"(extcpuid[0]), "=b"(extcpuid[1]), "=c"(extcpuid[2]), "=d"(extcpuid[3]) + : "0"(eax), "2"(ecx) + ); +#endif +} + +namespace rfb { + +bool supportsSSE2() { + getcpuid(); +#if defined(__x86_64__) || defined(__i386__) + #define bit_SSE2 (1 << 26) + return cpuid[3] & bit_SSE2; +#endif + return false; +} + +bool supportsAVX512f() { + getcpuid(); +#if defined(__x86_64__) || defined(__i386__) + #define bit_AVX512f (1 << 16) + return extcpuid[1] & bit_AVX512f; +#endif + return false; +} + +}; // namespace rfb diff --git a/common/rfb/cpuid.h b/common/rfb/cpuid.h new file mode 100644 index 0000000..c84b4a2 --- /dev/null +++ b/common/rfb/cpuid.h @@ -0,0 +1,28 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#ifndef __RFB_CPUID_H__ +#define __RFB_CPUID_H__ + +namespace rfb { + + bool supportsSSE2(); + bool supportsAVX512f(); +}; + +#endif diff --git a/common/rfb/scale_dummy.cxx b/common/rfb/scale_dummy.cxx new file mode 100644 index 0000000..b978609 --- /dev/null +++ b/common/rfb/scale_dummy.cxx @@ -0,0 +1,37 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#include + +namespace rfb { + +void SSE2_halve(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride) { +} + +// Handles factors between 0.5 and 1.0 +void SSE2_scale(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride, + const float tgtdiff) { +} + +}; // namespace rfb diff --git a/common/rfb/scale_sse2.cxx b/common/rfb/scale_sse2.cxx new file mode 100644 index 0000000..e4c717b --- /dev/null +++ b/common/rfb/scale_sse2.cxx @@ -0,0 +1,257 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#include + +#include + +namespace rfb { + +/* +static void print128(const char msg[], const __m128i v) { + union { + __m128i v; + uint8_t c[16]; + } u; + + u.v = v; + + printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n", + msg, + u.c[0], + u.c[1], + u.c[2], + u.c[3], + u.c[4], + u.c[5], + u.c[6], + u.c[7], + u.c[8], + u.c[9], + u.c[10], + u.c[11], + u.c[12], + u.c[13], + u.c[14], + u.c[15]); +} +*/ + +void SSE2_halve(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride) { + uint16_t x, y; + const uint16_t srcw = tgtw * 2, srch = tgth * 2; + const __m128i zero = _mm_setzero_si128(); + const __m128i shift = _mm_set_epi32(0, 0, 0, 2); + const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); + + for (y = 0; y < srch; y += 2) { + const uint8_t * const row0 = oldpx + oldstride * y * 4; + const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4; + + uint8_t * const dst = newpx + newstride * (y / 2) * 4; + + for (x = 0; x < srcw; x += 4) { + __m128i lo, hi, a, b, c, d; + lo = _mm_loadu_si128((__m128i *) &row0[x * 4]); + hi = _mm_loadu_si128((__m128i *) &row1[x * 4]); + + a = _mm_unpacklo_epi8(lo, zero); + b = _mm_unpackhi_epi8(lo, zero); + c = _mm_unpacklo_epi8(hi, zero); + d = _mm_unpackhi_epi8(hi, zero); + + a = _mm_add_epi16(a, c); + b = _mm_add_epi16(b, d); + + c = _mm_srli_si128(a, 8); + a = _mm_and_si128(a, low); + a = _mm_add_epi16(a, c); + + d = _mm_slli_si128(b, 8); + b = _mm_and_si128(b, high); + b = _mm_add_epi16(b, d); + + a = _mm_add_epi16(a, b); + + a = _mm_srl_epi16(a, shift); + a = _mm_packus_epi16(a, zero); + + _mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a); + } + + for (; x < srcw; x += 2) { + // Remainder in C + uint8_t i; + for (i = 0; i < 4; i++) { + dst[(x / 2) * 4 + i] = + (row0[x * 4 + i] + + row0[(x + 1) * 4 + i] + + row1[x * 4 + i] + + row1[(x + 1) * 4 + i]) / 4; + } + } + } +} + +// Handles factors between 0.5 and 1.0 +void SSE2_scale(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride, + const float tgtdiff) { + + uint16_t x, y; + const __m128i zero = _mm_setzero_si128(); + const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); + const float invdiff = 1 / tgtdiff; + + for (y = 0; y < tgth; y++) { + const float ny = y * invdiff; + const uint16_t lowy = ny; + const uint16_t highy = lowy + 1; + const uint16_t bot = (ny - lowy) * 256; + const uint16_t top = 256 - bot; + const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4); + const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4); + const uint8_t * const brow0 = (uint8_t *) row0; + const uint8_t * const brow1 = (uint8_t *) row1; + + uint8_t * const dst = newpx + newstride * y * 4; + + const __m128i vertmul = _mm_set1_epi16(top); + const __m128i vertmul2 = _mm_set1_epi16(bot); + + for (x = 0; x < tgtw; x += 2) { + const float nx[2] = { + x * invdiff, + (x + 1) * invdiff, + }; + const uint16_t lowx[2] = { + (uint16_t) nx[0], + (uint16_t) nx[1], + }; + const uint16_t highx[2] = { + (uint16_t) (lowx[0] + 1), + (uint16_t) (lowx[1] + 1), + }; + const uint16_t right[2] = { + (uint16_t) ((nx[0] - lowx[0]) * 256), + (uint16_t) ((nx[1] - lowx[1]) * 256), + }; + const uint16_t left[2] = { + (uint16_t) (256 - right[0]), + (uint16_t) (256 - right[1]), + }; + + const __m128i horzmul = _mm_set_epi16( + right[0], + right[0], + right[0], + right[0], + left[0], + left[0], + left[0], + left[0] + ); + const __m128i horzmul2 = _mm_set_epi16( + right[1], + right[1], + right[1], + right[1], + left[1], + left[1], + left[1], + left[1] + ); + + __m128i lo, hi, a, b, c, d; + lo = _mm_setr_epi32(row0[lowx[0]], + row0[highx[0]], + row0[lowx[1]], + row0[highx[1]]); + hi = _mm_setr_epi32(row1[lowx[0]], + row1[highx[0]], + row1[lowx[1]], + row1[highx[1]]); + + a = _mm_unpacklo_epi8(lo, zero); + b = _mm_unpackhi_epi8(lo, zero); + c = _mm_unpacklo_epi8(hi, zero); + d = _mm_unpackhi_epi8(hi, zero); + + a = _mm_mullo_epi16(a, vertmul); + b = _mm_mullo_epi16(b, vertmul); + c = _mm_mullo_epi16(c, vertmul2); + d = _mm_mullo_epi16(d, vertmul2); + + a = _mm_add_epi16(a, c); + a = _mm_srli_epi16(a, 8); + b = _mm_add_epi16(b, d); + b = _mm_srli_epi16(b, 8); + + a = _mm_mullo_epi16(a, horzmul); + b = _mm_mullo_epi16(b, horzmul2); + + lo = _mm_srli_si128(a, 8); + a = _mm_and_si128(a, low); + a = _mm_add_epi16(a, lo); + + hi = _mm_slli_si128(b, 8); + b = _mm_and_si128(b, high); + b = _mm_add_epi16(b, hi); + + a = _mm_add_epi16(a, b); + a = _mm_srli_epi16(a, 8); + + a = _mm_packus_epi16(a, zero); + + _mm_storel_epi64((__m128i *) &dst[x * 4], a); + } + + for (; x < tgtw; x++) { + // Remainder in C + const float nx = x * invdiff; + const uint16_t lowx = nx; + const uint16_t highx = lowx + 1; + const uint16_t right = (nx - lowx) * 256; + const uint16_t left = 256 - right; + + uint8_t i; + uint32_t val, val2; + for (i = 0; i < 4; i++) { + val = brow0[lowx * 4 + i] * left; + val += brow0[highx * 4 + i] * right; + val >>= 8; + + val2 = brow1[lowx * 4 + i] * left; + val2 += brow1[highx * 4 + i] * right; + val2 >>= 8; + + dst[x * 4 + i] = + (val * top + val2 * bot) >> 8; + } + } + } +} + +}; // namespace rfb diff --git a/common/rfb/scale_sse2.h b/common/rfb/scale_sse2.h new file mode 100644 index 0000000..e60357d --- /dev/null +++ b/common/rfb/scale_sse2.h @@ -0,0 +1,38 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#ifndef __RFB_SCALE_SSE2_H__ +#define __RFB_SCALE_SSE2_H__ + +#include + +namespace rfb { + + void SSE2_halve(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride); + + void SSE2_scale(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride, + const float tgtdiff); +}; + +#endif diff --git a/kasmweb b/kasmweb index ba40cac..e0bb9f6 160000 --- a/kasmweb +++ b/kasmweb @@ -1 +1 @@ -Subproject commit ba40cacce068fa35fc706c41605db14c04348170 +Subproject commit e0bb9f6bcf945da6cb10fd0eb48b63b48bf09bb8