diff --git a/CMakeLists.txt b/CMakeLists.txt index 179c278..c32884a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ include(CheckLibraryExists) include(CheckTypeSize) include(CheckCSourceCompiles) include(CheckCXXSourceCompiles) +include(CheckCXXCompilerFlag) include(CheckCSourceRuns) include(CMakeMacroLibtoolFile) @@ -208,6 +209,9 @@ if(ENABLE_PAM) endif() set(HAVE_PAM ${ENABLE_PAM}) +# Check for SSE2 +check_cxx_compiler_flag(-msse2 COMPILER_SUPPORTS_SSE2) + # Generate config.h and make sure the source finds it configure_file(config.h.in config.h) add_definitions(-DHAVE_CONFIG_H) diff --git a/common/rfb/CMakeLists.txt b/common/rfb/CMakeLists.txt index 5e1944f..56d0da6 100644 --- a/common/rfb/CMakeLists.txt +++ b/common/rfb/CMakeLists.txt @@ -99,6 +99,29 @@ endif() add_library(rfb STATIC ${RFB_SOURCES}) +# SSE2 + +set(SSE2_SOURCES + scale_sse2.cxx) + +set(SCALE_DUMMY_SOURCES + scale_dummy.cxx) + +if(COMPILER_SUPPORTS_SSE2) + add_library(scale_sse2 STATIC ${SSE2_SOURCES}) + set(RFB_LIBRARIES + ${RFB_LIBRARIES} + scale_sse2 + ) + set_target_properties(scale_sse2 PROPERTIES COMPILE_FLAGS ${COMPILE_FLAGS} -msse2) +else() + add_library(scale_dummy STATIC ${SCALE_DUMMY_SOURCES}) + set(RFB_LIBRARIES + ${RFB_LIBRARIES} + scale_dummy + ) +endif() + target_link_libraries(rfb ${RFB_LIBRARIES}) if(UNIX) diff --git a/common/rfb/EncodeManager.cxx b/common/rfb/EncodeManager.cxx index 8785190..b6d7210 100644 --- a/common/rfb/EncodeManager.cxx +++ b/common/rfb/EncodeManager.cxx @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -973,6 +974,64 @@ PixelBuffer *progressiveBilinearScale(const PixelBuffer *pb, const uint16_t tgtw, const uint16_t tgth, const float tgtdiff) { + if (supportsSSE2()) { + if (tgtdiff >= 0.5f) { + ManagedPixelBuffer *newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth); + + int oldstride, newstride; + const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride); + rdr::U8 *newpx = newpb->getBufferRW(newpb->getRect(), &newstride); + + SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff); + return newpb; + } + + PixelBuffer *newpb; + uint16_t neww, newh, oldw, oldh; + bool del = false; + + do { + oldw = pb->getRect().width(); + oldh = pb->getRect().height(); + neww = oldw / 2; + newh = oldh / 2; + + newpb = new ManagedPixelBuffer(pb->getPF(), neww, newh); + + int oldstride, newstride; + const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride); + rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(), + &newstride); + + SSE2_halve(oldpx, neww, newh, newpx, oldstride, newstride); + + if (del) + delete pb; + del = true; + + pb = newpb; + } while (tgtw * 2 < neww); + + // Final, non-halving step + if (tgtw != neww || tgth != newh) { + oldw = pb->getRect().width(); + oldh = pb->getRect().height(); + + newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth); + + int oldstride, newstride; + const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride); + rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(), + &newstride); + + SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff); + if (del) + delete pb; + } + + return newpb; + } // SSE2 + if (tgtdiff >= 0.5f) return bilinearScale(pb, tgtw, tgth, tgtdiff); diff --git a/common/rfb/scale_dummy.cxx b/common/rfb/scale_dummy.cxx new file mode 100644 index 0000000..b978609 --- /dev/null +++ b/common/rfb/scale_dummy.cxx @@ -0,0 +1,37 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#include + +namespace rfb { + +void SSE2_halve(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride) { +} + +// Handles factors between 0.5 and 1.0 +void SSE2_scale(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride, + const float tgtdiff) { +} + +}; // namespace rfb diff --git a/common/rfb/scale_sse2.cxx b/common/rfb/scale_sse2.cxx new file mode 100644 index 0000000..e4c717b --- /dev/null +++ b/common/rfb/scale_sse2.cxx @@ -0,0 +1,257 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#include + +#include + +namespace rfb { + +/* +static void print128(const char msg[], const __m128i v) { + union { + __m128i v; + uint8_t c[16]; + } u; + + u.v = v; + + printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n", + msg, + u.c[0], + u.c[1], + u.c[2], + u.c[3], + u.c[4], + u.c[5], + u.c[6], + u.c[7], + u.c[8], + u.c[9], + u.c[10], + u.c[11], + u.c[12], + u.c[13], + u.c[14], + u.c[15]); +} +*/ + +void SSE2_halve(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride) { + uint16_t x, y; + const uint16_t srcw = tgtw * 2, srch = tgth * 2; + const __m128i zero = _mm_setzero_si128(); + const __m128i shift = _mm_set_epi32(0, 0, 0, 2); + const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); + + for (y = 0; y < srch; y += 2) { + const uint8_t * const row0 = oldpx + oldstride * y * 4; + const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4; + + uint8_t * const dst = newpx + newstride * (y / 2) * 4; + + for (x = 0; x < srcw; x += 4) { + __m128i lo, hi, a, b, c, d; + lo = _mm_loadu_si128((__m128i *) &row0[x * 4]); + hi = _mm_loadu_si128((__m128i *) &row1[x * 4]); + + a = _mm_unpacklo_epi8(lo, zero); + b = _mm_unpackhi_epi8(lo, zero); + c = _mm_unpacklo_epi8(hi, zero); + d = _mm_unpackhi_epi8(hi, zero); + + a = _mm_add_epi16(a, c); + b = _mm_add_epi16(b, d); + + c = _mm_srli_si128(a, 8); + a = _mm_and_si128(a, low); + a = _mm_add_epi16(a, c); + + d = _mm_slli_si128(b, 8); + b = _mm_and_si128(b, high); + b = _mm_add_epi16(b, d); + + a = _mm_add_epi16(a, b); + + a = _mm_srl_epi16(a, shift); + a = _mm_packus_epi16(a, zero); + + _mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a); + } + + for (; x < srcw; x += 2) { + // Remainder in C + uint8_t i; + for (i = 0; i < 4; i++) { + dst[(x / 2) * 4 + i] = + (row0[x * 4 + i] + + row0[(x + 1) * 4 + i] + + row1[x * 4 + i] + + row1[(x + 1) * 4 + i]) / 4; + } + } + } +} + +// Handles factors between 0.5 and 1.0 +void SSE2_scale(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride, + const float tgtdiff) { + + uint16_t x, y; + const __m128i zero = _mm_setzero_si128(); + const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); + const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); + const float invdiff = 1 / tgtdiff; + + for (y = 0; y < tgth; y++) { + const float ny = y * invdiff; + const uint16_t lowy = ny; + const uint16_t highy = lowy + 1; + const uint16_t bot = (ny - lowy) * 256; + const uint16_t top = 256 - bot; + const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4); + const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4); + const uint8_t * const brow0 = (uint8_t *) row0; + const uint8_t * const brow1 = (uint8_t *) row1; + + uint8_t * const dst = newpx + newstride * y * 4; + + const __m128i vertmul = _mm_set1_epi16(top); + const __m128i vertmul2 = _mm_set1_epi16(bot); + + for (x = 0; x < tgtw; x += 2) { + const float nx[2] = { + x * invdiff, + (x + 1) * invdiff, + }; + const uint16_t lowx[2] = { + (uint16_t) nx[0], + (uint16_t) nx[1], + }; + const uint16_t highx[2] = { + (uint16_t) (lowx[0] + 1), + (uint16_t) (lowx[1] + 1), + }; + const uint16_t right[2] = { + (uint16_t) ((nx[0] - lowx[0]) * 256), + (uint16_t) ((nx[1] - lowx[1]) * 256), + }; + const uint16_t left[2] = { + (uint16_t) (256 - right[0]), + (uint16_t) (256 - right[1]), + }; + + const __m128i horzmul = _mm_set_epi16( + right[0], + right[0], + right[0], + right[0], + left[0], + left[0], + left[0], + left[0] + ); + const __m128i horzmul2 = _mm_set_epi16( + right[1], + right[1], + right[1], + right[1], + left[1], + left[1], + left[1], + left[1] + ); + + __m128i lo, hi, a, b, c, d; + lo = _mm_setr_epi32(row0[lowx[0]], + row0[highx[0]], + row0[lowx[1]], + row0[highx[1]]); + hi = _mm_setr_epi32(row1[lowx[0]], + row1[highx[0]], + row1[lowx[1]], + row1[highx[1]]); + + a = _mm_unpacklo_epi8(lo, zero); + b = _mm_unpackhi_epi8(lo, zero); + c = _mm_unpacklo_epi8(hi, zero); + d = _mm_unpackhi_epi8(hi, zero); + + a = _mm_mullo_epi16(a, vertmul); + b = _mm_mullo_epi16(b, vertmul); + c = _mm_mullo_epi16(c, vertmul2); + d = _mm_mullo_epi16(d, vertmul2); + + a = _mm_add_epi16(a, c); + a = _mm_srli_epi16(a, 8); + b = _mm_add_epi16(b, d); + b = _mm_srli_epi16(b, 8); + + a = _mm_mullo_epi16(a, horzmul); + b = _mm_mullo_epi16(b, horzmul2); + + lo = _mm_srli_si128(a, 8); + a = _mm_and_si128(a, low); + a = _mm_add_epi16(a, lo); + + hi = _mm_slli_si128(b, 8); + b = _mm_and_si128(b, high); + b = _mm_add_epi16(b, hi); + + a = _mm_add_epi16(a, b); + a = _mm_srli_epi16(a, 8); + + a = _mm_packus_epi16(a, zero); + + _mm_storel_epi64((__m128i *) &dst[x * 4], a); + } + + for (; x < tgtw; x++) { + // Remainder in C + const float nx = x * invdiff; + const uint16_t lowx = nx; + const uint16_t highx = lowx + 1; + const uint16_t right = (nx - lowx) * 256; + const uint16_t left = 256 - right; + + uint8_t i; + uint32_t val, val2; + for (i = 0; i < 4; i++) { + val = brow0[lowx * 4 + i] * left; + val += brow0[highx * 4 + i] * right; + val >>= 8; + + val2 = brow1[lowx * 4 + i] * left; + val2 += brow1[highx * 4 + i] * right; + val2 >>= 8; + + dst[x * 4 + i] = + (val * top + val2 * bot) >> 8; + } + } + } +} + +}; // namespace rfb diff --git a/common/rfb/scale_sse2.h b/common/rfb/scale_sse2.h new file mode 100644 index 0000000..e60357d --- /dev/null +++ b/common/rfb/scale_sse2.h @@ -0,0 +1,38 @@ +/* Copyright (C) 2021 Kasm Web + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this software; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + */ + +#ifndef __RFB_SCALE_SSE2_H__ +#define __RFB_SCALE_SSE2_H__ + +#include + +namespace rfb { + + void SSE2_halve(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride); + + void SSE2_scale(const uint8_t *oldpx, + const uint16_t tgtw, const uint16_t tgth, + uint8_t *newpx, + const unsigned oldstride, const unsigned newstride, + const float tgtdiff); +}; + +#endif