Sse scaling (#52)

* Add CPUID functions for runtime dispatch * Add SSE2 scaling
4 years ago · 0cb2c0ba9f
parent dc21d5f97c
commit 0cb2c0ba9f
10 changed files with 521 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -16,6 +16,7 @@ include(CheckLibraryExists)
 include(CheckTypeSize)
 include(CheckCSourceCompiles)
 include(CheckCXXSourceCompiles)
+include(CheckCXXCompilerFlag)
 include(CheckCSourceRuns)

 include(CMakeMacroLibtoolFile)
@ -208,6 +209,9 @@ if(ENABLE_PAM)
 endif()
 set(HAVE_PAM ${ENABLE_PAM})

+# Check for SSE2
+check_cxx_compiler_flag(-msse2 COMPILER_SUPPORTS_SSE2)
+
 # Generate config.h and make sure the source finds it
 configure_file(config.h.in config.h)
 add_definitions(-DHAVE_CONFIG_H)
--- a/common/rfb/CMakeLists.txt
+++ b/common/rfb/CMakeLists.txt
@ -64,6 +64,7 @@ set(RFB_SOURCES
  VNCServerST.cxx
  ZRLEEncoder.cxx
  ZRLEDecoder.cxx
+  cpuid.cxx
  encodings.cxx
  util.cxx
  xxhash.c)
@ -97,6 +98,27 @@ if(GNUTLS_FOUND)
  )
 endif()

+# SSE2
+
+set(SSE2_SOURCES
+  scale_sse2.cxx)
+
+set(SCALE_DUMMY_SOURCES
+  scale_dummy.cxx)
+
+if(COMPILER_SUPPORTS_SSE2)
+  set_source_files_properties(${SSE2_SOURCES} PROPERTIES COMPILE_FLAGS ${COMPILE_FLAGS} -msse2)
+  set(RFB_SOURCES
+    ${RFB_SOURCES}
+    ${SSE2_SOURCES}
+  )
+else()
+  set(RFB_SOURCES
+    ${RFB_SOURCES}
+    ${SCALE_DUMMY_SOURCES}
+  )
+endif()
+
 add_library(rfb STATIC ${RFB_SOURCES})

 target_link_libraries(rfb ${RFB_LIBRARIES})
--- a/common/rfb/EncodeManager.cxx
+++ b/common/rfb/EncodeManager.cxx
@ -22,10 +22,12 @@
 #include <omp.h>
 #include <stdlib.h>

+#include <rfb/cpuid.h>
 #include <rfb/EncCache.h>
 #include <rfb/EncodeManager.h>
 #include <rfb/Encoder.h>
 #include <rfb/Palette.h>
+#include <rfb/scale_sse2.h>
 #include <rfb/SConnection.h>
 #include <rfb/ServerCore.h>
 #include <rfb/SMsgWriter.h>
@ -972,6 +974,64 @@ PixelBuffer *rfb::progressiveBilinearScale(const PixelBuffer *pb,
                                 const uint16_t tgtw, const uint16_t tgth,
                                 const float tgtdiff)
 {
+  if (supportsSSE2()) {
+    if (tgtdiff >= 0.5f) {
+      ManagedPixelBuffer *newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth);
+
+      int oldstride, newstride;
+      const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride);
+      rdr::U8 *newpx = newpb->getBufferRW(newpb->getRect(), &newstride);
+
+      SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff);
+      return newpb;
+    }
+
+    PixelBuffer *newpb;
+    uint16_t neww, newh, oldw, oldh;
+    bool del = false;
+
+    do {
+      oldw = pb->getRect().width();
+      oldh = pb->getRect().height();
+      neww = oldw / 2;
+      newh = oldh / 2;
+
+      newpb = new ManagedPixelBuffer(pb->getPF(), neww, newh);
+
+      int oldstride, newstride;
+      const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride);
+      rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(),
+                                                                   &newstride);
+
+      SSE2_halve(oldpx, neww, newh, newpx, oldstride, newstride);
+
+      if (del)
+        delete pb;
+      del = true;
+
+      pb = newpb;
+    } while (tgtw * 2 < neww);
+
+    // Final, non-halving step
+    if (tgtw != neww || tgth != newh) {
+      oldw = pb->getRect().width();
+      oldh = pb->getRect().height();
+
+      newpb = new ManagedPixelBuffer(pb->getPF(), tgtw, tgth);
+
+      int oldstride, newstride;
+      const rdr::U8 *oldpx = pb->getBuffer(pb->getRect(), &oldstride);
+      rdr::U8 *newpx = ((ManagedPixelBuffer *) newpb)->getBufferRW(newpb->getRect(),
+                                                                   &newstride);
+
+      SSE2_scale(oldpx, tgtw, tgth, newpx, oldstride, newstride, tgtdiff);
+      if (del)
+        delete pb;
+    }
+
+    return newpb;
+  } // SSE2
+
  if (tgtdiff >= 0.5f)
    return bilinearScale(pb, tgtw, tgth, tgtdiff);

--- a/common/rfb/VNCServerST.cxx
+++ b/common/rfb/VNCServerST.cxx
@ -53,6 +53,7 @@

 #include <network/GetAPI.h>

+#include <rfb/cpuid.h>
 #include <rfb/ComparingUpdateTracker.h>
 #include <rfb/KeyRemapper.h>
 #include <rfb/ListConnInfo.h>
@ -134,6 +135,9 @@ VNCServerST::VNCServerST(const char* name_, SDesktop* desktop_)
 {
  lastUserInputTime = lastDisconnectTime = time(0);
  slog.debug("creating single-threaded server %s", name.buf);
+  slog.info("CPU capability: SSE2 %s, AVX512f %s",
+            supportsSSE2() ? "yes" : "no",
+            supportsAVX512f() ? "yes" : "no");

  DLPRegion.enabled = DLPRegion.percents = false;

--- a/common/rfb/cpuid.cxx
+++ b/common/rfb/cpuid.cxx
@ -0,0 +1,70 @@
+/* Copyright (C) 2021 Kasm Web
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ * USA.
+ */
+
+#include <stdint.h>
+
+static uint32_t cpuid[4] = { 0 };
+static uint32_t extcpuid[4] = { 0 };
+
+static void getcpuid() {
+	if (cpuid[0])
+		return;
+
+#if defined(__x86_64__) || defined(__i386__)
+	uint32_t eax, ecx = 0;
+
+	eax = 1; // normal feature bits
+
+	__asm__ __volatile__(
+		"cpuid\n\t"
+		: "=a"(cpuid[0]), "=b"(cpuid[1]), "=c"(cpuid[2]), "=d"(cpuid[3])
+		: "0"(eax), "2"(ecx)
+	);
+
+	eax = 7; // ext feature bits
+	ecx = 0;
+
+	__asm__ __volatile__(
+		"cpuid\n\t"
+		: "=a"(extcpuid[0]), "=b"(extcpuid[1]), "=c"(extcpuid[2]), "=d"(extcpuid[3])
+		: "0"(eax), "2"(ecx)
+	);
+#endif
+}
+
+namespace rfb {
+
+bool supportsSSE2() {
+	getcpuid();
+#if defined(__x86_64__) || defined(__i386__)
+	#define bit_SSE2        (1 << 26)
+	return cpuid[3] & bit_SSE2;
+#endif
+	return false;
+}
+
+bool supportsAVX512f() {
+	getcpuid();
+#if defined(__x86_64__) || defined(__i386__)
+	#define bit_AVX512f        (1 << 16)
+	return extcpuid[1] & bit_AVX512f;
+#endif
+	return false;
+}
+
+}; // namespace rfb
--- a/common/rfb/cpuid.h
+++ b/common/rfb/cpuid.h
@ -0,0 +1,28 @@
+/* Copyright (C) 2021 Kasm Web
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ * USA.
+ */
+
+#ifndef __RFB_CPUID_H__
+#define __RFB_CPUID_H__
+
+namespace rfb {
+
+	bool supportsSSE2();
+	bool supportsAVX512f();
+};
+
+#endif
--- a/common/rfb/scale_dummy.cxx
+++ b/common/rfb/scale_dummy.cxx
@ -0,0 +1,37 @@
+/* Copyright (C) 2021 Kasm Web
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ * USA.
+ */
+
+#include <rfb/scale_sse2.h>
+
+namespace rfb {
+
+void SSE2_halve(const uint8_t *oldpx,
+			const uint16_t tgtw, const uint16_t tgth,
+			uint8_t *newpx,
+			const unsigned oldstride, const unsigned newstride) {
+}
+
+// Handles factors between 0.5 and 1.0
+void SSE2_scale(const uint8_t *oldpx,
+		const uint16_t tgtw, const uint16_t tgth,
+		uint8_t *newpx,
+		const unsigned oldstride, const unsigned newstride,
+		const float tgtdiff) {
+}
+
+}; // namespace rfb
--- a/common/rfb/scale_sse2.cxx
+++ b/common/rfb/scale_sse2.cxx
@ -0,0 +1,257 @@
+/* Copyright (C) 2021 Kasm Web
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ * USA.
+ */
+
+#include <emmintrin.h>
+
+#include <rfb/scale_sse2.h>
+
+namespace rfb {
+
+/*
+static void print128(const char msg[], const __m128i v) {
+	union {
+		__m128i v;
+		uint8_t c[16];
+	} u;
+
+	u.v = v;
+
+	printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n",
+		msg,
+		u.c[0],
+		u.c[1],
+		u.c[2],
+		u.c[3],
+		u.c[4],
+		u.c[5],
+		u.c[6],
+		u.c[7],
+		u.c[8],
+		u.c[9],
+		u.c[10],
+		u.c[11],
+		u.c[12],
+		u.c[13],
+		u.c[14],
+		u.c[15]);
+}
+*/
+
+void SSE2_halve(const uint8_t *oldpx,
+			const uint16_t tgtw, const uint16_t tgth,
+			uint8_t *newpx,
+			const unsigned oldstride, const unsigned newstride) {
+	uint16_t x, y;
+	const uint16_t srcw = tgtw * 2, srch = tgth * 2;
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i shift = _mm_set_epi32(0, 0, 0, 2);
+	const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+	const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);
+
+	for (y = 0; y < srch; y += 2) {
+		const uint8_t * const row0 = oldpx + oldstride * y * 4;
+		const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4;
+
+		uint8_t * const dst = newpx + newstride * (y / 2) * 4;
+
+		for (x = 0; x < srcw; x += 4) {
+			__m128i lo, hi, a, b, c, d;
+			lo = _mm_loadu_si128((__m128i *) &row0[x * 4]);
+			hi = _mm_loadu_si128((__m128i *) &row1[x * 4]);
+
+			a = _mm_unpacklo_epi8(lo, zero);
+			b = _mm_unpackhi_epi8(lo, zero);
+			c = _mm_unpacklo_epi8(hi, zero);
+			d = _mm_unpackhi_epi8(hi, zero);
+
+			a = _mm_add_epi16(a, c);
+			b = _mm_add_epi16(b, d);
+
+			c = _mm_srli_si128(a, 8);
+			a = _mm_and_si128(a, low);
+			a = _mm_add_epi16(a, c);
+
+			d = _mm_slli_si128(b, 8);
+			b = _mm_and_si128(b, high);
+			b = _mm_add_epi16(b, d);
+
+			a = _mm_add_epi16(a, b);
+
+			a = _mm_srl_epi16(a, shift);
+			a = _mm_packus_epi16(a, zero);
+
+			_mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a);
+		}
+
+		for (; x < srcw; x += 2) {
+			// Remainder in C
+			uint8_t i;
+			for (i = 0; i < 4; i++) {
+				dst[(x / 2) * 4 + i] =
+					(row0[x * 4 + i] +
+					row0[(x + 1) * 4 + i] +
+					row1[x * 4 + i] +
+					row1[(x + 1) * 4 + i]) / 4;
+			}
+		}
+	}
+}
+
+// Handles factors between 0.5 and 1.0
+void SSE2_scale(const uint8_t *oldpx,
+		const uint16_t tgtw, const uint16_t tgth,
+		uint8_t *newpx,
+		const unsigned oldstride, const unsigned newstride,
+		const float tgtdiff) {
+
+	uint16_t x, y;
+	const __m128i zero = _mm_setzero_si128();
+	const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+	const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0);
+	const float invdiff = 1 / tgtdiff;
+
+	for (y = 0; y < tgth; y++) {
+		const float ny = y * invdiff;
+		const uint16_t lowy = ny;
+		const uint16_t highy = lowy + 1;
+		const uint16_t bot = (ny - lowy) * 256;
+		const uint16_t top = 256 - bot;
+		const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4);
+		const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4);
+		const uint8_t * const brow0 = (uint8_t *) row0;
+		const uint8_t * const brow1 = (uint8_t *) row1;
+
+		uint8_t * const dst = newpx + newstride * y * 4;
+
+		const __m128i vertmul = _mm_set1_epi16(top);
+		const __m128i vertmul2 = _mm_set1_epi16(bot);
+
+		for (x = 0; x < tgtw; x += 2) {
+			const float nx[2] = {
+				x * invdiff,
+				(x + 1) * invdiff,
+			};
+			const uint16_t lowx[2] =  {
+				(uint16_t) nx[0],
+				(uint16_t) nx[1],
+			};
+			const uint16_t highx[2] = {
+				(uint16_t) (lowx[0] + 1),
+				(uint16_t) (lowx[1] + 1),
+			};
+			const uint16_t right[2] = {
+				(uint16_t) ((nx[0] - lowx[0]) * 256),
+				(uint16_t) ((nx[1] - lowx[1]) * 256),
+			};
+			const uint16_t left[2] = {
+				(uint16_t) (256 - right[0]),
+				(uint16_t) (256 - right[1]),
+			};
+
+			const __m128i horzmul = _mm_set_epi16(
+				right[0],
+				right[0],
+				right[0],
+				right[0],
+				left[0],
+				left[0],
+				left[0],
+				left[0]
+			);
+			const __m128i horzmul2 = _mm_set_epi16(
+				right[1],
+				right[1],
+				right[1],
+				right[1],
+				left[1],
+				left[1],
+				left[1],
+				left[1]
+			);
+
+			__m128i lo, hi, a, b, c, d;
+			lo = _mm_setr_epi32(row0[lowx[0]],
+						row0[highx[0]],
+						row0[lowx[1]],
+						row0[highx[1]]);
+			hi = _mm_setr_epi32(row1[lowx[0]],
+						row1[highx[0]],
+						row1[lowx[1]],
+						row1[highx[1]]);
+
+			a = _mm_unpacklo_epi8(lo, zero);
+			b = _mm_unpackhi_epi8(lo, zero);
+			c = _mm_unpacklo_epi8(hi, zero);
+			d = _mm_unpackhi_epi8(hi, zero);
+
+			a = _mm_mullo_epi16(a, vertmul);
+			b = _mm_mullo_epi16(b, vertmul);
+			c = _mm_mullo_epi16(c, vertmul2);
+			d = _mm_mullo_epi16(d, vertmul2);
+
+			a = _mm_add_epi16(a, c);
+			a = _mm_srli_epi16(a, 8);
+			b = _mm_add_epi16(b, d);
+			b = _mm_srli_epi16(b, 8);
+
+			a = _mm_mullo_epi16(a, horzmul);
+			b = _mm_mullo_epi16(b, horzmul2);
+
+			lo = _mm_srli_si128(a, 8);
+			a = _mm_and_si128(a, low);
+			a = _mm_add_epi16(a, lo);
+
+			hi = _mm_slli_si128(b, 8);
+			b = _mm_and_si128(b, high);
+			b = _mm_add_epi16(b, hi);
+
+			a = _mm_add_epi16(a, b);
+			a = _mm_srli_epi16(a, 8);
+
+			a = _mm_packus_epi16(a, zero);
+
+			_mm_storel_epi64((__m128i *) &dst[x * 4], a);
+		}
+
+		for (; x < tgtw; x++) {
+			// Remainder in C
+			const float nx = x * invdiff;
+			const uint16_t lowx = nx;
+			const uint16_t highx = lowx + 1;
+			const uint16_t right = (nx - lowx) * 256;
+			const uint16_t left = 256 - right;
+
+			uint8_t i;
+			uint32_t val, val2;
+			for (i = 0; i < 4; i++) {
+				val = brow0[lowx * 4 + i] * left;
+				val += brow0[highx * 4 + i] * right;
+				val >>= 8;
+
+				val2 = brow1[lowx * 4 + i] * left;
+				val2 += brow1[highx * 4 + i] * right;
+				val2 >>= 8;
+
+				dst[x * 4 + i] =
+					(val * top + val2 * bot) >> 8;
+			}
+		}
+	}
+}
+
+}; // namespace rfb
--- a/common/rfb/scale_sse2.h
+++ b/common/rfb/scale_sse2.h
@ -0,0 +1,38 @@
+/* Copyright (C) 2021 Kasm Web
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ * USA.
+ */
+
+#ifndef __RFB_SCALE_SSE2_H__
+#define __RFB_SCALE_SSE2_H__
+
+#include <stdint.h>
+
+namespace rfb {
+
+	void SSE2_halve(const uint8_t *oldpx,
+			const uint16_t tgtw, const uint16_t tgth,
+			uint8_t *newpx,
+			const unsigned oldstride, const unsigned newstride);
+
+	void SSE2_scale(const uint8_t *oldpx,
+			const uint16_t tgtw, const uint16_t tgth,
+			uint8_t *newpx,
+			const unsigned oldstride, const unsigned newstride,
+			const float tgtdiff);
+};
+
+#endif
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit ba40cacce068fa35fc706c41605db14c04348170
+Subproject commit e0bb9f6bcf945da6cb10fd0eb48b63b48bf09bb8