parent
							
								
									dc21d5f97c
								
							
						
					
					
						commit
						0cb2c0ba9f
					
				| @ -0,0 +1,70 @@ | ||||
| /* Copyright (C) 2021 Kasm Web
 | ||||
|  * | ||||
|  * This is free software; you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation; either version 2 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * This software is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with this software; if not, write to the Free Software | ||||
|  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, | ||||
|  * USA. | ||||
|  */ | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| 
 | ||||
| static uint32_t cpuid[4] = { 0 }; | ||||
| static uint32_t extcpuid[4] = { 0 }; | ||||
| 
 | ||||
| static void getcpuid() { | ||||
| 	if (cpuid[0]) | ||||
| 		return; | ||||
| 
 | ||||
| #if defined(__x86_64__) || defined(__i386__) | ||||
| 	uint32_t eax, ecx = 0; | ||||
| 
 | ||||
| 	eax = 1; // normal feature bits
 | ||||
| 
 | ||||
| 	__asm__ __volatile__( | ||||
| 		"cpuid\n\t" | ||||
| 		: "=a"(cpuid[0]), "=b"(cpuid[1]), "=c"(cpuid[2]), "=d"(cpuid[3]) | ||||
| 		: "0"(eax), "2"(ecx) | ||||
| 	); | ||||
| 
 | ||||
| 	eax = 7; // ext feature bits
 | ||||
| 	ecx = 0; | ||||
| 
 | ||||
| 	__asm__ __volatile__( | ||||
| 		"cpuid\n\t" | ||||
| 		: "=a"(extcpuid[0]), "=b"(extcpuid[1]), "=c"(extcpuid[2]), "=d"(extcpuid[3]) | ||||
| 		: "0"(eax), "2"(ecx) | ||||
| 	); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| namespace rfb { | ||||
| 
 | ||||
| bool supportsSSE2() { | ||||
| 	getcpuid(); | ||||
| #if defined(__x86_64__) || defined(__i386__) | ||||
| 	#define bit_SSE2        (1 << 26) | ||||
| 	return cpuid[3] & bit_SSE2; | ||||
| #endif | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| bool supportsAVX512f() { | ||||
| 	getcpuid(); | ||||
| #if defined(__x86_64__) || defined(__i386__) | ||||
| 	#define bit_AVX512f        (1 << 16) | ||||
| 	return extcpuid[1] & bit_AVX512f; | ||||
| #endif | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| }; // namespace rfb
 | ||||
| @ -0,0 +1,28 @@ | ||||
| /* Copyright (C) 2021 Kasm Web
 | ||||
|  * | ||||
|  * This is free software; you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation; either version 2 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * This software is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with this software; if not, write to the Free Software | ||||
|  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, | ||||
|  * USA. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef __RFB_CPUID_H__ | ||||
| #define __RFB_CPUID_H__ | ||||
| 
 | ||||
| namespace rfb { | ||||
| 
 | ||||
| 	bool supportsSSE2(); | ||||
| 	bool supportsAVX512f(); | ||||
| }; | ||||
| 
 | ||||
| #endif | ||||
| @ -0,0 +1,37 @@ | ||||
| /* Copyright (C) 2021 Kasm Web
 | ||||
|  * | ||||
|  * This is free software; you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation; either version 2 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * This software is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with this software; if not, write to the Free Software | ||||
|  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, | ||||
|  * USA. | ||||
|  */ | ||||
| 
 | ||||
| #include <rfb/scale_sse2.h> | ||||
| 
 | ||||
| namespace rfb { | ||||
| 
 | ||||
| void SSE2_halve(const uint8_t *oldpx, | ||||
| 			const uint16_t tgtw, const uint16_t tgth, | ||||
| 			uint8_t *newpx, | ||||
| 			const unsigned oldstride, const unsigned newstride) { | ||||
| } | ||||
| 
 | ||||
| // Handles factors between 0.5 and 1.0
 | ||||
| void SSE2_scale(const uint8_t *oldpx, | ||||
| 		const uint16_t tgtw, const uint16_t tgth, | ||||
| 		uint8_t *newpx, | ||||
| 		const unsigned oldstride, const unsigned newstride, | ||||
| 		const float tgtdiff) { | ||||
| } | ||||
| 
 | ||||
| }; // namespace rfb
 | ||||
| @ -0,0 +1,257 @@ | ||||
| /* Copyright (C) 2021 Kasm Web
 | ||||
|  * | ||||
|  * This is free software; you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation; either version 2 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * This software is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with this software; if not, write to the Free Software | ||||
|  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, | ||||
|  * USA. | ||||
|  */ | ||||
| 
 | ||||
| #include <emmintrin.h> | ||||
| 
 | ||||
| #include <rfb/scale_sse2.h> | ||||
| 
 | ||||
| namespace rfb { | ||||
| 
 | ||||
| /*
 | ||||
| static void print128(const char msg[], const __m128i v) { | ||||
| 	union { | ||||
| 		__m128i v; | ||||
| 		uint8_t c[16]; | ||||
| 	} u; | ||||
| 
 | ||||
| 	u.v = v; | ||||
| 
 | ||||
| 	printf("%s %02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x,%02x\n", | ||||
| 		msg, | ||||
| 		u.c[0], | ||||
| 		u.c[1], | ||||
| 		u.c[2], | ||||
| 		u.c[3], | ||||
| 		u.c[4], | ||||
| 		u.c[5], | ||||
| 		u.c[6], | ||||
| 		u.c[7], | ||||
| 		u.c[8], | ||||
| 		u.c[9], | ||||
| 		u.c[10], | ||||
| 		u.c[11], | ||||
| 		u.c[12], | ||||
| 		u.c[13], | ||||
| 		u.c[14], | ||||
| 		u.c[15]); | ||||
| } | ||||
| */ | ||||
| 
 | ||||
| void SSE2_halve(const uint8_t *oldpx, | ||||
| 			const uint16_t tgtw, const uint16_t tgth, | ||||
| 			uint8_t *newpx, | ||||
| 			const unsigned oldstride, const unsigned newstride) { | ||||
| 	uint16_t x, y; | ||||
| 	const uint16_t srcw = tgtw * 2, srch = tgth * 2; | ||||
| 	const __m128i zero = _mm_setzero_si128(); | ||||
| 	const __m128i shift = _mm_set_epi32(0, 0, 0, 2); | ||||
| 	const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); | ||||
| 	const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); | ||||
| 
 | ||||
| 	for (y = 0; y < srch; y += 2) { | ||||
| 		const uint8_t * const row0 = oldpx + oldstride * y * 4; | ||||
| 		const uint8_t * const row1 = oldpx + oldstride * (y + 1) * 4; | ||||
| 
 | ||||
| 		uint8_t * const dst = newpx + newstride * (y / 2) * 4; | ||||
| 
 | ||||
| 		for (x = 0; x < srcw; x += 4) { | ||||
| 			__m128i lo, hi, a, b, c, d; | ||||
| 			lo = _mm_loadu_si128((__m128i *) &row0[x * 4]); | ||||
| 			hi = _mm_loadu_si128((__m128i *) &row1[x * 4]); | ||||
| 
 | ||||
| 			a = _mm_unpacklo_epi8(lo, zero); | ||||
| 			b = _mm_unpackhi_epi8(lo, zero); | ||||
| 			c = _mm_unpacklo_epi8(hi, zero); | ||||
| 			d = _mm_unpackhi_epi8(hi, zero); | ||||
| 
 | ||||
| 			a = _mm_add_epi16(a, c); | ||||
| 			b = _mm_add_epi16(b, d); | ||||
| 
 | ||||
| 			c = _mm_srli_si128(a, 8); | ||||
| 			a = _mm_and_si128(a, low); | ||||
| 			a = _mm_add_epi16(a, c); | ||||
| 
 | ||||
| 			d = _mm_slli_si128(b, 8); | ||||
| 			b = _mm_and_si128(b, high); | ||||
| 			b = _mm_add_epi16(b, d); | ||||
| 
 | ||||
| 			a = _mm_add_epi16(a, b); | ||||
| 
 | ||||
| 			a = _mm_srl_epi16(a, shift); | ||||
| 			a = _mm_packus_epi16(a, zero); | ||||
| 
 | ||||
| 			_mm_storel_epi64((__m128i *) &dst[(x / 2) * 4], a); | ||||
| 		} | ||||
| 
 | ||||
| 		for (; x < srcw; x += 2) { | ||||
| 			// Remainder in C
 | ||||
| 			uint8_t i; | ||||
| 			for (i = 0; i < 4; i++) { | ||||
| 				dst[(x / 2) * 4 + i] = | ||||
| 					(row0[x * 4 + i] + | ||||
| 					row0[(x + 1) * 4 + i] + | ||||
| 					row1[x * 4 + i] + | ||||
| 					row1[(x + 1) * 4 + i]) / 4; | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // Handles factors between 0.5 and 1.0
 | ||||
| void SSE2_scale(const uint8_t *oldpx, | ||||
| 		const uint16_t tgtw, const uint16_t tgth, | ||||
| 		uint8_t *newpx, | ||||
| 		const unsigned oldstride, const unsigned newstride, | ||||
| 		const float tgtdiff) { | ||||
| 
 | ||||
| 	uint16_t x, y; | ||||
| 	const __m128i zero = _mm_setzero_si128(); | ||||
| 	const __m128i low = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); | ||||
| 	const __m128i high = _mm_set_epi32(0xffffffff, 0xffffffff, 0, 0); | ||||
| 	const float invdiff = 1 / tgtdiff; | ||||
| 
 | ||||
| 	for (y = 0; y < tgth; y++) { | ||||
| 		const float ny = y * invdiff; | ||||
| 		const uint16_t lowy = ny; | ||||
| 		const uint16_t highy = lowy + 1; | ||||
| 		const uint16_t bot = (ny - lowy) * 256; | ||||
| 		const uint16_t top = 256 - bot; | ||||
| 		const uint32_t * const row0 = (uint32_t *) (oldpx + oldstride * lowy * 4); | ||||
| 		const uint32_t * const row1 = (uint32_t *) (oldpx + oldstride * highy * 4); | ||||
| 		const uint8_t * const brow0 = (uint8_t *) row0; | ||||
| 		const uint8_t * const brow1 = (uint8_t *) row1; | ||||
| 
 | ||||
| 		uint8_t * const dst = newpx + newstride * y * 4; | ||||
| 
 | ||||
| 		const __m128i vertmul = _mm_set1_epi16(top); | ||||
| 		const __m128i vertmul2 = _mm_set1_epi16(bot); | ||||
| 
 | ||||
| 		for (x = 0; x < tgtw; x += 2) { | ||||
| 			const float nx[2] = { | ||||
| 				x * invdiff, | ||||
| 				(x + 1) * invdiff, | ||||
| 			}; | ||||
| 			const uint16_t lowx[2] =  { | ||||
| 				(uint16_t) nx[0], | ||||
| 				(uint16_t) nx[1], | ||||
| 			}; | ||||
| 			const uint16_t highx[2] = { | ||||
| 				(uint16_t) (lowx[0] + 1), | ||||
| 				(uint16_t) (lowx[1] + 1), | ||||
| 			}; | ||||
| 			const uint16_t right[2] = { | ||||
| 				(uint16_t) ((nx[0] - lowx[0]) * 256), | ||||
| 				(uint16_t) ((nx[1] - lowx[1]) * 256), | ||||
| 			}; | ||||
| 			const uint16_t left[2] = { | ||||
| 				(uint16_t) (256 - right[0]), | ||||
| 				(uint16_t) (256 - right[1]), | ||||
| 			}; | ||||
| 
 | ||||
| 			const __m128i horzmul = _mm_set_epi16( | ||||
| 				right[0], | ||||
| 				right[0], | ||||
| 				right[0], | ||||
| 				right[0], | ||||
| 				left[0], | ||||
| 				left[0], | ||||
| 				left[0], | ||||
| 				left[0] | ||||
| 			); | ||||
| 			const __m128i horzmul2 = _mm_set_epi16( | ||||
| 				right[1], | ||||
| 				right[1], | ||||
| 				right[1], | ||||
| 				right[1], | ||||
| 				left[1], | ||||
| 				left[1], | ||||
| 				left[1], | ||||
| 				left[1] | ||||
| 			); | ||||
| 
 | ||||
| 			__m128i lo, hi, a, b, c, d; | ||||
| 			lo = _mm_setr_epi32(row0[lowx[0]], | ||||
| 						row0[highx[0]], | ||||
| 						row0[lowx[1]], | ||||
| 						row0[highx[1]]); | ||||
| 			hi = _mm_setr_epi32(row1[lowx[0]], | ||||
| 						row1[highx[0]], | ||||
| 						row1[lowx[1]], | ||||
| 						row1[highx[1]]); | ||||
| 
 | ||||
| 			a = _mm_unpacklo_epi8(lo, zero); | ||||
| 			b = _mm_unpackhi_epi8(lo, zero); | ||||
| 			c = _mm_unpacklo_epi8(hi, zero); | ||||
| 			d = _mm_unpackhi_epi8(hi, zero); | ||||
| 
 | ||||
| 			a = _mm_mullo_epi16(a, vertmul); | ||||
| 			b = _mm_mullo_epi16(b, vertmul); | ||||
| 			c = _mm_mullo_epi16(c, vertmul2); | ||||
| 			d = _mm_mullo_epi16(d, vertmul2); | ||||
| 
 | ||||
| 			a = _mm_add_epi16(a, c); | ||||
| 			a = _mm_srli_epi16(a, 8); | ||||
| 			b = _mm_add_epi16(b, d); | ||||
| 			b = _mm_srli_epi16(b, 8); | ||||
| 
 | ||||
| 			a = _mm_mullo_epi16(a, horzmul); | ||||
| 			b = _mm_mullo_epi16(b, horzmul2); | ||||
| 
 | ||||
| 			lo = _mm_srli_si128(a, 8); | ||||
| 			a = _mm_and_si128(a, low); | ||||
| 			a = _mm_add_epi16(a, lo); | ||||
| 
 | ||||
| 			hi = _mm_slli_si128(b, 8); | ||||
| 			b = _mm_and_si128(b, high); | ||||
| 			b = _mm_add_epi16(b, hi); | ||||
| 
 | ||||
| 			a = _mm_add_epi16(a, b); | ||||
| 			a = _mm_srli_epi16(a, 8); | ||||
| 
 | ||||
| 			a = _mm_packus_epi16(a, zero); | ||||
| 
 | ||||
| 			_mm_storel_epi64((__m128i *) &dst[x * 4], a); | ||||
| 		} | ||||
| 
 | ||||
| 		for (; x < tgtw; x++) { | ||||
| 			// Remainder in C
 | ||||
| 			const float nx = x * invdiff; | ||||
| 			const uint16_t lowx = nx; | ||||
| 			const uint16_t highx = lowx + 1; | ||||
| 			const uint16_t right = (nx - lowx) * 256; | ||||
| 			const uint16_t left = 256 - right; | ||||
| 
 | ||||
| 			uint8_t i; | ||||
| 			uint32_t val, val2; | ||||
| 			for (i = 0; i < 4; i++) { | ||||
| 				val = brow0[lowx * 4 + i] * left; | ||||
| 				val += brow0[highx * 4 + i] * right; | ||||
| 				val >>= 8; | ||||
| 
 | ||||
| 				val2 = brow1[lowx * 4 + i] * left; | ||||
| 				val2 += brow1[highx * 4 + i] * right; | ||||
| 				val2 >>= 8; | ||||
| 
 | ||||
| 				dst[x * 4 + i] = | ||||
| 					(val * top + val2 * bot) >> 8; | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| }; // namespace rfb
 | ||||
| @ -0,0 +1,38 @@ | ||||
| /* Copyright (C) 2021 Kasm Web
 | ||||
|  * | ||||
|  * This is free software; you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation; either version 2 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * This software is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with this software; if not, write to the Free Software | ||||
|  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, | ||||
|  * USA. | ||||
|  */ | ||||
| 
 | ||||
| #ifndef __RFB_SCALE_SSE2_H__ | ||||
| #define __RFB_SCALE_SSE2_H__ | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| 
 | ||||
| namespace rfb { | ||||
| 
 | ||||
| 	void SSE2_halve(const uint8_t *oldpx, | ||||
| 			const uint16_t tgtw, const uint16_t tgth, | ||||
| 			uint8_t *newpx, | ||||
| 			const unsigned oldstride, const unsigned newstride); | ||||
| 
 | ||||
| 	void SSE2_scale(const uint8_t *oldpx, | ||||
| 			const uint16_t tgtw, const uint16_t tgth, | ||||
| 			uint8_t *newpx, | ||||
| 			const unsigned oldstride, const unsigned newstride, | ||||
| 			const float tgtdiff); | ||||
| }; | ||||
| 
 | ||||
| #endif | ||||
| @ -1 +1 @@ | ||||
| Subproject commit ba40cacce068fa35fc706c41605db14c04348170 | ||||
| Subproject commit e0bb9f6bcf945da6cb10fd0eb48b63b48bf09bb8 | ||||
					Loading…
					
					
				
		Reference in New Issue
	
	 mmcclaskey
						mmcclaskey