diff src/filters/hq_shared32.cpp @ 27:b970226568d2

brought in filters package
author Robert McIntyre <rlm@mit.edu>
date Sun, 04 Mar 2012 20:32:31 -0600
parents f9f4f1b99eed
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/filters/hq_shared32.cpp	Sun Mar 04 20:32:31 2012 -0600
     1.3 @@ -0,0 +1,389 @@
     1.4 +#include "../Port.h"
     1.5 +#include "hq_shared32.h"
     1.6 +#include "interp.h"
     1.7 +
     1.8 +const unsigned __int64 reg_blank = 0x0000000000000000;
     1.9 +const unsigned __int64 const7	 = 0x0000000700070007;
    1.10 +const unsigned __int64 treshold	 = 0x0000000000300706;
    1.11 +
    1.12 +void Interp1(unsigned char *pc, unsigned int c1, unsigned int c2)
    1.13 +{
    1.14 +	//*((int*)pc) = (c1*3+c2)/4;
    1.15 +
    1.16 +#ifdef MMX
    1.17 +	__asm
    1.18 +	{
    1.19 +		mov eax, pc
    1.20 +		movd mm1, c1
    1.21 +		movd mm2, c2
    1.22 +		movq mm0, mm1
    1.23 +		pslld mm0, 2
    1.24 +		psubd mm0, mm1
    1.25 +		paddd mm0, mm2
    1.26 +		psrld mm0, 2
    1.27 +		movd    [eax], mm0
    1.28 +		    EMMS
    1.29 +	}
    1.30 +#else
    1.31 +	__asm
    1.32 +	{
    1.33 +		mov eax, pc
    1.34 +		mov edx, c1
    1.35 +		shl edx, 2
    1.36 +		add edx, c2
    1.37 +		sub edx, c1
    1.38 +		shr edx, 2
    1.39 +		mov        [eax], edx
    1.40 +	}
    1.41 +#endif
    1.42 +}
    1.43 +
    1.44 +void Interp2(unsigned char *pc, unsigned int c1, unsigned int c2, unsigned int c3)
    1.45 +{
    1.46 +	//*((int*)pc) = (c1*2+c2+c3)/4;
    1.47 +
    1.48 +#ifdef MMX
    1.49 +	__asm
    1.50 +	{
    1.51 +		mov eax, pc
    1.52 +		movd mm0, c1
    1.53 +		movd mm1, c2
    1.54 +		movd mm2, c3
    1.55 +		pslld mm0, 1
    1.56 +		paddd mm0, mm1
    1.57 +		paddd mm0, mm2
    1.58 +		psrad mm0, 2
    1.59 +		movd [eax], mm0
    1.60 +		    EMMS
    1.61 +	}
    1.62 +#else
    1.63 +	__asm
    1.64 +	{
    1.65 +		mov eax, pc
    1.66 +		mov edx, c1
    1.67 +		shl edx, 1
    1.68 +		add edx, c2
    1.69 +		add edx, c3
    1.70 +		shr edx, 2
    1.71 +		mov        [eax], edx
    1.72 +	}
    1.73 +#endif
    1.74 +}
    1.75 +
    1.76 +void Interp3(unsigned char *pc, unsigned int c1, unsigned int c2)
    1.77 +{
    1.78 +	//*((int*)pc) = (c1*7+c2)/8;
    1.79 +	//*((int*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
    1.80 +	//	            (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
    1.81 +
    1.82 +#ifdef MMX
    1.83 +	__asm
    1.84 +	{
    1.85 +		mov eax, pc
    1.86 +		movd mm1, c1
    1.87 +		movd mm2, c2
    1.88 +		punpcklbw mm1, reg_blank
    1.89 +		punpcklbw mm2, reg_blank
    1.90 +		pmullw mm1, const7
    1.91 +		paddw mm1, mm2
    1.92 +		psrlw mm1, 3
    1.93 +		packuswb mm1, reg_blank
    1.94 +		    movd       [eax], mm1
    1.95 +		    EMMS
    1.96 +	}
    1.97 +#else
    1.98 +	__asm
    1.99 +	{
   1.100 +		mov eax, c1
   1.101 +		mov ebx, c2
   1.102 +		mov ecx, eax
   1.103 +		shl ecx, 3
   1.104 +		sub ecx, eax
   1.105 +		add ecx, ebx
   1.106 +		shr ecx, 3
   1.107 +		mov eax, pc
   1.108 +		    mov     [eax], ecx
   1.109 +	}
   1.110 +#endif
   1.111 +}
   1.112 +
   1.113 +void Interp4(unsigned char *pc, unsigned int c1, unsigned int c2, unsigned int c3)
   1.114 +{
   1.115 +	//*((int*)pc) = (c1*2+(c2+c3)*7)/16;
   1.116 +	//*((int*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
   1.117 +	//              (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
   1.118 +
   1.119 +#ifdef MMX
   1.120 +	__asm
   1.121 +	{
   1.122 +		mov eax, pc
   1.123 +		movd mm1, c1
   1.124 +		movd mm2, c2
   1.125 +		movd mm3, c3
   1.126 +		punpcklbw mm1, reg_blank
   1.127 +		punpcklbw mm2, reg_blank
   1.128 +		punpcklbw mm3, reg_blank
   1.129 +		psllw mm1, 1
   1.130 +		paddw mm2, mm3
   1.131 +		pmullw mm2, const7
   1.132 +		paddw mm1, mm2
   1.133 +		psrlw mm1, 4
   1.134 +		packuswb mm1, reg_blank
   1.135 +		    movd       [eax], mm1
   1.136 +		    EMMS
   1.137 +	}
   1.138 +#else
   1.139 +
   1.140 +	__asm
   1.141 +	{
   1.142 +		mov eax, [c1]
   1.143 +		and     eax, 0FF00h
   1.144 +		shl eax, 1
   1.145 +		mov ecx, [c2]
   1.146 +		and     ecx, 0FF00h
   1.147 +		mov edx, [c3]
   1.148 +		and     edx, 0FF00h
   1.149 +		add ecx, edx
   1.150 +		imul ecx, ecx, 7
   1.151 +		add eax, ecx
   1.152 +		and     eax, 0FF000h
   1.153 +
   1.154 +		mov ebx, [c1]
   1.155 +		and     ebx, 0FF00FFh
   1.156 +		shl ebx, 1
   1.157 +		mov ecx, [c2]
   1.158 +		and     ecx, 0FF00FFh
   1.159 +		mov edx, [c3]
   1.160 +		and     edx, 0FF00FFh
   1.161 +		add ecx, edx
   1.162 +		imul ecx, ecx, 7
   1.163 +		add ebx, ecx
   1.164 +		and     ebx, 0FF00FF0h
   1.165 +
   1.166 +		add eax, ebx
   1.167 +		shr eax, 4
   1.168 +
   1.169 +		mov ebx, pc
   1.170 +		    mov     [ebx], eax
   1.171 +	}
   1.172 +#endif
   1.173 +}
   1.174 +
   1.175 +void Interp5(unsigned char *pc, unsigned int c1, unsigned int c2)
   1.176 +{
   1.177 +	//*((int*)pc) = (c1+c2)/2;
   1.178 +
   1.179 +#ifdef MMX
   1.180 +	__asm
   1.181 +	{
   1.182 +		mov eax, pc
   1.183 +		movd mm0, c1
   1.184 +		movd mm1, c2
   1.185 +		paddd mm0, mm1
   1.186 +		psrad mm0, 1
   1.187 +		movd    [eax], mm0
   1.188 +		    EMMS
   1.189 +	}
   1.190 +#else
   1.191 +	__asm
   1.192 +	{
   1.193 +		mov eax, pc
   1.194 +		mov edx, c1
   1.195 +		add edx, c2
   1.196 +		shr edx, 1
   1.197 +		mov        [eax], edx
   1.198 +	}
   1.199 +#endif
   1.200 +}
   1.201 +
   1.202 +void Interp1_16(unsigned char *pc, unsigned short c1, unsigned short c2)
   1.203 +{
   1.204 +	*((unsigned short *)pc) = interp_16_31(c1, c2);
   1.205 +	//*((int*)pc) = (c1*3+c2)/4;
   1.206 +}
   1.207 +
   1.208 +void Interp2_16(unsigned char *pc, unsigned short c1, unsigned short c2, unsigned short c3)
   1.209 +{
   1.210 +	*((unsigned short *)pc) = interp_16_211(c1, c2, c3);
   1.211 +	//*((int*)pc) = (c1*2+c2+c3)/4;
   1.212 +}
   1.213 +
   1.214 +void Interp3_16(unsigned char *pc, unsigned short c1, unsigned short c2)
   1.215 +{
   1.216 +	*((unsigned short *)pc) = interp_16_71(c1, c2);
   1.217 +//	*((unsigned short*)pc) = (c1*7+c2)/8;
   1.218 +//	*((unsigned short*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
   1.219 +//		            (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
   1.220 +}
   1.221 +
   1.222 +void Interp4_16(unsigned char *pc, unsigned short c1, unsigned short c2, unsigned short c3)
   1.223 +{
   1.224 +	*((unsigned short *)pc) = interp_16_772(c2, c3, c1);
   1.225 +//	*((unsigned short*)pc) = (c1*2+(c2+c3)*7)/16;
   1.226 +//	*((unsigned short*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
   1.227 +//	              (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
   1.228 +}
   1.229 +
   1.230 +void Interp5_16(unsigned char *pc, unsigned short c1, unsigned short c2)
   1.231 +{
   1.232 +	*((unsigned short *)pc) = interp_16_11(c1, c2);
   1.233 +}
   1.234 +
   1.235 +bool Diff(unsigned int c1, unsigned int c2)
   1.236 +{
   1.237 +	unsigned int
   1.238 +	    YUV1 = RGBtoYUV(c1),
   1.239 +	    YUV2 = RGBtoYUV(c2);
   1.240 +
   1.241 +	if (YUV1 == YUV2) return false;  // Save some processing power
   1.242 +
   1.243 +#ifdef MMX
   1.244 +	unsigned int retval;
   1.245 +	__asm
   1.246 +	{
   1.247 +		mov eax, 0x7FFFFFFF
   1.248 +		movd mm7, eax; mm7 = ABS_MASK = 0x7FFFFFFF
   1.249 +
   1.250 +		; Copy source colors in first reg
   1.251 +		movd mm0, YUV1
   1.252 +		movd mm1, YUV2
   1.253 +
   1.254 +		mov eax, 0x00FF0000
   1.255 +		movd mm6, eax; mm6 = Ymask = 0x00FF0000
   1.256 +
   1.257 +		; Calculate color Y difference
   1.258 +		movq mm2, mm0
   1.259 +		movq mm3, mm1
   1.260 +		pand mm2, mm6
   1.261 +		pand mm3, mm6
   1.262 +		psubd mm2, mm3
   1.263 +		pand mm2, mm7
   1.264 +
   1.265 +		mov eax, 0x0000FF00
   1.266 +		movd mm6, eax; mm6 = Umask = 0x0000FF00
   1.267 +
   1.268 +		; Calculate color U difference
   1.269 +		movq mm3, mm0
   1.270 +		movq mm4, mm1
   1.271 +		pand mm3, mm6
   1.272 +		pand mm4, mm6
   1.273 +		psubd mm3, mm4
   1.274 +		pand mm3, mm7
   1.275 +
   1.276 +		mov eax, 0x000000FF
   1.277 +		movd mm6, eax; mm6 = Vmask = 0x000000FF
   1.278 +
   1.279 +		; Calculate color V difference
   1.280 +		movq mm4, mm0
   1.281 +		movq mm5, mm1
   1.282 +		pand mm4, mm6
   1.283 +		pand mm5, mm6
   1.284 +		psubd mm4, mm5
   1.285 +		pand mm4, mm7
   1.286 +
   1.287 +		mov eax, 0x00300000
   1.288 +		movd mm5, eax; mm5 = trY = 0x00300000
   1.289 +		                           mov eax, 0x00000700
   1.290 +		movd mm6, eax; mm6 = trU = 0x00000700
   1.291 +		                           mov eax, 0x00000006
   1.292 +		movd mm7, eax; mm7 = trV = 0x00000006
   1.293 +
   1.294 +		; Compare the results
   1.295 +		pcmpgtd mm2, trY
   1.296 +		pcmpgtd mm3, trU
   1.297 +		pcmpgtd mm4, trV
   1.298 +		por mm2, mm3
   1.299 +		por mm2, mm4
   1.300 +
   1.301 +		movd retval, mm2
   1.302 +
   1.303 +		        EMMS
   1.304 +	}
   1.305 +	return (retval != 0);
   1.306 +#else
   1.307 +	return
   1.308 +	    (abs32((YUV1 & Ymask) - (YUV2 & Ymask)) > trY) ||
   1.309 +	    (abs32((YUV1 & Umask) - (YUV2 & Umask)) > trU) ||
   1.310 +	    (abs32((YUV1 & Vmask) - (YUV2 & Vmask)) > trV);
   1.311 +#endif
   1.312 +}
   1.313 +
   1.314 +unsigned int RGBtoYUV(unsigned int c)
   1.315 +{   // Division through 3 slows down the emulation about 10% !!!
   1.316 +#ifdef MMX
   1.317 +	unsigned int retval;
   1.318 +	__asm
   1.319 +	{
   1.320 +		movd mm0, c
   1.321 +		movq mm1, mm0
   1.322 +		movq mm2, mm0; mm0 = mm1 = mm2 = c
   1.323 +
   1.324 +		                                 mov eax, 0x000000FF
   1.325 +		movd mm5, eax; mm5 = REDMASK = 0x000000FF
   1.326 +		                               mov eax, 0x0000FF00
   1.327 +		movd mm6, eax; mm6 = GREENMASK = 0x0000FF00
   1.328 +		                                 mov eax, 0x00FF0000
   1.329 +		movd mm7, eax; mm7 = BLUEMASK = 0x00FF0000
   1.330 +
   1.331 +		                                pand mm0, mm5
   1.332 +		pand mm1, mm6
   1.333 +		pand mm2, mm7; mm0 = R mm1 = G mm2 = B
   1.334 +
   1.335 +		                                     movq mm3, mm0
   1.336 +		paddd mm3, mm1
   1.337 +		paddd mm3, mm2
   1.338 +		;       psrld mm3, 2; mm3 = Y
   1.339 +		;       pslld mm3, 16
   1.340 +		pslld mm3, 14; mm3 = Y << 16
   1.341 +
   1.342 +		                     mov eax, 512
   1.343 +		movd mm7, eax; mm7 = 128 << 2 = 512
   1.344 +
   1.345 +		                                movq mm4, mm0
   1.346 +		psubd mm4, mm2
   1.347 +		;       psrld mm4, 2
   1.348 +		;       paddd mm4, mm7; mm4 = U
   1.349 +		;       pslld mm4, 8; mm4 = U << 8
   1.350 +		                            paddd mm4, mm7
   1.351 +		pslld mm4, 6
   1.352 +
   1.353 +		mov eax, 128
   1.354 +		movd mm7, eax; mm7 = 128
   1.355 +
   1.356 +		                     movq mm5, mm1
   1.357 +		pslld mm5, 1
   1.358 +		psubd mm5, mm0
   1.359 +		psubd mm5, mm2
   1.360 +		psrld mm5, 3
   1.361 +		paddd mm5, mm7; mm5 = V
   1.362 +
   1.363 +		                      paddd mm5, mm4
   1.364 +		paddd mm5, mm3
   1.365 +
   1.366 +		movd retval, mm5
   1.367 +
   1.368 +		EMMS
   1.369 +	}
   1.370 +	return retval;
   1.371 +#else
   1.372 +	unsigned char r, g, b, Y, u, v;
   1.373 +	r = (c & 0x000000FF);
   1.374 +	g = (c & 0x0000FF00) >> 8;
   1.375 +	b = (c & 0x00FF0000) >> 16;
   1.376 +	Y = (r + g + b) >> 2;
   1.377 +	u = 128 + ((r - b) >> 2);
   1.378 +	v = 128 + ((-r + 2 * g - b) >> 3);
   1.379 +	return (Y << 16) + (u << 8) + v;
   1.380 +
   1.381 +	// Extremely High Quality Code
   1.382 +	//unsigned char r, g, b;
   1.383 +	//r = c & 0xFF;
   1.384 +	//g = (c >> 8) & 0xFF;
   1.385 +	//b = (c >> 16) & 0xFF;
   1.386 +	//unsigned char y, u, v;
   1.387 +	//y = (0.256788 * r  +  0.504129 * g  +  0.097906 * b) + 16;
   1.388 +	//u = (-0.148223 * r  -  0.290993 * g  +  0.439216 * b) + 128;
   1.389 +	//v = (0.439216 * r  -  0.367788 * g  -  0.071427 * b) + 128;
   1.390 +	//return (y << 16) + (u << 8) + v;
   1.391 +#endif
   1.392 +}
   1.393 \ No newline at end of file