annotate src/filters/hq_shared32.cpp @ 308:de172acc5a03

moved the memory manipulation functions out of world.practice and into a separate location, gb.mem-utils, to avoid cyclic load dependency. will adjust the dependent files shortly.
author Dylan Holmes <ocsenave@gmail.com>
date Sat, 31 Mar 2012 04:25:49 -0500
parents f9f4f1b99eed
children
rev   line source
rlm@1 1 #include "../Port.h"
rlm@1 2 #include "hq_shared32.h"
rlm@1 3 #include "interp.h"
rlm@1 4
rlm@1 5 const unsigned __int64 reg_blank = 0x0000000000000000;
rlm@1 6 const unsigned __int64 const7 = 0x0000000700070007;
rlm@1 7 const unsigned __int64 treshold = 0x0000000000300706;
rlm@1 8
rlm@1 9 void Interp1(unsigned char *pc, unsigned int c1, unsigned int c2)
rlm@1 10 {
rlm@1 11 //*((int*)pc) = (c1*3+c2)/4;
rlm@1 12
rlm@1 13 #ifdef MMX
rlm@1 14 __asm
rlm@1 15 {
rlm@1 16 mov eax, pc
rlm@1 17 movd mm1, c1
rlm@1 18 movd mm2, c2
rlm@1 19 movq mm0, mm1
rlm@1 20 pslld mm0, 2
rlm@1 21 psubd mm0, mm1
rlm@1 22 paddd mm0, mm2
rlm@1 23 psrld mm0, 2
rlm@1 24 movd [eax], mm0
rlm@1 25 EMMS
rlm@1 26 }
rlm@1 27 #else
rlm@1 28 __asm
rlm@1 29 {
rlm@1 30 mov eax, pc
rlm@1 31 mov edx, c1
rlm@1 32 shl edx, 2
rlm@1 33 add edx, c2
rlm@1 34 sub edx, c1
rlm@1 35 shr edx, 2
rlm@1 36 mov [eax], edx
rlm@1 37 }
rlm@1 38 #endif
rlm@1 39 }
rlm@1 40
rlm@1 41 void Interp2(unsigned char *pc, unsigned int c1, unsigned int c2, unsigned int c3)
rlm@1 42 {
rlm@1 43 //*((int*)pc) = (c1*2+c2+c3)/4;
rlm@1 44
rlm@1 45 #ifdef MMX
rlm@1 46 __asm
rlm@1 47 {
rlm@1 48 mov eax, pc
rlm@1 49 movd mm0, c1
rlm@1 50 movd mm1, c2
rlm@1 51 movd mm2, c3
rlm@1 52 pslld mm0, 1
rlm@1 53 paddd mm0, mm1
rlm@1 54 paddd mm0, mm2
rlm@1 55 psrad mm0, 2
rlm@1 56 movd [eax], mm0
rlm@1 57 EMMS
rlm@1 58 }
rlm@1 59 #else
rlm@1 60 __asm
rlm@1 61 {
rlm@1 62 mov eax, pc
rlm@1 63 mov edx, c1
rlm@1 64 shl edx, 1
rlm@1 65 add edx, c2
rlm@1 66 add edx, c3
rlm@1 67 shr edx, 2
rlm@1 68 mov [eax], edx
rlm@1 69 }
rlm@1 70 #endif
rlm@1 71 }
rlm@1 72
rlm@1 73 void Interp3(unsigned char *pc, unsigned int c1, unsigned int c2)
rlm@1 74 {
rlm@1 75 //*((int*)pc) = (c1*7+c2)/8;
rlm@1 76 //*((int*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
rlm@1 77 // (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
rlm@1 78
rlm@1 79 #ifdef MMX
rlm@1 80 __asm
rlm@1 81 {
rlm@1 82 mov eax, pc
rlm@1 83 movd mm1, c1
rlm@1 84 movd mm2, c2
rlm@1 85 punpcklbw mm1, reg_blank
rlm@1 86 punpcklbw mm2, reg_blank
rlm@1 87 pmullw mm1, const7
rlm@1 88 paddw mm1, mm2
rlm@1 89 psrlw mm1, 3
rlm@1 90 packuswb mm1, reg_blank
rlm@1 91 movd [eax], mm1
rlm@1 92 EMMS
rlm@1 93 }
rlm@1 94 #else
rlm@1 95 __asm
rlm@1 96 {
rlm@1 97 mov eax, c1
rlm@1 98 mov ebx, c2
rlm@1 99 mov ecx, eax
rlm@1 100 shl ecx, 3
rlm@1 101 sub ecx, eax
rlm@1 102 add ecx, ebx
rlm@1 103 shr ecx, 3
rlm@1 104 mov eax, pc
rlm@1 105 mov [eax], ecx
rlm@1 106 }
rlm@1 107 #endif
rlm@1 108 }
rlm@1 109
rlm@1 110 void Interp4(unsigned char *pc, unsigned int c1, unsigned int c2, unsigned int c3)
rlm@1 111 {
rlm@1 112 //*((int*)pc) = (c1*2+(c2+c3)*7)/16;
rlm@1 113 //*((int*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
rlm@1 114 // (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
rlm@1 115
rlm@1 116 #ifdef MMX
rlm@1 117 __asm
rlm@1 118 {
rlm@1 119 mov eax, pc
rlm@1 120 movd mm1, c1
rlm@1 121 movd mm2, c2
rlm@1 122 movd mm3, c3
rlm@1 123 punpcklbw mm1, reg_blank
rlm@1 124 punpcklbw mm2, reg_blank
rlm@1 125 punpcklbw mm3, reg_blank
rlm@1 126 psllw mm1, 1
rlm@1 127 paddw mm2, mm3
rlm@1 128 pmullw mm2, const7
rlm@1 129 paddw mm1, mm2
rlm@1 130 psrlw mm1, 4
rlm@1 131 packuswb mm1, reg_blank
rlm@1 132 movd [eax], mm1
rlm@1 133 EMMS
rlm@1 134 }
rlm@1 135 #else
rlm@1 136
rlm@1 137 __asm
rlm@1 138 {
rlm@1 139 mov eax, [c1]
rlm@1 140 and eax, 0FF00h
rlm@1 141 shl eax, 1
rlm@1 142 mov ecx, [c2]
rlm@1 143 and ecx, 0FF00h
rlm@1 144 mov edx, [c3]
rlm@1 145 and edx, 0FF00h
rlm@1 146 add ecx, edx
rlm@1 147 imul ecx, ecx, 7
rlm@1 148 add eax, ecx
rlm@1 149 and eax, 0FF000h
rlm@1 150
rlm@1 151 mov ebx, [c1]
rlm@1 152 and ebx, 0FF00FFh
rlm@1 153 shl ebx, 1
rlm@1 154 mov ecx, [c2]
rlm@1 155 and ecx, 0FF00FFh
rlm@1 156 mov edx, [c3]
rlm@1 157 and edx, 0FF00FFh
rlm@1 158 add ecx, edx
rlm@1 159 imul ecx, ecx, 7
rlm@1 160 add ebx, ecx
rlm@1 161 and ebx, 0FF00FF0h
rlm@1 162
rlm@1 163 add eax, ebx
rlm@1 164 shr eax, 4
rlm@1 165
rlm@1 166 mov ebx, pc
rlm@1 167 mov [ebx], eax
rlm@1 168 }
rlm@1 169 #endif
rlm@1 170 }
rlm@1 171
rlm@1 172 void Interp5(unsigned char *pc, unsigned int c1, unsigned int c2)
rlm@1 173 {
rlm@1 174 //*((int*)pc) = (c1+c2)/2;
rlm@1 175
rlm@1 176 #ifdef MMX
rlm@1 177 __asm
rlm@1 178 {
rlm@1 179 mov eax, pc
rlm@1 180 movd mm0, c1
rlm@1 181 movd mm1, c2
rlm@1 182 paddd mm0, mm1
rlm@1 183 psrad mm0, 1
rlm@1 184 movd [eax], mm0
rlm@1 185 EMMS
rlm@1 186 }
rlm@1 187 #else
rlm@1 188 __asm
rlm@1 189 {
rlm@1 190 mov eax, pc
rlm@1 191 mov edx, c1
rlm@1 192 add edx, c2
rlm@1 193 shr edx, 1
rlm@1 194 mov [eax], edx
rlm@1 195 }
rlm@1 196 #endif
rlm@1 197 }
rlm@1 198
rlm@1 199 void Interp1_16(unsigned char *pc, unsigned short c1, unsigned short c2)
rlm@1 200 {
rlm@1 201 *((unsigned short *)pc) = interp_16_31(c1, c2);
rlm@1 202 //*((int*)pc) = (c1*3+c2)/4;
rlm@1 203 }
rlm@1 204
rlm@1 205 void Interp2_16(unsigned char *pc, unsigned short c1, unsigned short c2, unsigned short c3)
rlm@1 206 {
rlm@1 207 *((unsigned short *)pc) = interp_16_211(c1, c2, c3);
rlm@1 208 //*((int*)pc) = (c1*2+c2+c3)/4;
rlm@1 209 }
rlm@1 210
rlm@1 211 void Interp3_16(unsigned char *pc, unsigned short c1, unsigned short c2)
rlm@1 212 {
rlm@1 213 *((unsigned short *)pc) = interp_16_71(c1, c2);
rlm@1 214 // *((unsigned short*)pc) = (c1*7+c2)/8;
rlm@1 215 // *((unsigned short*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
rlm@1 216 // (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
rlm@1 217 }
rlm@1 218
rlm@1 219 void Interp4_16(unsigned char *pc, unsigned short c1, unsigned short c2, unsigned short c3)
rlm@1 220 {
rlm@1 221 *((unsigned short *)pc) = interp_16_772(c2, c3, c1);
rlm@1 222 // *((unsigned short*)pc) = (c1*2+(c2+c3)*7)/16;
rlm@1 223 // *((unsigned short*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
rlm@1 224 // (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
rlm@1 225 }
rlm@1 226
rlm@1 227 void Interp5_16(unsigned char *pc, unsigned short c1, unsigned short c2)
rlm@1 228 {
rlm@1 229 *((unsigned short *)pc) = interp_16_11(c1, c2);
rlm@1 230 }
rlm@1 231
rlm@1 232 bool Diff(unsigned int c1, unsigned int c2)
rlm@1 233 {
rlm@1 234 unsigned int
rlm@1 235 YUV1 = RGBtoYUV(c1),
rlm@1 236 YUV2 = RGBtoYUV(c2);
rlm@1 237
rlm@1 238 if (YUV1 == YUV2) return false; // Save some processing power
rlm@1 239
rlm@1 240 #ifdef MMX
rlm@1 241 unsigned int retval;
rlm@1 242 __asm
rlm@1 243 {
rlm@1 244 mov eax, 0x7FFFFFFF
rlm@1 245 movd mm7, eax; mm7 = ABS_MASK = 0x7FFFFFFF
rlm@1 246
rlm@1 247 ; Copy source colors in first reg
rlm@1 248 movd mm0, YUV1
rlm@1 249 movd mm1, YUV2
rlm@1 250
rlm@1 251 mov eax, 0x00FF0000
rlm@1 252 movd mm6, eax; mm6 = Ymask = 0x00FF0000
rlm@1 253
rlm@1 254 ; Calculate color Y difference
rlm@1 255 movq mm2, mm0
rlm@1 256 movq mm3, mm1
rlm@1 257 pand mm2, mm6
rlm@1 258 pand mm3, mm6
rlm@1 259 psubd mm2, mm3
rlm@1 260 pand mm2, mm7
rlm@1 261
rlm@1 262 mov eax, 0x0000FF00
rlm@1 263 movd mm6, eax; mm6 = Umask = 0x0000FF00
rlm@1 264
rlm@1 265 ; Calculate color U difference
rlm@1 266 movq mm3, mm0
rlm@1 267 movq mm4, mm1
rlm@1 268 pand mm3, mm6
rlm@1 269 pand mm4, mm6
rlm@1 270 psubd mm3, mm4
rlm@1 271 pand mm3, mm7
rlm@1 272
rlm@1 273 mov eax, 0x000000FF
rlm@1 274 movd mm6, eax; mm6 = Vmask = 0x000000FF
rlm@1 275
rlm@1 276 ; Calculate color V difference
rlm@1 277 movq mm4, mm0
rlm@1 278 movq mm5, mm1
rlm@1 279 pand mm4, mm6
rlm@1 280 pand mm5, mm6
rlm@1 281 psubd mm4, mm5
rlm@1 282 pand mm4, mm7
rlm@1 283
rlm@1 284 mov eax, 0x00300000
rlm@1 285 movd mm5, eax; mm5 = trY = 0x00300000
rlm@1 286 mov eax, 0x00000700
rlm@1 287 movd mm6, eax; mm6 = trU = 0x00000700
rlm@1 288 mov eax, 0x00000006
rlm@1 289 movd mm7, eax; mm7 = trV = 0x00000006
rlm@1 290
rlm@1 291 ; Compare the results
rlm@1 292 pcmpgtd mm2, trY
rlm@1 293 pcmpgtd mm3, trU
rlm@1 294 pcmpgtd mm4, trV
rlm@1 295 por mm2, mm3
rlm@1 296 por mm2, mm4
rlm@1 297
rlm@1 298 movd retval, mm2
rlm@1 299
rlm@1 300 EMMS
rlm@1 301 }
rlm@1 302 return (retval != 0);
rlm@1 303 #else
rlm@1 304 return
rlm@1 305 (abs32((YUV1 & Ymask) - (YUV2 & Ymask)) > trY) ||
rlm@1 306 (abs32((YUV1 & Umask) - (YUV2 & Umask)) > trU) ||
rlm@1 307 (abs32((YUV1 & Vmask) - (YUV2 & Vmask)) > trV);
rlm@1 308 #endif
rlm@1 309 }
rlm@1 310
rlm@1 311 unsigned int RGBtoYUV(unsigned int c)
rlm@1 312 { // Division through 3 slows down the emulation about 10% !!!
rlm@1 313 #ifdef MMX
rlm@1 314 unsigned int retval;
rlm@1 315 __asm
rlm@1 316 {
rlm@1 317 movd mm0, c
rlm@1 318 movq mm1, mm0
rlm@1 319 movq mm2, mm0; mm0 = mm1 = mm2 = c
rlm@1 320
rlm@1 321 mov eax, 0x000000FF
rlm@1 322 movd mm5, eax; mm5 = REDMASK = 0x000000FF
rlm@1 323 mov eax, 0x0000FF00
rlm@1 324 movd mm6, eax; mm6 = GREENMASK = 0x0000FF00
rlm@1 325 mov eax, 0x00FF0000
rlm@1 326 movd mm7, eax; mm7 = BLUEMASK = 0x00FF0000
rlm@1 327
rlm@1 328 pand mm0, mm5
rlm@1 329 pand mm1, mm6
rlm@1 330 pand mm2, mm7; mm0 = R mm1 = G mm2 = B
rlm@1 331
rlm@1 332 movq mm3, mm0
rlm@1 333 paddd mm3, mm1
rlm@1 334 paddd mm3, mm2
rlm@1 335 ; psrld mm3, 2; mm3 = Y
rlm@1 336 ; pslld mm3, 16
rlm@1 337 pslld mm3, 14; mm3 = Y << 16
rlm@1 338
rlm@1 339 mov eax, 512
rlm@1 340 movd mm7, eax; mm7 = 128 << 2 = 512
rlm@1 341
rlm@1 342 movq mm4, mm0
rlm@1 343 psubd mm4, mm2
rlm@1 344 ; psrld mm4, 2
rlm@1 345 ; paddd mm4, mm7; mm4 = U
rlm@1 346 ; pslld mm4, 8; mm4 = U << 8
rlm@1 347 paddd mm4, mm7
rlm@1 348 pslld mm4, 6
rlm@1 349
rlm@1 350 mov eax, 128
rlm@1 351 movd mm7, eax; mm7 = 128
rlm@1 352
rlm@1 353 movq mm5, mm1
rlm@1 354 pslld mm5, 1
rlm@1 355 psubd mm5, mm0
rlm@1 356 psubd mm5, mm2
rlm@1 357 psrld mm5, 3
rlm@1 358 paddd mm5, mm7; mm5 = V
rlm@1 359
rlm@1 360 paddd mm5, mm4
rlm@1 361 paddd mm5, mm3
rlm@1 362
rlm@1 363 movd retval, mm5
rlm@1 364
rlm@1 365 EMMS
rlm@1 366 }
rlm@1 367 return retval;
rlm@1 368 #else
rlm@1 369 unsigned char r, g, b, Y, u, v;
rlm@1 370 r = (c & 0x000000FF);
rlm@1 371 g = (c & 0x0000FF00) >> 8;
rlm@1 372 b = (c & 0x00FF0000) >> 16;
rlm@1 373 Y = (r + g + b) >> 2;
rlm@1 374 u = 128 + ((r - b) >> 2);
rlm@1 375 v = 128 + ((-r + 2 * g - b) >> 3);
rlm@1 376 return (Y << 16) + (u << 8) + v;
rlm@1 377
rlm@1 378 // Extremely High Quality Code
rlm@1 379 //unsigned char r, g, b;
rlm@1 380 //r = c & 0xFF;
rlm@1 381 //g = (c >> 8) & 0xFF;
rlm@1 382 //b = (c >> 16) & 0xFF;
rlm@1 383 //unsigned char y, u, v;
rlm@1 384 //y = (0.256788 * r + 0.504129 * g + 0.097906 * b) + 16;
rlm@1 385 //u = (-0.148223 * r - 0.290993 * g + 0.439216 * b) + 128;
rlm@1 386 //v = (0.439216 * r - 0.367788 * g - 0.071427 * b) + 128;
rlm@1 387 //return (y << 16) + (u << 8) + v;
rlm@1 388 #endif
rlm@1 389 }