diff src/filters/2xSaImmx.asm @ 27:b970226568d2

brought in filters package
author Robert McIntyre <rlm@mit.edu>
date Sun, 04 Mar 2012 20:32:31 -0600
parents f9f4f1b99eed
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/filters/2xSaImmx.asm	Sun Mar 04 20:32:31 2012 -0600
     1.3 @@ -0,0 +1,2109 @@
     1.4 +;/*---------------------------------------------------------------------*
     1.5 +; * The following (piece of) code, (part of) the 2xSaI engine,          *
     1.6 +; * copyright (c) 1999 - 2001 by Derek Liauw Kie Fa.                    *
     1.7 +; * Non-Commercial use of this software is allowed and is encouraged,   *
     1.8 +; * provided that appropriate credit be given.                          *
     1.9 +; * You may freely modify this code, but I request                      *
    1.10 +; * that any improvements to the engine be submitted to me, so          *
    1.11 +; * that I can implement these improvements in newer versions of        *
    1.12 +; * the software.                                                       *
    1.13 +; * If you need more information, have any comments or suggestions,     *
    1.14 +; * you can e-mail me. My e-mail: derek-liauw@usa.net.                  *
    1.15 +; *---------------------------------------------------------------------*/
    1.16 +
    1.17 +;----------------------
    1.18 +; 2xSaI version 0.59 WIP, soon to become version 0.60
    1.19 +;----------------------
    1.20 +
    1.21 +;%define FAR_POINTER
    1.22 +
    1.23 +
    1.24 +
    1.25 +          BITS 32
    1.26 +%ifdef __DJGPP__
    1.27 +          GLOBAL __2xSaILine
    1.28 +          GLOBAL __2xSaISuperEagleLine
    1.29 +                  GLOBAL __2xSaISuper2xSaILine
    1.30 +          GLOBAL _Init_2xSaIMMX
    1.31 +%else
    1.32 +          GLOBAL _2xSaILine
    1.33 +          GLOBAL _2xSaISuperEagleLine
    1.34 +                  GLOBAL _2xSaISuper2xSaILine
    1.35 +          GLOBAL Init_2xSaIMMX
    1.36 +%endif
    1.37 +          SECTION .text ALIGN = 32
    1.38 +
    1.39 +%ifdef FAR_POINTER
    1.40 +;EXTERN_C void _2xSaILine (uint8 *srcPtr, uint32 srcPitch, uint32 width,
    1.41 +;                        uint8 *dstPtr, uint32 dstPitch, uint16 dstSegment);
    1.42 +%else
    1.43 +;EXTERN_C void _2xSaILine (uint8 *srcPtr, uint32 srcPitch, uint32 width,
    1.44 +;                        uint8 *dstPtr, uint32 dstPitch);
    1.45 +%endif
    1.46 +
    1.47 +srcPtr        equ 8
    1.48 +deltaPtr      equ 12
    1.49 +srcPitch      equ 16
    1.50 +width         equ 20
    1.51 +dstOffset     equ 24
    1.52 +dstPitch      equ 28
    1.53 +dstSegment    equ 32
    1.54 +
    1.55 +
    1.56 +
    1.57 +
    1.58 +colorB0   equ -2
    1.59 +colorB1   equ 0
    1.60 +colorB2   equ 2
    1.61 +colorB3   equ 4
    1.62 +
    1.63 +color7   equ -2
    1.64 +color8   equ 0
    1.65 +color9   equ 2
    1.66 +
    1.67 +color4   equ -2
    1.68 +color5   equ 0
    1.69 +color6   equ 2
    1.70 +colorS2   equ 4
    1.71 +
    1.72 +color1   equ -2
    1.73 +color2   equ 0
    1.74 +color3   equ 2
    1.75 +colorS1   equ 4
    1.76 +
    1.77 +colorA0   equ -2
    1.78 +colorA1   equ 0
    1.79 +colorA2   equ 2
    1.80 +colorA3   equ 4
    1.81 +
    1.82 +
    1.83 +
    1.84 +
    1.85 +%ifdef __DJGPP__
    1.86 +__2xSaISuper2xSaILine:
    1.87 +%else
    1.88 +_2xSaISuper2xSaILine:
    1.89 +%endif
    1.90 +; Store some stuff
    1.91 +         push ebp
    1.92 +         mov ebp, esp
    1.93 +         pushad
    1.94 +
    1.95 +; Prepare the destination
    1.96 +%ifdef FAR_POINTER
    1.97 +         ; Set the selector
    1.98 +         mov eax, [ebp+dstSegment]
    1.99 +         mov fs, ax
   1.100 +%endif
   1.101 +         mov edx, [ebp+dstOffset]         ; edx points to the screen
   1.102 +; Prepare the source
   1.103 +         ; eax points to colorA
   1.104 +         mov eax, [ebp+srcPtr]                          ;eax points to colorA
   1.105 +         mov ebx, [ebp+srcPitch]                        ;ebx contains the source pitch
   1.106 +         mov ecx, [ebp+width]                           ;ecx contains the number of pixels to process
   1.107 +         ; eax now points to colorB1
   1.108 +         sub eax, ebx                                           ;eax points to B1 which is the base 
   1.109 +
   1.110 +; Main Loop
   1.111 +.Loop:   push ecx
   1.112 +
   1.113 +         ;-----Check Delta------------------
   1.114 +         mov ecx, [ebp+deltaPtr]
   1.115 +
   1.116 +
   1.117 +                ;load source img
   1.118 +         movq mm0, [eax+colorB0]
   1.119 +         movq mm1, [eax+colorB3]
   1.120 +         movq mm2, [eax+ebx+color4]
   1.121 +         movq mm3, [eax+ebx+colorS2]
   1.122 +         movq mm4, [eax+ebx+ebx+color1]
   1.123 +         movq mm5, [eax+ebx+ebx+colorS1]
   1.124 +         push eax
   1.125 +         add eax, ebx
   1.126 +         movq mm6, [eax+ebx+ebx+colorA0]
   1.127 +         movq mm7, [eax+ebx+ebx+colorA3]
   1.128 +         pop eax
   1.129 +
   1.130 +                ;compare to delta
   1.131 +         pcmpeqw mm0, [ecx+2+colorB0]
   1.132 +         pcmpeqw mm1, [ecx+2+colorB3]
   1.133 +         pcmpeqw mm2, [ecx+ebx+2+color4]
   1.134 +         pcmpeqw mm3, [ecx+ebx+2+colorS2]
   1.135 +         pcmpeqw mm4, [ecx+ebx+ebx+2+color1]
   1.136 +         pcmpeqw mm5, [ecx+ebx+ebx+2+colorS1]
   1.137 +         add ecx, ebx
   1.138 +         pcmpeqw mm6, [ecx+ebx+ebx+2+colorA0]
   1.139 +         pcmpeqw mm7, [ecx+ebx+ebx+2+colorA3]
   1.140 +         sub ecx, ebx
   1.141 +
   1.142 +
   1.143 +                ;compose results
   1.144 +         pand mm0, mm1
   1.145 +         pand mm2, mm3
   1.146 +         pand mm4, mm5
   1.147 +         pand mm6, mm7
   1.148 +         pand mm0, mm2
   1.149 +         pand mm4, mm6
   1.150 +         pxor mm7, mm7
   1.151 +         pand mm0, mm4
   1.152 +         movq mm6, [eax+colorB0]
   1.153 +         pcmpeqw mm7, mm0                       ;did any compare give us a zero ?
   1.154 +
   1.155 +         movq [ecx+2+colorB0], mm6
   1.156 +
   1.157 +         packsswb mm7, mm7
   1.158 +         movd ecx, mm7
   1.159 +         test ecx, ecx                          
   1.160 +         jz near .SKIP_PROCESS          ;no, so we can skip
   1.161 +
   1.162 +         ;End Delta
   1.163 +
   1.164 +         ;---------------------------------
   1.165 +         movq mm0, [eax+ebx+color5]
   1.166 +         movq mm1, [eax+ebx+color6]
   1.167 +         movq mm2, mm0
   1.168 +         movq mm3, mm1
   1.169 +         movq mm4, mm0
   1.170 +         movq mm5, mm1
   1.171 +
   1.172 +         pand mm0, [colorMask]
   1.173 +         pand mm1, [colorMask]
   1.174 +
   1.175 +         psrlw mm0, 1
   1.176 +         psrlw mm1, 1
   1.177 +
   1.178 +         pand mm3, [lowPixelMask]
   1.179 +         paddw mm0, mm1
   1.180 +
   1.181 +         pand mm3, mm2
   1.182 +         paddw mm0, mm3                ;mm0 contains the interpolated values
   1.183 +         movq [I56Pixel], mm0
   1.184 +         movq mm7, mm0
   1.185 +
   1.186 +         ;-------------------
   1.187 +         movq mm0, mm7
   1.188 +         movq mm1, mm4  ;5,5,5,6
   1.189 +         movq mm2, mm0
   1.190 +         movq mm3, mm1
   1.191 +
   1.192 +         pand mm0, [colorMask]
   1.193 +         pand mm1, [colorMask]
   1.194 +
   1.195 +         psrlw mm0, 1
   1.196 +         psrlw mm1, 1
   1.197 +
   1.198 +         pand mm3, [lowPixelMask]
   1.199 +         paddw mm0, mm1
   1.200 +
   1.201 +         pand mm3, mm2
   1.202 +         paddw mm0, mm3                ;mm0 contains the interpolated values
   1.203 +         movq [I5556Pixel], mm0
   1.204 +         ;--------------------
   1.205 +
   1.206 +         movq mm0, mm7
   1.207 +         movq mm1, mm5  ;6,6,6,5
   1.208 +         movq mm2, mm0
   1.209 +         movq mm3, mm1
   1.210 +
   1.211 +         pand mm0, [colorMask]
   1.212 +         pand mm1, [colorMask]
   1.213 +
   1.214 +         psrlw mm0, 1
   1.215 +         psrlw mm1, 1
   1.216 +
   1.217 +         pand mm3, [lowPixelMask]
   1.218 +         paddw mm0, mm1
   1.219 +
   1.220 +         pand mm3, mm2
   1.221 +         paddw mm0, mm3
   1.222 +         movq [I5666Pixel], mm0
   1.223 +
   1.224 +         ;-------------------------
   1.225 +         ;-------------------------
   1.226 +         movq mm0, [eax+ebx+ebx+color2]
   1.227 +         movq mm1, [eax+ebx+ebx+color3]
   1.228 +         movq mm2, mm0
   1.229 +         movq mm3, mm1
   1.230 +         movq mm4, mm0
   1.231 +         movq mm5, mm1
   1.232 +
   1.233 +         pand mm0, [colorMask]
   1.234 +         pand mm1, [colorMask]
   1.235 +
   1.236 +         psrlw mm0, 1
   1.237 +         psrlw mm1, 1
   1.238 +
   1.239 +         pand mm3, [lowPixelMask]
   1.240 +         paddw mm0, mm1
   1.241 +
   1.242 +         pand mm3, mm2
   1.243 +         paddw mm0, mm3
   1.244 +         movq [I23Pixel], mm0
   1.245 +         movq mm7, mm0
   1.246 +
   1.247 +         ;---------------------
   1.248 +         movq mm0, mm7
   1.249 +         movq mm1, mm4  ;2,2,2,3
   1.250 +         movq mm2, mm0
   1.251 +         movq mm3, mm1
   1.252 +
   1.253 +         pand mm0, [colorMask]
   1.254 +         pand mm1, [colorMask]
   1.255 +
   1.256 +         psrlw mm0, 1
   1.257 +         psrlw mm1, 1
   1.258 +
   1.259 +         pand mm3, [lowPixelMask]
   1.260 +         paddw mm0, mm1
   1.261 +
   1.262 +         pand mm3, mm2
   1.263 +         paddw mm0, mm3
   1.264 +         movq [I2223Pixel], mm0
   1.265 +
   1.266 +         ;----------------------
   1.267 +         movq mm0, mm7
   1.268 +         movq mm1, mm5  ;3,3,3,2
   1.269 +         movq mm2, mm0
   1.270 +         movq mm3, mm1
   1.271 +
   1.272 +         pand mm0, [colorMask]
   1.273 +         pand mm1, [colorMask]
   1.274 +
   1.275 +         psrlw mm0, 1
   1.276 +         psrlw mm1, 1
   1.277 +
   1.278 +         pand mm3, [lowPixelMask]
   1.279 +         paddw mm0, mm1
   1.280 +
   1.281 +         pand mm3, mm2
   1.282 +         paddw mm0, mm3
   1.283 +         movq [I2333Pixel], mm0
   1.284 +
   1.285 +
   1.286 +         ;--------------------
   1.287 +;////////////////////////////////
   1.288 +; Decide which "branch" to take
   1.289 +;--------------------------------
   1.290 +         movq mm0, [eax+ebx+color5]
   1.291 +         movq mm1, [eax+ebx+color6]
   1.292 +         movq mm6, mm0
   1.293 +         movq mm7, mm1
   1.294 +         pcmpeqw mm0, [eax+ebx+ebx+color3]
   1.295 +         pcmpeqw mm1, [eax+ebx+ebx+color2]
   1.296 +         pcmpeqw mm6, mm7
   1.297 +
   1.298 +         movq mm2, mm0
   1.299 +         movq mm3, mm0
   1.300 +
   1.301 +         pand mm0, mm1       ;colorA == colorD && colorB == colorC
   1.302 +         pxor mm7, mm7
   1.303 +
   1.304 +         pcmpeqw mm2, mm7
   1.305 +         pand mm6, mm0
   1.306 +         pand mm2, mm1       ;colorA != colorD && colorB == colorC
   1.307 +
   1.308 +         pcmpeqw mm1, mm7
   1.309 +
   1.310 +         pand mm1, mm3       ;colorA == colorD && colorB != colorC
   1.311 +         pxor mm0, mm6
   1.312 +         por mm1, mm6
   1.313 +         movq mm7, mm0
   1.314 +         movq [Mask26], mm2
   1.315 +         packsswb mm7, mm7
   1.316 +         movq [Mask35], mm1
   1.317 +
   1.318 +         movd ecx, mm7
   1.319 +         test ecx, ecx
   1.320 +         jz near .SKIP_GUESS
   1.321 +
   1.322 +;---------------------------------------------
   1.323 +         movq mm6, mm0
   1.324 +         movq mm4, [eax+ebx+colorA]
   1.325 +         movq mm5, [eax+ebx+colorB]
   1.326 +         pxor mm7, mm7
   1.327 +         pand mm6, [ONE]
   1.328 +
   1.329 +         movq mm0, [eax+colorE]
   1.330 +         movq mm1, [eax+ebx+colorG]
   1.331 +         movq mm2, mm0
   1.332 +         movq mm3, mm1
   1.333 +         pcmpeqw mm0, mm4
   1.334 +         pcmpeqw mm1, mm4
   1.335 +         pcmpeqw mm2, mm5
   1.336 +         pcmpeqw mm3, mm5
   1.337 +         pand mm0, mm6
   1.338 +         pand mm1, mm6
   1.339 +         pand mm2, mm6
   1.340 +         pand mm3, mm6
   1.341 +         paddw mm0, mm1
   1.342 +         paddw mm2, mm3
   1.343 +
   1.344 +         pxor mm3, mm3
   1.345 +         pcmpgtw mm0, mm6
   1.346 +         pcmpgtw mm2, mm6
   1.347 +         pcmpeqw mm0, mm3
   1.348 +         pcmpeqw mm2, mm3
   1.349 +         pand mm0, mm6
   1.350 +         pand mm2, mm6
   1.351 +         paddw mm7, mm0
   1.352 +         psubw mm7, mm2
   1.353 +
   1.354 +         movq mm0, [eax+colorF]
   1.355 +         movq mm1, [eax+ebx+colorK]
   1.356 +         movq mm2, mm0
   1.357 +         movq mm3, mm1
   1.358 +         pcmpeqw mm0, mm4
   1.359 +         pcmpeqw mm1, mm4
   1.360 +         pcmpeqw mm2, mm5
   1.361 +         pcmpeqw mm3, mm5
   1.362 +         pand mm0, mm6
   1.363 +         pand mm1, mm6
   1.364 +         pand mm2, mm6
   1.365 +         pand mm3, mm6
   1.366 +         paddw mm0, mm1
   1.367 +         paddw mm2, mm3
   1.368 +
   1.369 +         pxor mm3, mm3
   1.370 +         pcmpgtw mm0, mm6
   1.371 +         pcmpgtw mm2, mm6
   1.372 +         pcmpeqw mm0, mm3
   1.373 +         pcmpeqw mm2, mm3
   1.374 +         pand mm0, mm6
   1.375 +         pand mm2, mm6
   1.376 +         paddw mm7, mm0
   1.377 +         psubw mm7, mm2
   1.378 +
   1.379 +         push eax
   1.380 +         add eax, ebx
   1.381 +         movq mm0, [eax+ebx+colorH]
   1.382 +         movq mm1, [eax+ebx+ebx+colorN]
   1.383 +         movq mm2, mm0
   1.384 +         movq mm3, mm1
   1.385 +         pcmpeqw mm0, mm4
   1.386 +         pcmpeqw mm1, mm4
   1.387 +         pcmpeqw mm2, mm5
   1.388 +         pcmpeqw mm3, mm5
   1.389 +         pand mm0, mm6
   1.390 +         pand mm1, mm6
   1.391 +         pand mm2, mm6
   1.392 +         pand mm3, mm6
   1.393 +         paddw mm0, mm1
   1.394 +         paddw mm2, mm3
   1.395 +
   1.396 +         pxor mm3, mm3
   1.397 +         pcmpgtw mm0, mm6
   1.398 +         pcmpgtw mm2, mm6
   1.399 +         pcmpeqw mm0, mm3
   1.400 +         pcmpeqw mm2, mm3
   1.401 +         pand mm0, mm6
   1.402 +         pand mm2, mm6
   1.403 +         paddw mm7, mm0
   1.404 +         psubw mm7, mm2
   1.405 +
   1.406 +         movq mm0, [eax+ebx+colorL]
   1.407 +         movq mm1, [eax+ebx+ebx+colorO]
   1.408 +         movq mm2, mm0
   1.409 +         movq mm3, mm1
   1.410 +         pcmpeqw mm0, mm4
   1.411 +         pcmpeqw mm1, mm4
   1.412 +         pcmpeqw mm2, mm5
   1.413 +         pcmpeqw mm3, mm5
   1.414 +         pand mm0, mm6
   1.415 +         pand mm1, mm6
   1.416 +         pand mm2, mm6
   1.417 +         pand mm3, mm6
   1.418 +         paddw mm0, mm1
   1.419 +         paddw mm2, mm3
   1.420 +
   1.421 +         pxor mm3, mm3
   1.422 +         pcmpgtw mm0, mm6
   1.423 +         pcmpgtw mm2, mm6
   1.424 +         pcmpeqw mm0, mm3
   1.425 +         pcmpeqw mm2, mm3
   1.426 +         pand mm0, mm6
   1.427 +         pand mm2, mm6
   1.428 +         paddw mm7, mm0
   1.429 +         psubw mm7, mm2
   1.430 +
   1.431 +         pop eax
   1.432 +         movq mm1, mm7
   1.433 +         pxor mm0, mm0
   1.434 +         pcmpgtw mm7, mm0
   1.435 +         pcmpgtw mm0, mm1
   1.436 +
   1.437 +         por mm7, [Mask35]
   1.438 +         por mm0, [Mask26] 
   1.439 +         movq [Mask35], mm7
   1.440 +         movq [Mask26], mm0
   1.441 +
   1.442 +.SKIP_GUESS:
   1.443 +
   1.444 +         ;Start the ASSEMBLY !!!        eh... compose all the results together to form the final image...
   1.445 +
   1.446 +                 
   1.447 +         movq mm0, [eax+ebx+color5]
   1.448 +         movq mm1, [eax+ebx+ebx+color2]
   1.449 +         movq mm2, mm0
   1.450 +         movq mm3, mm1
   1.451 +         movq mm4, mm0
   1.452 +         movq mm5, mm1
   1.453 +
   1.454 +         pand mm0, [colorMask]
   1.455 +         pand mm1, [colorMask]
   1.456 +
   1.457 +         psrlw mm0, 1
   1.458 +         psrlw mm1, 1
   1.459 +
   1.460 +         pand mm3, [lowPixelMask]
   1.461 +         paddw mm0, mm1
   1.462 +
   1.463 +         pand mm3, mm2
   1.464 +         paddw mm0, mm3                ;mm0 contains the interpolated values
   1.465 +                 ;---------------------------
   1.466 +
   1.467 +
   1.468 +
   1.469 +%ifdef dfhsdfhsdahdsfhdsfh
   1.470 +
   1.471 +                if (color5 == color3 && color2 != color6 && color4 == color5 && color5 != colorA2)
   1.472 +                   product2a = INTERPOLATE (color2, color5);
   1.473 +                else
   1.474 +                if (color5 == color1 && color6 == color5 && color4 != color2 && color5 != colorA0)
   1.475 +                   product2a = INTERPOLATE(color2, color5);
   1.476 +                else
   1.477 +                   product2a = color2;
   1.478 +
   1.479 +                if (color2 == color6 && color5 != color3 && color1 == color2 && color2 != colorB2)
   1.480 +                   product1a = INTERPOLATE (color2, color5);
   1.481 +                else
   1.482 +                if (color4 == color2 && color3 == color2 && color1 != color5 && color2 != colorB0)
   1.483 +                   product1a = INTERPOLATE(color2, color5);
   1.484 +                else
   1.485 +                   product1a = color5;
   1.486 +
   1.487 +%endif
   1.488 +
   1.489 +
   1.490 +                 movq mm7, [Mask26]
   1.491 +                 movq mm6, [eax+colorB2]
   1.492 +                 movq mm5, [eax+ebx+ebx+color2]
   1.493 +                 movq mm4, [eax+ebx+ebx+color1]
   1.494 +                 pcmpeqw mm4, mm5
   1.495 +                 pcmpeqw mm6, mm5
   1.496 +                 pxor mm5, mm5
   1.497 +                 pand mm7, mm4
   1.498 +                 pcmpeqw mm6, mm5
   1.499 +                 pand mm7, mm6
   1.500 +
   1.501 +
   1.502 +
   1.503 +                 movq mm6, [eax+ebx+ebx+color3]
   1.504 +                 movq mm5, [eax+ebx+ebx+color2]
   1.505 +                 movq mm4, [eax+ebx+ebx+color1]
   1.506 +                 movq mm2, [eax+ebx+color5]
   1.507 +                 movq mm1, [eax+ebx+color4]
   1.508 +                 movq mm3, [eax+colorB0]
   1.509 +
   1.510 +                 pcmpeqw mm2, mm4
   1.511 +                 pcmpeqw mm6, mm5
   1.512 +                 pcmpeqw mm1, mm5
   1.513 +                 pcmpeqw mm3, mm5
   1.514 +                 pxor mm5, mm5
   1.515 +                 pcmpeqw mm2, mm5
   1.516 +                 pcmpeqw mm3, mm5
   1.517 +                 pand mm6, mm1
   1.518 +                 pand mm2, mm3
   1.519 +                 pand mm6, mm2
   1.520 +                 por mm7, mm6
   1.521 +
   1.522 +                 
   1.523 +                 movq mm6, mm7
   1.524 +                 pcmpeqw mm6, mm5
   1.525 +                 pand mm7, mm0
   1.526 +
   1.527 +                 movq mm1, [eax+ebx+color5]
   1.528 +                 pand mm6, mm1
   1.529 +                 por mm7, mm6
   1.530 +                 movq [final1a], mm7                    ;finished  1a
   1.531 +
   1.532 +
   1.533 +         
   1.534 +             ;--------------------------------           
   1.535 +
   1.536 +                 movq mm7, [Mask35]
   1.537 +                 push eax
   1.538 +                 add eax, ebx
   1.539 +                 movq mm6, [eax+ebx+ebx+colorA2]
   1.540 +                 pop eax
   1.541 +                 movq mm5, [eax+ebx+color5]
   1.542 +                 movq mm4, [eax+ebx+color4]
   1.543 +                 pcmpeqw mm4, mm5
   1.544 +                 pcmpeqw mm6, mm5
   1.545 +                 pxor mm5, mm5
   1.546 +                 pand mm7, mm4
   1.547 +                 pcmpeqw mm6, mm5
   1.548 +                 pand mm7, mm6
   1.549 +
   1.550 +
   1.551 +
   1.552 +                 movq mm6, [eax+ebx+color6]
   1.553 +                 movq mm5, [eax+ebx+color5]
   1.554 +                 movq mm4, [eax+ebx+color4]
   1.555 +                 movq mm2, [eax+ebx+ebx+color2]
   1.556 +                 movq mm1, [eax+ebx+ebx+color1]
   1.557 +                 push eax
   1.558 +                 add eax, ebx
   1.559 +                 movq mm3, [eax+ebx+ebx+colorA0]
   1.560 +                 pop eax
   1.561 +
   1.562 +                 pcmpeqw mm2, mm4
   1.563 +                 pcmpeqw mm6, mm5
   1.564 +                 pcmpeqw mm1, mm5
   1.565 +                 pcmpeqw mm3, mm5
   1.566 +                 pxor mm5, mm5
   1.567 +                 pcmpeqw mm2, mm5
   1.568 +                 pcmpeqw mm3, mm5
   1.569 +                 pand mm6, mm1
   1.570 +                 pand mm2, mm3
   1.571 +                 pand mm6, mm2
   1.572 +                 por mm7, mm6
   1.573 +
   1.574 +                 
   1.575 +                 movq mm6, mm7
   1.576 +                 pcmpeqw mm6, mm5
   1.577 +                 pand mm7, mm0
   1.578 +
   1.579 +                 movq mm1, [eax+ebx+ebx+color2]
   1.580 +                 pand mm6, mm1
   1.581 +                 por mm7, mm6
   1.582 +                 movq [final2a], mm7                    ;finished  2a
   1.583 +
   1.584 +
   1.585 +                 ;--------------------------------------------
   1.586 + 
   1.587 +
   1.588 +%ifdef dfhsdfhsdahdsfhdsfh
   1.589 +                   if (color6 == color3 && color3 == colorA1 && color2 != colorA2 && color3 != colorA0)
   1.590 +                      product2b = Q_INTERPOLATE (color3, color3, color3, color2);
   1.591 +                   else
   1.592 +                   if (color5 == color2 && color2 == colorA2 && colorA1 != color3 && color2 != colorA3)
   1.593 +                      product2b = Q_INTERPOLATE (color2, color2, color2, color3);
   1.594 +                   else
   1.595 +                      product2b = INTERPOLATE (color2, color3);
   1.596 +
   1.597 +                   if (color6 == color3 && color6 == colorB1 && color5 != colorB2 && color6 != colorB0)
   1.598 +                      product1b = Q_INTERPOLATE (color6, color6, color6, color5);
   1.599 +                   else
   1.600 +                   if (color5 == color2 && color5 == colorB2 && colorB1 != color6 && color5 != colorB3)
   1.601 +                      product1b = Q_INTERPOLATE (color6, color5, color5, color5);
   1.602 +                   else
   1.603 +                      product1b = INTERPOLATE (color5, color6);
   1.604 +%endif
   1.605 +
   1.606 +                 push eax
   1.607 +                 add eax, ebx
   1.608 +                 pxor mm7, mm7
   1.609 +                 movq mm0, [eax+ebx+ebx+colorA0]
   1.610 +                 movq mm1, [eax+ebx+ebx+colorA1]
   1.611 +                 movq mm2, [eax+ebx+ebx+colorA2]
   1.612 +                 movq mm3, [eax+ebx+ebx+colorA3]
   1.613 +                 pop eax
   1.614 +                 movq mm4, [eax+ebx+ebx+color2]
   1.615 +                 movq mm5, [eax+ebx+ebx+color3]
   1.616 +                 movq mm6, [eax+ebx+color6]
   1.617 +
   1.618 +                 pcmpeqw mm6, mm5
   1.619 +                 pcmpeqw mm1, mm5
   1.620 +                 pcmpeqw mm4, mm2
   1.621 +                 pcmpeqw mm0, mm5
   1.622 +                 pcmpeqw mm4, mm7
   1.623 +                 pcmpeqw mm0, mm7
   1.624 +                 pand mm0, mm4
   1.625 +                 pand mm6, mm1
   1.626 +                 pand mm0, mm6
   1.627 +
   1.628 +
   1.629 +                 push eax
   1.630 +                 add eax, ebx
   1.631 +                 movq mm1, [eax+ebx+ebx+colorA1]
   1.632 +                 pop eax
   1.633 +                 movq mm4, [eax+ebx+ebx+color2]
   1.634 +                 movq mm5, [eax+ebx+color5]
   1.635 +                 movq mm6, [eax+ebx+ebx+color3]
   1.636 +
   1.637 +                 pcmpeqw mm5, mm4
   1.638 +                 pcmpeqw mm2, mm4
   1.639 +                 pcmpeqw mm1, mm6
   1.640 +                 pcmpeqw mm3, mm4
   1.641 +                 pcmpeqw mm1, mm7
   1.642 +                 pcmpeqw mm3, mm7
   1.643 +                 pand mm2, mm5
   1.644 +                 pand mm1, mm3
   1.645 +                 pand mm1, mm2
   1.646 +
   1.647 +
   1.648 +                 movq mm7, mm0
   1.649 +                 por mm7, mm1
   1.650 +
   1.651 +                 movq mm4, [Mask35]
   1.652 +                 movq mm3, [Mask26]
   1.653 +                 
   1.654 +                 movq mm6, mm4
   1.655 +                 pand mm6, mm7
   1.656 +                 pxor mm4, mm6
   1.657 +
   1.658 +                 movq mm6, mm3
   1.659 +                 pand mm6, mm7
   1.660 +                 pxor mm3, mm6
   1.661 +
   1.662 +                 movq mm2, mm0
   1.663 +                 movq mm7, [I2333Pixel]
   1.664 +                 movq mm6, [I2223Pixel]
   1.665 +                 movq mm5, [I23Pixel]
   1.666 +
   1.667 +
   1.668 +                 por mm2, mm4
   1.669 +                 pand mm4, [eax+ebx+ebx+color3]
   1.670 +                 por mm2, mm3
   1.671 +                 pand mm3, [eax+ebx+ebx+color2]
   1.672 +                 por mm2, mm1
   1.673 +                 pand mm0, mm7
   1.674 +                 pand mm1, mm6
   1.675 +                 pxor mm7, mm7
   1.676 +                 pcmpeqw mm2, mm7
   1.677 +                 por mm0, mm1
   1.678 +                 por mm3, mm4
   1.679 +                 pand mm2, mm5
   1.680 +                 por mm0, mm3
   1.681 +                 por mm0, mm2
   1.682 +                 movq [final2b], mm0
   1.683 +
   1.684 +                 ;-----------------------------------
   1.685 +                 
   1.686 +
   1.687 +                 pxor mm7, mm7
   1.688 +                 movq mm0, [eax+colorB0]
   1.689 +                 movq mm1, [eax+colorB1]
   1.690 +                 movq mm2, [eax+colorB2]
   1.691 +                 movq mm3, [eax+colorB3]
   1.692 +                 movq mm4, [eax+ebx+color5]
   1.693 +                 movq mm5, [eax+ebx+color6]
   1.694 +                 movq mm6, [eax+ebx+ebx+color3]
   1.695 +
   1.696 +                 pcmpeqw mm6, mm5
   1.697 +                 pcmpeqw mm1, mm5
   1.698 +                 pcmpeqw mm4, mm2
   1.699 +                 pcmpeqw mm0, mm5
   1.700 +                 pcmpeqw mm4, mm7
   1.701 +                 pcmpeqw mm0, mm7
   1.702 +                 pand mm0, mm4
   1.703 +                 pand mm6, mm1
   1.704 +                 pand mm0, mm6
   1.705 +
   1.706 +                 movq mm1, [eax+colorB1]
   1.707 +                 movq mm4, [eax+ebx+color5]
   1.708 +                 movq mm5, [eax+ebx+ebx+color2]
   1.709 +                 movq mm6, [eax+ebx+color6]
   1.710 +
   1.711 +                 pcmpeqw mm5, mm4
   1.712 +                 pcmpeqw mm2, mm4
   1.713 +                 pcmpeqw mm1, mm6
   1.714 +                 pcmpeqw mm3, mm4
   1.715 +                 pcmpeqw mm1, mm7
   1.716 +                 pcmpeqw mm3, mm7
   1.717 +                 pand mm2, mm5
   1.718 +                 pand mm1, mm3
   1.719 +                 pand mm1, mm2
   1.720 +
   1.721 +
   1.722 +                 movq mm7, mm0
   1.723 +                 por mm7, mm1
   1.724 +
   1.725 +                 movq mm4, [Mask35]
   1.726 +                 movq mm3, [Mask26]
   1.727 +                 
   1.728 +                 movq mm6, mm4
   1.729 +                 pand mm6, mm7
   1.730 +                 pxor mm4, mm6
   1.731 +
   1.732 +                 movq mm6, mm3
   1.733 +                 pand mm6, mm7
   1.734 +                 pxor mm3, mm6
   1.735 +
   1.736 +                 movq mm2, mm0
   1.737 +                 movq mm7, [I5666Pixel]
   1.738 +                 movq mm6, [I5556Pixel]
   1.739 +                 movq mm5, [I56Pixel]
   1.740 +
   1.741 +
   1.742 +                 por mm2, mm4
   1.743 +                 pand mm4, [eax+ebx+color5]
   1.744 +                 por mm2, mm3
   1.745 +                 pand mm3, [eax+ebx+color6]
   1.746 +                 por mm2, mm1
   1.747 +                 pand mm0, mm7
   1.748 +                 pand mm1, mm6
   1.749 +                 pxor mm7, mm7
   1.750 +                 pcmpeqw mm2, mm7
   1.751 +                 por mm0, mm1
   1.752 +                 por mm3, mm4
   1.753 +                 pand mm2, mm5
   1.754 +                 por mm0, mm3
   1.755 +                 por mm0, mm2
   1.756 +                 movq [final1b], mm0
   1.757 +                 
   1.758 +          ;---------
   1.759 +
   1.760 +                 movq mm0, [final1a]
   1.761 +                 movq mm4, [final2a]
   1.762 +                 movq mm2, [final1b]
   1.763 +                 movq mm6, [final2b]
   1.764 +
   1.765 +
   1.766 +                 movq mm1, mm0
   1.767 +                 movq mm5, mm4
   1.768 +
   1.769 +
   1.770 +         punpcklwd mm0, mm2
   1.771 +         punpckhwd mm1, mm2
   1.772 +
   1.773 +         punpcklwd mm4, mm6
   1.774 +         punpckhwd mm5, mm6
   1.775 +
   1.776 +
   1.777 +%ifdef FAR_POINTER
   1.778 +         movq [fs:edx], mm0
   1.779 +         movq [fs:edx+8], mm1
   1.780 +         push edx
   1.781 +         add edx, [ebp+dstPitch]
   1.782 +         movq [fs:edx], mm4
   1.783 +         movq [fs:edx+8], mm5
   1.784 +         pop edx
   1.785 +%else
   1.786 +         movq [edx], mm0
   1.787 +         movq [edx+8], mm1
   1.788 +         push edx
   1.789 +         add edx, [ebp+dstPitch]
   1.790 +         movq [edx], mm4
   1.791 +         movq [edx+8], mm5
   1.792 +         pop edx
   1.793 +%endif
   1.794 +.SKIP_PROCESS:
   1.795 +         mov ecx, [ebp+deltaPtr]
   1.796 +         add ecx, 8
   1.797 +         mov [ebp+deltaPtr], ecx
   1.798 +         add edx, 16
   1.799 +         add eax, 8
   1.800 +
   1.801 +         pop ecx
   1.802 +         sub ecx, 4
   1.803 +         cmp ecx, 0
   1.804 +         jg  near .Loop
   1.805 +
   1.806 +; Restore some stuff
   1.807 +         popad
   1.808 +         mov esp, ebp
   1.809 +         pop ebp
   1.810 +         emms
   1.811 +         ret
   1.812 +
   1.813 +
   1.814 +;-------------------------------------------------------------------------
   1.815 +;-------------------------------------------------------------------------
   1.816 +;-------------------------------------------------------------------------
   1.817 +;-------------------------------------------------------------------------
   1.818 +;-------------------------------------------------------------------------
   1.819 +;-------------------------------------------------------------------------
   1.820 +;-------------------------------------------------------------------------
   1.821 +
   1.822 +
   1.823 +
   1.824 +%ifdef __DJGPP__
   1.825 +__2xSaISuperEagleLine:
   1.826 +%else
   1.827 +_2xSaISuperEagleLine:
   1.828 +%endif
   1.829 +; Store some stuff
   1.830 +         push ebp
   1.831 +         mov ebp, esp
   1.832 +         pushad
   1.833 +
   1.834 +; Prepare the destination
   1.835 +%ifdef FAR_POINTER
   1.836 +         ; Set the selector
   1.837 +         mov eax, [ebp+dstSegment]
   1.838 +         mov fs, ax
   1.839 +%endif
   1.840 +         mov edx, [ebp+dstOffset]         ; edx points to the screen
   1.841 +; Prepare the source
   1.842 +         ; eax points to colorA
   1.843 +         mov eax, [ebp+srcPtr]
   1.844 +         mov ebx, [ebp+srcPitch]
   1.845 +         mov ecx, [ebp+width]
   1.846 +         ; eax now points to colorB1
   1.847 +         sub eax, ebx
   1.848 +
   1.849 +; Main Loop
   1.850 +.Loop:   push ecx
   1.851 +
   1.852 +         ;-----Check Delta------------------
   1.853 +         mov ecx, [ebp+deltaPtr]
   1.854 +
   1.855 +         movq mm0, [eax+colorB0]
   1.856 +         movq mm1, [eax+colorB3]
   1.857 +         movq mm2, [eax+ebx+color4]
   1.858 +         movq mm3, [eax+ebx+colorS2]
   1.859 +         movq mm4, [eax+ebx+ebx+color1]
   1.860 +         movq mm5, [eax+ebx+ebx+colorS1]
   1.861 +         push eax
   1.862 +         add eax, ebx
   1.863 +         movq mm6, [eax+ebx+ebx+colorA0]
   1.864 +         movq mm7, [eax+ebx+ebx+colorA3]
   1.865 +         pop eax
   1.866 +
   1.867 +         pcmpeqw mm0, [ecx+2+colorB0]
   1.868 +         pcmpeqw mm1, [ecx+2+colorB3]
   1.869 +         pcmpeqw mm2, [ecx+ebx+2+color4]
   1.870 +         pcmpeqw mm3, [ecx+ebx+2+colorS2]
   1.871 +         pcmpeqw mm4, [ecx+ebx+ebx+2+color1]
   1.872 +         pcmpeqw mm5, [ecx+ebx+ebx+2+colorS1]
   1.873 +         add ecx, ebx
   1.874 +         pcmpeqw mm6, [ecx+ebx+ebx+2+colorA0]
   1.875 +         pcmpeqw mm7, [ecx+ebx+ebx+2+colorA3]
   1.876 +         sub ecx, ebx
   1.877 +
   1.878 +
   1.879 +         pand mm0, mm1
   1.880 +         pand mm2, mm3
   1.881 +         pand mm4, mm5
   1.882 +         pand mm6, mm7
   1.883 +         pand mm0, mm2
   1.884 +         pand mm4, mm6
   1.885 +         pxor mm7, mm7
   1.886 +         pand mm0, mm4
   1.887 +         movq mm6, [eax+colorB0]
   1.888 +         pcmpeqw mm7, mm0
   1.889 +
   1.890 +         movq [ecx+2+colorB0], mm6
   1.891 +
   1.892 +         packsswb mm7, mm7
   1.893 +         movd ecx, mm7
   1.894 +         test ecx, ecx
   1.895 +         jz near .SKIP_PROCESS
   1.896 +
   1.897 +         ;End Delta
   1.898 +
   1.899 +         ;---------------------------------
   1.900 +         movq mm0, [eax+ebx+color5]
   1.901 +         movq mm1, [eax+ebx+color6]
   1.902 +         movq mm2, mm0
   1.903 +         movq mm3, mm1
   1.904 +         movq mm4, mm0
   1.905 +         movq mm5, mm1
   1.906 +
   1.907 +         pand mm0, [colorMask]
   1.908 +         pand mm1, [colorMask]
   1.909 +
   1.910 +         psrlw mm0, 1
   1.911 +         psrlw mm1, 1
   1.912 +
   1.913 +         pand mm3, [lowPixelMask]
   1.914 +         paddw mm0, mm1
   1.915 +
   1.916 +         pand mm3, mm2
   1.917 +         paddw mm0, mm3                ;mm0 contains the interpolated values
   1.918 +         movq [I56Pixel], mm0
   1.919 +         movq mm7, mm0
   1.920 +
   1.921 +         ;-------------------
   1.922 +         movq mm0, mm7
   1.923 +         movq mm1, mm4  ;5,5,5,6
   1.924 +         movq mm2, mm0
   1.925 +         movq mm3, mm1
   1.926 +
   1.927 +         pand mm0, [colorMask]
   1.928 +         pand mm1, [colorMask]
   1.929 +
   1.930 +         psrlw mm0, 1
   1.931 +         psrlw mm1, 1
   1.932 +
   1.933 +         pand mm3, [lowPixelMask]
   1.934 +         paddw mm0, mm1
   1.935 +
   1.936 +         pand mm3, mm2
   1.937 +         paddw mm0, mm3                ;mm0 contains the interpolated values
   1.938 +         movq [product1a], mm0
   1.939 +         ;--------------------
   1.940 +
   1.941 +         movq mm0, mm7
   1.942 +         movq mm1, mm5  ;6,6,6,5
   1.943 +         movq mm2, mm0
   1.944 +         movq mm3, mm1
   1.945 +
   1.946 +         pand mm0, [colorMask]
   1.947 +         pand mm1, [colorMask]
   1.948 +
   1.949 +         psrlw mm0, 1
   1.950 +         psrlw mm1, 1
   1.951 +
   1.952 +         pand mm3, [lowPixelMask]
   1.953 +         paddw mm0, mm1
   1.954 +
   1.955 +         pand mm3, mm2
   1.956 +         paddw mm0, mm3
   1.957 +         movq [product1b], mm0
   1.958 +
   1.959 +         ;-------------------------
   1.960 +         ;-------------------------
   1.961 +         movq mm0, [eax+ebx+ebx+color2]
   1.962 +         movq mm1, [eax+ebx+ebx+color3]
   1.963 +         movq mm2, mm0
   1.964 +         movq mm3, mm1
   1.965 +         movq mm4, mm0
   1.966 +         movq mm5, mm1
   1.967 +
   1.968 +         pand mm0, [colorMask]
   1.969 +         pand mm1, [colorMask]
   1.970 +
   1.971 +         psrlw mm0, 1
   1.972 +         psrlw mm1, 1
   1.973 +
   1.974 +         pand mm3, [lowPixelMask]
   1.975 +         paddw mm0, mm1
   1.976 +
   1.977 +         pand mm3, mm2
   1.978 +         paddw mm0, mm3
   1.979 +         movq [I23Pixel], mm0
   1.980 +         movq mm7, mm0
   1.981 +
   1.982 +         ;---------------------
   1.983 +         movq mm0, mm7
   1.984 +         movq mm1, mm4  ;2,2,2,3
   1.985 +         movq mm2, mm0
   1.986 +         movq mm3, mm1
   1.987 +
   1.988 +         pand mm0, [colorMask]
   1.989 +         pand mm1, [colorMask]
   1.990 +
   1.991 +         psrlw mm0, 1
   1.992 +         psrlw mm1, 1
   1.993 +
   1.994 +         pand mm3, [lowPixelMask]
   1.995 +         paddw mm0, mm1
   1.996 +
   1.997 +         pand mm3, mm2
   1.998 +         paddw mm0, mm3
   1.999 +         movq [product2a], mm0
  1.1000 +
  1.1001 +         ;----------------------
  1.1002 +         movq mm0, mm7
  1.1003 +         movq mm1, mm5  ;3,3,3,2
  1.1004 +         movq mm2, mm0
  1.1005 +         movq mm3, mm1
  1.1006 +
  1.1007 +         pand mm0, [colorMask]
  1.1008 +         pand mm1, [colorMask]
  1.1009 +
  1.1010 +         psrlw mm0, 1
  1.1011 +         psrlw mm1, 1
  1.1012 +
  1.1013 +         pand mm3, [lowPixelMask]
  1.1014 +         paddw mm0, mm1
  1.1015 +
  1.1016 +         pand mm3, mm2
  1.1017 +         paddw mm0, mm3
  1.1018 +         movq [product2b], mm0
  1.1019 +
  1.1020 +
  1.1021 +         ;////////////////////////////////
  1.1022 +         ; Decide which "branch" to take
  1.1023 +         ;--------------------------------
  1.1024 +         movq mm4, [eax+ebx+color5]
  1.1025 +         movq mm5, [eax+ebx+color6]
  1.1026 +         movq mm6, [eax+ebx+ebx+color3]
  1.1027 +         movq mm7, [eax+ebx+ebx+color2]
  1.1028 +
  1.1029 +         pxor mm3, mm3
  1.1030 +         movq mm0, mm4
  1.1031 +         movq mm1, mm5
  1.1032 +
  1.1033 +         pcmpeqw mm0, mm6
  1.1034 +         pcmpeqw mm1, mm7
  1.1035 +         pcmpeqw mm1, mm3
  1.1036 +         pand mm0, mm1
  1.1037 +         movq [Mask35], mm0
  1.1038 +
  1.1039 +         movq mm0, [eax+ebx+ebx+colorS1]
  1.1040 +         movq mm1, [eax+ebx+color4]
  1.1041 +         push eax
  1.1042 +         add eax, ebx
  1.1043 +         movq mm2, [eax+ebx+ebx+colorA2]
  1.1044 +         pop eax
  1.1045 +         movq mm3, [eax+colorB1]
  1.1046 +         pcmpeqw mm0, mm4
  1.1047 +         pcmpeqw mm1, mm4
  1.1048 +         pcmpeqw mm2, mm4
  1.1049 +         pcmpeqw mm3, mm4
  1.1050 +         pand mm0, mm1
  1.1051 +         pand mm2, mm3
  1.1052 +         por mm0, mm2
  1.1053 +         pand mm0, [Mask35]
  1.1054 +         movq [Mask35b], mm0
  1.1055 +
  1.1056 +         ;-----------
  1.1057 +         pxor mm3, mm3
  1.1058 +         movq mm0, mm4
  1.1059 +         movq mm1, mm5
  1.1060 +
  1.1061 +         pcmpeqw mm0, mm6
  1.1062 +         pcmpeqw mm1, mm7
  1.1063 +         pcmpeqw mm0, mm3
  1.1064 +         pand mm0, mm1
  1.1065 +         movq [Mask26], mm0
  1.1066 +
  1.1067 +         movq mm0, [eax+ebx+ebx+color1]
  1.1068 +         movq mm1, [eax+ebx+colorS2]
  1.1069 +         push eax
  1.1070 +         add eax, ebx
  1.1071 +         movq mm2, [eax+ebx+ebx+colorA1]
  1.1072 +         pop eax
  1.1073 +         movq mm3, [eax+colorB2]
  1.1074 +         pcmpeqw mm0, mm5
  1.1075 +         pcmpeqw mm1, mm5
  1.1076 +         pcmpeqw mm2, mm5
  1.1077 +         pcmpeqw mm3, mm5
  1.1078 +         pand mm0, mm1
  1.1079 +         pand mm2, mm3
  1.1080 +         por mm0, mm2
  1.1081 +         pand mm0, [Mask26]
  1.1082 +         movq [Mask26b], mm0
  1.1083 +
  1.1084 +         ;--------------------
  1.1085 +         movq mm0, mm4
  1.1086 +         movq mm1, mm5
  1.1087 +         movq mm2, mm0
  1.1088 +
  1.1089 +         pcmpeqw mm2, mm1
  1.1090 +         pcmpeqw mm0, mm6
  1.1091 +         pcmpeqw mm1, mm7
  1.1092 +         pand mm0, mm1
  1.1093 +         pand mm2, mm0
  1.1094 +         pxor mm0, mm2
  1.1095 +         movq mm7, mm0
  1.1096 +
  1.1097 +         ;------------------
  1.1098 +         packsswb mm7, mm7
  1.1099 +         movd ecx, mm7
  1.1100 +         test ecx, ecx
  1.1101 +         jz near .SKIP_GUESS
  1.1102 +
  1.1103 +;---------------------------------------------
  1.1104 +; Map of the pixels:                    I|E F|J
  1.1105 +;                                       G|A B|K
  1.1106 +;                                       H|C D|L
  1.1107 +;                                       M|N O|P
  1.1108 +         movq mm6, mm0
  1.1109 +         movq mm4, [eax+ebx+color5]
  1.1110 +         movq mm5, [eax+ebx+color6]
  1.1111 +         pxor mm7, mm7
  1.1112 +         pand mm6, [ONE]
  1.1113 +
  1.1114 +         movq mm0, [eax+colorB1]
  1.1115 +         movq mm1, [eax+ebx+color4]
  1.1116 +         movq mm2, mm0
  1.1117 +         movq mm3, mm1
  1.1118 +         pcmpeqw mm0, mm4
  1.1119 +         pcmpeqw mm1, mm4
  1.1120 +         pcmpeqw mm2, mm5
  1.1121 +         pcmpeqw mm3, mm5
  1.1122 +         pand mm0, mm6
  1.1123 +         pand mm1, mm6
  1.1124 +         pand mm2, mm6
  1.1125 +         pand mm3, mm6
  1.1126 +         paddw mm0, mm1
  1.1127 +         paddw mm2, mm3
  1.1128 +
  1.1129 +         pxor mm3, mm3
  1.1130 +         pcmpgtw mm0, mm6
  1.1131 +         pcmpgtw mm2, mm6
  1.1132 +         pcmpeqw mm0, mm3
  1.1133 +         pcmpeqw mm2, mm3
  1.1134 +         pand mm0, mm6
  1.1135 +         pand mm2, mm6
  1.1136 +         paddw mm7, mm0
  1.1137 +         psubw mm7, mm2
  1.1138 +
  1.1139 +         movq mm0, [eax+colorB2]
  1.1140 +         movq mm1, [eax+ebx+colorS2]
  1.1141 +         movq mm2, mm0
  1.1142 +         movq mm3, mm1
  1.1143 +         pcmpeqw mm0, mm4
  1.1144 +         pcmpeqw mm1, mm4
  1.1145 +         pcmpeqw mm2, mm5
  1.1146 +         pcmpeqw mm3, mm5
  1.1147 +         pand mm0, mm6
  1.1148 +         pand mm1, mm6
  1.1149 +         pand mm2, mm6
  1.1150 +         pand mm3, mm6
  1.1151 +         paddw mm0, mm1
  1.1152 +         paddw mm2, mm3
  1.1153 +
  1.1154 +         pxor mm3, mm3
  1.1155 +         pcmpgtw mm0, mm6
  1.1156 +         pcmpgtw mm2, mm6
  1.1157 +         pcmpeqw mm0, mm3
  1.1158 +         pcmpeqw mm2, mm3
  1.1159 +         pand mm0, mm6
  1.1160 +         pand mm2, mm6
  1.1161 +         paddw mm7, mm0
  1.1162 +         psubw mm7, mm2
  1.1163 +
  1.1164 +         push eax
  1.1165 +         add eax, ebx
  1.1166 +         movq mm0, [eax+ebx+color1]
  1.1167 +         movq mm1, [eax+ebx+ebx+colorA1]
  1.1168 +         movq mm2, mm0
  1.1169 +         movq mm3, mm1
  1.1170 +         pcmpeqw mm0, mm4
  1.1171 +         pcmpeqw mm1, mm4
  1.1172 +         pcmpeqw mm2, mm5
  1.1173 +         pcmpeqw mm3, mm5
  1.1174 +         pand mm0, mm6
  1.1175 +         pand mm1, mm6
  1.1176 +         pand mm2, mm6
  1.1177 +         pand mm3, mm6
  1.1178 +         paddw mm0, mm1
  1.1179 +         paddw mm2, mm3
  1.1180 +
  1.1181 +         pxor mm3, mm3
  1.1182 +         pcmpgtw mm0, mm6
  1.1183 +         pcmpgtw mm2, mm6
  1.1184 +         pcmpeqw mm0, mm3
  1.1185 +         pcmpeqw mm2, mm3
  1.1186 +         pand mm0, mm6
  1.1187 +         pand mm2, mm6
  1.1188 +         paddw mm7, mm0
  1.1189 +         psubw mm7, mm2
  1.1190 +
  1.1191 +         movq mm0, [eax+ebx+colorS1]
  1.1192 +         movq mm1, [eax+ebx+ebx+colorA2]
  1.1193 +         movq mm2, mm0
  1.1194 +         movq mm3, mm1
  1.1195 +         pcmpeqw mm0, mm4
  1.1196 +         pcmpeqw mm1, mm4
  1.1197 +         pcmpeqw mm2, mm5
  1.1198 +         pcmpeqw mm3, mm5
  1.1199 +         pand mm0, mm6
  1.1200 +         pand mm1, mm6
  1.1201 +         pand mm2, mm6
  1.1202 +         pand mm3, mm6
  1.1203 +         paddw mm0, mm1
  1.1204 +         paddw mm2, mm3
  1.1205 +
  1.1206 +         pxor mm3, mm3
  1.1207 +         pcmpgtw mm0, mm6
  1.1208 +         pcmpgtw mm2, mm6
  1.1209 +         pcmpeqw mm0, mm3
  1.1210 +         pcmpeqw mm2, mm3
  1.1211 +         pand mm0, mm6
  1.1212 +         pand mm2, mm6
  1.1213 +         paddw mm7, mm0
  1.1214 +         psubw mm7, mm2
  1.1215 +
  1.1216 +         pop eax
  1.1217 +         movq mm1, mm7
  1.1218 +         pxor mm0, mm0
  1.1219 +         pcmpgtw mm7, mm0
  1.1220 +         pcmpgtw mm0, mm1
  1.1221 +
  1.1222 +         por mm7, [Mask35]
  1.1223 +         por mm0, [Mask26]
  1.1224 +         movq [Mask35], mm7
  1.1225 +         movq [Mask26], mm0
  1.1226 +
  1.1227 +.SKIP_GUESS:
  1.1228 +         ;Start the ASSEMBLY !!!
  1.1229 +
  1.1230 +         movq mm4, [Mask35]
  1.1231 +         movq mm5, [Mask26]
  1.1232 +         movq mm6, [Mask35b]
  1.1233 +         movq mm7, [Mask26b]
  1.1234 +
  1.1235 +         movq mm0, [eax+ebx+color5]
  1.1236 +         movq mm1, [eax+ebx+color6]
  1.1237 +         movq mm2, [eax+ebx+ebx+color2]
  1.1238 +         movq mm3, [eax+ebx+ebx+color3]
  1.1239 +         pcmpeqw mm0, mm2
  1.1240 +         pcmpeqw mm1, mm3
  1.1241 +         movq mm2, mm4
  1.1242 +         movq mm3, mm5
  1.1243 +         por mm0, mm1
  1.1244 +         por mm2, mm3
  1.1245 +         pand mm2, mm0
  1.1246 +         pxor mm0, mm2
  1.1247 +         movq mm3, mm0
  1.1248 +
  1.1249 +         movq mm2, mm0
  1.1250 +         pxor mm0, mm0
  1.1251 +         por mm2, mm4
  1.1252 +         pxor mm4, mm6
  1.1253 +         por mm2, mm5
  1.1254 +         pxor mm5, mm7
  1.1255 +         pcmpeqw mm2, mm0
  1.1256 +         ;----------------
  1.1257 +
  1.1258 +         movq mm0, [eax+ebx+color5]
  1.1259 +         movq mm1, mm3
  1.1260 +         por mm1, mm4
  1.1261 +         por mm1, mm6
  1.1262 +         pand mm0, mm1
  1.1263 +         movq mm1, mm5
  1.1264 +         pand mm1, [I56Pixel]
  1.1265 +         por mm0, mm1
  1.1266 +         movq mm1, mm7
  1.1267 +         pand mm1, [product1b]
  1.1268 +         por mm0, mm1
  1.1269 +         movq mm1, mm2
  1.1270 +         pand mm1, [product1a]
  1.1271 +         por mm0, mm1
  1.1272 +         movq [final1a], mm0
  1.1273 +
  1.1274 +         movq mm0, [eax+ebx+color6]
  1.1275 +         movq mm1, mm3
  1.1276 +         por mm1, mm5
  1.1277 +         por mm1, mm7
  1.1278 +         pand mm0, mm1
  1.1279 +         movq mm1, mm4
  1.1280 +         pand mm1, [I56Pixel]
  1.1281 +         por mm0, mm1
  1.1282 +         movq mm1, mm6
  1.1283 +         pand mm1, [product1a]
  1.1284 +         por mm0, mm1
  1.1285 +         movq mm1, mm2
  1.1286 +         pand mm1, [product1b]
  1.1287 +         por mm0, mm1
  1.1288 +         movq [final1b], mm0
  1.1289 +
  1.1290 +         movq mm0, [eax+ebx+ebx+color2]
  1.1291 +         movq mm1, mm3
  1.1292 +         por mm1, mm5
  1.1293 +         por mm1, mm7
  1.1294 +         pand mm0, mm1
  1.1295 +         movq mm1, mm4
  1.1296 +         pand mm1, [I23Pixel]
  1.1297 +         por mm0, mm1
  1.1298 +         movq mm1, mm6
  1.1299 +         pand mm1, [product2b]
  1.1300 +         por mm0, mm1
  1.1301 +         movq mm1, mm2
  1.1302 +         pand mm1, [product2a]
  1.1303 +         por mm0, mm1
  1.1304 +         movq [final2a], mm0
  1.1305 +
  1.1306 +         movq mm0, [eax+ebx+ebx+color3]
  1.1307 +         movq mm1, mm3
  1.1308 +         por mm1, mm4
  1.1309 +         por mm1, mm6
  1.1310 +         pand mm0, mm1
  1.1311 +         movq mm1, mm5
  1.1312 +         pand mm1, [I23Pixel]
  1.1313 +         por mm0, mm1
  1.1314 +         movq mm1, mm7
  1.1315 +         pand mm1, [product2a]
  1.1316 +         por mm0, mm1
  1.1317 +         movq mm1, mm2
  1.1318 +         pand mm1, [product2b]
  1.1319 +         por mm0, mm1
  1.1320 +         movq [final2b], mm0
  1.1321 +
  1.1322 +
  1.1323 +         movq mm0, [final1a]
  1.1324 +         movq mm2, [final1b]
  1.1325 +         movq mm1, mm0
  1.1326 +         movq mm4, [final2a]
  1.1327 +         movq mm6, [final2b]
  1.1328 +         movq mm5, mm4
  1.1329 +         punpcklwd mm0, mm2
  1.1330 +         punpckhwd mm1, mm2
  1.1331 +         punpcklwd mm4, mm6
  1.1332 +         punpckhwd mm5, mm6
  1.1333 +
  1.1334 +
  1.1335 +
  1.1336 +
  1.1337 +%ifdef FAR_POINTER
  1.1338 +         movq [fs:edx], mm0
  1.1339 +         movq [fs:edx+8], mm1
  1.1340 +         push edx
  1.1341 +         add edx, [ebp+dstPitch]
  1.1342 +         movq [fs:edx], mm4
  1.1343 +         movq [fs:edx+8], mm5
  1.1344 +         pop edx
  1.1345 +%else
  1.1346 +         movq [edx], mm0
  1.1347 +         movq [edx+8], mm1
  1.1348 +         push edx
  1.1349 +         add edx, [ebp+dstPitch]
  1.1350 +         movq [edx], mm4
  1.1351 +         movq [edx+8], mm5
  1.1352 +         pop edx
  1.1353 +%endif
  1.1354 +.SKIP_PROCESS:
  1.1355 +         mov ecx, [ebp+deltaPtr]
  1.1356 +         add ecx, 8
  1.1357 +         mov [ebp+deltaPtr], ecx
  1.1358 +         add edx, 16
  1.1359 +         add eax, 8
  1.1360 +
  1.1361 +         pop ecx
  1.1362 +         sub ecx, 4
  1.1363 +         cmp ecx, 0
  1.1364 +         jg  near .Loop
  1.1365 +
  1.1366 +; Restore some stuff
  1.1367 +         popad
  1.1368 +         mov esp, ebp
  1.1369 +         pop ebp
  1.1370 +         emms
  1.1371 +         ret
  1.1372 +
  1.1373 +
  1.1374 +;-------------------------------------------------------------------------
  1.1375 +;-------------------------------------------------------------------------
  1.1376 +;-------------------------------------------------------------------------
  1.1377 +;-------------------------------------------------------------------------
  1.1378 +;-------------------------------------------------------------------------
  1.1379 +;-------------------------------------------------------------------------
  1.1380 +;-------------------------------------------------------------------------
  1.1381 +
  1.1382 +
  1.1383 +;This is version 0.50
  1.1384 +colorI   equ -2
  1.1385 +colorE   equ 0
  1.1386 +colorF   equ 2
  1.1387 +colorJ   equ 4
  1.1388 +
  1.1389 +colorG   equ -2
  1.1390 +colorA   equ 0
  1.1391 +colorB   equ 2
  1.1392 +colorK   equ 4
  1.1393 +
  1.1394 +colorH   equ -2
  1.1395 +colorC   equ 0
  1.1396 +colorD   equ 2
  1.1397 +colorL   equ 4
  1.1398 +
  1.1399 +colorM   equ -2
  1.1400 +colorN   equ 0
  1.1401 +colorO   equ 2
  1.1402 +colorP   equ 4
  1.1403 +
  1.1404 +%ifdef __DJGPP__
  1.1405 +__2xSaILine:
  1.1406 +%else
  1.1407 +_2xSaILine:
  1.1408 +%endif
  1.1409 +; Store some stuff
  1.1410 +         push ebp
  1.1411 +         mov ebp, esp
  1.1412 +         pushad
  1.1413 +
  1.1414 +; Prepare the destination
  1.1415 +%ifdef FAR_POINTER
  1.1416 +         ; Set the selector
  1.1417 +         mov eax, [ebp+dstSegment]
  1.1418 +         mov fs, ax
  1.1419 +%endif
  1.1420 +         mov edx, [ebp+dstOffset]         ; edx points to the screen
  1.1421 +; Prepare the source
  1.1422 +         ; eax points to colorA
  1.1423 +         mov eax, [ebp+srcPtr]
  1.1424 +         mov ebx, [ebp+srcPitch]
  1.1425 +         mov ecx, [ebp+width]
  1.1426 +         ; eax now points to colorE
  1.1427 +         sub eax, ebx
  1.1428 +
  1.1429 +
  1.1430 +; Main Loop
  1.1431 +.Loop:   push ecx
  1.1432 +
  1.1433 +         ;-----Check Delta------------------
  1.1434 +         mov ecx, [ebp+deltaPtr]
  1.1435 +
  1.1436 +         movq mm0, [eax+colorI]
  1.1437 +         movq mm1, [eax+colorJ]
  1.1438 +         movq mm2, [eax+ebx+colorG]
  1.1439 +         movq mm3, [eax+ebx+colorK]
  1.1440 +         movq mm4, [eax+ebx+ebx+colorH]
  1.1441 +         movq mm5, [eax+ebx+ebx+colorL]
  1.1442 +         push eax
  1.1443 +         add eax, ebx
  1.1444 +         movq mm6, [eax+ebx+ebx+colorM]
  1.1445 +         movq mm7, [eax+ebx+ebx+colorP]
  1.1446 +         pop eax
  1.1447 +
  1.1448 +         pcmpeqw mm0, [ecx+2+colorI]
  1.1449 +         pcmpeqw mm1, [ecx+2+colorK]
  1.1450 +         pcmpeqw mm2, [ecx+ebx+2+colorG]
  1.1451 +         pcmpeqw mm3, [ecx+ebx+2+colorK]
  1.1452 +         pcmpeqw mm4, [ecx+ebx+ebx+2+colorH]
  1.1453 +         pcmpeqw mm5, [ecx+ebx+ebx+2+colorL]
  1.1454 +         add ecx, ebx
  1.1455 +         pcmpeqw mm6, [ecx+ebx+ebx+2+colorM]
  1.1456 +         pcmpeqw mm7, [ecx+ebx+ebx+2+colorP]
  1.1457 +         sub ecx, ebx
  1.1458 +
  1.1459 +
  1.1460 +         pand mm0, mm1
  1.1461 +         pand mm2, mm3
  1.1462 +         pand mm4, mm5
  1.1463 +         pand mm6, mm7
  1.1464 +         pand mm0, mm2
  1.1465 +         pand mm4, mm6
  1.1466 +         pxor mm7, mm7
  1.1467 +         pand mm0, mm4
  1.1468 +         movq mm6, [eax+colorI]
  1.1469 +         pcmpeqw mm7, mm0
  1.1470 +
  1.1471 +         movq [ecx+2+colorI], mm6
  1.1472 +
  1.1473 +         packsswb mm7, mm7
  1.1474 +         movd ecx, mm7
  1.1475 +         test ecx, ecx
  1.1476 +         jz near .SKIP_PROCESS
  1.1477 +
  1.1478 +         ;End Delta
  1.1479 +
  1.1480 +         ;---------------------------------
  1.1481 +
  1.1482 +
  1.1483 +;1
  1.1484 +         ;if ((colorA == colorD) && (colorB != colorC) && (colorA == colorE) && (colorB == colorL)
  1.1485 +         movq mm0, [eax+ebx+colorA]        ;mm0 and mm1 contain colorA
  1.1486 +         movq mm2, [eax+ebx+colorB]        ;mm2 and mm3 contain colorB
  1.1487 +
  1.1488 +         movq mm1, mm0
  1.1489 +         movq mm3, mm2
  1.1490 +
  1.1491 +         pcmpeqw mm0, [eax+ebx+ebx+colorD]
  1.1492 +         pcmpeqw mm1, [eax+colorE]
  1.1493 +         pcmpeqw mm2, [eax+ebx+ebx+colorL]
  1.1494 +         pcmpeqw mm3, [eax+ebx+ebx+colorC]
  1.1495 +
  1.1496 +         pand mm0, mm1
  1.1497 +         pxor mm1, mm1
  1.1498 +         pand mm0, mm2
  1.1499 +         pcmpeqw mm3, mm1
  1.1500 +         pand mm0, mm3                 ;result in mm0
  1.1501 +
  1.1502 +         ;if ((colorA == colorC) && (colorB != colorE) && (colorA == colorF) && (colorB == colorJ)
  1.1503 +         movq mm4, [eax+ebx+colorA]        ;mm4 and mm5 contain colorA
  1.1504 +         movq mm6, [eax+ebx+colorB]        ;mm6 and mm7 contain colorB
  1.1505 +         movq mm5, mm4
  1.1506 +         movq mm7, mm6
  1.1507 +
  1.1508 +         pcmpeqw mm4, [eax+ebx+ebx+colorC]
  1.1509 +         pcmpeqw mm5, [eax+colorF]
  1.1510 +         pcmpeqw mm6, [eax+colorJ]
  1.1511 +         pcmpeqw mm7, [eax+colorE]
  1.1512 +
  1.1513 +         pand mm4, mm5
  1.1514 +         pxor mm5, mm5
  1.1515 +         pand mm4, mm6
  1.1516 +         pcmpeqw mm7, mm5
  1.1517 +         pand mm4, mm7                 ;result in mm4
  1.1518 +
  1.1519 +         por mm0, mm4                  ;combine the masks
  1.1520 +         movq [Mask1], mm0
  1.1521 +
  1.1522 +         ;--------------------------------------------
  1.1523 +
  1.1524 +;2
  1.1525 +         ;if ((colorB == colorC) && (colorA != colorD) && (colorB == colorF) && (colorA == colorH)
  1.1526 +         movq mm0, [eax+ebx+colorB]        ;mm0 and mm1 contain colorB
  1.1527 +         movq mm2, [eax+ebx+colorA]        ;mm2 and mm3 contain colorA
  1.1528 +         movq mm1, mm0
  1.1529 +         movq mm3, mm2
  1.1530 +
  1.1531 +         pcmpeqw mm0, [eax+ebx+ebx+colorC]
  1.1532 +         pcmpeqw mm1, [eax+colorF]
  1.1533 +         pcmpeqw mm2, [eax+ebx+ebx+colorH]
  1.1534 +         pcmpeqw mm3, [eax+ebx+ebx+colorD]
  1.1535 +
  1.1536 +         pand mm0, mm1
  1.1537 +         pxor mm1, mm1
  1.1538 +         pand mm0, mm2
  1.1539 +         pcmpeqw mm3, mm1
  1.1540 +         pand mm0, mm3                 ;result in mm0
  1.1541 +
  1.1542 +         ;if ((colorB == colorE) && (colorB == colorD) && (colorA != colorF) && (colorA == colorI)
  1.1543 +         movq mm4, [eax+ebx+colorB]        ;mm4 and mm5 contain colorB
  1.1544 +         movq mm6, [eax+ebx+colorA]        ;mm6 and mm7 contain colorA
  1.1545 +         movq mm5, mm4
  1.1546 +         movq mm7, mm6
  1.1547 +
  1.1548 +         pcmpeqw mm4, [eax+ebx+ebx+colorD]
  1.1549 +         pcmpeqw mm5, [eax+colorE]
  1.1550 +         pcmpeqw mm6, [eax+colorI]
  1.1551 +         pcmpeqw mm7, [eax+colorF]
  1.1552 +
  1.1553 +         pand mm4, mm5
  1.1554 +         pxor mm5, mm5
  1.1555 +         pand mm4, mm6
  1.1556 +         pcmpeqw mm7, mm5
  1.1557 +         pand mm4, mm7                 ;result in mm4
  1.1558 +
  1.1559 +         por mm0, mm4                  ;combine the masks
  1.1560 +         movq [Mask2], mm0
  1.1561 +
  1.1562 +
  1.1563 +;interpolate colorA and colorB
  1.1564 +         movq mm0, [eax+ebx+colorA]
  1.1565 +         movq mm1, [eax+ebx+colorB]
  1.1566 +
  1.1567 +         movq mm2, mm0
  1.1568 +         movq mm3, mm1
  1.1569 +
  1.1570 +         pand mm0, [colorMask]
  1.1571 +         pand mm1, [colorMask]
  1.1572 +
  1.1573 +         psrlw mm0, 1
  1.1574 +         psrlw mm1, 1
  1.1575 +
  1.1576 +         pand mm3, [lowPixelMask]
  1.1577 +         paddw mm0, mm1
  1.1578 +
  1.1579 +         pand mm3, mm2
  1.1580 +         paddw mm0, mm3                ;mm0 contains the interpolated values
  1.1581 +
  1.1582 +         ;assemble the pixels
  1.1583 +         movq mm1, [eax+ebx+colorA]
  1.1584 +         movq mm2, [eax+ebx+colorB]
  1.1585 +
  1.1586 +         movq mm3, [Mask1]
  1.1587 +         movq mm5, mm1
  1.1588 +         movq mm4, [Mask2]
  1.1589 +         movq mm6, mm1
  1.1590 +
  1.1591 +         pand mm1, mm3
  1.1592 +         por mm3, mm4
  1.1593 +         pxor mm7, mm7
  1.1594 +         pand mm2, mm4
  1.1595 +
  1.1596 +         pcmpeqw mm3, mm7
  1.1597 +         por mm1, mm2
  1.1598 +         pand mm0, mm3
  1.1599 +
  1.1600 +         por mm0, mm1
  1.1601 +
  1.1602 +         punpcklwd mm5, mm0
  1.1603 +         punpckhwd mm6, mm0
  1.1604 +
  1.1605 +%ifdef FAR_POINTER
  1.1606 +         movq [fs:edx], mm5
  1.1607 +         movq [fs:edx+8], mm6
  1.1608 +%else
  1.1609 +         movq [edx], mm5
  1.1610 +         movq [edx+8], mm6
  1.1611 +%endif
  1.1612 +
  1.1613 +;------------------------------------------------
  1.1614 +;        Create the Nextline
  1.1615 +;------------------------------------------------
  1.1616 +;3       ;if ((colorA == colorD) && (colorB != colorC) && (colorA == colorG) && (colorC == colorO)
  1.1617 +         movq mm0, [eax+ebx+colorA]        ;mm0 and mm1 contain colorA
  1.1618 +         movq mm2, [eax+ebx+ebx+colorC]        ;mm2 and mm3 contain colorC
  1.1619 +         movq mm1, mm0
  1.1620 +         movq mm3, mm2
  1.1621 +
  1.1622 +         push eax
  1.1623 +         add eax, ebx
  1.1624 +         pcmpeqw mm0, [eax+ebx+colorD]
  1.1625 +         pcmpeqw mm1, [eax+colorG]
  1.1626 +         pcmpeqw mm2, [eax+ebx+ebx+colorO]
  1.1627 +         pcmpeqw mm3, [eax+colorB]
  1.1628 +         pop eax
  1.1629 +
  1.1630 +         pand mm0, mm1
  1.1631 +         pxor mm1, mm1
  1.1632 +         pand mm0, mm2
  1.1633 +         pcmpeqw mm3, mm1
  1.1634 +         pand mm0, mm3                 ;result in mm0
  1.1635 +
  1.1636 +         ;if ((colorA == colorB) && (colorG != colorC) && (colorA == colorH) && (colorC == colorM)
  1.1637 +         movq mm4, [eax+ebx+colorA]        ;mm4 and mm5 contain colorA
  1.1638 +         movq mm6, [eax+ebx+ebx+colorC]        ;mm6 and mm7 contain colorC
  1.1639 +         movq mm5, mm4
  1.1640 +         movq mm7, mm6
  1.1641 +
  1.1642 +         push eax
  1.1643 +         add eax, ebx
  1.1644 +         pcmpeqw mm4, [eax+ebx+colorH]
  1.1645 +         pcmpeqw mm5, [eax+colorB]
  1.1646 +         pcmpeqw mm6, [eax+ebx+ebx+colorM]
  1.1647 +         pcmpeqw mm7, [eax+colorG]
  1.1648 +         pop eax
  1.1649 +
  1.1650 +         pand mm4, mm5
  1.1651 +         pxor mm5, mm5
  1.1652 +         pand mm4, mm6
  1.1653 +         pcmpeqw mm7, mm5
  1.1654 +         pand mm4, mm7                 ;result in mm4
  1.1655 +
  1.1656 +         por mm0, mm4                  ;combine the masks
  1.1657 +         movq [Mask1], mm0
  1.1658 +         ;--------------------------------------------
  1.1659 +
  1.1660 +;4
  1.1661 +         ;if ((colorB == colorC) && (colorA != colorD) && (colorC == colorH) && (colorA == colorF)
  1.1662 +         movq mm0, [eax+ebx+ebx+colorC]        ;mm0 and mm1 contain colorC
  1.1663 +         movq mm2, [eax+ebx+colorA]        ;mm2 and mm3 contain colorA
  1.1664 +         movq mm1, mm0
  1.1665 +         movq mm3, mm2
  1.1666 +
  1.1667 +         pcmpeqw mm0, [eax+ebx+colorB]
  1.1668 +         pcmpeqw mm1, [eax+ebx+ebx+colorH]
  1.1669 +         pcmpeqw mm2, [eax+colorF]
  1.1670 +         pcmpeqw mm3, [eax+ebx+ebx+colorD]
  1.1671 +
  1.1672 +         pand mm0, mm1
  1.1673 +         pxor mm1, mm1
  1.1674 +         pand mm0, mm2
  1.1675 +         pcmpeqw mm3, mm1
  1.1676 +         pand mm0, mm3                 ;result in mm0
  1.1677 +
  1.1678 +         ;if ((colorC == colorG) && (colorC == colorD) && (colorA != colorH) && (colorA == colorI)
  1.1679 +         movq mm4, [eax+ebx+ebx+colorC]        ;mm4 and mm5 contain colorC
  1.1680 +         movq mm6, [eax+ebx+colorA]        ;mm6 and mm7 contain colorA
  1.1681 +         movq mm5, mm4
  1.1682 +         movq mm7, mm6
  1.1683 +
  1.1684 +         pcmpeqw mm4, [eax+ebx+ebx+colorD]
  1.1685 +         pcmpeqw mm5, [eax+ebx+colorG]
  1.1686 +         pcmpeqw mm6, [eax+colorI]
  1.1687 +         pcmpeqw mm7, [eax+ebx+ebx+colorH]
  1.1688 +
  1.1689 +         pand mm4, mm5
  1.1690 +         pxor mm5, mm5
  1.1691 +         pand mm4, mm6
  1.1692 +         pcmpeqw mm7, mm5
  1.1693 +         pand mm4, mm7                 ;result in mm4
  1.1694 +
  1.1695 +         por mm0, mm4                  ;combine the masks
  1.1696 +         movq [Mask2], mm0
  1.1697 +         ;----------------------------------------------
  1.1698 +
  1.1699 +;interpolate colorA and colorC
  1.1700 +         movq mm0, [eax+ebx+colorA]
  1.1701 +         movq mm1, [eax+ebx+ebx+colorC]
  1.1702 +
  1.1703 +         movq mm2, mm0
  1.1704 +         movq mm3, mm1
  1.1705 +
  1.1706 +         pand mm0, [colorMask]
  1.1707 +         pand mm1, [colorMask]
  1.1708 +
  1.1709 +         psrlw mm0, 1
  1.1710 +         psrlw mm1, 1
  1.1711 +
  1.1712 +         pand mm3, [lowPixelMask]
  1.1713 +         paddw mm0, mm1
  1.1714 +
  1.1715 +         pand mm3, mm2
  1.1716 +         paddw mm0, mm3                ;mm0 contains the interpolated values
  1.1717 +         ;-------------
  1.1718 +
  1.1719 +         ;assemble the pixels
  1.1720 +         movq mm1, [eax+ebx+colorA]
  1.1721 +         movq mm2, [eax+ebx+ebx+colorC]
  1.1722 +
  1.1723 +         movq mm3, [Mask1]
  1.1724 +         movq mm4, [Mask2]
  1.1725 +
  1.1726 +         pand mm1, mm3
  1.1727 +         pand mm2, mm4
  1.1728 +
  1.1729 +         por mm3, mm4
  1.1730 +         pxor mm7, mm7
  1.1731 +         por mm1, mm2
  1.1732 +
  1.1733 +         pcmpeqw mm3, mm7
  1.1734 +         pand mm0, mm3
  1.1735 +         por mm0, mm1
  1.1736 +         movq [ACPixel], mm0
  1.1737 +
  1.1738 +;////////////////////////////////
  1.1739 +; Decide which "branch" to take
  1.1740 +;--------------------------------
  1.1741 +         movq mm0, [eax+ebx+colorA]
  1.1742 +         movq mm1, [eax+ebx+colorB]
  1.1743 +         movq mm6, mm0
  1.1744 +         movq mm7, mm1
  1.1745 +         pcmpeqw mm0, [eax+ebx+ebx+colorD]
  1.1746 +         pcmpeqw mm1, [eax+ebx+ebx+colorC]
  1.1747 +         pcmpeqw mm6, mm7
  1.1748 +
  1.1749 +         movq mm2, mm0
  1.1750 +         movq mm3, mm0
  1.1751 +
  1.1752 +         pand mm0, mm1       ;colorA == colorD && colorB == colorC
  1.1753 +         pxor mm7, mm7
  1.1754 +
  1.1755 +         pcmpeqw mm2, mm7
  1.1756 +         pand mm6, mm0
  1.1757 +         pand mm2, mm1       ;colorA != colorD && colorB == colorC
  1.1758 +
  1.1759 +         pcmpeqw mm1, mm7
  1.1760 +
  1.1761 +         pand mm1, mm3       ;colorA == colorD && colorB != colorC
  1.1762 +         pxor mm0, mm6
  1.1763 +         por mm1, mm6
  1.1764 +         movq mm7, mm0
  1.1765 +         movq [Mask2], mm2
  1.1766 +         packsswb mm7, mm7
  1.1767 +         movq [Mask1], mm1
  1.1768 +
  1.1769 +         movd ecx, mm7
  1.1770 +         test ecx, ecx
  1.1771 +         jz near .SKIP_GUESS
  1.1772 +
  1.1773 +;---------------------------------------------
  1.1774 +; Map of the pixels:                    I|E F|J
  1.1775 +;                                       G|A B|K
  1.1776 +;                                       H|C D|L
  1.1777 +;                                       M|N O|P
  1.1778 +         movq mm6, mm0
  1.1779 +         movq mm4, [eax+ebx+colorA]
  1.1780 +         movq mm5, [eax+ebx+colorB]
  1.1781 +         pxor mm7, mm7
  1.1782 +         pand mm6, [ONE]
  1.1783 +
  1.1784 +         movq mm0, [eax+colorE]
  1.1785 +         movq mm1, [eax+ebx+colorG]
  1.1786 +         movq mm2, mm0
  1.1787 +         movq mm3, mm1
  1.1788 +         pcmpeqw mm0, mm4
  1.1789 +         pcmpeqw mm1, mm4
  1.1790 +         pcmpeqw mm2, mm5
  1.1791 +         pcmpeqw mm3, mm5
  1.1792 +         pand mm0, mm6
  1.1793 +         pand mm1, mm6
  1.1794 +         pand mm2, mm6
  1.1795 +         pand mm3, mm6
  1.1796 +         paddw mm0, mm1
  1.1797 +         paddw mm2, mm3
  1.1798 +
  1.1799 +         pxor mm3, mm3
  1.1800 +         pcmpgtw mm0, mm6
  1.1801 +         pcmpgtw mm2, mm6
  1.1802 +         pcmpeqw mm0, mm3
  1.1803 +         pcmpeqw mm2, mm3
  1.1804 +         pand mm0, mm6
  1.1805 +         pand mm2, mm6
  1.1806 +         paddw mm7, mm0
  1.1807 +         psubw mm7, mm2
  1.1808 +
  1.1809 +         movq mm0, [eax+colorF]
  1.1810 +         movq mm1, [eax+ebx+colorK]
  1.1811 +         movq mm2, mm0
  1.1812 +         movq mm3, mm1
  1.1813 +         pcmpeqw mm0, mm4
  1.1814 +         pcmpeqw mm1, mm4
  1.1815 +         pcmpeqw mm2, mm5
  1.1816 +         pcmpeqw mm3, mm5
  1.1817 +         pand mm0, mm6
  1.1818 +         pand mm1, mm6
  1.1819 +         pand mm2, mm6
  1.1820 +         pand mm3, mm6
  1.1821 +         paddw mm0, mm1
  1.1822 +         paddw mm2, mm3
  1.1823 +
  1.1824 +         pxor mm3, mm3
  1.1825 +         pcmpgtw mm0, mm6
  1.1826 +         pcmpgtw mm2, mm6
  1.1827 +         pcmpeqw mm0, mm3
  1.1828 +         pcmpeqw mm2, mm3
  1.1829 +         pand mm0, mm6
  1.1830 +         pand mm2, mm6
  1.1831 +         paddw mm7, mm0
  1.1832 +         psubw mm7, mm2
  1.1833 +
  1.1834 +         push eax
  1.1835 +         add eax, ebx
  1.1836 +         movq mm0, [eax+ebx+colorH]
  1.1837 +         movq mm1, [eax+ebx+ebx+colorN]
  1.1838 +         movq mm2, mm0
  1.1839 +         movq mm3, mm1
  1.1840 +         pcmpeqw mm0, mm4
  1.1841 +         pcmpeqw mm1, mm4
  1.1842 +         pcmpeqw mm2, mm5
  1.1843 +         pcmpeqw mm3, mm5
  1.1844 +         pand mm0, mm6
  1.1845 +         pand mm1, mm6
  1.1846 +         pand mm2, mm6
  1.1847 +         pand mm3, mm6
  1.1848 +         paddw mm0, mm1
  1.1849 +         paddw mm2, mm3
  1.1850 +
  1.1851 +         pxor mm3, mm3
  1.1852 +         pcmpgtw mm0, mm6
  1.1853 +         pcmpgtw mm2, mm6
  1.1854 +         pcmpeqw mm0, mm3
  1.1855 +         pcmpeqw mm2, mm3
  1.1856 +         pand mm0, mm6
  1.1857 +         pand mm2, mm6
  1.1858 +         paddw mm7, mm0
  1.1859 +         psubw mm7, mm2
  1.1860 +
  1.1861 +         movq mm0, [eax+ebx+colorL]
  1.1862 +         movq mm1, [eax+ebx+ebx+colorO]
  1.1863 +         movq mm2, mm0
  1.1864 +         movq mm3, mm1
  1.1865 +         pcmpeqw mm0, mm4
  1.1866 +         pcmpeqw mm1, mm4
  1.1867 +         pcmpeqw mm2, mm5
  1.1868 +         pcmpeqw mm3, mm5
  1.1869 +         pand mm0, mm6
  1.1870 +         pand mm1, mm6
  1.1871 +         pand mm2, mm6
  1.1872 +         pand mm3, mm6
  1.1873 +         paddw mm0, mm1
  1.1874 +         paddw mm2, mm3
  1.1875 +
  1.1876 +         pxor mm3, mm3
  1.1877 +         pcmpgtw mm0, mm6
  1.1878 +         pcmpgtw mm2, mm6
  1.1879 +         pcmpeqw mm0, mm3
  1.1880 +         pcmpeqw mm2, mm3
  1.1881 +         pand mm0, mm6
  1.1882 +         pand mm2, mm6
  1.1883 +         paddw mm7, mm0
  1.1884 +         psubw mm7, mm2
  1.1885 +
  1.1886 +         pop eax
  1.1887 +         movq mm1, mm7
  1.1888 +         pxor mm0, mm0
  1.1889 +         pcmpgtw mm7, mm0
  1.1890 +         pcmpgtw mm0, mm1
  1.1891 +
  1.1892 +         por mm7, [Mask1]
  1.1893 +         por mm0, [Mask2]
  1.1894 +         movq [Mask1], mm7
  1.1895 +         movq [Mask2], mm0
  1.1896 +
  1.1897 +.SKIP_GUESS:
  1.1898 +         ;----------------------------
  1.1899 +         ;interpolate A, B, C and D
  1.1900 +         movq mm0, [eax+ebx+colorA]
  1.1901 +         movq mm1, [eax+ebx+colorB]
  1.1902 +         movq mm4, mm0
  1.1903 +         movq mm2, [eax+ebx+ebx+colorC]
  1.1904 +         movq mm5, mm1
  1.1905 +         movq mm3, [qcolorMask]
  1.1906 +         movq mm6, mm2
  1.1907 +         movq mm7, [qlowpixelMask]
  1.1908 +
  1.1909 +         pand mm0, mm3
  1.1910 +         pand mm1, mm3
  1.1911 +         pand mm2, mm3
  1.1912 +         pand mm3, [eax+ebx+ebx+colorD]
  1.1913 +
  1.1914 +         psrlw mm0, 2
  1.1915 +         pand mm4, mm7
  1.1916 +         psrlw mm1, 2
  1.1917 +         pand mm5, mm7
  1.1918 +         psrlw mm2, 2
  1.1919 +         pand mm6, mm7
  1.1920 +         psrlw mm3, 2
  1.1921 +         pand mm7, [eax+ebx+ebx+colorD]
  1.1922 +
  1.1923 +         paddw mm0, mm1
  1.1924 +         paddw mm2, mm3
  1.1925 +
  1.1926 +         paddw mm4, mm5
  1.1927 +         paddw mm6, mm7
  1.1928 +
  1.1929 +         paddw mm4, mm6
  1.1930 +         paddw mm0, mm2
  1.1931 +         psrlw mm4, 2
  1.1932 +         pand mm4, [qlowpixelMask]
  1.1933 +         paddw mm0, mm4      ;mm0 contains the interpolated value of A, B, C and D
  1.1934 +
  1.1935 +;\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
  1.1936 +         ;assemble the pixels
  1.1937 +         movq mm1, [Mask1]
  1.1938 +         movq mm2, [Mask2]
  1.1939 +         movq mm4, [eax+ebx+colorA]
  1.1940 +         movq mm5, [eax+ebx+colorB]
  1.1941 +         pand mm4, mm1
  1.1942 +         pand mm5, mm2
  1.1943 +
  1.1944 +         pxor mm7, mm7
  1.1945 +         por mm1, mm2
  1.1946 +         por mm4, mm5
  1.1947 +         pcmpeqw mm1, mm7
  1.1948 +         pand mm0, mm1
  1.1949 +         por mm4, mm0        ;mm4 contains the diagonal pixels
  1.1950 +
  1.1951 +         movq mm0, [ACPixel]
  1.1952 +         movq mm1, mm0
  1.1953 +         punpcklwd mm0, mm4
  1.1954 +         punpckhwd mm1, mm4
  1.1955 +
  1.1956 +         push edx
  1.1957 +         add edx, [ebp+dstPitch]
  1.1958 +
  1.1959 +%ifdef FAR_POINTER
  1.1960 +         movq [fs:edx], mm0
  1.1961 +         movq [fs:edx+8], mm1
  1.1962 +%else
  1.1963 +         movq [edx], mm0
  1.1964 +         movq [edx+8], mm1
  1.1965 +%endif
  1.1966 +         pop edx
  1.1967 +
  1.1968 +.SKIP_PROCESS:
  1.1969 +         mov ecx, [ebp+deltaPtr]
  1.1970 +         add ecx, 8
  1.1971 +         mov [ebp+deltaPtr], ecx
  1.1972 +         add edx, 16
  1.1973 +         add eax, 8
  1.1974 +
  1.1975 +         pop ecx
  1.1976 +         sub ecx, 4
  1.1977 +         cmp ecx, 0
  1.1978 +         jg  near .Loop
  1.1979 +
  1.1980 +; Restore some stuff
  1.1981 +         popad
  1.1982 +         mov esp, ebp
  1.1983 +         pop ebp
  1.1984 +         emms
  1.1985 +         ret
  1.1986 +
  1.1987 +;-------------------------------------------------------------------------
  1.1988 +;-------------------------------------------------------------------------
  1.1989 +;-------------------------------------------------------------------------
  1.1990 +;-------------------------------------------------------------------------
  1.1991 +;-------------------------------------------------------------------------
  1.1992 +;-------------------------------------------------------------------------
  1.1993 +;-------------------------------------------------------------------------
  1.1994 +
  1.1995 +%ifdef __DJGPP__
  1.1996 +_Init_2xSaIMMX:
  1.1997 +%else
  1.1998 +Init_2xSaIMMX:
  1.1999 +%endif
  1.2000 +; Store some stuff
  1.2001 +         push ebp
  1.2002 +         mov ebp, esp
  1.2003 +         push edx
  1.2004 +
  1.2005 +
  1.2006 +;Damn thing doesn't work
  1.2007 +;        mov eax,1
  1.2008 +;        cpuid
  1.2009 +;        test edx, 0x00800000     ;test bit 23
  1.2010 +;        jz end2 ;bit not set => no MMX detected
  1.2011 +
  1.2012 +         mov eax, [ebp+8]         ;PixelFormat
  1.2013 +         cmp eax, 555
  1.2014 +         jz Bits555
  1.2015 +         cmp eax, 565
  1.2016 +         jz Bits565
  1.2017 +end2:
  1.2018 +         mov eax, 1
  1.2019 +         jmp end3
  1.2020 +Bits555:
  1.2021 +         mov edx, 0x7BDE7BDE
  1.2022 +         mov eax, colorMask
  1.2023 +         mov [eax], edx
  1.2024 +         mov [eax+4], edx
  1.2025 +         mov edx, 0x04210421
  1.2026 +         mov eax, lowPixelMask
  1.2027 +         mov [eax], edx
  1.2028 +         mov [eax+4], edx
  1.2029 +         mov edx, 0x739C739C
  1.2030 +         mov eax, qcolorMask
  1.2031 +         mov [eax], edx
  1.2032 +         mov [eax+4], edx
  1.2033 +         mov edx, 0x0C630C63
  1.2034 +         mov eax, qlowpixelMask
  1.2035 +         mov [eax], edx
  1.2036 +         mov [eax+4], edx
  1.2037 +         mov eax, 0
  1.2038 +         jmp end3
  1.2039 +Bits565:
  1.2040 +         mov edx, 0xF7DEF7DE
  1.2041 +         mov eax, colorMask
  1.2042 +         mov [eax], edx
  1.2043 +         mov [eax+4], edx
  1.2044 +         mov edx, 0x08210821
  1.2045 +         mov eax, lowPixelMask
  1.2046 +         mov [eax], edx
  1.2047 +         mov [eax+4], edx
  1.2048 +         mov edx, 0xE79CE79C
  1.2049 +         mov eax, qcolorMask
  1.2050 +         mov [eax], edx
  1.2051 +         mov [eax+4], edx
  1.2052 +         mov edx, 0x18631863
  1.2053 +         mov eax, qlowpixelMask
  1.2054 +         mov [eax], edx
  1.2055 +         mov [eax+4], edx
  1.2056 +         mov eax, 0
  1.2057 +         jmp end3
  1.2058 +end3:   
  1.2059 +         pop edx
  1.2060 +         mov esp, ebp
  1.2061 +         pop ebp
  1.2062 +         ret
  1.2063 +
  1.2064 +
  1.2065 +;-------------------------------------------------------------------------
  1.2066 +;-------------------------------------------------------------------------
  1.2067 +;-------------------------------------------------------------------------
  1.2068 +;-------------------------------------------------------------------------
  1.2069 +;-------------------------------------------------------------------------
  1.2070 +;-------------------------------------------------------------------------
  1.2071 +;-------------------------------------------------------------------------
  1.2072 +
  1.2073 +        SECTION .data ALIGN = 32
  1.2074 +;Some constants
  1.2075 +colorMask     dd 0xF7DEF7DE,0xF7DEF7DE
  1.2076 +lowPixelMask  dd 0x08210821,0x08210821
  1.2077 +
  1.2078 +qcolorMask    dd 0xE79CE79C,0xE79CE79C
  1.2079 +qlowpixelMask dd 0x18631863,0x18631863
  1.2080 +
  1.2081 +darkenMask    dd 0xC718C718,0xC718C718
  1.2082 +GreenMask     dd 0x07E007E0,0x07E007E0
  1.2083 +RedBlueMask   dd 0xF81FF81F,0xF81FF81F
  1.2084 +
  1.2085 +FALSE         dd 0x00000000,0x00000000
  1.2086 +TRUE          dd 0xffffffff,0xffffffff
  1.2087 +ONE           dd 0x00010001,0x00010001
  1.2088 +
  1.2089 +
  1.2090 +        SECTION .bss ALIGN = 32
  1.2091 +ACPixel       resb 8
  1.2092 +Mask1         resb 8
  1.2093 +Mask2         resb 8
  1.2094 +
  1.2095 +I56Pixel      resb 8
  1.2096 +I23Pixel      resb 8
  1.2097 +I5556Pixel    resb 8
  1.2098 +I2223Pixel    resb 8
  1.2099 +I5666Pixel    resb 8
  1.2100 +I2333Pixel    resb 8
  1.2101 +Mask26        resb 8
  1.2102 +Mask35        resb 8
  1.2103 +Mask26b       resb 8
  1.2104 +Mask35b       resb 8
  1.2105 +product1a     resb 8
  1.2106 +product1b     resb 8
  1.2107 +product2a     resb 8
  1.2108 +product2b     resb 8
  1.2109 +final1a       resb 8
  1.2110 +final1b       resb 8
  1.2111 +final2a       resb 8
  1.2112 +final2b       resb 8