Mercurial > vba-clojure
diff src/filters/admame.cpp @ 27:b970226568d2
brought in filters package
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Sun, 04 Mar 2012 20:32:31 -0600 |
parents | f9f4f1b99eed |
children |
line wrap: on
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/filters/admame.cpp Sun Mar 04 20:32:31 2012 -0600 1.3 @@ -0,0 +1,1036 @@ 1.4 +/* 1.5 + * This file is part of the Advance project. 1.6 + * 1.7 + * Copyright (C) 1999-2002 Andrea Mazzoleni 1.8 + * 1.9 + * This program is free software; you can redistribute it and/or modify 1.10 + * it under the terms of the GNU General Public License as published by 1.11 + * the Free Software Foundation; either version 2 of the License, or 1.12 + * (at your option) any later version. 1.13 + * 1.14 + * This program is distributed in the hope that it will be useful, 1.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 1.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1.17 + * GNU General Public License for more details. 1.18 + * 1.19 + * You should have received a copy of the GNU General Public License 1.20 + * along with this program; if not, write to the Free Software 1.21 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 1.22 + * 1.23 + * In addition, as a special exception, Andrea Mazzoleni 1.24 + * gives permission to link the code of this program with 1.25 + * the MAME library (or with modified versions of MAME that use the 1.26 + * same license as MAME), and distribute linked combinations including 1.27 + * the two. You must obey the GNU General Public License in all 1.28 + * respects for all of the code used other than MAME. If you modify 1.29 + * this file, you may extend this exception to your version of the 1.30 + * file, but you are not obligated to do so. If you do not wish to 1.31 + * do so, delete this exception statement from your version. 1.32 + */ 1.33 + 1.34 +/* 1.35 + * Alternatively at the previous license terms, you are allowed to use this 1.36 + * code in your program with these conditions: 1.37 + * - the program is not used in commercial activities. 1.38 + * - the whole source code of the program is released with the binary. 1.39 + */ 1.40 + 1.41 +#include "../Port.h" 1.42 + 1.43 +#ifdef MMX 1.44 +extern "C" bool cpu_mmx; 1.45 +#endif 1.46 + 1.47 +static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) 1.48 +{ 1.49 + /* first pixel */ 1.50 + dst[0] = src1[0]; 1.51 + if (src1[1] == src0[0] && src2[0] != src0[0]) 1.52 + dst[1] = src0[0]; 1.53 + else 1.54 + dst[1] = src1[0]; 1.55 + ++src0; 1.56 + ++src1; 1.57 + ++src2; 1.58 + dst += 2; 1.59 + 1.60 + /* central pixels */ 1.61 + count -= 2; 1.62 + while (count) 1.63 + { 1.64 + if (src0[0] != src2[0] && src1[-1] != src1[1]) 1.65 + { 1.66 + dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0]; 1.67 + dst[1] = src1[1] == src0[0] ? src0[0] : src1[0]; 1.68 + } 1.69 + else 1.70 + { 1.71 + dst[0] = src1[0]; 1.72 + dst[1] = src1[0]; 1.73 + } 1.74 + 1.75 + ++src0; 1.76 + ++src1; 1.77 + ++src2; 1.78 + dst += 2; 1.79 + --count; 1.80 + } 1.81 + 1.82 + /* last pixel */ 1.83 + if (src1[-1] == src0[0] && src2[0] != src0[0]) 1.84 + dst[0] = src0[0]; 1.85 + else 1.86 + dst[0] = src1[0]; 1.87 + dst[1] = src1[0]; 1.88 +} 1.89 + 1.90 +static void internal_scale2x_32_def(u32 *dst, 1.91 + const u32 *src0, 1.92 + const u32 *src1, 1.93 + const u32 *src2, 1.94 + unsigned count) 1.95 +{ 1.96 + /* first pixel */ 1.97 + dst[0] = src1[0]; 1.98 + if (src1[1] == src0[0] && src2[0] != src0[0]) 1.99 + dst[1] = src0[0]; 1.100 + else 1.101 + dst[1] = src1[0]; 1.102 + ++src0; 1.103 + ++src1; 1.104 + ++src2; 1.105 + dst += 2; 1.106 + 1.107 + /* central pixels */ 1.108 + count -= 2; 1.109 + while (count) 1.110 + { 1.111 + if (src0[0] != src2[0] && src1[-1] != src1[1]) 1.112 + { 1.113 + dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0]; 1.114 + dst[1] = src1[1] == src0[0] ? src0[0] : src1[0]; 1.115 + } 1.116 + else 1.117 + { 1.118 + dst[0] = src1[0]; 1.119 + dst[1] = src1[0]; 1.120 + } 1.121 + 1.122 + ++src0; 1.123 + ++src1; 1.124 + ++src2; 1.125 + dst += 2; 1.126 + --count; 1.127 + } 1.128 + 1.129 + /* last pixel */ 1.130 + if (src1[-1] == src0[0] && src2[0] != src0[0]) 1.131 + dst[0] = src0[0]; 1.132 + else 1.133 + dst[0] = src1[0]; 1.134 + dst[1] = src1[0]; 1.135 +} 1.136 + 1.137 +#ifdef MMX 1.138 +static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) 1.139 +{ 1.140 + /* always do the first and last run */ 1.141 + count -= 2 * 4; 1.142 + 1.143 +#ifdef __GNUC__ 1.144 + __asm__ __volatile__ ( 1.145 + /* first run */ 1.146 + /* set the current, current_pre, current_next registers */ 1.147 + "movq 0(%1), %%mm0\n" 1.148 + "movq 0(%1),%%mm7\n" 1.149 + "movq 8(%1),%%mm1\n" 1.150 + "psllq $48,%%mm0\n" 1.151 + "psllq $48,%%mm1\n" 1.152 + "psrlq $48, %%mm0\n" 1.153 + "movq %%mm7,%%mm2\n" 1.154 + "movq %%mm7,%%mm3\n" 1.155 + "psllq $16,%%mm2\n" 1.156 + "psrlq $16,%%mm3\n" 1.157 + "por %%mm2,%%mm0\n" 1.158 + "por %%mm3,%%mm1\n" 1.159 + 1.160 + /* current_upper */ 1.161 + "movq (%0),%%mm6\n" 1.162 + 1.163 + /* compute the upper-left pixel for dst on %%mm2 */ 1.164 + /* compute the upper-right pixel for dst on %%mm4 */ 1.165 + "movq %%mm0,%%mm2\n" 1.166 + "movq %%mm1,%%mm4\n" 1.167 + "movq %%mm0,%%mm3\n" 1.168 + "movq %%mm1,%%mm5\n" 1.169 + "pcmpeqw %%mm6,%%mm2\n" 1.170 + "pcmpeqw %%mm6,%%mm4\n" 1.171 + "pcmpeqw (%2),%%mm3\n" 1.172 + "pcmpeqw (%2),%%mm5\n" 1.173 + "pandn %%mm2,%%mm3\n" 1.174 + "pandn %%mm4,%%mm5\n" 1.175 + "movq %%mm0,%%mm2\n" 1.176 + "movq %%mm1,%%mm4\n" 1.177 + "pcmpeqw %%mm1,%%mm2\n" 1.178 + "pcmpeqw %%mm0,%%mm4\n" 1.179 + "pandn %%mm3,%%mm2\n" 1.180 + "pandn %%mm5,%%mm4\n" 1.181 + "movq %%mm2,%%mm3\n" 1.182 + "movq %%mm4,%%mm5\n" 1.183 + "pand %%mm6,%%mm2\n" 1.184 + "pand %%mm6,%%mm4\n" 1.185 + "pandn %%mm7,%%mm3\n" 1.186 + "pandn %%mm7,%%mm5\n" 1.187 + "por %%mm3,%%mm2\n" 1.188 + "por %%mm5,%%mm4\n" 1.189 + 1.190 + /* set *dst */ 1.191 + "movq %%mm2,%%mm3\n" 1.192 + "punpcklwd %%mm4,%%mm2\n" 1.193 + "punpckhwd %%mm4,%%mm3\n" 1.194 + "movq %%mm2,(%3)\n" 1.195 + "movq %%mm3,8(%3)\n" 1.196 + 1.197 + /* next */ 1.198 + "addl $8,%0\n" 1.199 + "addl $8,%1\n" 1.200 + "addl $8,%2\n" 1.201 + "addl $16,%3\n" 1.202 + 1.203 + /* central runs */ 1.204 + "shrl $2,%4\n" 1.205 + "jz 1f\n" 1.206 + 1.207 + "0:\n" 1.208 + 1.209 + /* set the current, current_pre, current_next registers */ 1.210 + "movq -8(%1),%%mm0\n" 1.211 + "movq (%1),%%mm7\n" 1.212 + "movq 8(%1),%%mm1\n" 1.213 + "psrlq $48,%%mm0\n" 1.214 + "psllq $48,%%mm1\n" 1.215 + "movq %%mm7,%%mm2\n" 1.216 + "movq %%mm7,%%mm3\n" 1.217 + "psllq $16,%%mm2\n" 1.218 + "psrlq $16,%%mm3\n" 1.219 + "por %%mm2,%%mm0\n" 1.220 + "por %%mm3,%%mm1\n" 1.221 + 1.222 + /* current_upper */ 1.223 + "movq (%0),%%mm6\n" 1.224 + 1.225 + /* compute the upper-left pixel for dst on %%mm2 */ 1.226 + /* compute the upper-right pixel for dst on %%mm4 */ 1.227 + "movq %%mm0,%%mm2\n" 1.228 + "movq %%mm1,%%mm4\n" 1.229 + "movq %%mm0,%%mm3\n" 1.230 + "movq %%mm1,%%mm5\n" 1.231 + "pcmpeqw %%mm6,%%mm2\n" 1.232 + "pcmpeqw %%mm6,%%mm4\n" 1.233 + "pcmpeqw (%2),%%mm3\n" 1.234 + "pcmpeqw (%2),%%mm5\n" 1.235 + "pandn %%mm2,%%mm3\n" 1.236 + "pandn %%mm4,%%mm5\n" 1.237 + "movq %%mm0,%%mm2\n" 1.238 + "movq %%mm1,%%mm4\n" 1.239 + "pcmpeqw %%mm1,%%mm2\n" 1.240 + "pcmpeqw %%mm0,%%mm4\n" 1.241 + "pandn %%mm3,%%mm2\n" 1.242 + "pandn %%mm5,%%mm4\n" 1.243 + "movq %%mm2,%%mm3\n" 1.244 + "movq %%mm4,%%mm5\n" 1.245 + "pand %%mm6,%%mm2\n" 1.246 + "pand %%mm6,%%mm4\n" 1.247 + "pandn %%mm7,%%mm3\n" 1.248 + "pandn %%mm7,%%mm5\n" 1.249 + "por %%mm3,%%mm2\n" 1.250 + "por %%mm5,%%mm4\n" 1.251 + 1.252 + /* set *dst */ 1.253 + "movq %%mm2,%%mm3\n" 1.254 + "punpcklwd %%mm4,%%mm2\n" 1.255 + "punpckhwd %%mm4,%%mm3\n" 1.256 + "movq %%mm2,(%3)\n" 1.257 + "movq %%mm3,8(%3)\n" 1.258 + 1.259 + /* next */ 1.260 + "addl $8,%0\n" 1.261 + "addl $8,%1\n" 1.262 + "addl $8,%2\n" 1.263 + "addl $16,%3\n" 1.264 + 1.265 + "decl %4\n" 1.266 + "jnz 0b\n" 1.267 + "1:\n" 1.268 + 1.269 + /* final run */ 1.270 + /* set the current, current_pre, current_next registers */ 1.271 + "movq (%1),%%mm1\n" 1.272 + "movq (%1),%%mm7\n" 1.273 + "movq -8(%1),%%mm0\n" 1.274 + "psrlq $48,%%mm1\n" 1.275 + "psrlq $48,%%mm0\n" 1.276 + "psllq $48,%%mm1\n" 1.277 + "movq %%mm7,%%mm2\n" 1.278 + "movq %%mm7,%%mm3\n" 1.279 + "psllq $16,%%mm2\n" 1.280 + "psrlq $16,%%mm3\n" 1.281 + "por %%mm2,%%mm0\n" 1.282 + "por %%mm3,%%mm1\n" 1.283 + 1.284 + /* current_upper */ 1.285 + "movq (%0),%%mm6\n" 1.286 + 1.287 + /* compute the upper-left pixel for dst on %%mm2 */ 1.288 + /* compute the upper-right pixel for dst on %%mm4 */ 1.289 + "movq %%mm0,%%mm2\n" 1.290 + "movq %%mm1,%%mm4\n" 1.291 + "movq %%mm0,%%mm3\n" 1.292 + "movq %%mm1,%%mm5\n" 1.293 + "pcmpeqw %%mm6,%%mm2\n" 1.294 + "pcmpeqw %%mm6,%%mm4\n" 1.295 + "pcmpeqw (%2),%%mm3\n" 1.296 + "pcmpeqw (%2),%%mm5\n" 1.297 + "pandn %%mm2,%%mm3\n" 1.298 + "pandn %%mm4,%%mm5\n" 1.299 + "movq %%mm0,%%mm2\n" 1.300 + "movq %%mm1,%%mm4\n" 1.301 + "pcmpeqw %%mm1,%%mm2\n" 1.302 + "pcmpeqw %%mm0,%%mm4\n" 1.303 + "pandn %%mm3,%%mm2\n" 1.304 + "pandn %%mm5,%%mm4\n" 1.305 + "movq %%mm2,%%mm3\n" 1.306 + "movq %%mm4,%%mm5\n" 1.307 + "pand %%mm6,%%mm2\n" 1.308 + "pand %%mm6,%%mm4\n" 1.309 + "pandn %%mm7,%%mm3\n" 1.310 + "pandn %%mm7,%%mm5\n" 1.311 + "por %%mm3,%%mm2\n" 1.312 + "por %%mm5,%%mm4\n" 1.313 + 1.314 + /* set *dst */ 1.315 + "movq %%mm2,%%mm3\n" 1.316 + "punpcklwd %%mm4,%%mm2\n" 1.317 + "punpckhwd %%mm4,%%mm3\n" 1.318 + "movq %%mm2,(%3)\n" 1.319 + "movq %%mm3,8(%3)\n" 1.320 + "emms\n" 1.321 + 1.322 + : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count) 1.323 + : 1.324 + : "cc" 1.325 + ); 1.326 +#else 1.327 + __asm { 1.328 + mov eax, src0; 1.329 + mov ebx, src1; 1.330 + mov ecx, src2; 1.331 + mov edx, dst; 1.332 + mov esi, count; 1.333 + 1.334 + /* first run */ 1.335 + /* set the current, current_pre, current_next registers */ 1.336 + movq mm0, qword ptr [ebx]; 1.337 + movq mm7, qword ptr [ebx]; 1.338 + movq mm1, qword ptr [ebx + 8]; 1.339 + psllq mm0, 48; 1.340 + psllq mm1, 48; 1.341 + psrlq mm0, 48; 1.342 + movq mm2, mm7; 1.343 + movq mm3, mm7; 1.344 + psllq mm2, 16; 1.345 + psrlq mm3, 16; 1.346 + por mm0, mm2; 1.347 + por mm1, mm3; 1.348 + 1.349 + /* current_upper */ 1.350 + movq mm6, qword ptr [eax]; 1.351 + 1.352 + /* compute the upper-left pixel for dst on %%mm2 */ 1.353 + /* compute the upper-right pixel for dst on %%mm4 */ 1.354 + movq mm2, mm0; 1.355 + movq mm4, mm1; 1.356 + movq mm3, mm0; 1.357 + movq mm5, mm1; 1.358 + pcmpeqw mm2, mm6; 1.359 + pcmpeqw mm4, mm6; 1.360 + pcmpeqw mm3, qword ptr [ecx]; 1.361 + pcmpeqw mm5, qword ptr [ecx]; 1.362 + pandn mm3, mm2; 1.363 + pandn mm5, mm4; 1.364 + movq mm2, mm0; 1.365 + movq mm4, mm1; 1.366 + pcmpeqw mm2, mm1; 1.367 + pcmpeqw mm4, mm0; 1.368 + pandn mm2, mm3; 1.369 + pandn mm4, mm5; 1.370 + movq mm3, mm2; 1.371 + movq mm5, mm4; 1.372 + pand mm2, mm6; 1.373 + pand mm4, mm6; 1.374 + pandn mm3, mm7; 1.375 + pandn mm5, mm7; 1.376 + por mm2, mm3; 1.377 + por mm4, mm5; 1.378 + 1.379 + /* set *dst0 */ 1.380 + movq mm3, mm2; 1.381 + punpcklwd mm2, mm4; 1.382 + punpckhwd mm3, mm4; 1.383 + movq qword ptr [edx], mm2; 1.384 + movq qword ptr [edx + 8], mm3; 1.385 + 1.386 + /* next */ 1.387 + add eax, 8; 1.388 + add ebx, 8; 1.389 + add ecx, 8; 1.390 + add edx, 16; 1.391 + 1.392 + /* central runs */ 1.393 + shr esi, 2; 1.394 + jz label1; 1.395 + align 4; 1.396 +label0: 1.397 + 1.398 + /* set the current, current_pre, current_next registers */ 1.399 + movq mm0, qword ptr [ebx - 8]; 1.400 + movq mm7, qword ptr [ebx]; 1.401 + movq mm1, qword ptr [ebx + 8]; 1.402 + psrlq mm0, 48; 1.403 + psllq mm1, 48; 1.404 + movq mm2, mm7; 1.405 + movq mm3, mm7; 1.406 + psllq mm2, 16; 1.407 + psrlq mm3, 16; 1.408 + por mm0, mm2; 1.409 + por mm1, mm3; 1.410 + 1.411 + /* current_upper */ 1.412 + movq mm6, qword ptr [eax]; 1.413 + 1.414 + /* compute the upper-left pixel for dst on %%mm2 */ 1.415 + /* compute the upper-right pixel for dst on %%mm4 */ 1.416 + movq mm2, mm0; 1.417 + movq mm4, mm1; 1.418 + movq mm3, mm0; 1.419 + movq mm5, mm1; 1.420 + pcmpeqw mm2, mm6; 1.421 + pcmpeqw mm4, mm6; 1.422 + pcmpeqw mm3, qword ptr [ecx]; 1.423 + pcmpeqw mm5, qword ptr [ecx]; 1.424 + pandn mm3, mm2; 1.425 + pandn mm5, mm4; 1.426 + movq mm2, mm0; 1.427 + movq mm4, mm1; 1.428 + pcmpeqw mm2, mm1; 1.429 + pcmpeqw mm4, mm0; 1.430 + pandn mm2, mm3; 1.431 + pandn mm4, mm5; 1.432 + movq mm3, mm2; 1.433 + movq mm5, mm4; 1.434 + pand mm2, mm6; 1.435 + pand mm4, mm6; 1.436 + pandn mm3, mm7; 1.437 + pandn mm5, mm7; 1.438 + por mm2, mm3; 1.439 + por mm4, mm5; 1.440 + 1.441 + /* set *dst */ 1.442 + movq mm3, mm2; 1.443 + punpcklwd mm2, mm4; 1.444 + punpckhwd mm3, mm4; 1.445 + movq qword ptr [edx], mm2; 1.446 + movq qword ptr [edx + 8], mm3; 1.447 + 1.448 + /* next */ 1.449 + add eax, 8; 1.450 + add ebx, 8; 1.451 + add ecx, 8; 1.452 + add edx, 16; 1.453 + 1.454 + dec esi; 1.455 + jnz label0; 1.456 +label1: 1.457 + 1.458 + /* final run */ 1.459 + /* set the current, current_pre, current_next registers */ 1.460 + movq mm1, qword ptr [ebx]; 1.461 + movq mm7, qword ptr [ebx]; 1.462 + movq mm0, qword ptr [ebx - 8]; 1.463 + psrlq mm1, 48; 1.464 + psrlq mm0, 48; 1.465 + psllq mm1, 48; 1.466 + movq mm2, mm7; 1.467 + movq mm3, mm7; 1.468 + psllq mm2, 16; 1.469 + psrlq mm3, 16; 1.470 + por mm0, mm2; 1.471 + por mm1, mm3; 1.472 + 1.473 + /* current_upper */ 1.474 + movq mm6, qword ptr [eax]; 1.475 + 1.476 + /* compute the upper-left pixel for dst on %%mm2 */ 1.477 + /* compute the upper-right pixel for dst on %%mm4 */ 1.478 + movq mm2, mm0; 1.479 + movq mm4, mm1; 1.480 + movq mm3, mm0; 1.481 + movq mm5, mm1; 1.482 + pcmpeqw mm2, mm6; 1.483 + pcmpeqw mm4, mm6; 1.484 + pcmpeqw mm3, qword ptr [ecx]; 1.485 + pcmpeqw mm5, qword ptr [ecx]; 1.486 + pandn mm3, mm2; 1.487 + pandn mm5, mm4; 1.488 + movq mm2, mm0; 1.489 + movq mm4, mm1; 1.490 + pcmpeqw mm2, mm1; 1.491 + pcmpeqw mm4, mm0; 1.492 + pandn mm2, mm3; 1.493 + pandn mm4, mm5; 1.494 + movq mm3, mm2; 1.495 + movq mm5, mm4; 1.496 + pand mm2, mm6; 1.497 + pand mm4, mm6; 1.498 + pandn mm3, mm7; 1.499 + pandn mm5, mm7; 1.500 + por mm2, mm3; 1.501 + por mm4, mm5; 1.502 + 1.503 + /* set *dst */ 1.504 + movq mm3, mm2; 1.505 + punpcklwd mm2, mm4; 1.506 + punpckhwd mm3, mm4; 1.507 + movq qword ptr [edx], mm2; 1.508 + movq qword ptr [edx + 8], mm3; 1.509 + 1.510 + mov src0, eax; 1.511 + mov src1, ebx; 1.512 + mov src2, ecx; 1.513 + mov dst, edx; 1.514 + mov count, esi; 1.515 + 1.516 + emms; 1.517 + } 1.518 +#endif 1.519 +} 1.520 + 1.521 +static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count) 1.522 +{ 1.523 + /* always do the first and last run */ 1.524 + count -= 2 * 2; 1.525 + 1.526 +#ifdef __GNUC__ 1.527 + __asm__ __volatile__ ( 1.528 + /* first run */ 1.529 + /* set the current, current_pre, current_next registers */ 1.530 + "movq 0(%1),%%mm0\n" 1.531 + "movq 0(%1),%%mm7\n" 1.532 + "movq 8(%1),%%mm1\n" 1.533 + "psllq $32,%%mm0\n" 1.534 + "psllq $32,%%mm1\n" 1.535 + "psrlq $32,%%mm0\n" 1.536 + "movq %%mm7,%%mm2\n" 1.537 + "movq %%mm7,%%mm3\n" 1.538 + "psllq $32,%%mm2\n" 1.539 + "psrlq $32,%%mm3\n" 1.540 + "por %%mm2,%%mm0\n" 1.541 + "por %%mm3,%%mm1\n" 1.542 + 1.543 + /* current_upper */ 1.544 + "movq (%0),%%mm6\n" 1.545 + 1.546 + /* compute the upper-left pixel for dst on %%mm2 */ 1.547 + /* compute the upper-right pixel for dst on %%mm4 */ 1.548 + "movq %%mm0,%%mm2\n" 1.549 + "movq %%mm1,%%mm4\n" 1.550 + "movq %%mm0,%%mm3\n" 1.551 + "movq %%mm1,%%mm5\n" 1.552 + "pcmpeqd %%mm6,%%mm2\n" 1.553 + "pcmpeqd %%mm6,%%mm4\n" 1.554 + "pcmpeqd (%2),%%mm3\n" 1.555 + "pcmpeqd (%2),%%mm5\n" 1.556 + "pandn %%mm2,%%mm3\n" 1.557 + "pandn %%mm4,%%mm5\n" 1.558 + "movq %%mm0,%%mm2\n" 1.559 + "movq %%mm1,%%mm4\n" 1.560 + "pcmpeqd %%mm1,%%mm2\n" 1.561 + "pcmpeqd %%mm0,%%mm4\n" 1.562 + "pandn %%mm3,%%mm2\n" 1.563 + "pandn %%mm5,%%mm4\n" 1.564 + "movq %%mm2,%%mm3\n" 1.565 + "movq %%mm4,%%mm5\n" 1.566 + "pand %%mm6,%%mm2\n" 1.567 + "pand %%mm6,%%mm4\n" 1.568 + "pandn %%mm7,%%mm3\n" 1.569 + "pandn %%mm7,%%mm5\n" 1.570 + "por %%mm3,%%mm2\n" 1.571 + "por %%mm5,%%mm4\n" 1.572 + 1.573 + /* set *dst */ 1.574 + "movq %%mm2,%%mm3\n" 1.575 + "punpckldq %%mm4,%%mm2\n" 1.576 + "punpckhdq %%mm4,%%mm3\n" 1.577 + "movq %%mm2,(%3)\n" 1.578 + "movq %%mm3, 8(%3)\n" 1.579 + 1.580 + /* next */ 1.581 + "addl $8,%0\n" 1.582 + "addl $8,%1\n" 1.583 + "addl $8,%2\n" 1.584 + "addl $16,%3\n" 1.585 + 1.586 + /* central runs */ 1.587 + "shrl $1,%4\n" 1.588 + "jz 1f\n" 1.589 + 1.590 + "0:\n" 1.591 + 1.592 + /* set the current, current_pre, current_next registers */ 1.593 + "movq -8(%1),%%mm0\n" 1.594 + "movq (%1),%%mm7\n" 1.595 + "movq 8(%1),%%mm1\n" 1.596 + "psrlq $32,%%mm0\n" 1.597 + "psllq $32,%%mm1\n" 1.598 + "movq %%mm7,%%mm2\n" 1.599 + "movq %%mm7,%%mm3\n" 1.600 + "psllq $32,%%mm2\n" 1.601 + "psrlq $32,%%mm3\n" 1.602 + "por %%mm2,%%mm0\n" 1.603 + "por %%mm3,%%mm1\n" 1.604 + 1.605 + /* current_upper */ 1.606 + "movq (%0),%%mm6\n" 1.607 + 1.608 + /* compute the upper-left pixel for dst on %%mm2 */ 1.609 + /* compute the upper-right pixel for dst on %%mm4 */ 1.610 + "movq %%mm0,%%mm2\n" 1.611 + "movq %%mm1,%%mm4\n" 1.612 + "movq %%mm0,%%mm3\n" 1.613 + "movq %%mm1,%%mm5\n" 1.614 + "pcmpeqd %%mm6,%%mm2\n" 1.615 + "pcmpeqd %%mm6,%%mm4\n" 1.616 + "pcmpeqd (%2),%%mm3\n" 1.617 + "pcmpeqd (%2),%%mm5\n" 1.618 + "pandn %%mm2,%%mm3\n" 1.619 + "pandn %%mm4,%%mm5\n" 1.620 + "movq %%mm0,%%mm2\n" 1.621 + "movq %%mm1,%%mm4\n" 1.622 + "pcmpeqd %%mm1,%%mm2\n" 1.623 + "pcmpeqd %%mm0,%%mm4\n" 1.624 + "pandn %%mm3,%%mm2\n" 1.625 + "pandn %%mm5,%%mm4\n" 1.626 + "movq %%mm2,%%mm3\n" 1.627 + "movq %%mm4,%%mm5\n" 1.628 + "pand %%mm6,%%mm2\n" 1.629 + "pand %%mm6,%%mm4\n" 1.630 + "pandn %%mm7,%%mm3\n" 1.631 + "pandn %%mm7,%%mm5\n" 1.632 + "por %%mm3,%%mm2\n" 1.633 + "por %%mm5,%%mm4\n" 1.634 + 1.635 + /* set *dst */ 1.636 + "movq %%mm2,%%mm3\n" 1.637 + "punpckldq %%mm4,%%mm2\n" 1.638 + "punpckhdq %%mm4,%%mm3\n" 1.639 + "movq %%mm2,(%3)\n" 1.640 + "movq %%mm3,8(%3)\n" 1.641 + 1.642 + /* next */ 1.643 + "addl $8,%0\n" 1.644 + "addl $8,%1\n" 1.645 + "addl $8,%2\n" 1.646 + "addl $16,%3\n" 1.647 + 1.648 + "decl %4\n" 1.649 + "jnz 0b\n" 1.650 + "1:\n" 1.651 + 1.652 + /* final run */ 1.653 + /* set the current, current_pre, current_next registers */ 1.654 + "movq (%1),%%mm1\n" 1.655 + "movq (%1),%%mm7\n" 1.656 + "movq -8(%1), %%mm0\n" 1.657 + "psrlq $32,%%mm1\n" 1.658 + "psrlq $32,%%mm0\n" 1.659 + "psllq $32,%%mm1\n" 1.660 + "movq %%mm7,%%mm2\n" 1.661 + "movq %%mm7,%%mm3\n" 1.662 + "psllq $32,%%mm2\n" 1.663 + "psrlq $32,%%mm3\n" 1.664 + "por %%mm2,%%mm0\n" 1.665 + "por %%mm3,%%mm1\n" 1.666 + 1.667 + /* current_upper */ 1.668 + "movq (%0),%%mm6\n" 1.669 + 1.670 + /* compute the upper-left pixel for dst on %%mm2 */ 1.671 + /* compute the upper-right pixel for dst on %%mm4 */ 1.672 + "movq %%mm0,%%mm2\n" 1.673 + "movq %%mm1,%%mm4\n" 1.674 + "movq %%mm0,%%mm3\n" 1.675 + "movq %%mm1,%%mm5\n" 1.676 + "pcmpeqd %%mm6,%%mm2\n" 1.677 + "pcmpeqd %%mm6,%%mm4\n" 1.678 + "pcmpeqd (%2),%%mm3\n" 1.679 + "pcmpeqd (%2),%%mm5\n" 1.680 + "pandn %%mm2,%%mm3\n" 1.681 + "pandn %%mm4,%%mm5\n" 1.682 + "movq %%mm0,%%mm2\n" 1.683 + "movq %%mm1,%%mm4\n" 1.684 + "pcmpeqd %%mm1,%%mm2\n" 1.685 + "pcmpeqd %%mm0,%%mm4\n" 1.686 + "pandn %%mm3,%%mm2\n" 1.687 + "pandn %%mm5,%%mm4\n" 1.688 + "movq %%mm2,%%mm3\n" 1.689 + "movq %%mm4,%%mm5\n" 1.690 + "pand %%mm6,%%mm2\n" 1.691 + "pand %%mm6,%%mm4\n" 1.692 + "pandn %%mm7,%%mm3\n" 1.693 + "pandn %%mm7,%%mm5\n" 1.694 + "por %%mm3,%%mm2\n" 1.695 + "por %%mm5,%%mm4\n" 1.696 + 1.697 + /* set *dst */ 1.698 + "movq %%mm2,%%mm3\n" 1.699 + "punpckldq %%mm4,%%mm2\n" 1.700 + "punpckhdq %%mm4,%%mm3\n" 1.701 + "movq %%mm2,(%3)\n" 1.702 + "movq %%mm3,8(%3)\n" 1.703 + "emms\n" 1.704 + 1.705 + : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count) 1.706 + : 1.707 + : "cc" 1.708 + ); 1.709 +#else 1.710 + __asm { 1.711 + mov eax, src0; 1.712 + mov ebx, src1; 1.713 + mov ecx, src2; 1.714 + mov edx, dst; 1.715 + mov esi, count; 1.716 + 1.717 + /* first run */ 1.718 + /* set the current, current_pre, current_next registers */ 1.719 + movq mm0, qword ptr [ebx]; 1.720 + movq mm7, qword ptr [ebx]; 1.721 + movq mm1, qword ptr [ebx + 8]; 1.722 + psllq mm0, 32; 1.723 + psllq mm1, 32; 1.724 + psrlq mm0, 32; 1.725 + movq mm2, mm7; 1.726 + movq mm3, mm7; 1.727 + psllq mm2, 32; 1.728 + psrlq mm3, 32; 1.729 + por mm0, mm2; 1.730 + por mm1, mm3; 1.731 + 1.732 + /* current_upper */ 1.733 + movq mm6, qword ptr [eax]; 1.734 + 1.735 + /* compute the upper-left pixel for dst on %%mm2 */ 1.736 + /* compute the upper-right pixel for dst on %%mm4 */ 1.737 + movq mm2, mm0; 1.738 + movq mm4, mm1; 1.739 + movq mm3, mm0; 1.740 + movq mm5, mm1; 1.741 + pcmpeqd mm2, mm6; 1.742 + pcmpeqd mm4, mm6; 1.743 + pcmpeqd mm3, qword ptr [ecx]; 1.744 + pcmpeqd mm5, qword ptr [ecx]; 1.745 + pandn mm3, mm2; 1.746 + pandn mm5, mm4; 1.747 + movq mm2, mm0; 1.748 + movq mm4, mm1; 1.749 + pcmpeqd mm2, mm1; 1.750 + pcmpeqd mm4, mm0; 1.751 + pandn mm2, mm3; 1.752 + pandn mm4, mm5; 1.753 + movq mm3, mm2; 1.754 + movq mm5, mm4; 1.755 + pand mm2, mm6; 1.756 + pand mm4, mm6; 1.757 + pandn mm3, mm7; 1.758 + pandn mm5, mm7; 1.759 + por mm2, mm3; 1.760 + por mm4, mm5; 1.761 + 1.762 + /* set *dst */ 1.763 + movq mm3, mm2; 1.764 + punpckldq mm2, mm4; 1.765 + punpckhdq mm3, mm4; 1.766 + movq qword ptr [edx], mm2; 1.767 + movq qword ptr [edx + 8], mm3; 1.768 + 1.769 + /* next */ 1.770 + add eax, 8; 1.771 + add ebx, 8; 1.772 + add ecx, 8; 1.773 + add edx, 16; 1.774 + 1.775 + /* central runs */ 1.776 + shr esi, 1; 1.777 + jz label1; 1.778 +label0: 1.779 + 1.780 + /* set the current, current_pre, current_next registers */ 1.781 + movq mm0, qword ptr [ebx - 8]; 1.782 + movq mm7, qword ptr [ebx]; 1.783 + movq mm1, qword ptr [ebx + 8]; 1.784 + psrlq mm0, 32; 1.785 + psllq mm1, 32; 1.786 + movq mm2, mm7; 1.787 + movq mm3, mm7; 1.788 + psllq mm2, 32; 1.789 + psrlq mm3, 32; 1.790 + por mm0, mm2; 1.791 + por mm1, mm3; 1.792 + 1.793 + /* current_upper */ 1.794 + movq mm6, qword ptr[eax]; 1.795 + 1.796 + /* compute the upper-left pixel for dst on %%mm2 */ 1.797 + /* compute the upper-right pixel for dst on %%mm4 */ 1.798 + movq mm2, mm0; 1.799 + movq mm4, mm1; 1.800 + movq mm3, mm0; 1.801 + movq mm5, mm1; 1.802 + pcmpeqd mm2, mm6; 1.803 + pcmpeqd mm4, mm6; 1.804 + pcmpeqd mm3, qword ptr[ecx]; 1.805 + pcmpeqd mm5, qword ptr[ecx]; 1.806 + pandn mm3, mm2; 1.807 + pandn mm5, mm4; 1.808 + movq mm2, mm0; 1.809 + movq mm4, mm1; 1.810 + pcmpeqd mm2, mm1; 1.811 + pcmpeqd mm4, mm0; 1.812 + pandn mm2, mm3; 1.813 + pandn mm4, mm5; 1.814 + movq mm3, mm2; 1.815 + movq mm5, mm4; 1.816 + pand mm2, mm6; 1.817 + pand mm4, mm6; 1.818 + pandn mm3, mm7; 1.819 + pandn mm5, mm7; 1.820 + por mm2, mm3; 1.821 + por mm4, mm5; 1.822 + 1.823 + /* set *dst */ 1.824 + movq mm3, mm2; 1.825 + punpckldq mm2, mm4; 1.826 + punpckhdq mm3, mm4; 1.827 + movq qword ptr [edx], mm2; 1.828 + movq qword ptr [edx + 8], mm3; 1.829 + 1.830 + /* next */ 1.831 + add eax, 8; 1.832 + add ebx, 8; 1.833 + add ecx, 8; 1.834 + add edx, 16; 1.835 + 1.836 + dec esi; 1.837 + jnz label0; 1.838 +label1: 1.839 + 1.840 + /* final run */ 1.841 + /* set the current, current_pre, current_next registers */ 1.842 + movq mm1, qword ptr [ebx]; 1.843 + movq mm7, qword ptr [ebx]; 1.844 + movq mm0, qword ptr [ebx - 8]; 1.845 + psrlq mm1, 32; 1.846 + psrlq mm0, 32; 1.847 + psllq mm1, 32; 1.848 + movq mm2, mm7; 1.849 + movq mm3, mm7; 1.850 + psllq mm2, 32; 1.851 + psrlq mm3, 32; 1.852 + por mm0, mm2; 1.853 + por mm1, mm3; 1.854 + 1.855 + /* current_upper */ 1.856 + movq mm6, qword ptr [eax]; 1.857 + 1.858 + /* compute the upper-left pixel for dst on %%mm2 */ 1.859 + /* compute the upper-right pixel for dst on %%mm4 */ 1.860 + movq mm2, mm0; 1.861 + movq mm4, mm1; 1.862 + movq mm3, mm0; 1.863 + movq mm5, mm1; 1.864 + pcmpeqd mm2, mm6; 1.865 + pcmpeqd mm4, mm6; 1.866 + pcmpeqd mm3, qword ptr [ecx]; 1.867 + pcmpeqd mm5, qword ptr [ecx]; 1.868 + pandn mm3, mm2; 1.869 + pandn mm5, mm4; 1.870 + movq mm2, mm0; 1.871 + movq mm4, mm1; 1.872 + pcmpeqd mm2, mm1; 1.873 + pcmpeqd mm4, mm0; 1.874 + pandn mm2, mm3; 1.875 + pandn mm4, mm5; 1.876 + movq mm3, mm2; 1.877 + movq mm5, mm4; 1.878 + pand mm2, mm6; 1.879 + pand mm4, mm6; 1.880 + pandn mm3, mm7; 1.881 + pandn mm5, mm7; 1.882 + por mm2, mm3; 1.883 + por mm4, mm5; 1.884 + 1.885 + /* set *dst */ 1.886 + movq mm3, mm2; 1.887 + punpckldq mm2, mm4; 1.888 + punpckhdq mm3, mm4; 1.889 + movq qword ptr [edx], mm2; 1.890 + movq qword ptr [edx + 8], mm3; 1.891 + 1.892 + mov src0, eax; 1.893 + mov src1, ebx; 1.894 + mov src2, ecx; 1.895 + mov dst, edx; 1.896 + mov count, esi; 1.897 + 1.898 + emms; 1.899 + } 1.900 +#endif 1.901 +} 1.902 + 1.903 +static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) 1.904 +{ 1.905 + // assert( count >= 2*4 ); 1.906 + internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count); 1.907 + internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count); 1.908 +} 1.909 + 1.910 +static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count) 1.911 +{ 1.912 + // assert( count >= 2*2 ); 1.913 + internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count); 1.914 + internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count); 1.915 +} 1.916 + 1.917 +#endif 1.918 + 1.919 +void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */, 1.920 + u8 *dstPtr, u32 dstPitch, int width, int height) 1.921 +{ 1.922 + u16 *dst0 = (u16 *)dstPtr; 1.923 + u16 *dst1 = dst0 + (dstPitch >> 1); 1.924 + 1.925 + u16 *src0 = (u16 *)srcPtr; 1.926 + u16 *src1 = src0 + (srcPitch >> 1); 1.927 + u16 *src2 = src1 + (srcPitch >> 1); 1.928 +#ifdef MMX 1.929 + if (cpu_mmx) 1.930 + { 1.931 + internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width); 1.932 + 1.933 + int count = height; 1.934 + 1.935 + count -= 2; 1.936 + while (count) 1.937 + { 1.938 + dst0 += dstPitch; 1.939 + dst1 += dstPitch; 1.940 + internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width); 1.941 + src0 = src1; 1.942 + src1 = src2; 1.943 + src2 += srcPitch >> 1; 1.944 + --count; 1.945 + } 1.946 + dst0 += dstPitch; 1.947 + dst1 += dstPitch; 1.948 + internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width); 1.949 + } 1.950 + else 1.951 + { 1.952 +#endif 1.953 + internal_scale2x_16_def(dst0, src0, src0, src1, width); 1.954 + internal_scale2x_16_def(dst1, src1, src0, src0, width); 1.955 + 1.956 + int count = height; 1.957 + 1.958 + count -= 2; 1.959 + while (count) 1.960 + { 1.961 + dst0 += dstPitch; 1.962 + dst1 += dstPitch; 1.963 + internal_scale2x_16_def(dst0, src0, src1, src2, width); 1.964 + internal_scale2x_16_def(dst1, src2, src1, src0, width); 1.965 + src0 = src1; 1.966 + src1 = src2; 1.967 + src2 += srcPitch >> 1; 1.968 + --count; 1.969 + } 1.970 + dst0 += dstPitch; 1.971 + dst1 += dstPitch; 1.972 + internal_scale2x_16_def(dst0, src0, src1, src1, width); 1.973 + internal_scale2x_16_def(dst1, src1, src1, src0, width); 1.974 +#ifdef MMX 1.975 +} 1.976 + 1.977 +#endif 1.978 +} 1.979 + 1.980 +void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */, 1.981 + u8 *dstPtr, u32 dstPitch, int width, int height) 1.982 +{ 1.983 + u32 *dst0 = (u32 *)dstPtr; 1.984 + u32 *dst1 = dst0 + (dstPitch >> 2); 1.985 + 1.986 + u32 *src0 = (u32 *)srcPtr; 1.987 + u32 *src1 = src0 + (srcPitch >> 2); 1.988 + u32 *src2 = src1 + (srcPitch >> 2); 1.989 +#ifdef MMX 1.990 + if (cpu_mmx) 1.991 + { 1.992 + internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width); 1.993 + 1.994 + int count = height; 1.995 + 1.996 + count -= 2; 1.997 + while (count) 1.998 + { 1.999 + dst0 += dstPitch >> 1; 1.1000 + dst1 += dstPitch >> 1; 1.1001 + internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width); 1.1002 + src0 = src1; 1.1003 + src1 = src2; 1.1004 + src2 += srcPitch >> 2; 1.1005 + --count; 1.1006 + } 1.1007 + dst0 += dstPitch >> 1; 1.1008 + dst1 += dstPitch >> 1; 1.1009 + internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width); 1.1010 + } 1.1011 + else 1.1012 + { 1.1013 +#endif 1.1014 + internal_scale2x_32_def(dst0, src0, src0, src1, width); 1.1015 + internal_scale2x_32_def(dst1, src1, src0, src0, width); 1.1016 + 1.1017 + int count = height; 1.1018 + 1.1019 + count -= 2; 1.1020 + while (count) 1.1021 + { 1.1022 + dst0 += dstPitch >> 1; 1.1023 + dst1 += dstPitch >> 1; 1.1024 + internal_scale2x_32_def(dst0, src0, src1, src2, width); 1.1025 + internal_scale2x_32_def(dst1, src2, src1, src0, width); 1.1026 + src0 = src1; 1.1027 + src1 = src2; 1.1028 + src2 += srcPitch >> 2; 1.1029 + --count; 1.1030 + } 1.1031 + dst0 += dstPitch >> 1; 1.1032 + dst1 += dstPitch >> 1; 1.1033 + internal_scale2x_32_def(dst0, src0, src1, src1, width); 1.1034 + internal_scale2x_32_def(dst1, src1, src1, src0, width); 1.1035 +#ifdef MMX 1.1036 +} 1.1037 + 1.1038 +#endif 1.1039 +}