Mercurial > vba-clojure
view src/filters/admame.cpp @ 135:eb6ba88088d3
Wrote a more efficient input-number-assembly program; 91 oc -> 60 oc.
author | Dylan Holmes <ocsenave@gmail.com> |
---|---|
date | Sun, 18 Mar 2012 05:13:19 -0500 |
parents | f9f4f1b99eed |
children |
line wrap: on
line source
1 /*2 * This file is part of the Advance project.3 *4 * Copyright (C) 1999-2002 Andrea Mazzoleni5 *6 * This program is free software; you can redistribute it and/or modify7 * it under the terms of the GNU General Public License as published by8 * the Free Software Foundation; either version 2 of the License, or9 * (at your option) any later version.10 *11 * This program is distributed in the hope that it will be useful,12 * but WITHOUT ANY WARRANTY; without even the implied warranty of13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the14 * GNU General Public License for more details.15 *16 * You should have received a copy of the GNU General Public License17 * along with this program; if not, write to the Free Software18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.19 *20 * In addition, as a special exception, Andrea Mazzoleni21 * gives permission to link the code of this program with22 * the MAME library (or with modified versions of MAME that use the23 * same license as MAME), and distribute linked combinations including24 * the two. You must obey the GNU General Public License in all25 * respects for all of the code used other than MAME. If you modify26 * this file, you may extend this exception to your version of the27 * file, but you are not obligated to do so. If you do not wish to28 * do so, delete this exception statement from your version.29 */31 /*32 * Alternatively at the previous license terms, you are allowed to use this33 * code in your program with these conditions:34 * - the program is not used in commercial activities.35 * - the whole source code of the program is released with the binary.36 */38 #include "../Port.h"40 #ifdef MMX41 extern "C" bool cpu_mmx;42 #endif44 static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)45 {46 /* first pixel */47 dst[0] = src1[0];48 if (src1[1] == src0[0] && src2[0] != src0[0])49 dst[1] = src0[0];50 else51 dst[1] = src1[0];52 ++src0;53 ++src1;54 ++src2;55 dst += 2;57 /* central pixels */58 count -= 2;59 while (count)60 {61 if (src0[0] != src2[0] && src1[-1] != src1[1])62 {63 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];64 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];65 }66 else67 {68 dst[0] = src1[0];69 dst[1] = src1[0];70 }72 ++src0;73 ++src1;74 ++src2;75 dst += 2;76 --count;77 }79 /* last pixel */80 if (src1[-1] == src0[0] && src2[0] != src0[0])81 dst[0] = src0[0];82 else83 dst[0] = src1[0];84 dst[1] = src1[0];85 }87 static void internal_scale2x_32_def(u32 *dst,88 const u32 *src0,89 const u32 *src1,90 const u32 *src2,91 unsigned count)92 {93 /* first pixel */94 dst[0] = src1[0];95 if (src1[1] == src0[0] && src2[0] != src0[0])96 dst[1] = src0[0];97 else98 dst[1] = src1[0];99 ++src0;100 ++src1;101 ++src2;102 dst += 2;104 /* central pixels */105 count -= 2;106 while (count)107 {108 if (src0[0] != src2[0] && src1[-1] != src1[1])109 {110 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];111 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];112 }113 else114 {115 dst[0] = src1[0];116 dst[1] = src1[0];117 }119 ++src0;120 ++src1;121 ++src2;122 dst += 2;123 --count;124 }126 /* last pixel */127 if (src1[-1] == src0[0] && src2[0] != src0[0])128 dst[0] = src0[0];129 else130 dst[0] = src1[0];131 dst[1] = src1[0];132 }134 #ifdef MMX135 static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)136 {137 /* always do the first and last run */138 count -= 2 * 4;140 #ifdef __GNUC__141 __asm__ __volatile__ (142 /* first run */143 /* set the current, current_pre, current_next registers */144 "movq 0(%1), %%mm0\n"145 "movq 0(%1),%%mm7\n"146 "movq 8(%1),%%mm1\n"147 "psllq $48,%%mm0\n"148 "psllq $48,%%mm1\n"149 "psrlq $48, %%mm0\n"150 "movq %%mm7,%%mm2\n"151 "movq %%mm7,%%mm3\n"152 "psllq $16,%%mm2\n"153 "psrlq $16,%%mm3\n"154 "por %%mm2,%%mm0\n"155 "por %%mm3,%%mm1\n"157 /* current_upper */158 "movq (%0),%%mm6\n"160 /* compute the upper-left pixel for dst on %%mm2 */161 /* compute the upper-right pixel for dst on %%mm4 */162 "movq %%mm0,%%mm2\n"163 "movq %%mm1,%%mm4\n"164 "movq %%mm0,%%mm3\n"165 "movq %%mm1,%%mm5\n"166 "pcmpeqw %%mm6,%%mm2\n"167 "pcmpeqw %%mm6,%%mm4\n"168 "pcmpeqw (%2),%%mm3\n"169 "pcmpeqw (%2),%%mm5\n"170 "pandn %%mm2,%%mm3\n"171 "pandn %%mm4,%%mm5\n"172 "movq %%mm0,%%mm2\n"173 "movq %%mm1,%%mm4\n"174 "pcmpeqw %%mm1,%%mm2\n"175 "pcmpeqw %%mm0,%%mm4\n"176 "pandn %%mm3,%%mm2\n"177 "pandn %%mm5,%%mm4\n"178 "movq %%mm2,%%mm3\n"179 "movq %%mm4,%%mm5\n"180 "pand %%mm6,%%mm2\n"181 "pand %%mm6,%%mm4\n"182 "pandn %%mm7,%%mm3\n"183 "pandn %%mm7,%%mm5\n"184 "por %%mm3,%%mm2\n"185 "por %%mm5,%%mm4\n"187 /* set *dst */188 "movq %%mm2,%%mm3\n"189 "punpcklwd %%mm4,%%mm2\n"190 "punpckhwd %%mm4,%%mm3\n"191 "movq %%mm2,(%3)\n"192 "movq %%mm3,8(%3)\n"194 /* next */195 "addl $8,%0\n"196 "addl $8,%1\n"197 "addl $8,%2\n"198 "addl $16,%3\n"200 /* central runs */201 "shrl $2,%4\n"202 "jz 1f\n"204 "0:\n"206 /* set the current, current_pre, current_next registers */207 "movq -8(%1),%%mm0\n"208 "movq (%1),%%mm7\n"209 "movq 8(%1),%%mm1\n"210 "psrlq $48,%%mm0\n"211 "psllq $48,%%mm1\n"212 "movq %%mm7,%%mm2\n"213 "movq %%mm7,%%mm3\n"214 "psllq $16,%%mm2\n"215 "psrlq $16,%%mm3\n"216 "por %%mm2,%%mm0\n"217 "por %%mm3,%%mm1\n"219 /* current_upper */220 "movq (%0),%%mm6\n"222 /* compute the upper-left pixel for dst on %%mm2 */223 /* compute the upper-right pixel for dst on %%mm4 */224 "movq %%mm0,%%mm2\n"225 "movq %%mm1,%%mm4\n"226 "movq %%mm0,%%mm3\n"227 "movq %%mm1,%%mm5\n"228 "pcmpeqw %%mm6,%%mm2\n"229 "pcmpeqw %%mm6,%%mm4\n"230 "pcmpeqw (%2),%%mm3\n"231 "pcmpeqw (%2),%%mm5\n"232 "pandn %%mm2,%%mm3\n"233 "pandn %%mm4,%%mm5\n"234 "movq %%mm0,%%mm2\n"235 "movq %%mm1,%%mm4\n"236 "pcmpeqw %%mm1,%%mm2\n"237 "pcmpeqw %%mm0,%%mm4\n"238 "pandn %%mm3,%%mm2\n"239 "pandn %%mm5,%%mm4\n"240 "movq %%mm2,%%mm3\n"241 "movq %%mm4,%%mm5\n"242 "pand %%mm6,%%mm2\n"243 "pand %%mm6,%%mm4\n"244 "pandn %%mm7,%%mm3\n"245 "pandn %%mm7,%%mm5\n"246 "por %%mm3,%%mm2\n"247 "por %%mm5,%%mm4\n"249 /* set *dst */250 "movq %%mm2,%%mm3\n"251 "punpcklwd %%mm4,%%mm2\n"252 "punpckhwd %%mm4,%%mm3\n"253 "movq %%mm2,(%3)\n"254 "movq %%mm3,8(%3)\n"256 /* next */257 "addl $8,%0\n"258 "addl $8,%1\n"259 "addl $8,%2\n"260 "addl $16,%3\n"262 "decl %4\n"263 "jnz 0b\n"264 "1:\n"266 /* final run */267 /* set the current, current_pre, current_next registers */268 "movq (%1),%%mm1\n"269 "movq (%1),%%mm7\n"270 "movq -8(%1),%%mm0\n"271 "psrlq $48,%%mm1\n"272 "psrlq $48,%%mm0\n"273 "psllq $48,%%mm1\n"274 "movq %%mm7,%%mm2\n"275 "movq %%mm7,%%mm3\n"276 "psllq $16,%%mm2\n"277 "psrlq $16,%%mm3\n"278 "por %%mm2,%%mm0\n"279 "por %%mm3,%%mm1\n"281 /* current_upper */282 "movq (%0),%%mm6\n"284 /* compute the upper-left pixel for dst on %%mm2 */285 /* compute the upper-right pixel for dst on %%mm4 */286 "movq %%mm0,%%mm2\n"287 "movq %%mm1,%%mm4\n"288 "movq %%mm0,%%mm3\n"289 "movq %%mm1,%%mm5\n"290 "pcmpeqw %%mm6,%%mm2\n"291 "pcmpeqw %%mm6,%%mm4\n"292 "pcmpeqw (%2),%%mm3\n"293 "pcmpeqw (%2),%%mm5\n"294 "pandn %%mm2,%%mm3\n"295 "pandn %%mm4,%%mm5\n"296 "movq %%mm0,%%mm2\n"297 "movq %%mm1,%%mm4\n"298 "pcmpeqw %%mm1,%%mm2\n"299 "pcmpeqw %%mm0,%%mm4\n"300 "pandn %%mm3,%%mm2\n"301 "pandn %%mm5,%%mm4\n"302 "movq %%mm2,%%mm3\n"303 "movq %%mm4,%%mm5\n"304 "pand %%mm6,%%mm2\n"305 "pand %%mm6,%%mm4\n"306 "pandn %%mm7,%%mm3\n"307 "pandn %%mm7,%%mm5\n"308 "por %%mm3,%%mm2\n"309 "por %%mm5,%%mm4\n"311 /* set *dst */312 "movq %%mm2,%%mm3\n"313 "punpcklwd %%mm4,%%mm2\n"314 "punpckhwd %%mm4,%%mm3\n"315 "movq %%mm2,(%3)\n"316 "movq %%mm3,8(%3)\n"317 "emms\n"319 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)320 :321 : "cc"322 );323 #else324 __asm {325 mov eax, src0;326 mov ebx, src1;327 mov ecx, src2;328 mov edx, dst;329 mov esi, count;331 /* first run */332 /* set the current, current_pre, current_next registers */333 movq mm0, qword ptr [ebx];334 movq mm7, qword ptr [ebx];335 movq mm1, qword ptr [ebx + 8];336 psllq mm0, 48;337 psllq mm1, 48;338 psrlq mm0, 48;339 movq mm2, mm7;340 movq mm3, mm7;341 psllq mm2, 16;342 psrlq mm3, 16;343 por mm0, mm2;344 por mm1, mm3;346 /* current_upper */347 movq mm6, qword ptr [eax];349 /* compute the upper-left pixel for dst on %%mm2 */350 /* compute the upper-right pixel for dst on %%mm4 */351 movq mm2, mm0;352 movq mm4, mm1;353 movq mm3, mm0;354 movq mm5, mm1;355 pcmpeqw mm2, mm6;356 pcmpeqw mm4, mm6;357 pcmpeqw mm3, qword ptr [ecx];358 pcmpeqw mm5, qword ptr [ecx];359 pandn mm3, mm2;360 pandn mm5, mm4;361 movq mm2, mm0;362 movq mm4, mm1;363 pcmpeqw mm2, mm1;364 pcmpeqw mm4, mm0;365 pandn mm2, mm3;366 pandn mm4, mm5;367 movq mm3, mm2;368 movq mm5, mm4;369 pand mm2, mm6;370 pand mm4, mm6;371 pandn mm3, mm7;372 pandn mm5, mm7;373 por mm2, mm3;374 por mm4, mm5;376 /* set *dst0 */377 movq mm3, mm2;378 punpcklwd mm2, mm4;379 punpckhwd mm3, mm4;380 movq qword ptr [edx], mm2;381 movq qword ptr [edx + 8], mm3;383 /* next */384 add eax, 8;385 add ebx, 8;386 add ecx, 8;387 add edx, 16;389 /* central runs */390 shr esi, 2;391 jz label1;392 align 4;393 label0:395 /* set the current, current_pre, current_next registers */396 movq mm0, qword ptr [ebx - 8];397 movq mm7, qword ptr [ebx];398 movq mm1, qword ptr [ebx + 8];399 psrlq mm0, 48;400 psllq mm1, 48;401 movq mm2, mm7;402 movq mm3, mm7;403 psllq mm2, 16;404 psrlq mm3, 16;405 por mm0, mm2;406 por mm1, mm3;408 /* current_upper */409 movq mm6, qword ptr [eax];411 /* compute the upper-left pixel for dst on %%mm2 */412 /* compute the upper-right pixel for dst on %%mm4 */413 movq mm2, mm0;414 movq mm4, mm1;415 movq mm3, mm0;416 movq mm5, mm1;417 pcmpeqw mm2, mm6;418 pcmpeqw mm4, mm6;419 pcmpeqw mm3, qword ptr [ecx];420 pcmpeqw mm5, qword ptr [ecx];421 pandn mm3, mm2;422 pandn mm5, mm4;423 movq mm2, mm0;424 movq mm4, mm1;425 pcmpeqw mm2, mm1;426 pcmpeqw mm4, mm0;427 pandn mm2, mm3;428 pandn mm4, mm5;429 movq mm3, mm2;430 movq mm5, mm4;431 pand mm2, mm6;432 pand mm4, mm6;433 pandn mm3, mm7;434 pandn mm5, mm7;435 por mm2, mm3;436 por mm4, mm5;438 /* set *dst */439 movq mm3, mm2;440 punpcklwd mm2, mm4;441 punpckhwd mm3, mm4;442 movq qword ptr [edx], mm2;443 movq qword ptr [edx + 8], mm3;445 /* next */446 add eax, 8;447 add ebx, 8;448 add ecx, 8;449 add edx, 16;451 dec esi;452 jnz label0;453 label1:455 /* final run */456 /* set the current, current_pre, current_next registers */457 movq mm1, qword ptr [ebx];458 movq mm7, qword ptr [ebx];459 movq mm0, qword ptr [ebx - 8];460 psrlq mm1, 48;461 psrlq mm0, 48;462 psllq mm1, 48;463 movq mm2, mm7;464 movq mm3, mm7;465 psllq mm2, 16;466 psrlq mm3, 16;467 por mm0, mm2;468 por mm1, mm3;470 /* current_upper */471 movq mm6, qword ptr [eax];473 /* compute the upper-left pixel for dst on %%mm2 */474 /* compute the upper-right pixel for dst on %%mm4 */475 movq mm2, mm0;476 movq mm4, mm1;477 movq mm3, mm0;478 movq mm5, mm1;479 pcmpeqw mm2, mm6;480 pcmpeqw mm4, mm6;481 pcmpeqw mm3, qword ptr [ecx];482 pcmpeqw mm5, qword ptr [ecx];483 pandn mm3, mm2;484 pandn mm5, mm4;485 movq mm2, mm0;486 movq mm4, mm1;487 pcmpeqw mm2, mm1;488 pcmpeqw mm4, mm0;489 pandn mm2, mm3;490 pandn mm4, mm5;491 movq mm3, mm2;492 movq mm5, mm4;493 pand mm2, mm6;494 pand mm4, mm6;495 pandn mm3, mm7;496 pandn mm5, mm7;497 por mm2, mm3;498 por mm4, mm5;500 /* set *dst */501 movq mm3, mm2;502 punpcklwd mm2, mm4;503 punpckhwd mm3, mm4;504 movq qword ptr [edx], mm2;505 movq qword ptr [edx + 8], mm3;507 mov src0, eax;508 mov src1, ebx;509 mov src2, ecx;510 mov dst, edx;511 mov count, esi;513 emms;514 }515 #endif516 }518 static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)519 {520 /* always do the first and last run */521 count -= 2 * 2;523 #ifdef __GNUC__524 __asm__ __volatile__ (525 /* first run */526 /* set the current, current_pre, current_next registers */527 "movq 0(%1),%%mm0\n"528 "movq 0(%1),%%mm7\n"529 "movq 8(%1),%%mm1\n"530 "psllq $32,%%mm0\n"531 "psllq $32,%%mm1\n"532 "psrlq $32,%%mm0\n"533 "movq %%mm7,%%mm2\n"534 "movq %%mm7,%%mm3\n"535 "psllq $32,%%mm2\n"536 "psrlq $32,%%mm3\n"537 "por %%mm2,%%mm0\n"538 "por %%mm3,%%mm1\n"540 /* current_upper */541 "movq (%0),%%mm6\n"543 /* compute the upper-left pixel for dst on %%mm2 */544 /* compute the upper-right pixel for dst on %%mm4 */545 "movq %%mm0,%%mm2\n"546 "movq %%mm1,%%mm4\n"547 "movq %%mm0,%%mm3\n"548 "movq %%mm1,%%mm5\n"549 "pcmpeqd %%mm6,%%mm2\n"550 "pcmpeqd %%mm6,%%mm4\n"551 "pcmpeqd (%2),%%mm3\n"552 "pcmpeqd (%2),%%mm5\n"553 "pandn %%mm2,%%mm3\n"554 "pandn %%mm4,%%mm5\n"555 "movq %%mm0,%%mm2\n"556 "movq %%mm1,%%mm4\n"557 "pcmpeqd %%mm1,%%mm2\n"558 "pcmpeqd %%mm0,%%mm4\n"559 "pandn %%mm3,%%mm2\n"560 "pandn %%mm5,%%mm4\n"561 "movq %%mm2,%%mm3\n"562 "movq %%mm4,%%mm5\n"563 "pand %%mm6,%%mm2\n"564 "pand %%mm6,%%mm4\n"565 "pandn %%mm7,%%mm3\n"566 "pandn %%mm7,%%mm5\n"567 "por %%mm3,%%mm2\n"568 "por %%mm5,%%mm4\n"570 /* set *dst */571 "movq %%mm2,%%mm3\n"572 "punpckldq %%mm4,%%mm2\n"573 "punpckhdq %%mm4,%%mm3\n"574 "movq %%mm2,(%3)\n"575 "movq %%mm3, 8(%3)\n"577 /* next */578 "addl $8,%0\n"579 "addl $8,%1\n"580 "addl $8,%2\n"581 "addl $16,%3\n"583 /* central runs */584 "shrl $1,%4\n"585 "jz 1f\n"587 "0:\n"589 /* set the current, current_pre, current_next registers */590 "movq -8(%1),%%mm0\n"591 "movq (%1),%%mm7\n"592 "movq 8(%1),%%mm1\n"593 "psrlq $32,%%mm0\n"594 "psllq $32,%%mm1\n"595 "movq %%mm7,%%mm2\n"596 "movq %%mm7,%%mm3\n"597 "psllq $32,%%mm2\n"598 "psrlq $32,%%mm3\n"599 "por %%mm2,%%mm0\n"600 "por %%mm3,%%mm1\n"602 /* current_upper */603 "movq (%0),%%mm6\n"605 /* compute the upper-left pixel for dst on %%mm2 */606 /* compute the upper-right pixel for dst on %%mm4 */607 "movq %%mm0,%%mm2\n"608 "movq %%mm1,%%mm4\n"609 "movq %%mm0,%%mm3\n"610 "movq %%mm1,%%mm5\n"611 "pcmpeqd %%mm6,%%mm2\n"612 "pcmpeqd %%mm6,%%mm4\n"613 "pcmpeqd (%2),%%mm3\n"614 "pcmpeqd (%2),%%mm5\n"615 "pandn %%mm2,%%mm3\n"616 "pandn %%mm4,%%mm5\n"617 "movq %%mm0,%%mm2\n"618 "movq %%mm1,%%mm4\n"619 "pcmpeqd %%mm1,%%mm2\n"620 "pcmpeqd %%mm0,%%mm4\n"621 "pandn %%mm3,%%mm2\n"622 "pandn %%mm5,%%mm4\n"623 "movq %%mm2,%%mm3\n"624 "movq %%mm4,%%mm5\n"625 "pand %%mm6,%%mm2\n"626 "pand %%mm6,%%mm4\n"627 "pandn %%mm7,%%mm3\n"628 "pandn %%mm7,%%mm5\n"629 "por %%mm3,%%mm2\n"630 "por %%mm5,%%mm4\n"632 /* set *dst */633 "movq %%mm2,%%mm3\n"634 "punpckldq %%mm4,%%mm2\n"635 "punpckhdq %%mm4,%%mm3\n"636 "movq %%mm2,(%3)\n"637 "movq %%mm3,8(%3)\n"639 /* next */640 "addl $8,%0\n"641 "addl $8,%1\n"642 "addl $8,%2\n"643 "addl $16,%3\n"645 "decl %4\n"646 "jnz 0b\n"647 "1:\n"649 /* final run */650 /* set the current, current_pre, current_next registers */651 "movq (%1),%%mm1\n"652 "movq (%1),%%mm7\n"653 "movq -8(%1), %%mm0\n"654 "psrlq $32,%%mm1\n"655 "psrlq $32,%%mm0\n"656 "psllq $32,%%mm1\n"657 "movq %%mm7,%%mm2\n"658 "movq %%mm7,%%mm3\n"659 "psllq $32,%%mm2\n"660 "psrlq $32,%%mm3\n"661 "por %%mm2,%%mm0\n"662 "por %%mm3,%%mm1\n"664 /* current_upper */665 "movq (%0),%%mm6\n"667 /* compute the upper-left pixel for dst on %%mm2 */668 /* compute the upper-right pixel for dst on %%mm4 */669 "movq %%mm0,%%mm2\n"670 "movq %%mm1,%%mm4\n"671 "movq %%mm0,%%mm3\n"672 "movq %%mm1,%%mm5\n"673 "pcmpeqd %%mm6,%%mm2\n"674 "pcmpeqd %%mm6,%%mm4\n"675 "pcmpeqd (%2),%%mm3\n"676 "pcmpeqd (%2),%%mm5\n"677 "pandn %%mm2,%%mm3\n"678 "pandn %%mm4,%%mm5\n"679 "movq %%mm0,%%mm2\n"680 "movq %%mm1,%%mm4\n"681 "pcmpeqd %%mm1,%%mm2\n"682 "pcmpeqd %%mm0,%%mm4\n"683 "pandn %%mm3,%%mm2\n"684 "pandn %%mm5,%%mm4\n"685 "movq %%mm2,%%mm3\n"686 "movq %%mm4,%%mm5\n"687 "pand %%mm6,%%mm2\n"688 "pand %%mm6,%%mm4\n"689 "pandn %%mm7,%%mm3\n"690 "pandn %%mm7,%%mm5\n"691 "por %%mm3,%%mm2\n"692 "por %%mm5,%%mm4\n"694 /* set *dst */695 "movq %%mm2,%%mm3\n"696 "punpckldq %%mm4,%%mm2\n"697 "punpckhdq %%mm4,%%mm3\n"698 "movq %%mm2,(%3)\n"699 "movq %%mm3,8(%3)\n"700 "emms\n"702 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)703 :704 : "cc"705 );706 #else707 __asm {708 mov eax, src0;709 mov ebx, src1;710 mov ecx, src2;711 mov edx, dst;712 mov esi, count;714 /* first run */715 /* set the current, current_pre, current_next registers */716 movq mm0, qword ptr [ebx];717 movq mm7, qword ptr [ebx];718 movq mm1, qword ptr [ebx + 8];719 psllq mm0, 32;720 psllq mm1, 32;721 psrlq mm0, 32;722 movq mm2, mm7;723 movq mm3, mm7;724 psllq mm2, 32;725 psrlq mm3, 32;726 por mm0, mm2;727 por mm1, mm3;729 /* current_upper */730 movq mm6, qword ptr [eax];732 /* compute the upper-left pixel for dst on %%mm2 */733 /* compute the upper-right pixel for dst on %%mm4 */734 movq mm2, mm0;735 movq mm4, mm1;736 movq mm3, mm0;737 movq mm5, mm1;738 pcmpeqd mm2, mm6;739 pcmpeqd mm4, mm6;740 pcmpeqd mm3, qword ptr [ecx];741 pcmpeqd mm5, qword ptr [ecx];742 pandn mm3, mm2;743 pandn mm5, mm4;744 movq mm2, mm0;745 movq mm4, mm1;746 pcmpeqd mm2, mm1;747 pcmpeqd mm4, mm0;748 pandn mm2, mm3;749 pandn mm4, mm5;750 movq mm3, mm2;751 movq mm5, mm4;752 pand mm2, mm6;753 pand mm4, mm6;754 pandn mm3, mm7;755 pandn mm5, mm7;756 por mm2, mm3;757 por mm4, mm5;759 /* set *dst */760 movq mm3, mm2;761 punpckldq mm2, mm4;762 punpckhdq mm3, mm4;763 movq qword ptr [edx], mm2;764 movq qword ptr [edx + 8], mm3;766 /* next */767 add eax, 8;768 add ebx, 8;769 add ecx, 8;770 add edx, 16;772 /* central runs */773 shr esi, 1;774 jz label1;775 label0:777 /* set the current, current_pre, current_next registers */778 movq mm0, qword ptr [ebx - 8];779 movq mm7, qword ptr [ebx];780 movq mm1, qword ptr [ebx + 8];781 psrlq mm0, 32;782 psllq mm1, 32;783 movq mm2, mm7;784 movq mm3, mm7;785 psllq mm2, 32;786 psrlq mm3, 32;787 por mm0, mm2;788 por mm1, mm3;790 /* current_upper */791 movq mm6, qword ptr[eax];793 /* compute the upper-left pixel for dst on %%mm2 */794 /* compute the upper-right pixel for dst on %%mm4 */795 movq mm2, mm0;796 movq mm4, mm1;797 movq mm3, mm0;798 movq mm5, mm1;799 pcmpeqd mm2, mm6;800 pcmpeqd mm4, mm6;801 pcmpeqd mm3, qword ptr[ecx];802 pcmpeqd mm5, qword ptr[ecx];803 pandn mm3, mm2;804 pandn mm5, mm4;805 movq mm2, mm0;806 movq mm4, mm1;807 pcmpeqd mm2, mm1;808 pcmpeqd mm4, mm0;809 pandn mm2, mm3;810 pandn mm4, mm5;811 movq mm3, mm2;812 movq mm5, mm4;813 pand mm2, mm6;814 pand mm4, mm6;815 pandn mm3, mm7;816 pandn mm5, mm7;817 por mm2, mm3;818 por mm4, mm5;820 /* set *dst */821 movq mm3, mm2;822 punpckldq mm2, mm4;823 punpckhdq mm3, mm4;824 movq qword ptr [edx], mm2;825 movq qword ptr [edx + 8], mm3;827 /* next */828 add eax, 8;829 add ebx, 8;830 add ecx, 8;831 add edx, 16;833 dec esi;834 jnz label0;835 label1:837 /* final run */838 /* set the current, current_pre, current_next registers */839 movq mm1, qword ptr [ebx];840 movq mm7, qword ptr [ebx];841 movq mm0, qword ptr [ebx - 8];842 psrlq mm1, 32;843 psrlq mm0, 32;844 psllq mm1, 32;845 movq mm2, mm7;846 movq mm3, mm7;847 psllq mm2, 32;848 psrlq mm3, 32;849 por mm0, mm2;850 por mm1, mm3;852 /* current_upper */853 movq mm6, qword ptr [eax];855 /* compute the upper-left pixel for dst on %%mm2 */856 /* compute the upper-right pixel for dst on %%mm4 */857 movq mm2, mm0;858 movq mm4, mm1;859 movq mm3, mm0;860 movq mm5, mm1;861 pcmpeqd mm2, mm6;862 pcmpeqd mm4, mm6;863 pcmpeqd mm3, qword ptr [ecx];864 pcmpeqd mm5, qword ptr [ecx];865 pandn mm3, mm2;866 pandn mm5, mm4;867 movq mm2, mm0;868 movq mm4, mm1;869 pcmpeqd mm2, mm1;870 pcmpeqd mm4, mm0;871 pandn mm2, mm3;872 pandn mm4, mm5;873 movq mm3, mm2;874 movq mm5, mm4;875 pand mm2, mm6;876 pand mm4, mm6;877 pandn mm3, mm7;878 pandn mm5, mm7;879 por mm2, mm3;880 por mm4, mm5;882 /* set *dst */883 movq mm3, mm2;884 punpckldq mm2, mm4;885 punpckhdq mm3, mm4;886 movq qword ptr [edx], mm2;887 movq qword ptr [edx + 8], mm3;889 mov src0, eax;890 mov src1, ebx;891 mov src2, ecx;892 mov dst, edx;893 mov count, esi;895 emms;896 }897 #endif898 }900 static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)901 {902 // assert( count >= 2*4 );903 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);904 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);905 }907 static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)908 {909 // assert( count >= 2*2 );910 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);911 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);912 }914 #endif916 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,917 u8 *dstPtr, u32 dstPitch, int width, int height)918 {919 u16 *dst0 = (u16 *)dstPtr;920 u16 *dst1 = dst0 + (dstPitch >> 1);922 u16 *src0 = (u16 *)srcPtr;923 u16 *src1 = src0 + (srcPitch >> 1);924 u16 *src2 = src1 + (srcPitch >> 1);925 #ifdef MMX926 if (cpu_mmx)927 {928 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);930 int count = height;932 count -= 2;933 while (count)934 {935 dst0 += dstPitch;936 dst1 += dstPitch;937 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);938 src0 = src1;939 src1 = src2;940 src2 += srcPitch >> 1;941 --count;942 }943 dst0 += dstPitch;944 dst1 += dstPitch;945 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);946 }947 else948 {949 #endif950 internal_scale2x_16_def(dst0, src0, src0, src1, width);951 internal_scale2x_16_def(dst1, src1, src0, src0, width);953 int count = height;955 count -= 2;956 while (count)957 {958 dst0 += dstPitch;959 dst1 += dstPitch;960 internal_scale2x_16_def(dst0, src0, src1, src2, width);961 internal_scale2x_16_def(dst1, src2, src1, src0, width);962 src0 = src1;963 src1 = src2;964 src2 += srcPitch >> 1;965 --count;966 }967 dst0 += dstPitch;968 dst1 += dstPitch;969 internal_scale2x_16_def(dst0, src0, src1, src1, width);970 internal_scale2x_16_def(dst1, src1, src1, src0, width);971 #ifdef MMX972 }974 #endif975 }977 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,978 u8 *dstPtr, u32 dstPitch, int width, int height)979 {980 u32 *dst0 = (u32 *)dstPtr;981 u32 *dst1 = dst0 + (dstPitch >> 2);983 u32 *src0 = (u32 *)srcPtr;984 u32 *src1 = src0 + (srcPitch >> 2);985 u32 *src2 = src1 + (srcPitch >> 2);986 #ifdef MMX987 if (cpu_mmx)988 {989 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);991 int count = height;993 count -= 2;994 while (count)995 {996 dst0 += dstPitch >> 1;997 dst1 += dstPitch >> 1;998 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);999 src0 = src1;1000 src1 = src2;1001 src2 += srcPitch >> 2;1002 --count;1003 }1004 dst0 += dstPitch >> 1;1005 dst1 += dstPitch >> 1;1006 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);1007 }1008 else1009 {1010 #endif1011 internal_scale2x_32_def(dst0, src0, src0, src1, width);1012 internal_scale2x_32_def(dst1, src1, src0, src0, width);1014 int count = height;1016 count -= 2;1017 while (count)1018 {1019 dst0 += dstPitch >> 1;1020 dst1 += dstPitch >> 1;1021 internal_scale2x_32_def(dst0, src0, src1, src2, width);1022 internal_scale2x_32_def(dst1, src2, src1, src0, width);1023 src0 = src1;1024 src1 = src2;1025 src2 += srcPitch >> 2;1026 --count;1027 }1028 dst0 += dstPitch >> 1;1029 dst1 += dstPitch >> 1;1030 internal_scale2x_32_def(dst0, src0, src1, src1, width);1031 internal_scale2x_32_def(dst1, src1, src1, src0, width);1032 #ifdef MMX1033 }1035 #endif1036 }