rlm@1: /* rlm@1: * This file is part of the Advance project. rlm@1: * rlm@1: * Copyright (C) 1999-2002 Andrea Mazzoleni rlm@1: * rlm@1: * This program is free software; you can redistribute it and/or modify rlm@1: * it under the terms of the GNU General Public License as published by rlm@1: * the Free Software Foundation; either version 2 of the License, or rlm@1: * (at your option) any later version. rlm@1: * rlm@1: * This program is distributed in the hope that it will be useful, rlm@1: * but WITHOUT ANY WARRANTY; without even the implied warranty of rlm@1: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the rlm@1: * GNU General Public License for more details. rlm@1: * rlm@1: * You should have received a copy of the GNU General Public License rlm@1: * along with this program; if not, write to the Free Software rlm@1: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. rlm@1: * rlm@1: * In addition, as a special exception, Andrea Mazzoleni rlm@1: * gives permission to link the code of this program with rlm@1: * the MAME library (or with modified versions of MAME that use the rlm@1: * same license as MAME), and distribute linked combinations including rlm@1: * the two. You must obey the GNU General Public License in all rlm@1: * respects for all of the code used other than MAME. If you modify rlm@1: * this file, you may extend this exception to your version of the rlm@1: * file, but you are not obligated to do so. If you do not wish to rlm@1: * do so, delete this exception statement from your version. rlm@1: */ rlm@1: rlm@1: /* rlm@1: * Alternatively at the previous license terms, you are allowed to use this rlm@1: * code in your program with these conditions: rlm@1: * - the program is not used in commercial activities. rlm@1: * - the whole source code of the program is released with the binary. rlm@1: */ rlm@1: rlm@1: #include "../Port.h" rlm@1: rlm@1: #ifdef MMX rlm@1: extern "C" bool cpu_mmx; rlm@1: #endif rlm@1: rlm@1: static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) rlm@1: { rlm@1: /* first pixel */ rlm@1: dst[0] = src1[0]; rlm@1: if (src1[1] == src0[0] && src2[0] != src0[0]) rlm@1: dst[1] = src0[0]; rlm@1: else rlm@1: dst[1] = src1[0]; rlm@1: ++src0; rlm@1: ++src1; rlm@1: ++src2; rlm@1: dst += 2; rlm@1: rlm@1: /* central pixels */ rlm@1: count -= 2; rlm@1: while (count) rlm@1: { rlm@1: if (src0[0] != src2[0] && src1[-1] != src1[1]) rlm@1: { rlm@1: dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0]; rlm@1: dst[1] = src1[1] == src0[0] ? src0[0] : src1[0]; rlm@1: } rlm@1: else rlm@1: { rlm@1: dst[0] = src1[0]; rlm@1: dst[1] = src1[0]; rlm@1: } rlm@1: rlm@1: ++src0; rlm@1: ++src1; rlm@1: ++src2; rlm@1: dst += 2; rlm@1: --count; rlm@1: } rlm@1: rlm@1: /* last pixel */ rlm@1: if (src1[-1] == src0[0] && src2[0] != src0[0]) rlm@1: dst[0] = src0[0]; rlm@1: else rlm@1: dst[0] = src1[0]; rlm@1: dst[1] = src1[0]; rlm@1: } rlm@1: rlm@1: static void internal_scale2x_32_def(u32 *dst, rlm@1: const u32 *src0, rlm@1: const u32 *src1, rlm@1: const u32 *src2, rlm@1: unsigned count) rlm@1: { rlm@1: /* first pixel */ rlm@1: dst[0] = src1[0]; rlm@1: if (src1[1] == src0[0] && src2[0] != src0[0]) rlm@1: dst[1] = src0[0]; rlm@1: else rlm@1: dst[1] = src1[0]; rlm@1: ++src0; rlm@1: ++src1; rlm@1: ++src2; rlm@1: dst += 2; rlm@1: rlm@1: /* central pixels */ rlm@1: count -= 2; rlm@1: while (count) rlm@1: { rlm@1: if (src0[0] != src2[0] && src1[-1] != src1[1]) rlm@1: { rlm@1: dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0]; rlm@1: dst[1] = src1[1] == src0[0] ? src0[0] : src1[0]; rlm@1: } rlm@1: else rlm@1: { rlm@1: dst[0] = src1[0]; rlm@1: dst[1] = src1[0]; rlm@1: } rlm@1: rlm@1: ++src0; rlm@1: ++src1; rlm@1: ++src2; rlm@1: dst += 2; rlm@1: --count; rlm@1: } rlm@1: rlm@1: /* last pixel */ rlm@1: if (src1[-1] == src0[0] && src2[0] != src0[0]) rlm@1: dst[0] = src0[0]; rlm@1: else rlm@1: dst[0] = src1[0]; rlm@1: dst[1] = src1[0]; rlm@1: } rlm@1: rlm@1: #ifdef MMX rlm@1: static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) rlm@1: { rlm@1: /* always do the first and last run */ rlm@1: count -= 2 * 4; rlm@1: rlm@1: #ifdef __GNUC__ rlm@1: __asm__ __volatile__ ( rlm@1: /* first run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: "movq 0(%1), %%mm0\n" rlm@1: "movq 0(%1),%%mm7\n" rlm@1: "movq 8(%1),%%mm1\n" rlm@1: "psllq $48,%%mm0\n" rlm@1: "psllq $48,%%mm1\n" rlm@1: "psrlq $48, %%mm0\n" rlm@1: "movq %%mm7,%%mm2\n" rlm@1: "movq %%mm7,%%mm3\n" rlm@1: "psllq $16,%%mm2\n" rlm@1: "psrlq $16,%%mm3\n" rlm@1: "por %%mm2,%%mm0\n" rlm@1: "por %%mm3,%%mm1\n" rlm@1: rlm@1: /* current_upper */ rlm@1: "movq (%0),%%mm6\n" rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "movq %%mm0,%%mm3\n" rlm@1: "movq %%mm1,%%mm5\n" rlm@1: "pcmpeqw %%mm6,%%mm2\n" rlm@1: "pcmpeqw %%mm6,%%mm4\n" rlm@1: "pcmpeqw (%2),%%mm3\n" rlm@1: "pcmpeqw (%2),%%mm5\n" rlm@1: "pandn %%mm2,%%mm3\n" rlm@1: "pandn %%mm4,%%mm5\n" rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "pcmpeqw %%mm1,%%mm2\n" rlm@1: "pcmpeqw %%mm0,%%mm4\n" rlm@1: "pandn %%mm3,%%mm2\n" rlm@1: "pandn %%mm5,%%mm4\n" rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "movq %%mm4,%%mm5\n" rlm@1: "pand %%mm6,%%mm2\n" rlm@1: "pand %%mm6,%%mm4\n" rlm@1: "pandn %%mm7,%%mm3\n" rlm@1: "pandn %%mm7,%%mm5\n" rlm@1: "por %%mm3,%%mm2\n" rlm@1: "por %%mm5,%%mm4\n" rlm@1: rlm@1: /* set *dst */ rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "punpcklwd %%mm4,%%mm2\n" rlm@1: "punpckhwd %%mm4,%%mm3\n" rlm@1: "movq %%mm2,(%3)\n" rlm@1: "movq %%mm3,8(%3)\n" rlm@1: rlm@1: /* next */ rlm@1: "addl $8,%0\n" rlm@1: "addl $8,%1\n" rlm@1: "addl $8,%2\n" rlm@1: "addl $16,%3\n" rlm@1: rlm@1: /* central runs */ rlm@1: "shrl $2,%4\n" rlm@1: "jz 1f\n" rlm@1: rlm@1: "0:\n" rlm@1: rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: "movq -8(%1),%%mm0\n" rlm@1: "movq (%1),%%mm7\n" rlm@1: "movq 8(%1),%%mm1\n" rlm@1: "psrlq $48,%%mm0\n" rlm@1: "psllq $48,%%mm1\n" rlm@1: "movq %%mm7,%%mm2\n" rlm@1: "movq %%mm7,%%mm3\n" rlm@1: "psllq $16,%%mm2\n" rlm@1: "psrlq $16,%%mm3\n" rlm@1: "por %%mm2,%%mm0\n" rlm@1: "por %%mm3,%%mm1\n" rlm@1: rlm@1: /* current_upper */ rlm@1: "movq (%0),%%mm6\n" rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "movq %%mm0,%%mm3\n" rlm@1: "movq %%mm1,%%mm5\n" rlm@1: "pcmpeqw %%mm6,%%mm2\n" rlm@1: "pcmpeqw %%mm6,%%mm4\n" rlm@1: "pcmpeqw (%2),%%mm3\n" rlm@1: "pcmpeqw (%2),%%mm5\n" rlm@1: "pandn %%mm2,%%mm3\n" rlm@1: "pandn %%mm4,%%mm5\n" rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "pcmpeqw %%mm1,%%mm2\n" rlm@1: "pcmpeqw %%mm0,%%mm4\n" rlm@1: "pandn %%mm3,%%mm2\n" rlm@1: "pandn %%mm5,%%mm4\n" rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "movq %%mm4,%%mm5\n" rlm@1: "pand %%mm6,%%mm2\n" rlm@1: "pand %%mm6,%%mm4\n" rlm@1: "pandn %%mm7,%%mm3\n" rlm@1: "pandn %%mm7,%%mm5\n" rlm@1: "por %%mm3,%%mm2\n" rlm@1: "por %%mm5,%%mm4\n" rlm@1: rlm@1: /* set *dst */ rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "punpcklwd %%mm4,%%mm2\n" rlm@1: "punpckhwd %%mm4,%%mm3\n" rlm@1: "movq %%mm2,(%3)\n" rlm@1: "movq %%mm3,8(%3)\n" rlm@1: rlm@1: /* next */ rlm@1: "addl $8,%0\n" rlm@1: "addl $8,%1\n" rlm@1: "addl $8,%2\n" rlm@1: "addl $16,%3\n" rlm@1: rlm@1: "decl %4\n" rlm@1: "jnz 0b\n" rlm@1: "1:\n" rlm@1: rlm@1: /* final run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: "movq (%1),%%mm1\n" rlm@1: "movq (%1),%%mm7\n" rlm@1: "movq -8(%1),%%mm0\n" rlm@1: "psrlq $48,%%mm1\n" rlm@1: "psrlq $48,%%mm0\n" rlm@1: "psllq $48,%%mm1\n" rlm@1: "movq %%mm7,%%mm2\n" rlm@1: "movq %%mm7,%%mm3\n" rlm@1: "psllq $16,%%mm2\n" rlm@1: "psrlq $16,%%mm3\n" rlm@1: "por %%mm2,%%mm0\n" rlm@1: "por %%mm3,%%mm1\n" rlm@1: rlm@1: /* current_upper */ rlm@1: "movq (%0),%%mm6\n" rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "movq %%mm0,%%mm3\n" rlm@1: "movq %%mm1,%%mm5\n" rlm@1: "pcmpeqw %%mm6,%%mm2\n" rlm@1: "pcmpeqw %%mm6,%%mm4\n" rlm@1: "pcmpeqw (%2),%%mm3\n" rlm@1: "pcmpeqw (%2),%%mm5\n" rlm@1: "pandn %%mm2,%%mm3\n" rlm@1: "pandn %%mm4,%%mm5\n" rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "pcmpeqw %%mm1,%%mm2\n" rlm@1: "pcmpeqw %%mm0,%%mm4\n" rlm@1: "pandn %%mm3,%%mm2\n" rlm@1: "pandn %%mm5,%%mm4\n" rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "movq %%mm4,%%mm5\n" rlm@1: "pand %%mm6,%%mm2\n" rlm@1: "pand %%mm6,%%mm4\n" rlm@1: "pandn %%mm7,%%mm3\n" rlm@1: "pandn %%mm7,%%mm5\n" rlm@1: "por %%mm3,%%mm2\n" rlm@1: "por %%mm5,%%mm4\n" rlm@1: rlm@1: /* set *dst */ rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "punpcklwd %%mm4,%%mm2\n" rlm@1: "punpckhwd %%mm4,%%mm3\n" rlm@1: "movq %%mm2,(%3)\n" rlm@1: "movq %%mm3,8(%3)\n" rlm@1: "emms\n" rlm@1: rlm@1: : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count) rlm@1: : rlm@1: : "cc" rlm@1: ); rlm@1: #else rlm@1: __asm { rlm@1: mov eax, src0; rlm@1: mov ebx, src1; rlm@1: mov ecx, src2; rlm@1: mov edx, dst; rlm@1: mov esi, count; rlm@1: rlm@1: /* first run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: movq mm0, qword ptr [ebx]; rlm@1: movq mm7, qword ptr [ebx]; rlm@1: movq mm1, qword ptr [ebx + 8]; rlm@1: psllq mm0, 48; rlm@1: psllq mm1, 48; rlm@1: psrlq mm0, 48; rlm@1: movq mm2, mm7; rlm@1: movq mm3, mm7; rlm@1: psllq mm2, 16; rlm@1: psrlq mm3, 16; rlm@1: por mm0, mm2; rlm@1: por mm1, mm3; rlm@1: rlm@1: /* current_upper */ rlm@1: movq mm6, qword ptr [eax]; rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: movq mm3, mm0; rlm@1: movq mm5, mm1; rlm@1: pcmpeqw mm2, mm6; rlm@1: pcmpeqw mm4, mm6; rlm@1: pcmpeqw mm3, qword ptr [ecx]; rlm@1: pcmpeqw mm5, qword ptr [ecx]; rlm@1: pandn mm3, mm2; rlm@1: pandn mm5, mm4; rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: pcmpeqw mm2, mm1; rlm@1: pcmpeqw mm4, mm0; rlm@1: pandn mm2, mm3; rlm@1: pandn mm4, mm5; rlm@1: movq mm3, mm2; rlm@1: movq mm5, mm4; rlm@1: pand mm2, mm6; rlm@1: pand mm4, mm6; rlm@1: pandn mm3, mm7; rlm@1: pandn mm5, mm7; rlm@1: por mm2, mm3; rlm@1: por mm4, mm5; rlm@1: rlm@1: /* set *dst0 */ rlm@1: movq mm3, mm2; rlm@1: punpcklwd mm2, mm4; rlm@1: punpckhwd mm3, mm4; rlm@1: movq qword ptr [edx], mm2; rlm@1: movq qword ptr [edx + 8], mm3; rlm@1: rlm@1: /* next */ rlm@1: add eax, 8; rlm@1: add ebx, 8; rlm@1: add ecx, 8; rlm@1: add edx, 16; rlm@1: rlm@1: /* central runs */ rlm@1: shr esi, 2; rlm@1: jz label1; rlm@1: align 4; rlm@1: label0: rlm@1: rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: movq mm0, qword ptr [ebx - 8]; rlm@1: movq mm7, qword ptr [ebx]; rlm@1: movq mm1, qword ptr [ebx + 8]; rlm@1: psrlq mm0, 48; rlm@1: psllq mm1, 48; rlm@1: movq mm2, mm7; rlm@1: movq mm3, mm7; rlm@1: psllq mm2, 16; rlm@1: psrlq mm3, 16; rlm@1: por mm0, mm2; rlm@1: por mm1, mm3; rlm@1: rlm@1: /* current_upper */ rlm@1: movq mm6, qword ptr [eax]; rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: movq mm3, mm0; rlm@1: movq mm5, mm1; rlm@1: pcmpeqw mm2, mm6; rlm@1: pcmpeqw mm4, mm6; rlm@1: pcmpeqw mm3, qword ptr [ecx]; rlm@1: pcmpeqw mm5, qword ptr [ecx]; rlm@1: pandn mm3, mm2; rlm@1: pandn mm5, mm4; rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: pcmpeqw mm2, mm1; rlm@1: pcmpeqw mm4, mm0; rlm@1: pandn mm2, mm3; rlm@1: pandn mm4, mm5; rlm@1: movq mm3, mm2; rlm@1: movq mm5, mm4; rlm@1: pand mm2, mm6; rlm@1: pand mm4, mm6; rlm@1: pandn mm3, mm7; rlm@1: pandn mm5, mm7; rlm@1: por mm2, mm3; rlm@1: por mm4, mm5; rlm@1: rlm@1: /* set *dst */ rlm@1: movq mm3, mm2; rlm@1: punpcklwd mm2, mm4; rlm@1: punpckhwd mm3, mm4; rlm@1: movq qword ptr [edx], mm2; rlm@1: movq qword ptr [edx + 8], mm3; rlm@1: rlm@1: /* next */ rlm@1: add eax, 8; rlm@1: add ebx, 8; rlm@1: add ecx, 8; rlm@1: add edx, 16; rlm@1: rlm@1: dec esi; rlm@1: jnz label0; rlm@1: label1: rlm@1: rlm@1: /* final run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: movq mm1, qword ptr [ebx]; rlm@1: movq mm7, qword ptr [ebx]; rlm@1: movq mm0, qword ptr [ebx - 8]; rlm@1: psrlq mm1, 48; rlm@1: psrlq mm0, 48; rlm@1: psllq mm1, 48; rlm@1: movq mm2, mm7; rlm@1: movq mm3, mm7; rlm@1: psllq mm2, 16; rlm@1: psrlq mm3, 16; rlm@1: por mm0, mm2; rlm@1: por mm1, mm3; rlm@1: rlm@1: /* current_upper */ rlm@1: movq mm6, qword ptr [eax]; rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: movq mm3, mm0; rlm@1: movq mm5, mm1; rlm@1: pcmpeqw mm2, mm6; rlm@1: pcmpeqw mm4, mm6; rlm@1: pcmpeqw mm3, qword ptr [ecx]; rlm@1: pcmpeqw mm5, qword ptr [ecx]; rlm@1: pandn mm3, mm2; rlm@1: pandn mm5, mm4; rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: pcmpeqw mm2, mm1; rlm@1: pcmpeqw mm4, mm0; rlm@1: pandn mm2, mm3; rlm@1: pandn mm4, mm5; rlm@1: movq mm3, mm2; rlm@1: movq mm5, mm4; rlm@1: pand mm2, mm6; rlm@1: pand mm4, mm6; rlm@1: pandn mm3, mm7; rlm@1: pandn mm5, mm7; rlm@1: por mm2, mm3; rlm@1: por mm4, mm5; rlm@1: rlm@1: /* set *dst */ rlm@1: movq mm3, mm2; rlm@1: punpcklwd mm2, mm4; rlm@1: punpckhwd mm3, mm4; rlm@1: movq qword ptr [edx], mm2; rlm@1: movq qword ptr [edx + 8], mm3; rlm@1: rlm@1: mov src0, eax; rlm@1: mov src1, ebx; rlm@1: mov src2, ecx; rlm@1: mov dst, edx; rlm@1: mov count, esi; rlm@1: rlm@1: emms; rlm@1: } rlm@1: #endif rlm@1: } rlm@1: rlm@1: static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count) rlm@1: { rlm@1: /* always do the first and last run */ rlm@1: count -= 2 * 2; rlm@1: rlm@1: #ifdef __GNUC__ rlm@1: __asm__ __volatile__ ( rlm@1: /* first run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: "movq 0(%1),%%mm0\n" rlm@1: "movq 0(%1),%%mm7\n" rlm@1: "movq 8(%1),%%mm1\n" rlm@1: "psllq $32,%%mm0\n" rlm@1: "psllq $32,%%mm1\n" rlm@1: "psrlq $32,%%mm0\n" rlm@1: "movq %%mm7,%%mm2\n" rlm@1: "movq %%mm7,%%mm3\n" rlm@1: "psllq $32,%%mm2\n" rlm@1: "psrlq $32,%%mm3\n" rlm@1: "por %%mm2,%%mm0\n" rlm@1: "por %%mm3,%%mm1\n" rlm@1: rlm@1: /* current_upper */ rlm@1: "movq (%0),%%mm6\n" rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "movq %%mm0,%%mm3\n" rlm@1: "movq %%mm1,%%mm5\n" rlm@1: "pcmpeqd %%mm6,%%mm2\n" rlm@1: "pcmpeqd %%mm6,%%mm4\n" rlm@1: "pcmpeqd (%2),%%mm3\n" rlm@1: "pcmpeqd (%2),%%mm5\n" rlm@1: "pandn %%mm2,%%mm3\n" rlm@1: "pandn %%mm4,%%mm5\n" rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "pcmpeqd %%mm1,%%mm2\n" rlm@1: "pcmpeqd %%mm0,%%mm4\n" rlm@1: "pandn %%mm3,%%mm2\n" rlm@1: "pandn %%mm5,%%mm4\n" rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "movq %%mm4,%%mm5\n" rlm@1: "pand %%mm6,%%mm2\n" rlm@1: "pand %%mm6,%%mm4\n" rlm@1: "pandn %%mm7,%%mm3\n" rlm@1: "pandn %%mm7,%%mm5\n" rlm@1: "por %%mm3,%%mm2\n" rlm@1: "por %%mm5,%%mm4\n" rlm@1: rlm@1: /* set *dst */ rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "punpckldq %%mm4,%%mm2\n" rlm@1: "punpckhdq %%mm4,%%mm3\n" rlm@1: "movq %%mm2,(%3)\n" rlm@1: "movq %%mm3, 8(%3)\n" rlm@1: rlm@1: /* next */ rlm@1: "addl $8,%0\n" rlm@1: "addl $8,%1\n" rlm@1: "addl $8,%2\n" rlm@1: "addl $16,%3\n" rlm@1: rlm@1: /* central runs */ rlm@1: "shrl $1,%4\n" rlm@1: "jz 1f\n" rlm@1: rlm@1: "0:\n" rlm@1: rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: "movq -8(%1),%%mm0\n" rlm@1: "movq (%1),%%mm7\n" rlm@1: "movq 8(%1),%%mm1\n" rlm@1: "psrlq $32,%%mm0\n" rlm@1: "psllq $32,%%mm1\n" rlm@1: "movq %%mm7,%%mm2\n" rlm@1: "movq %%mm7,%%mm3\n" rlm@1: "psllq $32,%%mm2\n" rlm@1: "psrlq $32,%%mm3\n" rlm@1: "por %%mm2,%%mm0\n" rlm@1: "por %%mm3,%%mm1\n" rlm@1: rlm@1: /* current_upper */ rlm@1: "movq (%0),%%mm6\n" rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "movq %%mm0,%%mm3\n" rlm@1: "movq %%mm1,%%mm5\n" rlm@1: "pcmpeqd %%mm6,%%mm2\n" rlm@1: "pcmpeqd %%mm6,%%mm4\n" rlm@1: "pcmpeqd (%2),%%mm3\n" rlm@1: "pcmpeqd (%2),%%mm5\n" rlm@1: "pandn %%mm2,%%mm3\n" rlm@1: "pandn %%mm4,%%mm5\n" rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "pcmpeqd %%mm1,%%mm2\n" rlm@1: "pcmpeqd %%mm0,%%mm4\n" rlm@1: "pandn %%mm3,%%mm2\n" rlm@1: "pandn %%mm5,%%mm4\n" rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "movq %%mm4,%%mm5\n" rlm@1: "pand %%mm6,%%mm2\n" rlm@1: "pand %%mm6,%%mm4\n" rlm@1: "pandn %%mm7,%%mm3\n" rlm@1: "pandn %%mm7,%%mm5\n" rlm@1: "por %%mm3,%%mm2\n" rlm@1: "por %%mm5,%%mm4\n" rlm@1: rlm@1: /* set *dst */ rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "punpckldq %%mm4,%%mm2\n" rlm@1: "punpckhdq %%mm4,%%mm3\n" rlm@1: "movq %%mm2,(%3)\n" rlm@1: "movq %%mm3,8(%3)\n" rlm@1: rlm@1: /* next */ rlm@1: "addl $8,%0\n" rlm@1: "addl $8,%1\n" rlm@1: "addl $8,%2\n" rlm@1: "addl $16,%3\n" rlm@1: rlm@1: "decl %4\n" rlm@1: "jnz 0b\n" rlm@1: "1:\n" rlm@1: rlm@1: /* final run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: "movq (%1),%%mm1\n" rlm@1: "movq (%1),%%mm7\n" rlm@1: "movq -8(%1), %%mm0\n" rlm@1: "psrlq $32,%%mm1\n" rlm@1: "psrlq $32,%%mm0\n" rlm@1: "psllq $32,%%mm1\n" rlm@1: "movq %%mm7,%%mm2\n" rlm@1: "movq %%mm7,%%mm3\n" rlm@1: "psllq $32,%%mm2\n" rlm@1: "psrlq $32,%%mm3\n" rlm@1: "por %%mm2,%%mm0\n" rlm@1: "por %%mm3,%%mm1\n" rlm@1: rlm@1: /* current_upper */ rlm@1: "movq (%0),%%mm6\n" rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "movq %%mm0,%%mm3\n" rlm@1: "movq %%mm1,%%mm5\n" rlm@1: "pcmpeqd %%mm6,%%mm2\n" rlm@1: "pcmpeqd %%mm6,%%mm4\n" rlm@1: "pcmpeqd (%2),%%mm3\n" rlm@1: "pcmpeqd (%2),%%mm5\n" rlm@1: "pandn %%mm2,%%mm3\n" rlm@1: "pandn %%mm4,%%mm5\n" rlm@1: "movq %%mm0,%%mm2\n" rlm@1: "movq %%mm1,%%mm4\n" rlm@1: "pcmpeqd %%mm1,%%mm2\n" rlm@1: "pcmpeqd %%mm0,%%mm4\n" rlm@1: "pandn %%mm3,%%mm2\n" rlm@1: "pandn %%mm5,%%mm4\n" rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "movq %%mm4,%%mm5\n" rlm@1: "pand %%mm6,%%mm2\n" rlm@1: "pand %%mm6,%%mm4\n" rlm@1: "pandn %%mm7,%%mm3\n" rlm@1: "pandn %%mm7,%%mm5\n" rlm@1: "por %%mm3,%%mm2\n" rlm@1: "por %%mm5,%%mm4\n" rlm@1: rlm@1: /* set *dst */ rlm@1: "movq %%mm2,%%mm3\n" rlm@1: "punpckldq %%mm4,%%mm2\n" rlm@1: "punpckhdq %%mm4,%%mm3\n" rlm@1: "movq %%mm2,(%3)\n" rlm@1: "movq %%mm3,8(%3)\n" rlm@1: "emms\n" rlm@1: rlm@1: : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count) rlm@1: : rlm@1: : "cc" rlm@1: ); rlm@1: #else rlm@1: __asm { rlm@1: mov eax, src0; rlm@1: mov ebx, src1; rlm@1: mov ecx, src2; rlm@1: mov edx, dst; rlm@1: mov esi, count; rlm@1: rlm@1: /* first run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: movq mm0, qword ptr [ebx]; rlm@1: movq mm7, qword ptr [ebx]; rlm@1: movq mm1, qword ptr [ebx + 8]; rlm@1: psllq mm0, 32; rlm@1: psllq mm1, 32; rlm@1: psrlq mm0, 32; rlm@1: movq mm2, mm7; rlm@1: movq mm3, mm7; rlm@1: psllq mm2, 32; rlm@1: psrlq mm3, 32; rlm@1: por mm0, mm2; rlm@1: por mm1, mm3; rlm@1: rlm@1: /* current_upper */ rlm@1: movq mm6, qword ptr [eax]; rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: movq mm3, mm0; rlm@1: movq mm5, mm1; rlm@1: pcmpeqd mm2, mm6; rlm@1: pcmpeqd mm4, mm6; rlm@1: pcmpeqd mm3, qword ptr [ecx]; rlm@1: pcmpeqd mm5, qword ptr [ecx]; rlm@1: pandn mm3, mm2; rlm@1: pandn mm5, mm4; rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: pcmpeqd mm2, mm1; rlm@1: pcmpeqd mm4, mm0; rlm@1: pandn mm2, mm3; rlm@1: pandn mm4, mm5; rlm@1: movq mm3, mm2; rlm@1: movq mm5, mm4; rlm@1: pand mm2, mm6; rlm@1: pand mm4, mm6; rlm@1: pandn mm3, mm7; rlm@1: pandn mm5, mm7; rlm@1: por mm2, mm3; rlm@1: por mm4, mm5; rlm@1: rlm@1: /* set *dst */ rlm@1: movq mm3, mm2; rlm@1: punpckldq mm2, mm4; rlm@1: punpckhdq mm3, mm4; rlm@1: movq qword ptr [edx], mm2; rlm@1: movq qword ptr [edx + 8], mm3; rlm@1: rlm@1: /* next */ rlm@1: add eax, 8; rlm@1: add ebx, 8; rlm@1: add ecx, 8; rlm@1: add edx, 16; rlm@1: rlm@1: /* central runs */ rlm@1: shr esi, 1; rlm@1: jz label1; rlm@1: label0: rlm@1: rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: movq mm0, qword ptr [ebx - 8]; rlm@1: movq mm7, qword ptr [ebx]; rlm@1: movq mm1, qword ptr [ebx + 8]; rlm@1: psrlq mm0, 32; rlm@1: psllq mm1, 32; rlm@1: movq mm2, mm7; rlm@1: movq mm3, mm7; rlm@1: psllq mm2, 32; rlm@1: psrlq mm3, 32; rlm@1: por mm0, mm2; rlm@1: por mm1, mm3; rlm@1: rlm@1: /* current_upper */ rlm@1: movq mm6, qword ptr[eax]; rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: movq mm3, mm0; rlm@1: movq mm5, mm1; rlm@1: pcmpeqd mm2, mm6; rlm@1: pcmpeqd mm4, mm6; rlm@1: pcmpeqd mm3, qword ptr[ecx]; rlm@1: pcmpeqd mm5, qword ptr[ecx]; rlm@1: pandn mm3, mm2; rlm@1: pandn mm5, mm4; rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: pcmpeqd mm2, mm1; rlm@1: pcmpeqd mm4, mm0; rlm@1: pandn mm2, mm3; rlm@1: pandn mm4, mm5; rlm@1: movq mm3, mm2; rlm@1: movq mm5, mm4; rlm@1: pand mm2, mm6; rlm@1: pand mm4, mm6; rlm@1: pandn mm3, mm7; rlm@1: pandn mm5, mm7; rlm@1: por mm2, mm3; rlm@1: por mm4, mm5; rlm@1: rlm@1: /* set *dst */ rlm@1: movq mm3, mm2; rlm@1: punpckldq mm2, mm4; rlm@1: punpckhdq mm3, mm4; rlm@1: movq qword ptr [edx], mm2; rlm@1: movq qword ptr [edx + 8], mm3; rlm@1: rlm@1: /* next */ rlm@1: add eax, 8; rlm@1: add ebx, 8; rlm@1: add ecx, 8; rlm@1: add edx, 16; rlm@1: rlm@1: dec esi; rlm@1: jnz label0; rlm@1: label1: rlm@1: rlm@1: /* final run */ rlm@1: /* set the current, current_pre, current_next registers */ rlm@1: movq mm1, qword ptr [ebx]; rlm@1: movq mm7, qword ptr [ebx]; rlm@1: movq mm0, qword ptr [ebx - 8]; rlm@1: psrlq mm1, 32; rlm@1: psrlq mm0, 32; rlm@1: psllq mm1, 32; rlm@1: movq mm2, mm7; rlm@1: movq mm3, mm7; rlm@1: psllq mm2, 32; rlm@1: psrlq mm3, 32; rlm@1: por mm0, mm2; rlm@1: por mm1, mm3; rlm@1: rlm@1: /* current_upper */ rlm@1: movq mm6, qword ptr [eax]; rlm@1: rlm@1: /* compute the upper-left pixel for dst on %%mm2 */ rlm@1: /* compute the upper-right pixel for dst on %%mm4 */ rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: movq mm3, mm0; rlm@1: movq mm5, mm1; rlm@1: pcmpeqd mm2, mm6; rlm@1: pcmpeqd mm4, mm6; rlm@1: pcmpeqd mm3, qword ptr [ecx]; rlm@1: pcmpeqd mm5, qword ptr [ecx]; rlm@1: pandn mm3, mm2; rlm@1: pandn mm5, mm4; rlm@1: movq mm2, mm0; rlm@1: movq mm4, mm1; rlm@1: pcmpeqd mm2, mm1; rlm@1: pcmpeqd mm4, mm0; rlm@1: pandn mm2, mm3; rlm@1: pandn mm4, mm5; rlm@1: movq mm3, mm2; rlm@1: movq mm5, mm4; rlm@1: pand mm2, mm6; rlm@1: pand mm4, mm6; rlm@1: pandn mm3, mm7; rlm@1: pandn mm5, mm7; rlm@1: por mm2, mm3; rlm@1: por mm4, mm5; rlm@1: rlm@1: /* set *dst */ rlm@1: movq mm3, mm2; rlm@1: punpckldq mm2, mm4; rlm@1: punpckhdq mm3, mm4; rlm@1: movq qword ptr [edx], mm2; rlm@1: movq qword ptr [edx + 8], mm3; rlm@1: rlm@1: mov src0, eax; rlm@1: mov src1, ebx; rlm@1: mov src2, ecx; rlm@1: mov dst, edx; rlm@1: mov count, esi; rlm@1: rlm@1: emms; rlm@1: } rlm@1: #endif rlm@1: } rlm@1: rlm@1: static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) rlm@1: { rlm@1: // assert( count >= 2*4 ); rlm@1: internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count); rlm@1: internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count); rlm@1: } rlm@1: rlm@1: static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count) rlm@1: { rlm@1: // assert( count >= 2*2 ); rlm@1: internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count); rlm@1: internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count); rlm@1: } rlm@1: rlm@1: #endif rlm@1: rlm@1: void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */, rlm@1: u8 *dstPtr, u32 dstPitch, int width, int height) rlm@1: { rlm@1: u16 *dst0 = (u16 *)dstPtr; rlm@1: u16 *dst1 = dst0 + (dstPitch >> 1); rlm@1: rlm@1: u16 *src0 = (u16 *)srcPtr; rlm@1: u16 *src1 = src0 + (srcPitch >> 1); rlm@1: u16 *src2 = src1 + (srcPitch >> 1); rlm@1: #ifdef MMX rlm@1: if (cpu_mmx) rlm@1: { rlm@1: internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width); rlm@1: rlm@1: int count = height; rlm@1: rlm@1: count -= 2; rlm@1: while (count) rlm@1: { rlm@1: dst0 += dstPitch; rlm@1: dst1 += dstPitch; rlm@1: internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width); rlm@1: src0 = src1; rlm@1: src1 = src2; rlm@1: src2 += srcPitch >> 1; rlm@1: --count; rlm@1: } rlm@1: dst0 += dstPitch; rlm@1: dst1 += dstPitch; rlm@1: internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width); rlm@1: } rlm@1: else rlm@1: { rlm@1: #endif rlm@1: internal_scale2x_16_def(dst0, src0, src0, src1, width); rlm@1: internal_scale2x_16_def(dst1, src1, src0, src0, width); rlm@1: rlm@1: int count = height; rlm@1: rlm@1: count -= 2; rlm@1: while (count) rlm@1: { rlm@1: dst0 += dstPitch; rlm@1: dst1 += dstPitch; rlm@1: internal_scale2x_16_def(dst0, src0, src1, src2, width); rlm@1: internal_scale2x_16_def(dst1, src2, src1, src0, width); rlm@1: src0 = src1; rlm@1: src1 = src2; rlm@1: src2 += srcPitch >> 1; rlm@1: --count; rlm@1: } rlm@1: dst0 += dstPitch; rlm@1: dst1 += dstPitch; rlm@1: internal_scale2x_16_def(dst0, src0, src1, src1, width); rlm@1: internal_scale2x_16_def(dst1, src1, src1, src0, width); rlm@1: #ifdef MMX rlm@1: } rlm@1: rlm@1: #endif rlm@1: } rlm@1: rlm@1: void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */, rlm@1: u8 *dstPtr, u32 dstPitch, int width, int height) rlm@1: { rlm@1: u32 *dst0 = (u32 *)dstPtr; rlm@1: u32 *dst1 = dst0 + (dstPitch >> 2); rlm@1: rlm@1: u32 *src0 = (u32 *)srcPtr; rlm@1: u32 *src1 = src0 + (srcPitch >> 2); rlm@1: u32 *src2 = src1 + (srcPitch >> 2); rlm@1: #ifdef MMX rlm@1: if (cpu_mmx) rlm@1: { rlm@1: internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width); rlm@1: rlm@1: int count = height; rlm@1: rlm@1: count -= 2; rlm@1: while (count) rlm@1: { rlm@1: dst0 += dstPitch >> 1; rlm@1: dst1 += dstPitch >> 1; rlm@1: internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width); rlm@1: src0 = src1; rlm@1: src1 = src2; rlm@1: src2 += srcPitch >> 2; rlm@1: --count; rlm@1: } rlm@1: dst0 += dstPitch >> 1; rlm@1: dst1 += dstPitch >> 1; rlm@1: internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width); rlm@1: } rlm@1: else rlm@1: { rlm@1: #endif rlm@1: internal_scale2x_32_def(dst0, src0, src0, src1, width); rlm@1: internal_scale2x_32_def(dst1, src1, src0, src0, width); rlm@1: rlm@1: int count = height; rlm@1: rlm@1: count -= 2; rlm@1: while (count) rlm@1: { rlm@1: dst0 += dstPitch >> 1; rlm@1: dst1 += dstPitch >> 1; rlm@1: internal_scale2x_32_def(dst0, src0, src1, src2, width); rlm@1: internal_scale2x_32_def(dst1, src2, src1, src0, width); rlm@1: src0 = src1; rlm@1: src1 = src2; rlm@1: src2 += srcPitch >> 2; rlm@1: --count; rlm@1: } rlm@1: dst0 += dstPitch >> 1; rlm@1: dst1 += dstPitch >> 1; rlm@1: internal_scale2x_32_def(dst0, src0, src1, src1, width); rlm@1: internal_scale2x_32_def(dst1, src1, src1, src0, width); rlm@1: #ifdef MMX rlm@1: } rlm@1: rlm@1: #endif rlm@1: }