rlm@1: /*
rlm@1:  * This file is part of the Advance project.
rlm@1:  *
rlm@1:  * Copyright (C) 1999-2002 Andrea Mazzoleni
rlm@1:  *
rlm@1:  * This program is free software; you can redistribute it and/or modify
rlm@1:  * it under the terms of the GNU General Public License as published by
rlm@1:  * the Free Software Foundation; either version 2 of the License, or
rlm@1:  * (at your option) any later version.
rlm@1:  *
rlm@1:  * This program is distributed in the hope that it will be useful,
rlm@1:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
rlm@1:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
rlm@1:  * GNU General Public License for more details.
rlm@1:  *
rlm@1:  * You should have received a copy of the GNU General Public License
rlm@1:  * along with this program; if not, write to the Free Software
rlm@1:  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
rlm@1:  *
rlm@1:  * In addition, as a special exception, Andrea Mazzoleni
rlm@1:  * gives permission to link the code of this program with
rlm@1:  * the MAME library (or with modified versions of MAME that use the
rlm@1:  * same license as MAME), and distribute linked combinations including
rlm@1:  * the two.  You must obey the GNU General Public License in all
rlm@1:  * respects for all of the code used other than MAME.  If you modify
rlm@1:  * this file, you may extend this exception to your version of the
rlm@1:  * file, but you are not obligated to do so.  If you do not wish to
rlm@1:  * do so, delete this exception statement from your version.
rlm@1:  */
rlm@1: 
rlm@1: /*
rlm@1:  * Alternatively at the previous license terms, you are allowed to use this
rlm@1:  * code in your program with these conditions:
rlm@1:  * - the program is not used in commercial activities.
rlm@1:  * - the whole source code of the program is released with the binary.
rlm@1:  */
rlm@1: 
rlm@1: #include "../Port.h"
rlm@1: 
rlm@1: #ifdef MMX
rlm@1: extern "C" bool cpu_mmx;
rlm@1: #endif
rlm@1: 
rlm@1: static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
rlm@1: {
rlm@1: 	/* first pixel */
rlm@1: 	dst[0] = src1[0];
rlm@1: 	if (src1[1] == src0[0] && src2[0] != src0[0])
rlm@1: 		dst[1] = src0[0];
rlm@1: 	else
rlm@1: 		dst[1] = src1[0];
rlm@1: 	++src0;
rlm@1: 	++src1;
rlm@1: 	++src2;
rlm@1: 	dst += 2;
rlm@1: 
rlm@1: 	/* central pixels */
rlm@1: 	count -= 2;
rlm@1: 	while (count)
rlm@1: 	{
rlm@1: 		if (src0[0] != src2[0] && src1[-1] != src1[1])
rlm@1: 		{
rlm@1: 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
rlm@1: 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
rlm@1: 		}
rlm@1: 		else
rlm@1: 		{
rlm@1: 			dst[0] = src1[0];
rlm@1: 			dst[1] = src1[0];
rlm@1: 		}
rlm@1: 
rlm@1: 		++src0;
rlm@1: 		++src1;
rlm@1: 		++src2;
rlm@1: 		dst += 2;
rlm@1: 		--count;
rlm@1: 	}
rlm@1: 
rlm@1: 	/* last pixel */
rlm@1: 	if (src1[-1] == src0[0] && src2[0] != src0[0])
rlm@1: 		dst[0] = src0[0];
rlm@1: 	else
rlm@1: 		dst[0] = src1[0];
rlm@1: 	dst[1] = src1[0];
rlm@1: }
rlm@1: 
rlm@1: static void internal_scale2x_32_def(u32 *dst,
rlm@1:                                     const u32 *src0,
rlm@1:                                     const u32 *src1,
rlm@1:                                     const u32 *src2,
rlm@1:                                     unsigned count)
rlm@1: {
rlm@1: 	/* first pixel */
rlm@1: 	dst[0] = src1[0];
rlm@1: 	if (src1[1] == src0[0] && src2[0] != src0[0])
rlm@1: 		dst[1] = src0[0];
rlm@1: 	else
rlm@1: 		dst[1] = src1[0];
rlm@1: 	++src0;
rlm@1: 	++src1;
rlm@1: 	++src2;
rlm@1: 	dst += 2;
rlm@1: 
rlm@1: 	/* central pixels */
rlm@1: 	count -= 2;
rlm@1: 	while (count)
rlm@1: 	{
rlm@1: 		if (src0[0] != src2[0] && src1[-1] != src1[1])
rlm@1: 		{
rlm@1: 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
rlm@1: 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
rlm@1: 		}
rlm@1: 		else
rlm@1: 		{
rlm@1: 			dst[0] = src1[0];
rlm@1: 			dst[1] = src1[0];
rlm@1: 		}
rlm@1: 
rlm@1: 		++src0;
rlm@1: 		++src1;
rlm@1: 		++src2;
rlm@1: 		dst += 2;
rlm@1: 		--count;
rlm@1: 	}
rlm@1: 
rlm@1: 	/* last pixel */
rlm@1: 	if (src1[-1] == src0[0] && src2[0] != src0[0])
rlm@1: 		dst[0] = src0[0];
rlm@1: 	else
rlm@1: 		dst[0] = src1[0];
rlm@1: 	dst[1] = src1[0];
rlm@1: }
rlm@1: 
rlm@1: #ifdef MMX
rlm@1: static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
rlm@1: {
rlm@1: 	/* always do the first and last run */
rlm@1: 	count -= 2 * 4;
rlm@1: 
rlm@1: #ifdef __GNUC__
rlm@1: 	__asm__ __volatile__ (
rlm@1: 	    /* first run */
rlm@1: 	    /* set the current, current_pre, current_next registers */
rlm@1: 	    "movq 0(%1), %%mm0\n"
rlm@1: 	    "movq 0(%1),%%mm7\n"
rlm@1: 	    "movq 8(%1),%%mm1\n"
rlm@1: 	    "psllq $48,%%mm0\n"
rlm@1: 	    "psllq $48,%%mm1\n"
rlm@1: 	    "psrlq $48, %%mm0\n"
rlm@1: 	    "movq %%mm7,%%mm2\n"
rlm@1: 	    "movq %%mm7,%%mm3\n"
rlm@1: 	    "psllq $16,%%mm2\n"
rlm@1: 	    "psrlq $16,%%mm3\n"
rlm@1: 	    "por %%mm2,%%mm0\n"
rlm@1: 	    "por %%mm3,%%mm1\n"
rlm@1: 
rlm@1: 	    /* current_upper */
rlm@1: 	    "movq (%0),%%mm6\n"
rlm@1: 
rlm@1: 	    /* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 	    /* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "movq %%mm0,%%mm3\n"
rlm@1: 	    "movq %%mm1,%%mm5\n"
rlm@1: 	    "pcmpeqw %%mm6,%%mm2\n"
rlm@1: 	    "pcmpeqw %%mm6,%%mm4\n"
rlm@1: 	    "pcmpeqw (%2),%%mm3\n"
rlm@1: 	    "pcmpeqw (%2),%%mm5\n"
rlm@1: 	    "pandn %%mm2,%%mm3\n"
rlm@1: 	    "pandn %%mm4,%%mm5\n"
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "pcmpeqw %%mm1,%%mm2\n"
rlm@1: 	    "pcmpeqw %%mm0,%%mm4\n"
rlm@1: 	    "pandn %%mm3,%%mm2\n"
rlm@1: 	    "pandn %%mm5,%%mm4\n"
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "movq %%mm4,%%mm5\n"
rlm@1: 	    "pand %%mm6,%%mm2\n"
rlm@1: 	    "pand %%mm6,%%mm4\n"
rlm@1: 	    "pandn %%mm7,%%mm3\n"
rlm@1: 	    "pandn %%mm7,%%mm5\n"
rlm@1: 	    "por %%mm3,%%mm2\n"
rlm@1: 	    "por %%mm5,%%mm4\n"
rlm@1: 
rlm@1: 	    /* set *dst */
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "punpcklwd %%mm4,%%mm2\n"
rlm@1: 	    "punpckhwd %%mm4,%%mm3\n"
rlm@1: 	    "movq %%mm2,(%3)\n"
rlm@1: 	    "movq %%mm3,8(%3)\n"
rlm@1: 
rlm@1: 	    /* next */
rlm@1: 	    "addl $8,%0\n"
rlm@1: 	    "addl $8,%1\n"
rlm@1: 	    "addl $8,%2\n"
rlm@1: 	    "addl $16,%3\n"
rlm@1: 
rlm@1: 	    /* central runs */
rlm@1: 	    "shrl $2,%4\n"
rlm@1: 	    "jz 1f\n"
rlm@1: 
rlm@1: 	    "0:\n"
rlm@1: 
rlm@1: 	    /* set the current, current_pre, current_next registers */
rlm@1: 	    "movq -8(%1),%%mm0\n"
rlm@1: 	    "movq (%1),%%mm7\n"
rlm@1: 	    "movq 8(%1),%%mm1\n"
rlm@1: 	    "psrlq $48,%%mm0\n"
rlm@1: 	    "psllq $48,%%mm1\n"
rlm@1: 	    "movq %%mm7,%%mm2\n"
rlm@1: 	    "movq %%mm7,%%mm3\n"
rlm@1: 	    "psllq $16,%%mm2\n"
rlm@1: 	    "psrlq $16,%%mm3\n"
rlm@1: 	    "por %%mm2,%%mm0\n"
rlm@1: 	    "por %%mm3,%%mm1\n"
rlm@1: 
rlm@1: 	    /* current_upper */
rlm@1: 	    "movq (%0),%%mm6\n"
rlm@1: 
rlm@1: 	    /* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 	    /* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "movq %%mm0,%%mm3\n"
rlm@1: 	    "movq %%mm1,%%mm5\n"
rlm@1: 	    "pcmpeqw %%mm6,%%mm2\n"
rlm@1: 	    "pcmpeqw %%mm6,%%mm4\n"
rlm@1: 	    "pcmpeqw (%2),%%mm3\n"
rlm@1: 	    "pcmpeqw (%2),%%mm5\n"
rlm@1: 	    "pandn %%mm2,%%mm3\n"
rlm@1: 	    "pandn %%mm4,%%mm5\n"
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "pcmpeqw %%mm1,%%mm2\n"
rlm@1: 	    "pcmpeqw %%mm0,%%mm4\n"
rlm@1: 	    "pandn %%mm3,%%mm2\n"
rlm@1: 	    "pandn %%mm5,%%mm4\n"
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "movq %%mm4,%%mm5\n"
rlm@1: 	    "pand %%mm6,%%mm2\n"
rlm@1: 	    "pand %%mm6,%%mm4\n"
rlm@1: 	    "pandn %%mm7,%%mm3\n"
rlm@1: 	    "pandn %%mm7,%%mm5\n"
rlm@1: 	    "por %%mm3,%%mm2\n"
rlm@1: 	    "por %%mm5,%%mm4\n"
rlm@1: 
rlm@1: 	    /* set *dst */
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "punpcklwd %%mm4,%%mm2\n"
rlm@1: 	    "punpckhwd %%mm4,%%mm3\n"
rlm@1: 	    "movq %%mm2,(%3)\n"
rlm@1: 	    "movq %%mm3,8(%3)\n"
rlm@1: 
rlm@1: 	    /* next */
rlm@1: 	    "addl $8,%0\n"
rlm@1: 	    "addl $8,%1\n"
rlm@1: 	    "addl $8,%2\n"
rlm@1: 	    "addl $16,%3\n"
rlm@1: 
rlm@1: 	    "decl %4\n"
rlm@1: 	    "jnz 0b\n"
rlm@1: 	    "1:\n"
rlm@1: 
rlm@1: 	    /* final run */
rlm@1: 	    /* set the current, current_pre, current_next registers */
rlm@1: 	    "movq (%1),%%mm1\n"
rlm@1: 	    "movq (%1),%%mm7\n"
rlm@1: 	    "movq -8(%1),%%mm0\n"
rlm@1: 	    "psrlq $48,%%mm1\n"
rlm@1: 	    "psrlq $48,%%mm0\n"
rlm@1: 	    "psllq $48,%%mm1\n"
rlm@1: 	    "movq %%mm7,%%mm2\n"
rlm@1: 	    "movq %%mm7,%%mm3\n"
rlm@1: 	    "psllq $16,%%mm2\n"
rlm@1: 	    "psrlq $16,%%mm3\n"
rlm@1: 	    "por %%mm2,%%mm0\n"
rlm@1: 	    "por %%mm3,%%mm1\n"
rlm@1: 
rlm@1: 	    /* current_upper */
rlm@1: 	    "movq (%0),%%mm6\n"
rlm@1: 
rlm@1: 	    /* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 	    /* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "movq %%mm0,%%mm3\n"
rlm@1: 	    "movq %%mm1,%%mm5\n"
rlm@1: 	    "pcmpeqw %%mm6,%%mm2\n"
rlm@1: 	    "pcmpeqw %%mm6,%%mm4\n"
rlm@1: 	    "pcmpeqw (%2),%%mm3\n"
rlm@1: 	    "pcmpeqw (%2),%%mm5\n"
rlm@1: 	    "pandn %%mm2,%%mm3\n"
rlm@1: 	    "pandn %%mm4,%%mm5\n"
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "pcmpeqw %%mm1,%%mm2\n"
rlm@1: 	    "pcmpeqw %%mm0,%%mm4\n"
rlm@1: 	    "pandn %%mm3,%%mm2\n"
rlm@1: 	    "pandn %%mm5,%%mm4\n"
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "movq %%mm4,%%mm5\n"
rlm@1: 	    "pand %%mm6,%%mm2\n"
rlm@1: 	    "pand %%mm6,%%mm4\n"
rlm@1: 	    "pandn %%mm7,%%mm3\n"
rlm@1: 	    "pandn %%mm7,%%mm5\n"
rlm@1: 	    "por %%mm3,%%mm2\n"
rlm@1: 	    "por %%mm5,%%mm4\n"
rlm@1: 
rlm@1: 	    /* set *dst */
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "punpcklwd %%mm4,%%mm2\n"
rlm@1: 	    "punpckhwd %%mm4,%%mm3\n"
rlm@1: 	    "movq %%mm2,(%3)\n"
rlm@1: 	    "movq %%mm3,8(%3)\n"
rlm@1: 	    "emms\n"
rlm@1: 
rlm@1: 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
rlm@1: 		:
rlm@1: 		: "cc"
rlm@1: 	    );
rlm@1: #else
rlm@1: 	__asm {
rlm@1: 		mov eax, src0;
rlm@1: 		mov ebx, src1;
rlm@1: 		mov ecx, src2;
rlm@1: 		mov edx, dst;
rlm@1: 		mov esi, count;
rlm@1: 
rlm@1: 		/* first run */
rlm@1: 		/* set the current, current_pre, current_next registers */
rlm@1: 		movq  mm0, qword ptr [ebx];
rlm@1: 		movq  mm7, qword ptr [ebx];
rlm@1: 		movq  mm1, qword ptr [ebx + 8];
rlm@1: 		psllq mm0, 48;
rlm@1: 		psllq mm1, 48;
rlm@1: 		psrlq mm0, 48;
rlm@1: 		movq  mm2, mm7;
rlm@1: 		movq  mm3, mm7;
rlm@1: 		psllq mm2, 16;
rlm@1: 		psrlq mm3, 16;
rlm@1: 		por	  mm0, mm2;
rlm@1: 		por	  mm1, mm3;
rlm@1: 
rlm@1: 		/* current_upper */
rlm@1: 		movq mm6, qword ptr [eax];
rlm@1: 
rlm@1: 		/* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 		/* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		movq	mm3, mm0;
rlm@1: 		movq	mm5, mm1;
rlm@1: 		pcmpeqw mm2, mm6;
rlm@1: 		pcmpeqw mm4, mm6;
rlm@1: 		pcmpeqw mm3, qword ptr [ecx];
rlm@1: 		pcmpeqw mm5, qword ptr [ecx];
rlm@1: 		pandn	mm3, mm2;
rlm@1: 		pandn	mm5, mm4;
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		pcmpeqw mm2, mm1;
rlm@1: 		pcmpeqw mm4, mm0;
rlm@1: 		pandn	mm2, mm3;
rlm@1: 		pandn	mm4, mm5;
rlm@1: 		movq	mm3, mm2;
rlm@1: 		movq	mm5, mm4;
rlm@1: 		pand	mm2, mm6;
rlm@1: 		pand	mm4, mm6;
rlm@1: 		pandn	mm3, mm7;
rlm@1: 		pandn	mm5, mm7;
rlm@1: 		por		mm2, mm3;
rlm@1: 		por		mm4, mm5;
rlm@1: 
rlm@1: 		/* set *dst0 */
rlm@1: 		movq	   mm3, mm2;
rlm@1: 		punpcklwd  mm2, mm4;
rlm@1: 		punpckhwd  mm3, mm4;
rlm@1: 		movq qword ptr [edx], mm2;
rlm@1: 		movq qword ptr [edx + 8], mm3;
rlm@1: 
rlm@1: 		/* next */
rlm@1: 		add eax, 8;
rlm@1: 		add ebx, 8;
rlm@1: 		add ecx, 8;
rlm@1: 		add edx, 16;
rlm@1: 
rlm@1: 		/* central runs */
rlm@1: 		shr esi, 2;
rlm@1: 		jz	label1;
rlm@1: 		align 4;
rlm@1: label0:
rlm@1: 
rlm@1: 		/* set the current, current_pre, current_next registers */
rlm@1: 		movq mm0, qword ptr [ebx - 8];
rlm@1: 		movq  mm7, qword ptr [ebx];
rlm@1: 		movq  mm1, qword ptr [ebx + 8];
rlm@1: 		psrlq mm0, 48;
rlm@1: 		psllq mm1, 48;
rlm@1: 		movq  mm2, mm7;
rlm@1: 		movq  mm3, mm7;
rlm@1: 		psllq mm2, 16;
rlm@1: 		psrlq mm3, 16;
rlm@1: 		por	  mm0, mm2;
rlm@1: 		por	  mm1, mm3;
rlm@1: 
rlm@1: 		/* current_upper */
rlm@1: 		movq mm6, qword ptr [eax];
rlm@1: 
rlm@1: 		/* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 		/* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		movq	mm3, mm0;
rlm@1: 		movq	mm5, mm1;
rlm@1: 		pcmpeqw mm2, mm6;
rlm@1: 		pcmpeqw mm4, mm6;
rlm@1: 		pcmpeqw mm3, qword ptr [ecx];
rlm@1: 		pcmpeqw mm5, qword ptr [ecx];
rlm@1: 		pandn	mm3, mm2;
rlm@1: 		pandn	mm5, mm4;
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		pcmpeqw mm2, mm1;
rlm@1: 		pcmpeqw mm4, mm0;
rlm@1: 		pandn	mm2, mm3;
rlm@1: 		pandn	mm4, mm5;
rlm@1: 		movq	mm3, mm2;
rlm@1: 		movq	mm5, mm4;
rlm@1: 		pand	mm2, mm6;
rlm@1: 		pand	mm4, mm6;
rlm@1: 		pandn	mm3, mm7;
rlm@1: 		pandn	mm5, mm7;
rlm@1: 		por		mm2, mm3;
rlm@1: 		por		mm4, mm5;
rlm@1: 
rlm@1: 		/* set *dst */
rlm@1: 		movq	   mm3, mm2;
rlm@1: 		punpcklwd  mm2, mm4;
rlm@1: 		punpckhwd  mm3, mm4;
rlm@1: 		movq qword ptr [edx], mm2;
rlm@1: 		movq qword ptr [edx + 8], mm3;
rlm@1: 
rlm@1: 		/* next */
rlm@1: 		add eax, 8;
rlm@1: 		add ebx, 8;
rlm@1: 		add ecx, 8;
rlm@1: 		add edx, 16;
rlm@1: 
rlm@1: 		dec esi;
rlm@1: 		jnz label0;
rlm@1: label1:
rlm@1: 
rlm@1: 		/* final run */
rlm@1: 		/* set the current, current_pre, current_next registers */
rlm@1: 		movq mm1, qword ptr [ebx];
rlm@1: 		movq  mm7, qword ptr [ebx];
rlm@1: 		movq  mm0, qword ptr [ebx - 8];
rlm@1: 		psrlq mm1, 48;
rlm@1: 		psrlq mm0, 48;
rlm@1: 		psllq mm1, 48;
rlm@1: 		movq  mm2, mm7;
rlm@1: 		movq  mm3, mm7;
rlm@1: 		psllq mm2, 16;
rlm@1: 		psrlq mm3, 16;
rlm@1: 		por	  mm0, mm2;
rlm@1: 		por	  mm1, mm3;
rlm@1: 
rlm@1: 		/* current_upper */
rlm@1: 		movq mm6, qword ptr [eax];
rlm@1: 
rlm@1: 		/* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 		/* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		movq	mm3, mm0;
rlm@1: 		movq	mm5, mm1;
rlm@1: 		pcmpeqw mm2, mm6;
rlm@1: 		pcmpeqw mm4, mm6;
rlm@1: 		pcmpeqw mm3, qword ptr [ecx];
rlm@1: 		pcmpeqw mm5, qword ptr [ecx];
rlm@1: 		pandn	mm3, mm2;
rlm@1: 		pandn	mm5, mm4;
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		pcmpeqw mm2, mm1;
rlm@1: 		pcmpeqw mm4, mm0;
rlm@1: 		pandn	mm2, mm3;
rlm@1: 		pandn	mm4, mm5;
rlm@1: 		movq	mm3, mm2;
rlm@1: 		movq	mm5, mm4;
rlm@1: 		pand	mm2, mm6;
rlm@1: 		pand	mm4, mm6;
rlm@1: 		pandn	mm3, mm7;
rlm@1: 		pandn	mm5, mm7;
rlm@1: 		por		mm2, mm3;
rlm@1: 		por		mm4, mm5;
rlm@1: 
rlm@1: 		/* set *dst */
rlm@1: 		movq	   mm3, mm2;
rlm@1: 		punpcklwd  mm2, mm4;
rlm@1: 		punpckhwd  mm3, mm4;
rlm@1: 		movq qword ptr [edx], mm2;
rlm@1: 		movq qword ptr [edx + 8], mm3;
rlm@1: 
rlm@1: 		mov src0, eax;
rlm@1: 		mov src1, ebx;
rlm@1: 		mov src2, ecx;
rlm@1: 		mov dst, edx;
rlm@1: 		mov count, esi;
rlm@1: 
rlm@1: 		emms;
rlm@1: 	}
rlm@1: #endif
rlm@1: }
rlm@1: 
rlm@1: static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
rlm@1: {
rlm@1: 	/* always do the first and last run */
rlm@1: 	count -= 2 * 2;
rlm@1: 
rlm@1: #ifdef __GNUC__
rlm@1: 	__asm__ __volatile__ (
rlm@1: 	    /* first run */
rlm@1: 	    /* set the current, current_pre, current_next registers */
rlm@1: 	    "movq 0(%1),%%mm0\n"
rlm@1: 	    "movq 0(%1),%%mm7\n"
rlm@1: 	    "movq 8(%1),%%mm1\n"
rlm@1: 	    "psllq $32,%%mm0\n"
rlm@1: 	    "psllq $32,%%mm1\n"
rlm@1: 	    "psrlq $32,%%mm0\n"
rlm@1: 	    "movq %%mm7,%%mm2\n"
rlm@1: 	    "movq %%mm7,%%mm3\n"
rlm@1: 	    "psllq $32,%%mm2\n"
rlm@1: 	    "psrlq $32,%%mm3\n"
rlm@1: 	    "por %%mm2,%%mm0\n"
rlm@1: 	    "por %%mm3,%%mm1\n"
rlm@1: 
rlm@1: 	    /* current_upper */
rlm@1: 	    "movq (%0),%%mm6\n"
rlm@1: 
rlm@1: 	    /* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 	    /* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "movq %%mm0,%%mm3\n"
rlm@1: 	    "movq %%mm1,%%mm5\n"
rlm@1: 	    "pcmpeqd %%mm6,%%mm2\n"
rlm@1: 	    "pcmpeqd %%mm6,%%mm4\n"
rlm@1: 	    "pcmpeqd (%2),%%mm3\n"
rlm@1: 	    "pcmpeqd (%2),%%mm5\n"
rlm@1: 	    "pandn %%mm2,%%mm3\n"
rlm@1: 	    "pandn %%mm4,%%mm5\n"
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "pcmpeqd %%mm1,%%mm2\n"
rlm@1: 	    "pcmpeqd %%mm0,%%mm4\n"
rlm@1: 	    "pandn %%mm3,%%mm2\n"
rlm@1: 	    "pandn %%mm5,%%mm4\n"
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "movq %%mm4,%%mm5\n"
rlm@1: 	    "pand %%mm6,%%mm2\n"
rlm@1: 	    "pand %%mm6,%%mm4\n"
rlm@1: 	    "pandn %%mm7,%%mm3\n"
rlm@1: 	    "pandn %%mm7,%%mm5\n"
rlm@1: 	    "por %%mm3,%%mm2\n"
rlm@1: 	    "por %%mm5,%%mm4\n"
rlm@1: 
rlm@1: 	    /* set *dst */
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "punpckldq %%mm4,%%mm2\n"
rlm@1: 	    "punpckhdq %%mm4,%%mm3\n"
rlm@1: 	    "movq %%mm2,(%3)\n"
rlm@1: 	    "movq %%mm3, 8(%3)\n"
rlm@1: 
rlm@1: 	    /* next */
rlm@1: 	    "addl $8,%0\n"
rlm@1: 	    "addl $8,%1\n"
rlm@1: 	    "addl $8,%2\n"
rlm@1: 	    "addl $16,%3\n"
rlm@1: 
rlm@1: 	    /* central runs */
rlm@1: 	    "shrl $1,%4\n"
rlm@1: 	    "jz 1f\n"
rlm@1: 
rlm@1: 	    "0:\n"
rlm@1: 
rlm@1: 	    /* set the current, current_pre, current_next registers */
rlm@1: 	    "movq -8(%1),%%mm0\n"
rlm@1: 	    "movq (%1),%%mm7\n"
rlm@1: 	    "movq 8(%1),%%mm1\n"
rlm@1: 	    "psrlq $32,%%mm0\n"
rlm@1: 	    "psllq $32,%%mm1\n"
rlm@1: 	    "movq %%mm7,%%mm2\n"
rlm@1: 	    "movq %%mm7,%%mm3\n"
rlm@1: 	    "psllq $32,%%mm2\n"
rlm@1: 	    "psrlq $32,%%mm3\n"
rlm@1: 	    "por %%mm2,%%mm0\n"
rlm@1: 	    "por %%mm3,%%mm1\n"
rlm@1: 
rlm@1: 	    /* current_upper */
rlm@1: 	    "movq (%0),%%mm6\n"
rlm@1: 
rlm@1: 	    /* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 	    /* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "movq %%mm0,%%mm3\n"
rlm@1: 	    "movq %%mm1,%%mm5\n"
rlm@1: 	    "pcmpeqd %%mm6,%%mm2\n"
rlm@1: 	    "pcmpeqd %%mm6,%%mm4\n"
rlm@1: 	    "pcmpeqd (%2),%%mm3\n"
rlm@1: 	    "pcmpeqd (%2),%%mm5\n"
rlm@1: 	    "pandn %%mm2,%%mm3\n"
rlm@1: 	    "pandn %%mm4,%%mm5\n"
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "pcmpeqd %%mm1,%%mm2\n"
rlm@1: 	    "pcmpeqd %%mm0,%%mm4\n"
rlm@1: 	    "pandn %%mm3,%%mm2\n"
rlm@1: 	    "pandn %%mm5,%%mm4\n"
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "movq %%mm4,%%mm5\n"
rlm@1: 	    "pand %%mm6,%%mm2\n"
rlm@1: 	    "pand %%mm6,%%mm4\n"
rlm@1: 	    "pandn %%mm7,%%mm3\n"
rlm@1: 	    "pandn %%mm7,%%mm5\n"
rlm@1: 	    "por %%mm3,%%mm2\n"
rlm@1: 	    "por %%mm5,%%mm4\n"
rlm@1: 
rlm@1: 	    /* set *dst */
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "punpckldq %%mm4,%%mm2\n"
rlm@1: 	    "punpckhdq %%mm4,%%mm3\n"
rlm@1: 	    "movq %%mm2,(%3)\n"
rlm@1: 	    "movq %%mm3,8(%3)\n"
rlm@1: 
rlm@1: 	    /* next */
rlm@1: 	    "addl $8,%0\n"
rlm@1: 	    "addl $8,%1\n"
rlm@1: 	    "addl $8,%2\n"
rlm@1: 	    "addl $16,%3\n"
rlm@1: 
rlm@1: 	    "decl %4\n"
rlm@1: 	    "jnz 0b\n"
rlm@1: 	    "1:\n"
rlm@1: 
rlm@1: 	    /* final run */
rlm@1: 	    /* set the current, current_pre, current_next registers */
rlm@1: 	    "movq (%1),%%mm1\n"
rlm@1: 	    "movq (%1),%%mm7\n"
rlm@1: 	    "movq -8(%1), %%mm0\n"
rlm@1: 	    "psrlq $32,%%mm1\n"
rlm@1: 	    "psrlq $32,%%mm0\n"
rlm@1: 	    "psllq $32,%%mm1\n"
rlm@1: 	    "movq %%mm7,%%mm2\n"
rlm@1: 	    "movq %%mm7,%%mm3\n"
rlm@1: 	    "psllq $32,%%mm2\n"
rlm@1: 	    "psrlq $32,%%mm3\n"
rlm@1: 	    "por %%mm2,%%mm0\n"
rlm@1: 	    "por %%mm3,%%mm1\n"
rlm@1: 
rlm@1: 	    /* current_upper */
rlm@1: 	    "movq (%0),%%mm6\n"
rlm@1: 
rlm@1: 	    /* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 	    /* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "movq %%mm0,%%mm3\n"
rlm@1: 	    "movq %%mm1,%%mm5\n"
rlm@1: 	    "pcmpeqd %%mm6,%%mm2\n"
rlm@1: 	    "pcmpeqd %%mm6,%%mm4\n"
rlm@1: 	    "pcmpeqd (%2),%%mm3\n"
rlm@1: 	    "pcmpeqd (%2),%%mm5\n"
rlm@1: 	    "pandn %%mm2,%%mm3\n"
rlm@1: 	    "pandn %%mm4,%%mm5\n"
rlm@1: 	    "movq %%mm0,%%mm2\n"
rlm@1: 	    "movq %%mm1,%%mm4\n"
rlm@1: 	    "pcmpeqd %%mm1,%%mm2\n"
rlm@1: 	    "pcmpeqd %%mm0,%%mm4\n"
rlm@1: 	    "pandn %%mm3,%%mm2\n"
rlm@1: 	    "pandn %%mm5,%%mm4\n"
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "movq %%mm4,%%mm5\n"
rlm@1: 	    "pand %%mm6,%%mm2\n"
rlm@1: 	    "pand %%mm6,%%mm4\n"
rlm@1: 	    "pandn %%mm7,%%mm3\n"
rlm@1: 	    "pandn %%mm7,%%mm5\n"
rlm@1: 	    "por %%mm3,%%mm2\n"
rlm@1: 	    "por %%mm5,%%mm4\n"
rlm@1: 
rlm@1: 	    /* set *dst */
rlm@1: 	    "movq %%mm2,%%mm3\n"
rlm@1: 	    "punpckldq %%mm4,%%mm2\n"
rlm@1: 	    "punpckhdq %%mm4,%%mm3\n"
rlm@1: 	    "movq %%mm2,(%3)\n"
rlm@1: 	    "movq %%mm3,8(%3)\n"
rlm@1: 	    "emms\n"
rlm@1: 
rlm@1: 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
rlm@1: 		:
rlm@1: 		: "cc"
rlm@1: 	    );
rlm@1: #else
rlm@1: 	__asm {
rlm@1: 		mov eax, src0;
rlm@1: 		mov ebx, src1;
rlm@1: 		mov ecx, src2;
rlm@1: 		mov edx, dst;
rlm@1: 		mov esi, count;
rlm@1: 
rlm@1: 		/* first run */
rlm@1: 		/* set the current, current_pre, current_next registers */
rlm@1: 		movq  mm0, qword ptr [ebx];
rlm@1: 		movq  mm7, qword ptr [ebx];
rlm@1: 		movq  mm1, qword ptr [ebx + 8];
rlm@1: 		psllq mm0, 32;
rlm@1: 		psllq mm1, 32;
rlm@1: 		psrlq mm0, 32;
rlm@1: 		movq  mm2, mm7;
rlm@1: 		movq  mm3, mm7;
rlm@1: 		psllq mm2, 32;
rlm@1: 		psrlq mm3, 32;
rlm@1: 		por	  mm0, mm2;
rlm@1: 		por	  mm1, mm3;
rlm@1: 
rlm@1: 		/* current_upper */
rlm@1: 		movq mm6, qword ptr [eax];
rlm@1: 
rlm@1: 		/* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 		/* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		movq	mm3, mm0;
rlm@1: 		movq	mm5, mm1;
rlm@1: 		pcmpeqd mm2, mm6;
rlm@1: 		pcmpeqd mm4, mm6;
rlm@1: 		pcmpeqd mm3, qword ptr [ecx];
rlm@1: 		pcmpeqd mm5, qword ptr [ecx];
rlm@1: 		pandn	mm3, mm2;
rlm@1: 		pandn	mm5, mm4;
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		pcmpeqd mm2, mm1;
rlm@1: 		pcmpeqd mm4, mm0;
rlm@1: 		pandn	mm2, mm3;
rlm@1: 		pandn	mm4, mm5;
rlm@1: 		movq	mm3, mm2;
rlm@1: 		movq	mm5, mm4;
rlm@1: 		pand	mm2, mm6;
rlm@1: 		pand	mm4, mm6;
rlm@1: 		pandn	mm3, mm7;
rlm@1: 		pandn	mm5, mm7;
rlm@1: 		por		mm2, mm3;
rlm@1: 		por		mm4, mm5;
rlm@1: 
rlm@1: 		/* set *dst */
rlm@1: 		movq	   mm3, mm2;
rlm@1: 		punpckldq  mm2, mm4;
rlm@1: 		punpckhdq  mm3, mm4;
rlm@1: 		movq qword ptr [edx], mm2;
rlm@1: 		movq qword ptr [edx + 8], mm3;
rlm@1: 
rlm@1: 		/* next */
rlm@1: 		add eax, 8;
rlm@1: 		add ebx, 8;
rlm@1: 		add ecx, 8;
rlm@1: 		add edx, 16;
rlm@1: 
rlm@1: 		/* central runs */
rlm@1: 		shr esi, 1;
rlm@1: 		jz	label1;
rlm@1: label0:
rlm@1: 
rlm@1: 		/* set the current, current_pre, current_next registers */
rlm@1: 		movq mm0, qword ptr [ebx - 8];
rlm@1: 		movq  mm7, qword ptr [ebx];
rlm@1: 		movq  mm1, qword ptr [ebx + 8];
rlm@1: 		psrlq mm0, 32;
rlm@1: 		psllq mm1, 32;
rlm@1: 		movq  mm2, mm7;
rlm@1: 		movq  mm3, mm7;
rlm@1: 		psllq mm2, 32;
rlm@1: 		psrlq mm3, 32;
rlm@1: 		por	  mm0, mm2;
rlm@1: 		por	  mm1, mm3;
rlm@1: 
rlm@1: 		/* current_upper */
rlm@1: 		movq mm6, qword ptr[eax];
rlm@1: 
rlm@1: 		/* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 		/* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		movq	mm3, mm0;
rlm@1: 		movq	mm5, mm1;
rlm@1: 		pcmpeqd mm2, mm6;
rlm@1: 		pcmpeqd mm4, mm6;
rlm@1: 		pcmpeqd mm3, qword ptr[ecx];
rlm@1: 		pcmpeqd mm5, qword ptr[ecx];
rlm@1: 		pandn	mm3, mm2;
rlm@1: 		pandn	mm5, mm4;
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		pcmpeqd mm2, mm1;
rlm@1: 		pcmpeqd mm4, mm0;
rlm@1: 		pandn	mm2, mm3;
rlm@1: 		pandn	mm4, mm5;
rlm@1: 		movq	mm3, mm2;
rlm@1: 		movq	mm5, mm4;
rlm@1: 		pand	mm2, mm6;
rlm@1: 		pand	mm4, mm6;
rlm@1: 		pandn	mm3, mm7;
rlm@1: 		pandn	mm5, mm7;
rlm@1: 		por		mm2, mm3;
rlm@1: 		por		mm4, mm5;
rlm@1: 
rlm@1: 		/* set *dst */
rlm@1: 		movq	   mm3, mm2;
rlm@1: 		punpckldq  mm2, mm4;
rlm@1: 		punpckhdq  mm3, mm4;
rlm@1: 		movq qword ptr [edx], mm2;
rlm@1: 		movq qword ptr [edx + 8], mm3;
rlm@1: 
rlm@1: 		/* next */
rlm@1: 		add eax, 8;
rlm@1: 		add ebx, 8;
rlm@1: 		add ecx, 8;
rlm@1: 		add edx, 16;
rlm@1: 
rlm@1: 		dec esi;
rlm@1: 		jnz label0;
rlm@1: label1:
rlm@1: 
rlm@1: 		/* final run */
rlm@1: 		/* set the current, current_pre, current_next registers */
rlm@1: 		movq mm1, qword ptr [ebx];
rlm@1: 		movq  mm7, qword ptr [ebx];
rlm@1: 		movq  mm0, qword ptr [ebx - 8];
rlm@1: 		psrlq mm1, 32;
rlm@1: 		psrlq mm0, 32;
rlm@1: 		psllq mm1, 32;
rlm@1: 		movq  mm2, mm7;
rlm@1: 		movq  mm3, mm7;
rlm@1: 		psllq mm2, 32;
rlm@1: 		psrlq mm3, 32;
rlm@1: 		por	  mm0, mm2;
rlm@1: 		por	  mm1, mm3;
rlm@1: 
rlm@1: 		/* current_upper */
rlm@1: 		movq mm6, qword ptr [eax];
rlm@1: 
rlm@1: 		/* compute the upper-left pixel for dst on %%mm2 */
rlm@1: 		/* compute the upper-right pixel for dst on %%mm4 */
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		movq	mm3, mm0;
rlm@1: 		movq	mm5, mm1;
rlm@1: 		pcmpeqd mm2, mm6;
rlm@1: 		pcmpeqd mm4, mm6;
rlm@1: 		pcmpeqd mm3, qword ptr [ecx];
rlm@1: 		pcmpeqd mm5, qword ptr [ecx];
rlm@1: 		pandn	mm3, mm2;
rlm@1: 		pandn	mm5, mm4;
rlm@1: 		movq	mm2, mm0;
rlm@1: 		movq	mm4, mm1;
rlm@1: 		pcmpeqd mm2, mm1;
rlm@1: 		pcmpeqd mm4, mm0;
rlm@1: 		pandn	mm2, mm3;
rlm@1: 		pandn	mm4, mm5;
rlm@1: 		movq	mm3, mm2;
rlm@1: 		movq	mm5, mm4;
rlm@1: 		pand	mm2, mm6;
rlm@1: 		pand	mm4, mm6;
rlm@1: 		pandn	mm3, mm7;
rlm@1: 		pandn	mm5, mm7;
rlm@1: 		por		mm2, mm3;
rlm@1: 		por		mm4, mm5;
rlm@1: 
rlm@1: 		/* set *dst */
rlm@1: 		movq	   mm3, mm2;
rlm@1: 		punpckldq  mm2, mm4;
rlm@1: 		punpckhdq  mm3, mm4;
rlm@1: 		movq qword ptr [edx], mm2;
rlm@1: 		movq qword ptr [edx + 8], mm3;
rlm@1: 
rlm@1: 		mov src0, eax;
rlm@1: 		mov src1, ebx;
rlm@1: 		mov src2, ecx;
rlm@1: 		mov dst, edx;
rlm@1: 		mov count, esi;
rlm@1: 
rlm@1: 		emms;
rlm@1: 	}
rlm@1: #endif
rlm@1: }
rlm@1: 
rlm@1: static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
rlm@1: {
rlm@1: 	//	assert( count >= 2*4 );
rlm@1: 	internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
rlm@1: 	internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
rlm@1: }
rlm@1: 
rlm@1: static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
rlm@1: {
rlm@1: 	//	assert( count >= 2*2 );
rlm@1: 	internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
rlm@1: 	internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
rlm@1: }
rlm@1: 
rlm@1: #endif
rlm@1: 
rlm@1: void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
rlm@1:               u8 *dstPtr, u32 dstPitch, int width, int height)
rlm@1: {
rlm@1: 	u16 *dst0 = (u16 *)dstPtr;
rlm@1: 	u16 *dst1 = dst0 + (dstPitch >> 1);
rlm@1: 
rlm@1: 	u16 *src0 = (u16 *)srcPtr;
rlm@1: 	u16 *src1 = src0 + (srcPitch >> 1);
rlm@1: 	u16 *src2 = src1 + (srcPitch >> 1);
rlm@1: #ifdef MMX
rlm@1: 	if (cpu_mmx)
rlm@1: 	{
rlm@1: 		internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
rlm@1: 
rlm@1: 		int count = height;
rlm@1: 
rlm@1: 		count -= 2;
rlm@1: 		while (count)
rlm@1: 		{
rlm@1: 			dst0 += dstPitch;
rlm@1: 			dst1 += dstPitch;
rlm@1: 			internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
rlm@1: 			src0  = src1;
rlm@1: 			src1  = src2;
rlm@1: 			src2 += srcPitch >> 1;
rlm@1: 			--count;
rlm@1: 		}
rlm@1: 		dst0 += dstPitch;
rlm@1: 		dst1 += dstPitch;
rlm@1: 		internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
rlm@1: 	}
rlm@1: 	else
rlm@1: 	{
rlm@1: #endif
rlm@1: 	internal_scale2x_16_def(dst0, src0, src0, src1, width);
rlm@1: 	internal_scale2x_16_def(dst1, src1, src0, src0, width);
rlm@1: 
rlm@1: 	int count = height;
rlm@1: 
rlm@1: 	count -= 2;
rlm@1: 	while (count)
rlm@1: 	{
rlm@1: 		dst0 += dstPitch;
rlm@1: 		dst1 += dstPitch;
rlm@1: 		internal_scale2x_16_def(dst0, src0, src1, src2, width);
rlm@1: 		internal_scale2x_16_def(dst1, src2, src1, src0, width);
rlm@1: 		src0  = src1;
rlm@1: 		src1  = src2;
rlm@1: 		src2 += srcPitch >> 1;
rlm@1: 		--count;
rlm@1: 	}
rlm@1: 	dst0 += dstPitch;
rlm@1: 	dst1 += dstPitch;
rlm@1: 	internal_scale2x_16_def(dst0, src0, src1, src1, width);
rlm@1: 	internal_scale2x_16_def(dst1, src1, src1, src0, width);
rlm@1: #ifdef MMX
rlm@1: }
rlm@1: 
rlm@1: #endif
rlm@1: }
rlm@1: 
rlm@1: void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
rlm@1:                 u8 *dstPtr, u32 dstPitch, int width, int height)
rlm@1: {
rlm@1: 	u32 *dst0 = (u32 *)dstPtr;
rlm@1: 	u32 *dst1 = dst0 + (dstPitch >> 2);
rlm@1: 
rlm@1: 	u32 *src0 = (u32 *)srcPtr;
rlm@1: 	u32 *src1 = src0 + (srcPitch >> 2);
rlm@1: 	u32 *src2 = src1 + (srcPitch >> 2);
rlm@1: #ifdef MMX
rlm@1: 	if (cpu_mmx)
rlm@1: 	{
rlm@1: 		internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
rlm@1: 
rlm@1: 		int count = height;
rlm@1: 
rlm@1: 		count -= 2;
rlm@1: 		while (count)
rlm@1: 		{
rlm@1: 			dst0 += dstPitch >> 1;
rlm@1: 			dst1 += dstPitch >> 1;
rlm@1: 			internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
rlm@1: 			src0  = src1;
rlm@1: 			src1  = src2;
rlm@1: 			src2 += srcPitch >> 2;
rlm@1: 			--count;
rlm@1: 		}
rlm@1: 		dst0 += dstPitch >> 1;
rlm@1: 		dst1 += dstPitch >> 1;
rlm@1: 		internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
rlm@1: 	}
rlm@1: 	else
rlm@1: 	{
rlm@1: #endif
rlm@1: 	internal_scale2x_32_def(dst0, src0, src0, src1, width);
rlm@1: 	internal_scale2x_32_def(dst1, src1, src0, src0, width);
rlm@1: 
rlm@1: 	int count = height;
rlm@1: 
rlm@1: 	count -= 2;
rlm@1: 	while (count)
rlm@1: 	{
rlm@1: 		dst0 += dstPitch >> 1;
rlm@1: 		dst1 += dstPitch >> 1;
rlm@1: 		internal_scale2x_32_def(dst0, src0, src1, src2, width);
rlm@1: 		internal_scale2x_32_def(dst1, src2, src1, src0, width);
rlm@1: 		src0  = src1;
rlm@1: 		src1  = src2;
rlm@1: 		src2 += srcPitch >> 2;
rlm@1: 		--count;
rlm@1: 	}
rlm@1: 	dst0 += dstPitch >> 1;
rlm@1: 	dst1 += dstPitch >> 1;
rlm@1: 	internal_scale2x_32_def(dst0, src0, src1, src1, width);
rlm@1: 	internal_scale2x_32_def(dst1, src1, src1, src0, width);
rlm@1: #ifdef MMX
rlm@1: }
rlm@1: 
rlm@1: #endif
rlm@1: }