annotate src/filters/admame.cpp @ 247:22f58fa47c3c

starting work on unoptimized bootstrap script.
author Robert McIntyre <rlm@mit.edu>
date Mon, 26 Mar 2012 03:08:54 -0500
parents f9f4f1b99eed
children
rev   line source
rlm@1 1 /*
rlm@1 2 * This file is part of the Advance project.
rlm@1 3 *
rlm@1 4 * Copyright (C) 1999-2002 Andrea Mazzoleni
rlm@1 5 *
rlm@1 6 * This program is free software; you can redistribute it and/or modify
rlm@1 7 * it under the terms of the GNU General Public License as published by
rlm@1 8 * the Free Software Foundation; either version 2 of the License, or
rlm@1 9 * (at your option) any later version.
rlm@1 10 *
rlm@1 11 * This program is distributed in the hope that it will be useful,
rlm@1 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
rlm@1 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
rlm@1 14 * GNU General Public License for more details.
rlm@1 15 *
rlm@1 16 * You should have received a copy of the GNU General Public License
rlm@1 17 * along with this program; if not, write to the Free Software
rlm@1 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
rlm@1 19 *
rlm@1 20 * In addition, as a special exception, Andrea Mazzoleni
rlm@1 21 * gives permission to link the code of this program with
rlm@1 22 * the MAME library (or with modified versions of MAME that use the
rlm@1 23 * same license as MAME), and distribute linked combinations including
rlm@1 24 * the two. You must obey the GNU General Public License in all
rlm@1 25 * respects for all of the code used other than MAME. If you modify
rlm@1 26 * this file, you may extend this exception to your version of the
rlm@1 27 * file, but you are not obligated to do so. If you do not wish to
rlm@1 28 * do so, delete this exception statement from your version.
rlm@1 29 */
rlm@1 30
rlm@1 31 /*
rlm@1 32 * Alternatively at the previous license terms, you are allowed to use this
rlm@1 33 * code in your program with these conditions:
rlm@1 34 * - the program is not used in commercial activities.
rlm@1 35 * - the whole source code of the program is released with the binary.
rlm@1 36 */
rlm@1 37
rlm@1 38 #include "../Port.h"
rlm@1 39
rlm@1 40 #ifdef MMX
rlm@1 41 extern "C" bool cpu_mmx;
rlm@1 42 #endif
rlm@1 43
rlm@1 44 static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
rlm@1 45 {
rlm@1 46 /* first pixel */
rlm@1 47 dst[0] = src1[0];
rlm@1 48 if (src1[1] == src0[0] && src2[0] != src0[0])
rlm@1 49 dst[1] = src0[0];
rlm@1 50 else
rlm@1 51 dst[1] = src1[0];
rlm@1 52 ++src0;
rlm@1 53 ++src1;
rlm@1 54 ++src2;
rlm@1 55 dst += 2;
rlm@1 56
rlm@1 57 /* central pixels */
rlm@1 58 count -= 2;
rlm@1 59 while (count)
rlm@1 60 {
rlm@1 61 if (src0[0] != src2[0] && src1[-1] != src1[1])
rlm@1 62 {
rlm@1 63 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
rlm@1 64 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
rlm@1 65 }
rlm@1 66 else
rlm@1 67 {
rlm@1 68 dst[0] = src1[0];
rlm@1 69 dst[1] = src1[0];
rlm@1 70 }
rlm@1 71
rlm@1 72 ++src0;
rlm@1 73 ++src1;
rlm@1 74 ++src2;
rlm@1 75 dst += 2;
rlm@1 76 --count;
rlm@1 77 }
rlm@1 78
rlm@1 79 /* last pixel */
rlm@1 80 if (src1[-1] == src0[0] && src2[0] != src0[0])
rlm@1 81 dst[0] = src0[0];
rlm@1 82 else
rlm@1 83 dst[0] = src1[0];
rlm@1 84 dst[1] = src1[0];
rlm@1 85 }
rlm@1 86
rlm@1 87 static void internal_scale2x_32_def(u32 *dst,
rlm@1 88 const u32 *src0,
rlm@1 89 const u32 *src1,
rlm@1 90 const u32 *src2,
rlm@1 91 unsigned count)
rlm@1 92 {
rlm@1 93 /* first pixel */
rlm@1 94 dst[0] = src1[0];
rlm@1 95 if (src1[1] == src0[0] && src2[0] != src0[0])
rlm@1 96 dst[1] = src0[0];
rlm@1 97 else
rlm@1 98 dst[1] = src1[0];
rlm@1 99 ++src0;
rlm@1 100 ++src1;
rlm@1 101 ++src2;
rlm@1 102 dst += 2;
rlm@1 103
rlm@1 104 /* central pixels */
rlm@1 105 count -= 2;
rlm@1 106 while (count)
rlm@1 107 {
rlm@1 108 if (src0[0] != src2[0] && src1[-1] != src1[1])
rlm@1 109 {
rlm@1 110 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
rlm@1 111 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
rlm@1 112 }
rlm@1 113 else
rlm@1 114 {
rlm@1 115 dst[0] = src1[0];
rlm@1 116 dst[1] = src1[0];
rlm@1 117 }
rlm@1 118
rlm@1 119 ++src0;
rlm@1 120 ++src1;
rlm@1 121 ++src2;
rlm@1 122 dst += 2;
rlm@1 123 --count;
rlm@1 124 }
rlm@1 125
rlm@1 126 /* last pixel */
rlm@1 127 if (src1[-1] == src0[0] && src2[0] != src0[0])
rlm@1 128 dst[0] = src0[0];
rlm@1 129 else
rlm@1 130 dst[0] = src1[0];
rlm@1 131 dst[1] = src1[0];
rlm@1 132 }
rlm@1 133
rlm@1 134 #ifdef MMX
rlm@1 135 static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
rlm@1 136 {
rlm@1 137 /* always do the first and last run */
rlm@1 138 count -= 2 * 4;
rlm@1 139
rlm@1 140 #ifdef __GNUC__
rlm@1 141 __asm__ __volatile__ (
rlm@1 142 /* first run */
rlm@1 143 /* set the current, current_pre, current_next registers */
rlm@1 144 "movq 0(%1), %%mm0\n"
rlm@1 145 "movq 0(%1),%%mm7\n"
rlm@1 146 "movq 8(%1),%%mm1\n"
rlm@1 147 "psllq $48,%%mm0\n"
rlm@1 148 "psllq $48,%%mm1\n"
rlm@1 149 "psrlq $48, %%mm0\n"
rlm@1 150 "movq %%mm7,%%mm2\n"
rlm@1 151 "movq %%mm7,%%mm3\n"
rlm@1 152 "psllq $16,%%mm2\n"
rlm@1 153 "psrlq $16,%%mm3\n"
rlm@1 154 "por %%mm2,%%mm0\n"
rlm@1 155 "por %%mm3,%%mm1\n"
rlm@1 156
rlm@1 157 /* current_upper */
rlm@1 158 "movq (%0),%%mm6\n"
rlm@1 159
rlm@1 160 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 161 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 162 "movq %%mm0,%%mm2\n"
rlm@1 163 "movq %%mm1,%%mm4\n"
rlm@1 164 "movq %%mm0,%%mm3\n"
rlm@1 165 "movq %%mm1,%%mm5\n"
rlm@1 166 "pcmpeqw %%mm6,%%mm2\n"
rlm@1 167 "pcmpeqw %%mm6,%%mm4\n"
rlm@1 168 "pcmpeqw (%2),%%mm3\n"
rlm@1 169 "pcmpeqw (%2),%%mm5\n"
rlm@1 170 "pandn %%mm2,%%mm3\n"
rlm@1 171 "pandn %%mm4,%%mm5\n"
rlm@1 172 "movq %%mm0,%%mm2\n"
rlm@1 173 "movq %%mm1,%%mm4\n"
rlm@1 174 "pcmpeqw %%mm1,%%mm2\n"
rlm@1 175 "pcmpeqw %%mm0,%%mm4\n"
rlm@1 176 "pandn %%mm3,%%mm2\n"
rlm@1 177 "pandn %%mm5,%%mm4\n"
rlm@1 178 "movq %%mm2,%%mm3\n"
rlm@1 179 "movq %%mm4,%%mm5\n"
rlm@1 180 "pand %%mm6,%%mm2\n"
rlm@1 181 "pand %%mm6,%%mm4\n"
rlm@1 182 "pandn %%mm7,%%mm3\n"
rlm@1 183 "pandn %%mm7,%%mm5\n"
rlm@1 184 "por %%mm3,%%mm2\n"
rlm@1 185 "por %%mm5,%%mm4\n"
rlm@1 186
rlm@1 187 /* set *dst */
rlm@1 188 "movq %%mm2,%%mm3\n"
rlm@1 189 "punpcklwd %%mm4,%%mm2\n"
rlm@1 190 "punpckhwd %%mm4,%%mm3\n"
rlm@1 191 "movq %%mm2,(%3)\n"
rlm@1 192 "movq %%mm3,8(%3)\n"
rlm@1 193
rlm@1 194 /* next */
rlm@1 195 "addl $8,%0\n"
rlm@1 196 "addl $8,%1\n"
rlm@1 197 "addl $8,%2\n"
rlm@1 198 "addl $16,%3\n"
rlm@1 199
rlm@1 200 /* central runs */
rlm@1 201 "shrl $2,%4\n"
rlm@1 202 "jz 1f\n"
rlm@1 203
rlm@1 204 "0:\n"
rlm@1 205
rlm@1 206 /* set the current, current_pre, current_next registers */
rlm@1 207 "movq -8(%1),%%mm0\n"
rlm@1 208 "movq (%1),%%mm7\n"
rlm@1 209 "movq 8(%1),%%mm1\n"
rlm@1 210 "psrlq $48,%%mm0\n"
rlm@1 211 "psllq $48,%%mm1\n"
rlm@1 212 "movq %%mm7,%%mm2\n"
rlm@1 213 "movq %%mm7,%%mm3\n"
rlm@1 214 "psllq $16,%%mm2\n"
rlm@1 215 "psrlq $16,%%mm3\n"
rlm@1 216 "por %%mm2,%%mm0\n"
rlm@1 217 "por %%mm3,%%mm1\n"
rlm@1 218
rlm@1 219 /* current_upper */
rlm@1 220 "movq (%0),%%mm6\n"
rlm@1 221
rlm@1 222 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 223 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 224 "movq %%mm0,%%mm2\n"
rlm@1 225 "movq %%mm1,%%mm4\n"
rlm@1 226 "movq %%mm0,%%mm3\n"
rlm@1 227 "movq %%mm1,%%mm5\n"
rlm@1 228 "pcmpeqw %%mm6,%%mm2\n"
rlm@1 229 "pcmpeqw %%mm6,%%mm4\n"
rlm@1 230 "pcmpeqw (%2),%%mm3\n"
rlm@1 231 "pcmpeqw (%2),%%mm5\n"
rlm@1 232 "pandn %%mm2,%%mm3\n"
rlm@1 233 "pandn %%mm4,%%mm5\n"
rlm@1 234 "movq %%mm0,%%mm2\n"
rlm@1 235 "movq %%mm1,%%mm4\n"
rlm@1 236 "pcmpeqw %%mm1,%%mm2\n"
rlm@1 237 "pcmpeqw %%mm0,%%mm4\n"
rlm@1 238 "pandn %%mm3,%%mm2\n"
rlm@1 239 "pandn %%mm5,%%mm4\n"
rlm@1 240 "movq %%mm2,%%mm3\n"
rlm@1 241 "movq %%mm4,%%mm5\n"
rlm@1 242 "pand %%mm6,%%mm2\n"
rlm@1 243 "pand %%mm6,%%mm4\n"
rlm@1 244 "pandn %%mm7,%%mm3\n"
rlm@1 245 "pandn %%mm7,%%mm5\n"
rlm@1 246 "por %%mm3,%%mm2\n"
rlm@1 247 "por %%mm5,%%mm4\n"
rlm@1 248
rlm@1 249 /* set *dst */
rlm@1 250 "movq %%mm2,%%mm3\n"
rlm@1 251 "punpcklwd %%mm4,%%mm2\n"
rlm@1 252 "punpckhwd %%mm4,%%mm3\n"
rlm@1 253 "movq %%mm2,(%3)\n"
rlm@1 254 "movq %%mm3,8(%3)\n"
rlm@1 255
rlm@1 256 /* next */
rlm@1 257 "addl $8,%0\n"
rlm@1 258 "addl $8,%1\n"
rlm@1 259 "addl $8,%2\n"
rlm@1 260 "addl $16,%3\n"
rlm@1 261
rlm@1 262 "decl %4\n"
rlm@1 263 "jnz 0b\n"
rlm@1 264 "1:\n"
rlm@1 265
rlm@1 266 /* final run */
rlm@1 267 /* set the current, current_pre, current_next registers */
rlm@1 268 "movq (%1),%%mm1\n"
rlm@1 269 "movq (%1),%%mm7\n"
rlm@1 270 "movq -8(%1),%%mm0\n"
rlm@1 271 "psrlq $48,%%mm1\n"
rlm@1 272 "psrlq $48,%%mm0\n"
rlm@1 273 "psllq $48,%%mm1\n"
rlm@1 274 "movq %%mm7,%%mm2\n"
rlm@1 275 "movq %%mm7,%%mm3\n"
rlm@1 276 "psllq $16,%%mm2\n"
rlm@1 277 "psrlq $16,%%mm3\n"
rlm@1 278 "por %%mm2,%%mm0\n"
rlm@1 279 "por %%mm3,%%mm1\n"
rlm@1 280
rlm@1 281 /* current_upper */
rlm@1 282 "movq (%0),%%mm6\n"
rlm@1 283
rlm@1 284 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 285 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 286 "movq %%mm0,%%mm2\n"
rlm@1 287 "movq %%mm1,%%mm4\n"
rlm@1 288 "movq %%mm0,%%mm3\n"
rlm@1 289 "movq %%mm1,%%mm5\n"
rlm@1 290 "pcmpeqw %%mm6,%%mm2\n"
rlm@1 291 "pcmpeqw %%mm6,%%mm4\n"
rlm@1 292 "pcmpeqw (%2),%%mm3\n"
rlm@1 293 "pcmpeqw (%2),%%mm5\n"
rlm@1 294 "pandn %%mm2,%%mm3\n"
rlm@1 295 "pandn %%mm4,%%mm5\n"
rlm@1 296 "movq %%mm0,%%mm2\n"
rlm@1 297 "movq %%mm1,%%mm4\n"
rlm@1 298 "pcmpeqw %%mm1,%%mm2\n"
rlm@1 299 "pcmpeqw %%mm0,%%mm4\n"
rlm@1 300 "pandn %%mm3,%%mm2\n"
rlm@1 301 "pandn %%mm5,%%mm4\n"
rlm@1 302 "movq %%mm2,%%mm3\n"
rlm@1 303 "movq %%mm4,%%mm5\n"
rlm@1 304 "pand %%mm6,%%mm2\n"
rlm@1 305 "pand %%mm6,%%mm4\n"
rlm@1 306 "pandn %%mm7,%%mm3\n"
rlm@1 307 "pandn %%mm7,%%mm5\n"
rlm@1 308 "por %%mm3,%%mm2\n"
rlm@1 309 "por %%mm5,%%mm4\n"
rlm@1 310
rlm@1 311 /* set *dst */
rlm@1 312 "movq %%mm2,%%mm3\n"
rlm@1 313 "punpcklwd %%mm4,%%mm2\n"
rlm@1 314 "punpckhwd %%mm4,%%mm3\n"
rlm@1 315 "movq %%mm2,(%3)\n"
rlm@1 316 "movq %%mm3,8(%3)\n"
rlm@1 317 "emms\n"
rlm@1 318
rlm@1 319 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
rlm@1 320 :
rlm@1 321 : "cc"
rlm@1 322 );
rlm@1 323 #else
rlm@1 324 __asm {
rlm@1 325 mov eax, src0;
rlm@1 326 mov ebx, src1;
rlm@1 327 mov ecx, src2;
rlm@1 328 mov edx, dst;
rlm@1 329 mov esi, count;
rlm@1 330
rlm@1 331 /* first run */
rlm@1 332 /* set the current, current_pre, current_next registers */
rlm@1 333 movq mm0, qword ptr [ebx];
rlm@1 334 movq mm7, qword ptr [ebx];
rlm@1 335 movq mm1, qword ptr [ebx + 8];
rlm@1 336 psllq mm0, 48;
rlm@1 337 psllq mm1, 48;
rlm@1 338 psrlq mm0, 48;
rlm@1 339 movq mm2, mm7;
rlm@1 340 movq mm3, mm7;
rlm@1 341 psllq mm2, 16;
rlm@1 342 psrlq mm3, 16;
rlm@1 343 por mm0, mm2;
rlm@1 344 por mm1, mm3;
rlm@1 345
rlm@1 346 /* current_upper */
rlm@1 347 movq mm6, qword ptr [eax];
rlm@1 348
rlm@1 349 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 350 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 351 movq mm2, mm0;
rlm@1 352 movq mm4, mm1;
rlm@1 353 movq mm3, mm0;
rlm@1 354 movq mm5, mm1;
rlm@1 355 pcmpeqw mm2, mm6;
rlm@1 356 pcmpeqw mm4, mm6;
rlm@1 357 pcmpeqw mm3, qword ptr [ecx];
rlm@1 358 pcmpeqw mm5, qword ptr [ecx];
rlm@1 359 pandn mm3, mm2;
rlm@1 360 pandn mm5, mm4;
rlm@1 361 movq mm2, mm0;
rlm@1 362 movq mm4, mm1;
rlm@1 363 pcmpeqw mm2, mm1;
rlm@1 364 pcmpeqw mm4, mm0;
rlm@1 365 pandn mm2, mm3;
rlm@1 366 pandn mm4, mm5;
rlm@1 367 movq mm3, mm2;
rlm@1 368 movq mm5, mm4;
rlm@1 369 pand mm2, mm6;
rlm@1 370 pand mm4, mm6;
rlm@1 371 pandn mm3, mm7;
rlm@1 372 pandn mm5, mm7;
rlm@1 373 por mm2, mm3;
rlm@1 374 por mm4, mm5;
rlm@1 375
rlm@1 376 /* set *dst0 */
rlm@1 377 movq mm3, mm2;
rlm@1 378 punpcklwd mm2, mm4;
rlm@1 379 punpckhwd mm3, mm4;
rlm@1 380 movq qword ptr [edx], mm2;
rlm@1 381 movq qword ptr [edx + 8], mm3;
rlm@1 382
rlm@1 383 /* next */
rlm@1 384 add eax, 8;
rlm@1 385 add ebx, 8;
rlm@1 386 add ecx, 8;
rlm@1 387 add edx, 16;
rlm@1 388
rlm@1 389 /* central runs */
rlm@1 390 shr esi, 2;
rlm@1 391 jz label1;
rlm@1 392 align 4;
rlm@1 393 label0:
rlm@1 394
rlm@1 395 /* set the current, current_pre, current_next registers */
rlm@1 396 movq mm0, qword ptr [ebx - 8];
rlm@1 397 movq mm7, qword ptr [ebx];
rlm@1 398 movq mm1, qword ptr [ebx + 8];
rlm@1 399 psrlq mm0, 48;
rlm@1 400 psllq mm1, 48;
rlm@1 401 movq mm2, mm7;
rlm@1 402 movq mm3, mm7;
rlm@1 403 psllq mm2, 16;
rlm@1 404 psrlq mm3, 16;
rlm@1 405 por mm0, mm2;
rlm@1 406 por mm1, mm3;
rlm@1 407
rlm@1 408 /* current_upper */
rlm@1 409 movq mm6, qword ptr [eax];
rlm@1 410
rlm@1 411 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 412 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 413 movq mm2, mm0;
rlm@1 414 movq mm4, mm1;
rlm@1 415 movq mm3, mm0;
rlm@1 416 movq mm5, mm1;
rlm@1 417 pcmpeqw mm2, mm6;
rlm@1 418 pcmpeqw mm4, mm6;
rlm@1 419 pcmpeqw mm3, qword ptr [ecx];
rlm@1 420 pcmpeqw mm5, qword ptr [ecx];
rlm@1 421 pandn mm3, mm2;
rlm@1 422 pandn mm5, mm4;
rlm@1 423 movq mm2, mm0;
rlm@1 424 movq mm4, mm1;
rlm@1 425 pcmpeqw mm2, mm1;
rlm@1 426 pcmpeqw mm4, mm0;
rlm@1 427 pandn mm2, mm3;
rlm@1 428 pandn mm4, mm5;
rlm@1 429 movq mm3, mm2;
rlm@1 430 movq mm5, mm4;
rlm@1 431 pand mm2, mm6;
rlm@1 432 pand mm4, mm6;
rlm@1 433 pandn mm3, mm7;
rlm@1 434 pandn mm5, mm7;
rlm@1 435 por mm2, mm3;
rlm@1 436 por mm4, mm5;
rlm@1 437
rlm@1 438 /* set *dst */
rlm@1 439 movq mm3, mm2;
rlm@1 440 punpcklwd mm2, mm4;
rlm@1 441 punpckhwd mm3, mm4;
rlm@1 442 movq qword ptr [edx], mm2;
rlm@1 443 movq qword ptr [edx + 8], mm3;
rlm@1 444
rlm@1 445 /* next */
rlm@1 446 add eax, 8;
rlm@1 447 add ebx, 8;
rlm@1 448 add ecx, 8;
rlm@1 449 add edx, 16;
rlm@1 450
rlm@1 451 dec esi;
rlm@1 452 jnz label0;
rlm@1 453 label1:
rlm@1 454
rlm@1 455 /* final run */
rlm@1 456 /* set the current, current_pre, current_next registers */
rlm@1 457 movq mm1, qword ptr [ebx];
rlm@1 458 movq mm7, qword ptr [ebx];
rlm@1 459 movq mm0, qword ptr [ebx - 8];
rlm@1 460 psrlq mm1, 48;
rlm@1 461 psrlq mm0, 48;
rlm@1 462 psllq mm1, 48;
rlm@1 463 movq mm2, mm7;
rlm@1 464 movq mm3, mm7;
rlm@1 465 psllq mm2, 16;
rlm@1 466 psrlq mm3, 16;
rlm@1 467 por mm0, mm2;
rlm@1 468 por mm1, mm3;
rlm@1 469
rlm@1 470 /* current_upper */
rlm@1 471 movq mm6, qword ptr [eax];
rlm@1 472
rlm@1 473 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 474 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 475 movq mm2, mm0;
rlm@1 476 movq mm4, mm1;
rlm@1 477 movq mm3, mm0;
rlm@1 478 movq mm5, mm1;
rlm@1 479 pcmpeqw mm2, mm6;
rlm@1 480 pcmpeqw mm4, mm6;
rlm@1 481 pcmpeqw mm3, qword ptr [ecx];
rlm@1 482 pcmpeqw mm5, qword ptr [ecx];
rlm@1 483 pandn mm3, mm2;
rlm@1 484 pandn mm5, mm4;
rlm@1 485 movq mm2, mm0;
rlm@1 486 movq mm4, mm1;
rlm@1 487 pcmpeqw mm2, mm1;
rlm@1 488 pcmpeqw mm4, mm0;
rlm@1 489 pandn mm2, mm3;
rlm@1 490 pandn mm4, mm5;
rlm@1 491 movq mm3, mm2;
rlm@1 492 movq mm5, mm4;
rlm@1 493 pand mm2, mm6;
rlm@1 494 pand mm4, mm6;
rlm@1 495 pandn mm3, mm7;
rlm@1 496 pandn mm5, mm7;
rlm@1 497 por mm2, mm3;
rlm@1 498 por mm4, mm5;
rlm@1 499
rlm@1 500 /* set *dst */
rlm@1 501 movq mm3, mm2;
rlm@1 502 punpcklwd mm2, mm4;
rlm@1 503 punpckhwd mm3, mm4;
rlm@1 504 movq qword ptr [edx], mm2;
rlm@1 505 movq qword ptr [edx + 8], mm3;
rlm@1 506
rlm@1 507 mov src0, eax;
rlm@1 508 mov src1, ebx;
rlm@1 509 mov src2, ecx;
rlm@1 510 mov dst, edx;
rlm@1 511 mov count, esi;
rlm@1 512
rlm@1 513 emms;
rlm@1 514 }
rlm@1 515 #endif
rlm@1 516 }
rlm@1 517
rlm@1 518 static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
rlm@1 519 {
rlm@1 520 /* always do the first and last run */
rlm@1 521 count -= 2 * 2;
rlm@1 522
rlm@1 523 #ifdef __GNUC__
rlm@1 524 __asm__ __volatile__ (
rlm@1 525 /* first run */
rlm@1 526 /* set the current, current_pre, current_next registers */
rlm@1 527 "movq 0(%1),%%mm0\n"
rlm@1 528 "movq 0(%1),%%mm7\n"
rlm@1 529 "movq 8(%1),%%mm1\n"
rlm@1 530 "psllq $32,%%mm0\n"
rlm@1 531 "psllq $32,%%mm1\n"
rlm@1 532 "psrlq $32,%%mm0\n"
rlm@1 533 "movq %%mm7,%%mm2\n"
rlm@1 534 "movq %%mm7,%%mm3\n"
rlm@1 535 "psllq $32,%%mm2\n"
rlm@1 536 "psrlq $32,%%mm3\n"
rlm@1 537 "por %%mm2,%%mm0\n"
rlm@1 538 "por %%mm3,%%mm1\n"
rlm@1 539
rlm@1 540 /* current_upper */
rlm@1 541 "movq (%0),%%mm6\n"
rlm@1 542
rlm@1 543 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 544 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 545 "movq %%mm0,%%mm2\n"
rlm@1 546 "movq %%mm1,%%mm4\n"
rlm@1 547 "movq %%mm0,%%mm3\n"
rlm@1 548 "movq %%mm1,%%mm5\n"
rlm@1 549 "pcmpeqd %%mm6,%%mm2\n"
rlm@1 550 "pcmpeqd %%mm6,%%mm4\n"
rlm@1 551 "pcmpeqd (%2),%%mm3\n"
rlm@1 552 "pcmpeqd (%2),%%mm5\n"
rlm@1 553 "pandn %%mm2,%%mm3\n"
rlm@1 554 "pandn %%mm4,%%mm5\n"
rlm@1 555 "movq %%mm0,%%mm2\n"
rlm@1 556 "movq %%mm1,%%mm4\n"
rlm@1 557 "pcmpeqd %%mm1,%%mm2\n"
rlm@1 558 "pcmpeqd %%mm0,%%mm4\n"
rlm@1 559 "pandn %%mm3,%%mm2\n"
rlm@1 560 "pandn %%mm5,%%mm4\n"
rlm@1 561 "movq %%mm2,%%mm3\n"
rlm@1 562 "movq %%mm4,%%mm5\n"
rlm@1 563 "pand %%mm6,%%mm2\n"
rlm@1 564 "pand %%mm6,%%mm4\n"
rlm@1 565 "pandn %%mm7,%%mm3\n"
rlm@1 566 "pandn %%mm7,%%mm5\n"
rlm@1 567 "por %%mm3,%%mm2\n"
rlm@1 568 "por %%mm5,%%mm4\n"
rlm@1 569
rlm@1 570 /* set *dst */
rlm@1 571 "movq %%mm2,%%mm3\n"
rlm@1 572 "punpckldq %%mm4,%%mm2\n"
rlm@1 573 "punpckhdq %%mm4,%%mm3\n"
rlm@1 574 "movq %%mm2,(%3)\n"
rlm@1 575 "movq %%mm3, 8(%3)\n"
rlm@1 576
rlm@1 577 /* next */
rlm@1 578 "addl $8,%0\n"
rlm@1 579 "addl $8,%1\n"
rlm@1 580 "addl $8,%2\n"
rlm@1 581 "addl $16,%3\n"
rlm@1 582
rlm@1 583 /* central runs */
rlm@1 584 "shrl $1,%4\n"
rlm@1 585 "jz 1f\n"
rlm@1 586
rlm@1 587 "0:\n"
rlm@1 588
rlm@1 589 /* set the current, current_pre, current_next registers */
rlm@1 590 "movq -8(%1),%%mm0\n"
rlm@1 591 "movq (%1),%%mm7\n"
rlm@1 592 "movq 8(%1),%%mm1\n"
rlm@1 593 "psrlq $32,%%mm0\n"
rlm@1 594 "psllq $32,%%mm1\n"
rlm@1 595 "movq %%mm7,%%mm2\n"
rlm@1 596 "movq %%mm7,%%mm3\n"
rlm@1 597 "psllq $32,%%mm2\n"
rlm@1 598 "psrlq $32,%%mm3\n"
rlm@1 599 "por %%mm2,%%mm0\n"
rlm@1 600 "por %%mm3,%%mm1\n"
rlm@1 601
rlm@1 602 /* current_upper */
rlm@1 603 "movq (%0),%%mm6\n"
rlm@1 604
rlm@1 605 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 606 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 607 "movq %%mm0,%%mm2\n"
rlm@1 608 "movq %%mm1,%%mm4\n"
rlm@1 609 "movq %%mm0,%%mm3\n"
rlm@1 610 "movq %%mm1,%%mm5\n"
rlm@1 611 "pcmpeqd %%mm6,%%mm2\n"
rlm@1 612 "pcmpeqd %%mm6,%%mm4\n"
rlm@1 613 "pcmpeqd (%2),%%mm3\n"
rlm@1 614 "pcmpeqd (%2),%%mm5\n"
rlm@1 615 "pandn %%mm2,%%mm3\n"
rlm@1 616 "pandn %%mm4,%%mm5\n"
rlm@1 617 "movq %%mm0,%%mm2\n"
rlm@1 618 "movq %%mm1,%%mm4\n"
rlm@1 619 "pcmpeqd %%mm1,%%mm2\n"
rlm@1 620 "pcmpeqd %%mm0,%%mm4\n"
rlm@1 621 "pandn %%mm3,%%mm2\n"
rlm@1 622 "pandn %%mm5,%%mm4\n"
rlm@1 623 "movq %%mm2,%%mm3\n"
rlm@1 624 "movq %%mm4,%%mm5\n"
rlm@1 625 "pand %%mm6,%%mm2\n"
rlm@1 626 "pand %%mm6,%%mm4\n"
rlm@1 627 "pandn %%mm7,%%mm3\n"
rlm@1 628 "pandn %%mm7,%%mm5\n"
rlm@1 629 "por %%mm3,%%mm2\n"
rlm@1 630 "por %%mm5,%%mm4\n"
rlm@1 631
rlm@1 632 /* set *dst */
rlm@1 633 "movq %%mm2,%%mm3\n"
rlm@1 634 "punpckldq %%mm4,%%mm2\n"
rlm@1 635 "punpckhdq %%mm4,%%mm3\n"
rlm@1 636 "movq %%mm2,(%3)\n"
rlm@1 637 "movq %%mm3,8(%3)\n"
rlm@1 638
rlm@1 639 /* next */
rlm@1 640 "addl $8,%0\n"
rlm@1 641 "addl $8,%1\n"
rlm@1 642 "addl $8,%2\n"
rlm@1 643 "addl $16,%3\n"
rlm@1 644
rlm@1 645 "decl %4\n"
rlm@1 646 "jnz 0b\n"
rlm@1 647 "1:\n"
rlm@1 648
rlm@1 649 /* final run */
rlm@1 650 /* set the current, current_pre, current_next registers */
rlm@1 651 "movq (%1),%%mm1\n"
rlm@1 652 "movq (%1),%%mm7\n"
rlm@1 653 "movq -8(%1), %%mm0\n"
rlm@1 654 "psrlq $32,%%mm1\n"
rlm@1 655 "psrlq $32,%%mm0\n"
rlm@1 656 "psllq $32,%%mm1\n"
rlm@1 657 "movq %%mm7,%%mm2\n"
rlm@1 658 "movq %%mm7,%%mm3\n"
rlm@1 659 "psllq $32,%%mm2\n"
rlm@1 660 "psrlq $32,%%mm3\n"
rlm@1 661 "por %%mm2,%%mm0\n"
rlm@1 662 "por %%mm3,%%mm1\n"
rlm@1 663
rlm@1 664 /* current_upper */
rlm@1 665 "movq (%0),%%mm6\n"
rlm@1 666
rlm@1 667 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 668 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 669 "movq %%mm0,%%mm2\n"
rlm@1 670 "movq %%mm1,%%mm4\n"
rlm@1 671 "movq %%mm0,%%mm3\n"
rlm@1 672 "movq %%mm1,%%mm5\n"
rlm@1 673 "pcmpeqd %%mm6,%%mm2\n"
rlm@1 674 "pcmpeqd %%mm6,%%mm4\n"
rlm@1 675 "pcmpeqd (%2),%%mm3\n"
rlm@1 676 "pcmpeqd (%2),%%mm5\n"
rlm@1 677 "pandn %%mm2,%%mm3\n"
rlm@1 678 "pandn %%mm4,%%mm5\n"
rlm@1 679 "movq %%mm0,%%mm2\n"
rlm@1 680 "movq %%mm1,%%mm4\n"
rlm@1 681 "pcmpeqd %%mm1,%%mm2\n"
rlm@1 682 "pcmpeqd %%mm0,%%mm4\n"
rlm@1 683 "pandn %%mm3,%%mm2\n"
rlm@1 684 "pandn %%mm5,%%mm4\n"
rlm@1 685 "movq %%mm2,%%mm3\n"
rlm@1 686 "movq %%mm4,%%mm5\n"
rlm@1 687 "pand %%mm6,%%mm2\n"
rlm@1 688 "pand %%mm6,%%mm4\n"
rlm@1 689 "pandn %%mm7,%%mm3\n"
rlm@1 690 "pandn %%mm7,%%mm5\n"
rlm@1 691 "por %%mm3,%%mm2\n"
rlm@1 692 "por %%mm5,%%mm4\n"
rlm@1 693
rlm@1 694 /* set *dst */
rlm@1 695 "movq %%mm2,%%mm3\n"
rlm@1 696 "punpckldq %%mm4,%%mm2\n"
rlm@1 697 "punpckhdq %%mm4,%%mm3\n"
rlm@1 698 "movq %%mm2,(%3)\n"
rlm@1 699 "movq %%mm3,8(%3)\n"
rlm@1 700 "emms\n"
rlm@1 701
rlm@1 702 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
rlm@1 703 :
rlm@1 704 : "cc"
rlm@1 705 );
rlm@1 706 #else
rlm@1 707 __asm {
rlm@1 708 mov eax, src0;
rlm@1 709 mov ebx, src1;
rlm@1 710 mov ecx, src2;
rlm@1 711 mov edx, dst;
rlm@1 712 mov esi, count;
rlm@1 713
rlm@1 714 /* first run */
rlm@1 715 /* set the current, current_pre, current_next registers */
rlm@1 716 movq mm0, qword ptr [ebx];
rlm@1 717 movq mm7, qword ptr [ebx];
rlm@1 718 movq mm1, qword ptr [ebx + 8];
rlm@1 719 psllq mm0, 32;
rlm@1 720 psllq mm1, 32;
rlm@1 721 psrlq mm0, 32;
rlm@1 722 movq mm2, mm7;
rlm@1 723 movq mm3, mm7;
rlm@1 724 psllq mm2, 32;
rlm@1 725 psrlq mm3, 32;
rlm@1 726 por mm0, mm2;
rlm@1 727 por mm1, mm3;
rlm@1 728
rlm@1 729 /* current_upper */
rlm@1 730 movq mm6, qword ptr [eax];
rlm@1 731
rlm@1 732 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 733 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 734 movq mm2, mm0;
rlm@1 735 movq mm4, mm1;
rlm@1 736 movq mm3, mm0;
rlm@1 737 movq mm5, mm1;
rlm@1 738 pcmpeqd mm2, mm6;
rlm@1 739 pcmpeqd mm4, mm6;
rlm@1 740 pcmpeqd mm3, qword ptr [ecx];
rlm@1 741 pcmpeqd mm5, qword ptr [ecx];
rlm@1 742 pandn mm3, mm2;
rlm@1 743 pandn mm5, mm4;
rlm@1 744 movq mm2, mm0;
rlm@1 745 movq mm4, mm1;
rlm@1 746 pcmpeqd mm2, mm1;
rlm@1 747 pcmpeqd mm4, mm0;
rlm@1 748 pandn mm2, mm3;
rlm@1 749 pandn mm4, mm5;
rlm@1 750 movq mm3, mm2;
rlm@1 751 movq mm5, mm4;
rlm@1 752 pand mm2, mm6;
rlm@1 753 pand mm4, mm6;
rlm@1 754 pandn mm3, mm7;
rlm@1 755 pandn mm5, mm7;
rlm@1 756 por mm2, mm3;
rlm@1 757 por mm4, mm5;
rlm@1 758
rlm@1 759 /* set *dst */
rlm@1 760 movq mm3, mm2;
rlm@1 761 punpckldq mm2, mm4;
rlm@1 762 punpckhdq mm3, mm4;
rlm@1 763 movq qword ptr [edx], mm2;
rlm@1 764 movq qword ptr [edx + 8], mm3;
rlm@1 765
rlm@1 766 /* next */
rlm@1 767 add eax, 8;
rlm@1 768 add ebx, 8;
rlm@1 769 add ecx, 8;
rlm@1 770 add edx, 16;
rlm@1 771
rlm@1 772 /* central runs */
rlm@1 773 shr esi, 1;
rlm@1 774 jz label1;
rlm@1 775 label0:
rlm@1 776
rlm@1 777 /* set the current, current_pre, current_next registers */
rlm@1 778 movq mm0, qword ptr [ebx - 8];
rlm@1 779 movq mm7, qword ptr [ebx];
rlm@1 780 movq mm1, qword ptr [ebx + 8];
rlm@1 781 psrlq mm0, 32;
rlm@1 782 psllq mm1, 32;
rlm@1 783 movq mm2, mm7;
rlm@1 784 movq mm3, mm7;
rlm@1 785 psllq mm2, 32;
rlm@1 786 psrlq mm3, 32;
rlm@1 787 por mm0, mm2;
rlm@1 788 por mm1, mm3;
rlm@1 789
rlm@1 790 /* current_upper */
rlm@1 791 movq mm6, qword ptr[eax];
rlm@1 792
rlm@1 793 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 794 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 795 movq mm2, mm0;
rlm@1 796 movq mm4, mm1;
rlm@1 797 movq mm3, mm0;
rlm@1 798 movq mm5, mm1;
rlm@1 799 pcmpeqd mm2, mm6;
rlm@1 800 pcmpeqd mm4, mm6;
rlm@1 801 pcmpeqd mm3, qword ptr[ecx];
rlm@1 802 pcmpeqd mm5, qword ptr[ecx];
rlm@1 803 pandn mm3, mm2;
rlm@1 804 pandn mm5, mm4;
rlm@1 805 movq mm2, mm0;
rlm@1 806 movq mm4, mm1;
rlm@1 807 pcmpeqd mm2, mm1;
rlm@1 808 pcmpeqd mm4, mm0;
rlm@1 809 pandn mm2, mm3;
rlm@1 810 pandn mm4, mm5;
rlm@1 811 movq mm3, mm2;
rlm@1 812 movq mm5, mm4;
rlm@1 813 pand mm2, mm6;
rlm@1 814 pand mm4, mm6;
rlm@1 815 pandn mm3, mm7;
rlm@1 816 pandn mm5, mm7;
rlm@1 817 por mm2, mm3;
rlm@1 818 por mm4, mm5;
rlm@1 819
rlm@1 820 /* set *dst */
rlm@1 821 movq mm3, mm2;
rlm@1 822 punpckldq mm2, mm4;
rlm@1 823 punpckhdq mm3, mm4;
rlm@1 824 movq qword ptr [edx], mm2;
rlm@1 825 movq qword ptr [edx + 8], mm3;
rlm@1 826
rlm@1 827 /* next */
rlm@1 828 add eax, 8;
rlm@1 829 add ebx, 8;
rlm@1 830 add ecx, 8;
rlm@1 831 add edx, 16;
rlm@1 832
rlm@1 833 dec esi;
rlm@1 834 jnz label0;
rlm@1 835 label1:
rlm@1 836
rlm@1 837 /* final run */
rlm@1 838 /* set the current, current_pre, current_next registers */
rlm@1 839 movq mm1, qword ptr [ebx];
rlm@1 840 movq mm7, qword ptr [ebx];
rlm@1 841 movq mm0, qword ptr [ebx - 8];
rlm@1 842 psrlq mm1, 32;
rlm@1 843 psrlq mm0, 32;
rlm@1 844 psllq mm1, 32;
rlm@1 845 movq mm2, mm7;
rlm@1 846 movq mm3, mm7;
rlm@1 847 psllq mm2, 32;
rlm@1 848 psrlq mm3, 32;
rlm@1 849 por mm0, mm2;
rlm@1 850 por mm1, mm3;
rlm@1 851
rlm@1 852 /* current_upper */
rlm@1 853 movq mm6, qword ptr [eax];
rlm@1 854
rlm@1 855 /* compute the upper-left pixel for dst on %%mm2 */
rlm@1 856 /* compute the upper-right pixel for dst on %%mm4 */
rlm@1 857 movq mm2, mm0;
rlm@1 858 movq mm4, mm1;
rlm@1 859 movq mm3, mm0;
rlm@1 860 movq mm5, mm1;
rlm@1 861 pcmpeqd mm2, mm6;
rlm@1 862 pcmpeqd mm4, mm6;
rlm@1 863 pcmpeqd mm3, qword ptr [ecx];
rlm@1 864 pcmpeqd mm5, qword ptr [ecx];
rlm@1 865 pandn mm3, mm2;
rlm@1 866 pandn mm5, mm4;
rlm@1 867 movq mm2, mm0;
rlm@1 868 movq mm4, mm1;
rlm@1 869 pcmpeqd mm2, mm1;
rlm@1 870 pcmpeqd mm4, mm0;
rlm@1 871 pandn mm2, mm3;
rlm@1 872 pandn mm4, mm5;
rlm@1 873 movq mm3, mm2;
rlm@1 874 movq mm5, mm4;
rlm@1 875 pand mm2, mm6;
rlm@1 876 pand mm4, mm6;
rlm@1 877 pandn mm3, mm7;
rlm@1 878 pandn mm5, mm7;
rlm@1 879 por mm2, mm3;
rlm@1 880 por mm4, mm5;
rlm@1 881
rlm@1 882 /* set *dst */
rlm@1 883 movq mm3, mm2;
rlm@1 884 punpckldq mm2, mm4;
rlm@1 885 punpckhdq mm3, mm4;
rlm@1 886 movq qword ptr [edx], mm2;
rlm@1 887 movq qword ptr [edx + 8], mm3;
rlm@1 888
rlm@1 889 mov src0, eax;
rlm@1 890 mov src1, ebx;
rlm@1 891 mov src2, ecx;
rlm@1 892 mov dst, edx;
rlm@1 893 mov count, esi;
rlm@1 894
rlm@1 895 emms;
rlm@1 896 }
rlm@1 897 #endif
rlm@1 898 }
rlm@1 899
rlm@1 900 static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
rlm@1 901 {
rlm@1 902 // assert( count >= 2*4 );
rlm@1 903 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
rlm@1 904 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
rlm@1 905 }
rlm@1 906
rlm@1 907 static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
rlm@1 908 {
rlm@1 909 // assert( count >= 2*2 );
rlm@1 910 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
rlm@1 911 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
rlm@1 912 }
rlm@1 913
rlm@1 914 #endif
rlm@1 915
rlm@1 916 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
rlm@1 917 u8 *dstPtr, u32 dstPitch, int width, int height)
rlm@1 918 {
rlm@1 919 u16 *dst0 = (u16 *)dstPtr;
rlm@1 920 u16 *dst1 = dst0 + (dstPitch >> 1);
rlm@1 921
rlm@1 922 u16 *src0 = (u16 *)srcPtr;
rlm@1 923 u16 *src1 = src0 + (srcPitch >> 1);
rlm@1 924 u16 *src2 = src1 + (srcPitch >> 1);
rlm@1 925 #ifdef MMX
rlm@1 926 if (cpu_mmx)
rlm@1 927 {
rlm@1 928 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
rlm@1 929
rlm@1 930 int count = height;
rlm@1 931
rlm@1 932 count -= 2;
rlm@1 933 while (count)
rlm@1 934 {
rlm@1 935 dst0 += dstPitch;
rlm@1 936 dst1 += dstPitch;
rlm@1 937 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
rlm@1 938 src0 = src1;
rlm@1 939 src1 = src2;
rlm@1 940 src2 += srcPitch >> 1;
rlm@1 941 --count;
rlm@1 942 }
rlm@1 943 dst0 += dstPitch;
rlm@1 944 dst1 += dstPitch;
rlm@1 945 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
rlm@1 946 }
rlm@1 947 else
rlm@1 948 {
rlm@1 949 #endif
rlm@1 950 internal_scale2x_16_def(dst0, src0, src0, src1, width);
rlm@1 951 internal_scale2x_16_def(dst1, src1, src0, src0, width);
rlm@1 952
rlm@1 953 int count = height;
rlm@1 954
rlm@1 955 count -= 2;
rlm@1 956 while (count)
rlm@1 957 {
rlm@1 958 dst0 += dstPitch;
rlm@1 959 dst1 += dstPitch;
rlm@1 960 internal_scale2x_16_def(dst0, src0, src1, src2, width);
rlm@1 961 internal_scale2x_16_def(dst1, src2, src1, src0, width);
rlm@1 962 src0 = src1;
rlm@1 963 src1 = src2;
rlm@1 964 src2 += srcPitch >> 1;
rlm@1 965 --count;
rlm@1 966 }
rlm@1 967 dst0 += dstPitch;
rlm@1 968 dst1 += dstPitch;
rlm@1 969 internal_scale2x_16_def(dst0, src0, src1, src1, width);
rlm@1 970 internal_scale2x_16_def(dst1, src1, src1, src0, width);
rlm@1 971 #ifdef MMX
rlm@1 972 }
rlm@1 973
rlm@1 974 #endif
rlm@1 975 }
rlm@1 976
rlm@1 977 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
rlm@1 978 u8 *dstPtr, u32 dstPitch, int width, int height)
rlm@1 979 {
rlm@1 980 u32 *dst0 = (u32 *)dstPtr;
rlm@1 981 u32 *dst1 = dst0 + (dstPitch >> 2);
rlm@1 982
rlm@1 983 u32 *src0 = (u32 *)srcPtr;
rlm@1 984 u32 *src1 = src0 + (srcPitch >> 2);
rlm@1 985 u32 *src2 = src1 + (srcPitch >> 2);
rlm@1 986 #ifdef MMX
rlm@1 987 if (cpu_mmx)
rlm@1 988 {
rlm@1 989 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
rlm@1 990
rlm@1 991 int count = height;
rlm@1 992
rlm@1 993 count -= 2;
rlm@1 994 while (count)
rlm@1 995 {
rlm@1 996 dst0 += dstPitch >> 1;
rlm@1 997 dst1 += dstPitch >> 1;
rlm@1 998 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
rlm@1 999 src0 = src1;
rlm@1 1000 src1 = src2;
rlm@1 1001 src2 += srcPitch >> 2;
rlm@1 1002 --count;
rlm@1 1003 }
rlm@1 1004 dst0 += dstPitch >> 1;
rlm@1 1005 dst1 += dstPitch >> 1;
rlm@1 1006 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
rlm@1 1007 }
rlm@1 1008 else
rlm@1 1009 {
rlm@1 1010 #endif
rlm@1 1011 internal_scale2x_32_def(dst0, src0, src0, src1, width);
rlm@1 1012 internal_scale2x_32_def(dst1, src1, src0, src0, width);
rlm@1 1013
rlm@1 1014 int count = height;
rlm@1 1015
rlm@1 1016 count -= 2;
rlm@1 1017 while (count)
rlm@1 1018 {
rlm@1 1019 dst0 += dstPitch >> 1;
rlm@1 1020 dst1 += dstPitch >> 1;
rlm@1 1021 internal_scale2x_32_def(dst0, src0, src1, src2, width);
rlm@1 1022 internal_scale2x_32_def(dst1, src2, src1, src0, width);
rlm@1 1023 src0 = src1;
rlm@1 1024 src1 = src2;
rlm@1 1025 src2 += srcPitch >> 2;
rlm@1 1026 --count;
rlm@1 1027 }
rlm@1 1028 dst0 += dstPitch >> 1;
rlm@1 1029 dst1 += dstPitch >> 1;
rlm@1 1030 internal_scale2x_32_def(dst0, src0, src1, src1, width);
rlm@1 1031 internal_scale2x_32_def(dst1, src1, src1, src0, width);
rlm@1 1032 #ifdef MMX
rlm@1 1033 }
rlm@1 1034
rlm@1 1035 #endif
rlm@1 1036 }