view src/filters/admame.cpp @ 175:5d9a7a0ca09a

beginning test of latest assembly code. 240->70.
author Dylan Holmes <ocsenave@gmail.com>
date Wed, 21 Mar 2012 18:17:37 -0500
parents f9f4f1b99eed
children
line wrap: on
line source
1 /*
2 * This file is part of the Advance project.
3 *
4 * Copyright (C) 1999-2002 Andrea Mazzoleni
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * In addition, as a special exception, Andrea Mazzoleni
21 * gives permission to link the code of this program with
22 * the MAME library (or with modified versions of MAME that use the
23 * same license as MAME), and distribute linked combinations including
24 * the two. You must obey the GNU General Public License in all
25 * respects for all of the code used other than MAME. If you modify
26 * this file, you may extend this exception to your version of the
27 * file, but you are not obligated to do so. If you do not wish to
28 * do so, delete this exception statement from your version.
29 */
31 /*
32 * Alternatively at the previous license terms, you are allowed to use this
33 * code in your program with these conditions:
34 * - the program is not used in commercial activities.
35 * - the whole source code of the program is released with the binary.
36 */
38 #include "../Port.h"
40 #ifdef MMX
41 extern "C" bool cpu_mmx;
42 #endif
44 static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
45 {
46 /* first pixel */
47 dst[0] = src1[0];
48 if (src1[1] == src0[0] && src2[0] != src0[0])
49 dst[1] = src0[0];
50 else
51 dst[1] = src1[0];
52 ++src0;
53 ++src1;
54 ++src2;
55 dst += 2;
57 /* central pixels */
58 count -= 2;
59 while (count)
60 {
61 if (src0[0] != src2[0] && src1[-1] != src1[1])
62 {
63 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
64 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
65 }
66 else
67 {
68 dst[0] = src1[0];
69 dst[1] = src1[0];
70 }
72 ++src0;
73 ++src1;
74 ++src2;
75 dst += 2;
76 --count;
77 }
79 /* last pixel */
80 if (src1[-1] == src0[0] && src2[0] != src0[0])
81 dst[0] = src0[0];
82 else
83 dst[0] = src1[0];
84 dst[1] = src1[0];
85 }
87 static void internal_scale2x_32_def(u32 *dst,
88 const u32 *src0,
89 const u32 *src1,
90 const u32 *src2,
91 unsigned count)
92 {
93 /* first pixel */
94 dst[0] = src1[0];
95 if (src1[1] == src0[0] && src2[0] != src0[0])
96 dst[1] = src0[0];
97 else
98 dst[1] = src1[0];
99 ++src0;
100 ++src1;
101 ++src2;
102 dst += 2;
104 /* central pixels */
105 count -= 2;
106 while (count)
107 {
108 if (src0[0] != src2[0] && src1[-1] != src1[1])
109 {
110 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
111 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
112 }
113 else
114 {
115 dst[0] = src1[0];
116 dst[1] = src1[0];
117 }
119 ++src0;
120 ++src1;
121 ++src2;
122 dst += 2;
123 --count;
124 }
126 /* last pixel */
127 if (src1[-1] == src0[0] && src2[0] != src0[0])
128 dst[0] = src0[0];
129 else
130 dst[0] = src1[0];
131 dst[1] = src1[0];
132 }
134 #ifdef MMX
135 static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
136 {
137 /* always do the first and last run */
138 count -= 2 * 4;
140 #ifdef __GNUC__
141 __asm__ __volatile__ (
142 /* first run */
143 /* set the current, current_pre, current_next registers */
144 "movq 0(%1), %%mm0\n"
145 "movq 0(%1),%%mm7\n"
146 "movq 8(%1),%%mm1\n"
147 "psllq $48,%%mm0\n"
148 "psllq $48,%%mm1\n"
149 "psrlq $48, %%mm0\n"
150 "movq %%mm7,%%mm2\n"
151 "movq %%mm7,%%mm3\n"
152 "psllq $16,%%mm2\n"
153 "psrlq $16,%%mm3\n"
154 "por %%mm2,%%mm0\n"
155 "por %%mm3,%%mm1\n"
157 /* current_upper */
158 "movq (%0),%%mm6\n"
160 /* compute the upper-left pixel for dst on %%mm2 */
161 /* compute the upper-right pixel for dst on %%mm4 */
162 "movq %%mm0,%%mm2\n"
163 "movq %%mm1,%%mm4\n"
164 "movq %%mm0,%%mm3\n"
165 "movq %%mm1,%%mm5\n"
166 "pcmpeqw %%mm6,%%mm2\n"
167 "pcmpeqw %%mm6,%%mm4\n"
168 "pcmpeqw (%2),%%mm3\n"
169 "pcmpeqw (%2),%%mm5\n"
170 "pandn %%mm2,%%mm3\n"
171 "pandn %%mm4,%%mm5\n"
172 "movq %%mm0,%%mm2\n"
173 "movq %%mm1,%%mm4\n"
174 "pcmpeqw %%mm1,%%mm2\n"
175 "pcmpeqw %%mm0,%%mm4\n"
176 "pandn %%mm3,%%mm2\n"
177 "pandn %%mm5,%%mm4\n"
178 "movq %%mm2,%%mm3\n"
179 "movq %%mm4,%%mm5\n"
180 "pand %%mm6,%%mm2\n"
181 "pand %%mm6,%%mm4\n"
182 "pandn %%mm7,%%mm3\n"
183 "pandn %%mm7,%%mm5\n"
184 "por %%mm3,%%mm2\n"
185 "por %%mm5,%%mm4\n"
187 /* set *dst */
188 "movq %%mm2,%%mm3\n"
189 "punpcklwd %%mm4,%%mm2\n"
190 "punpckhwd %%mm4,%%mm3\n"
191 "movq %%mm2,(%3)\n"
192 "movq %%mm3,8(%3)\n"
194 /* next */
195 "addl $8,%0\n"
196 "addl $8,%1\n"
197 "addl $8,%2\n"
198 "addl $16,%3\n"
200 /* central runs */
201 "shrl $2,%4\n"
202 "jz 1f\n"
204 "0:\n"
206 /* set the current, current_pre, current_next registers */
207 "movq -8(%1),%%mm0\n"
208 "movq (%1),%%mm7\n"
209 "movq 8(%1),%%mm1\n"
210 "psrlq $48,%%mm0\n"
211 "psllq $48,%%mm1\n"
212 "movq %%mm7,%%mm2\n"
213 "movq %%mm7,%%mm3\n"
214 "psllq $16,%%mm2\n"
215 "psrlq $16,%%mm3\n"
216 "por %%mm2,%%mm0\n"
217 "por %%mm3,%%mm1\n"
219 /* current_upper */
220 "movq (%0),%%mm6\n"
222 /* compute the upper-left pixel for dst on %%mm2 */
223 /* compute the upper-right pixel for dst on %%mm4 */
224 "movq %%mm0,%%mm2\n"
225 "movq %%mm1,%%mm4\n"
226 "movq %%mm0,%%mm3\n"
227 "movq %%mm1,%%mm5\n"
228 "pcmpeqw %%mm6,%%mm2\n"
229 "pcmpeqw %%mm6,%%mm4\n"
230 "pcmpeqw (%2),%%mm3\n"
231 "pcmpeqw (%2),%%mm5\n"
232 "pandn %%mm2,%%mm3\n"
233 "pandn %%mm4,%%mm5\n"
234 "movq %%mm0,%%mm2\n"
235 "movq %%mm1,%%mm4\n"
236 "pcmpeqw %%mm1,%%mm2\n"
237 "pcmpeqw %%mm0,%%mm4\n"
238 "pandn %%mm3,%%mm2\n"
239 "pandn %%mm5,%%mm4\n"
240 "movq %%mm2,%%mm3\n"
241 "movq %%mm4,%%mm5\n"
242 "pand %%mm6,%%mm2\n"
243 "pand %%mm6,%%mm4\n"
244 "pandn %%mm7,%%mm3\n"
245 "pandn %%mm7,%%mm5\n"
246 "por %%mm3,%%mm2\n"
247 "por %%mm5,%%mm4\n"
249 /* set *dst */
250 "movq %%mm2,%%mm3\n"
251 "punpcklwd %%mm4,%%mm2\n"
252 "punpckhwd %%mm4,%%mm3\n"
253 "movq %%mm2,(%3)\n"
254 "movq %%mm3,8(%3)\n"
256 /* next */
257 "addl $8,%0\n"
258 "addl $8,%1\n"
259 "addl $8,%2\n"
260 "addl $16,%3\n"
262 "decl %4\n"
263 "jnz 0b\n"
264 "1:\n"
266 /* final run */
267 /* set the current, current_pre, current_next registers */
268 "movq (%1),%%mm1\n"
269 "movq (%1),%%mm7\n"
270 "movq -8(%1),%%mm0\n"
271 "psrlq $48,%%mm1\n"
272 "psrlq $48,%%mm0\n"
273 "psllq $48,%%mm1\n"
274 "movq %%mm7,%%mm2\n"
275 "movq %%mm7,%%mm3\n"
276 "psllq $16,%%mm2\n"
277 "psrlq $16,%%mm3\n"
278 "por %%mm2,%%mm0\n"
279 "por %%mm3,%%mm1\n"
281 /* current_upper */
282 "movq (%0),%%mm6\n"
284 /* compute the upper-left pixel for dst on %%mm2 */
285 /* compute the upper-right pixel for dst on %%mm4 */
286 "movq %%mm0,%%mm2\n"
287 "movq %%mm1,%%mm4\n"
288 "movq %%mm0,%%mm3\n"
289 "movq %%mm1,%%mm5\n"
290 "pcmpeqw %%mm6,%%mm2\n"
291 "pcmpeqw %%mm6,%%mm4\n"
292 "pcmpeqw (%2),%%mm3\n"
293 "pcmpeqw (%2),%%mm5\n"
294 "pandn %%mm2,%%mm3\n"
295 "pandn %%mm4,%%mm5\n"
296 "movq %%mm0,%%mm2\n"
297 "movq %%mm1,%%mm4\n"
298 "pcmpeqw %%mm1,%%mm2\n"
299 "pcmpeqw %%mm0,%%mm4\n"
300 "pandn %%mm3,%%mm2\n"
301 "pandn %%mm5,%%mm4\n"
302 "movq %%mm2,%%mm3\n"
303 "movq %%mm4,%%mm5\n"
304 "pand %%mm6,%%mm2\n"
305 "pand %%mm6,%%mm4\n"
306 "pandn %%mm7,%%mm3\n"
307 "pandn %%mm7,%%mm5\n"
308 "por %%mm3,%%mm2\n"
309 "por %%mm5,%%mm4\n"
311 /* set *dst */
312 "movq %%mm2,%%mm3\n"
313 "punpcklwd %%mm4,%%mm2\n"
314 "punpckhwd %%mm4,%%mm3\n"
315 "movq %%mm2,(%3)\n"
316 "movq %%mm3,8(%3)\n"
317 "emms\n"
319 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
320 :
321 : "cc"
322 );
323 #else
324 __asm {
325 mov eax, src0;
326 mov ebx, src1;
327 mov ecx, src2;
328 mov edx, dst;
329 mov esi, count;
331 /* first run */
332 /* set the current, current_pre, current_next registers */
333 movq mm0, qword ptr [ebx];
334 movq mm7, qword ptr [ebx];
335 movq mm1, qword ptr [ebx + 8];
336 psllq mm0, 48;
337 psllq mm1, 48;
338 psrlq mm0, 48;
339 movq mm2, mm7;
340 movq mm3, mm7;
341 psllq mm2, 16;
342 psrlq mm3, 16;
343 por mm0, mm2;
344 por mm1, mm3;
346 /* current_upper */
347 movq mm6, qword ptr [eax];
349 /* compute the upper-left pixel for dst on %%mm2 */
350 /* compute the upper-right pixel for dst on %%mm4 */
351 movq mm2, mm0;
352 movq mm4, mm1;
353 movq mm3, mm0;
354 movq mm5, mm1;
355 pcmpeqw mm2, mm6;
356 pcmpeqw mm4, mm6;
357 pcmpeqw mm3, qword ptr [ecx];
358 pcmpeqw mm5, qword ptr [ecx];
359 pandn mm3, mm2;
360 pandn mm5, mm4;
361 movq mm2, mm0;
362 movq mm4, mm1;
363 pcmpeqw mm2, mm1;
364 pcmpeqw mm4, mm0;
365 pandn mm2, mm3;
366 pandn mm4, mm5;
367 movq mm3, mm2;
368 movq mm5, mm4;
369 pand mm2, mm6;
370 pand mm4, mm6;
371 pandn mm3, mm7;
372 pandn mm5, mm7;
373 por mm2, mm3;
374 por mm4, mm5;
376 /* set *dst0 */
377 movq mm3, mm2;
378 punpcklwd mm2, mm4;
379 punpckhwd mm3, mm4;
380 movq qword ptr [edx], mm2;
381 movq qword ptr [edx + 8], mm3;
383 /* next */
384 add eax, 8;
385 add ebx, 8;
386 add ecx, 8;
387 add edx, 16;
389 /* central runs */
390 shr esi, 2;
391 jz label1;
392 align 4;
393 label0:
395 /* set the current, current_pre, current_next registers */
396 movq mm0, qword ptr [ebx - 8];
397 movq mm7, qword ptr [ebx];
398 movq mm1, qword ptr [ebx + 8];
399 psrlq mm0, 48;
400 psllq mm1, 48;
401 movq mm2, mm7;
402 movq mm3, mm7;
403 psllq mm2, 16;
404 psrlq mm3, 16;
405 por mm0, mm2;
406 por mm1, mm3;
408 /* current_upper */
409 movq mm6, qword ptr [eax];
411 /* compute the upper-left pixel for dst on %%mm2 */
412 /* compute the upper-right pixel for dst on %%mm4 */
413 movq mm2, mm0;
414 movq mm4, mm1;
415 movq mm3, mm0;
416 movq mm5, mm1;
417 pcmpeqw mm2, mm6;
418 pcmpeqw mm4, mm6;
419 pcmpeqw mm3, qword ptr [ecx];
420 pcmpeqw mm5, qword ptr [ecx];
421 pandn mm3, mm2;
422 pandn mm5, mm4;
423 movq mm2, mm0;
424 movq mm4, mm1;
425 pcmpeqw mm2, mm1;
426 pcmpeqw mm4, mm0;
427 pandn mm2, mm3;
428 pandn mm4, mm5;
429 movq mm3, mm2;
430 movq mm5, mm4;
431 pand mm2, mm6;
432 pand mm4, mm6;
433 pandn mm3, mm7;
434 pandn mm5, mm7;
435 por mm2, mm3;
436 por mm4, mm5;
438 /* set *dst */
439 movq mm3, mm2;
440 punpcklwd mm2, mm4;
441 punpckhwd mm3, mm4;
442 movq qword ptr [edx], mm2;
443 movq qword ptr [edx + 8], mm3;
445 /* next */
446 add eax, 8;
447 add ebx, 8;
448 add ecx, 8;
449 add edx, 16;
451 dec esi;
452 jnz label0;
453 label1:
455 /* final run */
456 /* set the current, current_pre, current_next registers */
457 movq mm1, qword ptr [ebx];
458 movq mm7, qword ptr [ebx];
459 movq mm0, qword ptr [ebx - 8];
460 psrlq mm1, 48;
461 psrlq mm0, 48;
462 psllq mm1, 48;
463 movq mm2, mm7;
464 movq mm3, mm7;
465 psllq mm2, 16;
466 psrlq mm3, 16;
467 por mm0, mm2;
468 por mm1, mm3;
470 /* current_upper */
471 movq mm6, qword ptr [eax];
473 /* compute the upper-left pixel for dst on %%mm2 */
474 /* compute the upper-right pixel for dst on %%mm4 */
475 movq mm2, mm0;
476 movq mm4, mm1;
477 movq mm3, mm0;
478 movq mm5, mm1;
479 pcmpeqw mm2, mm6;
480 pcmpeqw mm4, mm6;
481 pcmpeqw mm3, qword ptr [ecx];
482 pcmpeqw mm5, qword ptr [ecx];
483 pandn mm3, mm2;
484 pandn mm5, mm4;
485 movq mm2, mm0;
486 movq mm4, mm1;
487 pcmpeqw mm2, mm1;
488 pcmpeqw mm4, mm0;
489 pandn mm2, mm3;
490 pandn mm4, mm5;
491 movq mm3, mm2;
492 movq mm5, mm4;
493 pand mm2, mm6;
494 pand mm4, mm6;
495 pandn mm3, mm7;
496 pandn mm5, mm7;
497 por mm2, mm3;
498 por mm4, mm5;
500 /* set *dst */
501 movq mm3, mm2;
502 punpcklwd mm2, mm4;
503 punpckhwd mm3, mm4;
504 movq qword ptr [edx], mm2;
505 movq qword ptr [edx + 8], mm3;
507 mov src0, eax;
508 mov src1, ebx;
509 mov src2, ecx;
510 mov dst, edx;
511 mov count, esi;
513 emms;
514 }
515 #endif
516 }
518 static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
519 {
520 /* always do the first and last run */
521 count -= 2 * 2;
523 #ifdef __GNUC__
524 __asm__ __volatile__ (
525 /* first run */
526 /* set the current, current_pre, current_next registers */
527 "movq 0(%1),%%mm0\n"
528 "movq 0(%1),%%mm7\n"
529 "movq 8(%1),%%mm1\n"
530 "psllq $32,%%mm0\n"
531 "psllq $32,%%mm1\n"
532 "psrlq $32,%%mm0\n"
533 "movq %%mm7,%%mm2\n"
534 "movq %%mm7,%%mm3\n"
535 "psllq $32,%%mm2\n"
536 "psrlq $32,%%mm3\n"
537 "por %%mm2,%%mm0\n"
538 "por %%mm3,%%mm1\n"
540 /* current_upper */
541 "movq (%0),%%mm6\n"
543 /* compute the upper-left pixel for dst on %%mm2 */
544 /* compute the upper-right pixel for dst on %%mm4 */
545 "movq %%mm0,%%mm2\n"
546 "movq %%mm1,%%mm4\n"
547 "movq %%mm0,%%mm3\n"
548 "movq %%mm1,%%mm5\n"
549 "pcmpeqd %%mm6,%%mm2\n"
550 "pcmpeqd %%mm6,%%mm4\n"
551 "pcmpeqd (%2),%%mm3\n"
552 "pcmpeqd (%2),%%mm5\n"
553 "pandn %%mm2,%%mm3\n"
554 "pandn %%mm4,%%mm5\n"
555 "movq %%mm0,%%mm2\n"
556 "movq %%mm1,%%mm4\n"
557 "pcmpeqd %%mm1,%%mm2\n"
558 "pcmpeqd %%mm0,%%mm4\n"
559 "pandn %%mm3,%%mm2\n"
560 "pandn %%mm5,%%mm4\n"
561 "movq %%mm2,%%mm3\n"
562 "movq %%mm4,%%mm5\n"
563 "pand %%mm6,%%mm2\n"
564 "pand %%mm6,%%mm4\n"
565 "pandn %%mm7,%%mm3\n"
566 "pandn %%mm7,%%mm5\n"
567 "por %%mm3,%%mm2\n"
568 "por %%mm5,%%mm4\n"
570 /* set *dst */
571 "movq %%mm2,%%mm3\n"
572 "punpckldq %%mm4,%%mm2\n"
573 "punpckhdq %%mm4,%%mm3\n"
574 "movq %%mm2,(%3)\n"
575 "movq %%mm3, 8(%3)\n"
577 /* next */
578 "addl $8,%0\n"
579 "addl $8,%1\n"
580 "addl $8,%2\n"
581 "addl $16,%3\n"
583 /* central runs */
584 "shrl $1,%4\n"
585 "jz 1f\n"
587 "0:\n"
589 /* set the current, current_pre, current_next registers */
590 "movq -8(%1),%%mm0\n"
591 "movq (%1),%%mm7\n"
592 "movq 8(%1),%%mm1\n"
593 "psrlq $32,%%mm0\n"
594 "psllq $32,%%mm1\n"
595 "movq %%mm7,%%mm2\n"
596 "movq %%mm7,%%mm3\n"
597 "psllq $32,%%mm2\n"
598 "psrlq $32,%%mm3\n"
599 "por %%mm2,%%mm0\n"
600 "por %%mm3,%%mm1\n"
602 /* current_upper */
603 "movq (%0),%%mm6\n"
605 /* compute the upper-left pixel for dst on %%mm2 */
606 /* compute the upper-right pixel for dst on %%mm4 */
607 "movq %%mm0,%%mm2\n"
608 "movq %%mm1,%%mm4\n"
609 "movq %%mm0,%%mm3\n"
610 "movq %%mm1,%%mm5\n"
611 "pcmpeqd %%mm6,%%mm2\n"
612 "pcmpeqd %%mm6,%%mm4\n"
613 "pcmpeqd (%2),%%mm3\n"
614 "pcmpeqd (%2),%%mm5\n"
615 "pandn %%mm2,%%mm3\n"
616 "pandn %%mm4,%%mm5\n"
617 "movq %%mm0,%%mm2\n"
618 "movq %%mm1,%%mm4\n"
619 "pcmpeqd %%mm1,%%mm2\n"
620 "pcmpeqd %%mm0,%%mm4\n"
621 "pandn %%mm3,%%mm2\n"
622 "pandn %%mm5,%%mm4\n"
623 "movq %%mm2,%%mm3\n"
624 "movq %%mm4,%%mm5\n"
625 "pand %%mm6,%%mm2\n"
626 "pand %%mm6,%%mm4\n"
627 "pandn %%mm7,%%mm3\n"
628 "pandn %%mm7,%%mm5\n"
629 "por %%mm3,%%mm2\n"
630 "por %%mm5,%%mm4\n"
632 /* set *dst */
633 "movq %%mm2,%%mm3\n"
634 "punpckldq %%mm4,%%mm2\n"
635 "punpckhdq %%mm4,%%mm3\n"
636 "movq %%mm2,(%3)\n"
637 "movq %%mm3,8(%3)\n"
639 /* next */
640 "addl $8,%0\n"
641 "addl $8,%1\n"
642 "addl $8,%2\n"
643 "addl $16,%3\n"
645 "decl %4\n"
646 "jnz 0b\n"
647 "1:\n"
649 /* final run */
650 /* set the current, current_pre, current_next registers */
651 "movq (%1),%%mm1\n"
652 "movq (%1),%%mm7\n"
653 "movq -8(%1), %%mm0\n"
654 "psrlq $32,%%mm1\n"
655 "psrlq $32,%%mm0\n"
656 "psllq $32,%%mm1\n"
657 "movq %%mm7,%%mm2\n"
658 "movq %%mm7,%%mm3\n"
659 "psllq $32,%%mm2\n"
660 "psrlq $32,%%mm3\n"
661 "por %%mm2,%%mm0\n"
662 "por %%mm3,%%mm1\n"
664 /* current_upper */
665 "movq (%0),%%mm6\n"
667 /* compute the upper-left pixel for dst on %%mm2 */
668 /* compute the upper-right pixel for dst on %%mm4 */
669 "movq %%mm0,%%mm2\n"
670 "movq %%mm1,%%mm4\n"
671 "movq %%mm0,%%mm3\n"
672 "movq %%mm1,%%mm5\n"
673 "pcmpeqd %%mm6,%%mm2\n"
674 "pcmpeqd %%mm6,%%mm4\n"
675 "pcmpeqd (%2),%%mm3\n"
676 "pcmpeqd (%2),%%mm5\n"
677 "pandn %%mm2,%%mm3\n"
678 "pandn %%mm4,%%mm5\n"
679 "movq %%mm0,%%mm2\n"
680 "movq %%mm1,%%mm4\n"
681 "pcmpeqd %%mm1,%%mm2\n"
682 "pcmpeqd %%mm0,%%mm4\n"
683 "pandn %%mm3,%%mm2\n"
684 "pandn %%mm5,%%mm4\n"
685 "movq %%mm2,%%mm3\n"
686 "movq %%mm4,%%mm5\n"
687 "pand %%mm6,%%mm2\n"
688 "pand %%mm6,%%mm4\n"
689 "pandn %%mm7,%%mm3\n"
690 "pandn %%mm7,%%mm5\n"
691 "por %%mm3,%%mm2\n"
692 "por %%mm5,%%mm4\n"
694 /* set *dst */
695 "movq %%mm2,%%mm3\n"
696 "punpckldq %%mm4,%%mm2\n"
697 "punpckhdq %%mm4,%%mm3\n"
698 "movq %%mm2,(%3)\n"
699 "movq %%mm3,8(%3)\n"
700 "emms\n"
702 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
703 :
704 : "cc"
705 );
706 #else
707 __asm {
708 mov eax, src0;
709 mov ebx, src1;
710 mov ecx, src2;
711 mov edx, dst;
712 mov esi, count;
714 /* first run */
715 /* set the current, current_pre, current_next registers */
716 movq mm0, qword ptr [ebx];
717 movq mm7, qword ptr [ebx];
718 movq mm1, qword ptr [ebx + 8];
719 psllq mm0, 32;
720 psllq mm1, 32;
721 psrlq mm0, 32;
722 movq mm2, mm7;
723 movq mm3, mm7;
724 psllq mm2, 32;
725 psrlq mm3, 32;
726 por mm0, mm2;
727 por mm1, mm3;
729 /* current_upper */
730 movq mm6, qword ptr [eax];
732 /* compute the upper-left pixel for dst on %%mm2 */
733 /* compute the upper-right pixel for dst on %%mm4 */
734 movq mm2, mm0;
735 movq mm4, mm1;
736 movq mm3, mm0;
737 movq mm5, mm1;
738 pcmpeqd mm2, mm6;
739 pcmpeqd mm4, mm6;
740 pcmpeqd mm3, qword ptr [ecx];
741 pcmpeqd mm5, qword ptr [ecx];
742 pandn mm3, mm2;
743 pandn mm5, mm4;
744 movq mm2, mm0;
745 movq mm4, mm1;
746 pcmpeqd mm2, mm1;
747 pcmpeqd mm4, mm0;
748 pandn mm2, mm3;
749 pandn mm4, mm5;
750 movq mm3, mm2;
751 movq mm5, mm4;
752 pand mm2, mm6;
753 pand mm4, mm6;
754 pandn mm3, mm7;
755 pandn mm5, mm7;
756 por mm2, mm3;
757 por mm4, mm5;
759 /* set *dst */
760 movq mm3, mm2;
761 punpckldq mm2, mm4;
762 punpckhdq mm3, mm4;
763 movq qword ptr [edx], mm2;
764 movq qword ptr [edx + 8], mm3;
766 /* next */
767 add eax, 8;
768 add ebx, 8;
769 add ecx, 8;
770 add edx, 16;
772 /* central runs */
773 shr esi, 1;
774 jz label1;
775 label0:
777 /* set the current, current_pre, current_next registers */
778 movq mm0, qword ptr [ebx - 8];
779 movq mm7, qword ptr [ebx];
780 movq mm1, qword ptr [ebx + 8];
781 psrlq mm0, 32;
782 psllq mm1, 32;
783 movq mm2, mm7;
784 movq mm3, mm7;
785 psllq mm2, 32;
786 psrlq mm3, 32;
787 por mm0, mm2;
788 por mm1, mm3;
790 /* current_upper */
791 movq mm6, qword ptr[eax];
793 /* compute the upper-left pixel for dst on %%mm2 */
794 /* compute the upper-right pixel for dst on %%mm4 */
795 movq mm2, mm0;
796 movq mm4, mm1;
797 movq mm3, mm0;
798 movq mm5, mm1;
799 pcmpeqd mm2, mm6;
800 pcmpeqd mm4, mm6;
801 pcmpeqd mm3, qword ptr[ecx];
802 pcmpeqd mm5, qword ptr[ecx];
803 pandn mm3, mm2;
804 pandn mm5, mm4;
805 movq mm2, mm0;
806 movq mm4, mm1;
807 pcmpeqd mm2, mm1;
808 pcmpeqd mm4, mm0;
809 pandn mm2, mm3;
810 pandn mm4, mm5;
811 movq mm3, mm2;
812 movq mm5, mm4;
813 pand mm2, mm6;
814 pand mm4, mm6;
815 pandn mm3, mm7;
816 pandn mm5, mm7;
817 por mm2, mm3;
818 por mm4, mm5;
820 /* set *dst */
821 movq mm3, mm2;
822 punpckldq mm2, mm4;
823 punpckhdq mm3, mm4;
824 movq qword ptr [edx], mm2;
825 movq qword ptr [edx + 8], mm3;
827 /* next */
828 add eax, 8;
829 add ebx, 8;
830 add ecx, 8;
831 add edx, 16;
833 dec esi;
834 jnz label0;
835 label1:
837 /* final run */
838 /* set the current, current_pre, current_next registers */
839 movq mm1, qword ptr [ebx];
840 movq mm7, qword ptr [ebx];
841 movq mm0, qword ptr [ebx - 8];
842 psrlq mm1, 32;
843 psrlq mm0, 32;
844 psllq mm1, 32;
845 movq mm2, mm7;
846 movq mm3, mm7;
847 psllq mm2, 32;
848 psrlq mm3, 32;
849 por mm0, mm2;
850 por mm1, mm3;
852 /* current_upper */
853 movq mm6, qword ptr [eax];
855 /* compute the upper-left pixel for dst on %%mm2 */
856 /* compute the upper-right pixel for dst on %%mm4 */
857 movq mm2, mm0;
858 movq mm4, mm1;
859 movq mm3, mm0;
860 movq mm5, mm1;
861 pcmpeqd mm2, mm6;
862 pcmpeqd mm4, mm6;
863 pcmpeqd mm3, qword ptr [ecx];
864 pcmpeqd mm5, qword ptr [ecx];
865 pandn mm3, mm2;
866 pandn mm5, mm4;
867 movq mm2, mm0;
868 movq mm4, mm1;
869 pcmpeqd mm2, mm1;
870 pcmpeqd mm4, mm0;
871 pandn mm2, mm3;
872 pandn mm4, mm5;
873 movq mm3, mm2;
874 movq mm5, mm4;
875 pand mm2, mm6;
876 pand mm4, mm6;
877 pandn mm3, mm7;
878 pandn mm5, mm7;
879 por mm2, mm3;
880 por mm4, mm5;
882 /* set *dst */
883 movq mm3, mm2;
884 punpckldq mm2, mm4;
885 punpckhdq mm3, mm4;
886 movq qword ptr [edx], mm2;
887 movq qword ptr [edx + 8], mm3;
889 mov src0, eax;
890 mov src1, ebx;
891 mov src2, ecx;
892 mov dst, edx;
893 mov count, esi;
895 emms;
896 }
897 #endif
898 }
900 static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
901 {
902 // assert( count >= 2*4 );
903 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
904 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
905 }
907 static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
908 {
909 // assert( count >= 2*2 );
910 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
911 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
912 }
914 #endif
916 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
917 u8 *dstPtr, u32 dstPitch, int width, int height)
918 {
919 u16 *dst0 = (u16 *)dstPtr;
920 u16 *dst1 = dst0 + (dstPitch >> 1);
922 u16 *src0 = (u16 *)srcPtr;
923 u16 *src1 = src0 + (srcPitch >> 1);
924 u16 *src2 = src1 + (srcPitch >> 1);
925 #ifdef MMX
926 if (cpu_mmx)
927 {
928 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
930 int count = height;
932 count -= 2;
933 while (count)
934 {
935 dst0 += dstPitch;
936 dst1 += dstPitch;
937 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
938 src0 = src1;
939 src1 = src2;
940 src2 += srcPitch >> 1;
941 --count;
942 }
943 dst0 += dstPitch;
944 dst1 += dstPitch;
945 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
946 }
947 else
948 {
949 #endif
950 internal_scale2x_16_def(dst0, src0, src0, src1, width);
951 internal_scale2x_16_def(dst1, src1, src0, src0, width);
953 int count = height;
955 count -= 2;
956 while (count)
957 {
958 dst0 += dstPitch;
959 dst1 += dstPitch;
960 internal_scale2x_16_def(dst0, src0, src1, src2, width);
961 internal_scale2x_16_def(dst1, src2, src1, src0, width);
962 src0 = src1;
963 src1 = src2;
964 src2 += srcPitch >> 1;
965 --count;
966 }
967 dst0 += dstPitch;
968 dst1 += dstPitch;
969 internal_scale2x_16_def(dst0, src0, src1, src1, width);
970 internal_scale2x_16_def(dst1, src1, src1, src0, width);
971 #ifdef MMX
972 }
974 #endif
975 }
977 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
978 u8 *dstPtr, u32 dstPitch, int width, int height)
979 {
980 u32 *dst0 = (u32 *)dstPtr;
981 u32 *dst1 = dst0 + (dstPitch >> 2);
983 u32 *src0 = (u32 *)srcPtr;
984 u32 *src1 = src0 + (srcPitch >> 2);
985 u32 *src2 = src1 + (srcPitch >> 2);
986 #ifdef MMX
987 if (cpu_mmx)
988 {
989 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
991 int count = height;
993 count -= 2;
994 while (count)
995 {
996 dst0 += dstPitch >> 1;
997 dst1 += dstPitch >> 1;
998 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
999 src0 = src1;
1000 src1 = src2;
1001 src2 += srcPitch >> 2;
1002 --count;
1004 dst0 += dstPitch >> 1;
1005 dst1 += dstPitch >> 1;
1006 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
1008 else
1010 #endif
1011 internal_scale2x_32_def(dst0, src0, src0, src1, width);
1012 internal_scale2x_32_def(dst1, src1, src0, src0, width);
1014 int count = height;
1016 count -= 2;
1017 while (count)
1019 dst0 += dstPitch >> 1;
1020 dst1 += dstPitch >> 1;
1021 internal_scale2x_32_def(dst0, src0, src1, src2, width);
1022 internal_scale2x_32_def(dst1, src2, src1, src0, width);
1023 src0 = src1;
1024 src1 = src2;
1025 src2 += srcPitch >> 2;
1026 --count;
1028 dst0 += dstPitch >> 1;
1029 dst1 += dstPitch >> 1;
1030 internal_scale2x_32_def(dst0, src0, src1, src1, width);
1031 internal_scale2x_32_def(dst1, src1, src1, src0, width);
1032 #ifdef MMX
1035 #endif