Mercurial > vba-clojure
comparison src/filters/admame.cpp @ 27:b970226568d2
brought in filters package
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Sun, 04 Mar 2012 20:32:31 -0600 |
parents | f9f4f1b99eed |
children |
comparison
equal
deleted
inserted
replaced
26:18eaae41bde3 | 27:b970226568d2 |
---|---|
1 /* | |
2 * This file is part of the Advance project. | |
3 * | |
4 * Copyright (C) 1999-2002 Andrea Mazzoleni | |
5 * | |
6 * This program is free software; you can redistribute it and/or modify | |
7 * it under the terms of the GNU General Public License as published by | |
8 * the Free Software Foundation; either version 2 of the License, or | |
9 * (at your option) any later version. | |
10 * | |
11 * This program is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 * GNU General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU General Public License | |
17 * along with this program; if not, write to the Free Software | |
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
19 * | |
20 * In addition, as a special exception, Andrea Mazzoleni | |
21 * gives permission to link the code of this program with | |
22 * the MAME library (or with modified versions of MAME that use the | |
23 * same license as MAME), and distribute linked combinations including | |
24 * the two. You must obey the GNU General Public License in all | |
25 * respects for all of the code used other than MAME. If you modify | |
26 * this file, you may extend this exception to your version of the | |
27 * file, but you are not obligated to do so. If you do not wish to | |
28 * do so, delete this exception statement from your version. | |
29 */ | |
30 | |
31 /* | |
32 * Alternatively at the previous license terms, you are allowed to use this | |
33 * code in your program with these conditions: | |
34 * - the program is not used in commercial activities. | |
35 * - the whole source code of the program is released with the binary. | |
36 */ | |
37 | |
38 #include "../Port.h" | |
39 | |
40 #ifdef MMX | |
41 extern "C" bool cpu_mmx; | |
42 #endif | |
43 | |
44 static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) | |
45 { | |
46 /* first pixel */ | |
47 dst[0] = src1[0]; | |
48 if (src1[1] == src0[0] && src2[0] != src0[0]) | |
49 dst[1] = src0[0]; | |
50 else | |
51 dst[1] = src1[0]; | |
52 ++src0; | |
53 ++src1; | |
54 ++src2; | |
55 dst += 2; | |
56 | |
57 /* central pixels */ | |
58 count -= 2; | |
59 while (count) | |
60 { | |
61 if (src0[0] != src2[0] && src1[-1] != src1[1]) | |
62 { | |
63 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0]; | |
64 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0]; | |
65 } | |
66 else | |
67 { | |
68 dst[0] = src1[0]; | |
69 dst[1] = src1[0]; | |
70 } | |
71 | |
72 ++src0; | |
73 ++src1; | |
74 ++src2; | |
75 dst += 2; | |
76 --count; | |
77 } | |
78 | |
79 /* last pixel */ | |
80 if (src1[-1] == src0[0] && src2[0] != src0[0]) | |
81 dst[0] = src0[0]; | |
82 else | |
83 dst[0] = src1[0]; | |
84 dst[1] = src1[0]; | |
85 } | |
86 | |
87 static void internal_scale2x_32_def(u32 *dst, | |
88 const u32 *src0, | |
89 const u32 *src1, | |
90 const u32 *src2, | |
91 unsigned count) | |
92 { | |
93 /* first pixel */ | |
94 dst[0] = src1[0]; | |
95 if (src1[1] == src0[0] && src2[0] != src0[0]) | |
96 dst[1] = src0[0]; | |
97 else | |
98 dst[1] = src1[0]; | |
99 ++src0; | |
100 ++src1; | |
101 ++src2; | |
102 dst += 2; | |
103 | |
104 /* central pixels */ | |
105 count -= 2; | |
106 while (count) | |
107 { | |
108 if (src0[0] != src2[0] && src1[-1] != src1[1]) | |
109 { | |
110 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0]; | |
111 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0]; | |
112 } | |
113 else | |
114 { | |
115 dst[0] = src1[0]; | |
116 dst[1] = src1[0]; | |
117 } | |
118 | |
119 ++src0; | |
120 ++src1; | |
121 ++src2; | |
122 dst += 2; | |
123 --count; | |
124 } | |
125 | |
126 /* last pixel */ | |
127 if (src1[-1] == src0[0] && src2[0] != src0[0]) | |
128 dst[0] = src0[0]; | |
129 else | |
130 dst[0] = src1[0]; | |
131 dst[1] = src1[0]; | |
132 } | |
133 | |
134 #ifdef MMX | |
135 static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) | |
136 { | |
137 /* always do the first and last run */ | |
138 count -= 2 * 4; | |
139 | |
140 #ifdef __GNUC__ | |
141 __asm__ __volatile__ ( | |
142 /* first run */ | |
143 /* set the current, current_pre, current_next registers */ | |
144 "movq 0(%1), %%mm0\n" | |
145 "movq 0(%1),%%mm7\n" | |
146 "movq 8(%1),%%mm1\n" | |
147 "psllq $48,%%mm0\n" | |
148 "psllq $48,%%mm1\n" | |
149 "psrlq $48, %%mm0\n" | |
150 "movq %%mm7,%%mm2\n" | |
151 "movq %%mm7,%%mm3\n" | |
152 "psllq $16,%%mm2\n" | |
153 "psrlq $16,%%mm3\n" | |
154 "por %%mm2,%%mm0\n" | |
155 "por %%mm3,%%mm1\n" | |
156 | |
157 /* current_upper */ | |
158 "movq (%0),%%mm6\n" | |
159 | |
160 /* compute the upper-left pixel for dst on %%mm2 */ | |
161 /* compute the upper-right pixel for dst on %%mm4 */ | |
162 "movq %%mm0,%%mm2\n" | |
163 "movq %%mm1,%%mm4\n" | |
164 "movq %%mm0,%%mm3\n" | |
165 "movq %%mm1,%%mm5\n" | |
166 "pcmpeqw %%mm6,%%mm2\n" | |
167 "pcmpeqw %%mm6,%%mm4\n" | |
168 "pcmpeqw (%2),%%mm3\n" | |
169 "pcmpeqw (%2),%%mm5\n" | |
170 "pandn %%mm2,%%mm3\n" | |
171 "pandn %%mm4,%%mm5\n" | |
172 "movq %%mm0,%%mm2\n" | |
173 "movq %%mm1,%%mm4\n" | |
174 "pcmpeqw %%mm1,%%mm2\n" | |
175 "pcmpeqw %%mm0,%%mm4\n" | |
176 "pandn %%mm3,%%mm2\n" | |
177 "pandn %%mm5,%%mm4\n" | |
178 "movq %%mm2,%%mm3\n" | |
179 "movq %%mm4,%%mm5\n" | |
180 "pand %%mm6,%%mm2\n" | |
181 "pand %%mm6,%%mm4\n" | |
182 "pandn %%mm7,%%mm3\n" | |
183 "pandn %%mm7,%%mm5\n" | |
184 "por %%mm3,%%mm2\n" | |
185 "por %%mm5,%%mm4\n" | |
186 | |
187 /* set *dst */ | |
188 "movq %%mm2,%%mm3\n" | |
189 "punpcklwd %%mm4,%%mm2\n" | |
190 "punpckhwd %%mm4,%%mm3\n" | |
191 "movq %%mm2,(%3)\n" | |
192 "movq %%mm3,8(%3)\n" | |
193 | |
194 /* next */ | |
195 "addl $8,%0\n" | |
196 "addl $8,%1\n" | |
197 "addl $8,%2\n" | |
198 "addl $16,%3\n" | |
199 | |
200 /* central runs */ | |
201 "shrl $2,%4\n" | |
202 "jz 1f\n" | |
203 | |
204 "0:\n" | |
205 | |
206 /* set the current, current_pre, current_next registers */ | |
207 "movq -8(%1),%%mm0\n" | |
208 "movq (%1),%%mm7\n" | |
209 "movq 8(%1),%%mm1\n" | |
210 "psrlq $48,%%mm0\n" | |
211 "psllq $48,%%mm1\n" | |
212 "movq %%mm7,%%mm2\n" | |
213 "movq %%mm7,%%mm3\n" | |
214 "psllq $16,%%mm2\n" | |
215 "psrlq $16,%%mm3\n" | |
216 "por %%mm2,%%mm0\n" | |
217 "por %%mm3,%%mm1\n" | |
218 | |
219 /* current_upper */ | |
220 "movq (%0),%%mm6\n" | |
221 | |
222 /* compute the upper-left pixel for dst on %%mm2 */ | |
223 /* compute the upper-right pixel for dst on %%mm4 */ | |
224 "movq %%mm0,%%mm2\n" | |
225 "movq %%mm1,%%mm4\n" | |
226 "movq %%mm0,%%mm3\n" | |
227 "movq %%mm1,%%mm5\n" | |
228 "pcmpeqw %%mm6,%%mm2\n" | |
229 "pcmpeqw %%mm6,%%mm4\n" | |
230 "pcmpeqw (%2),%%mm3\n" | |
231 "pcmpeqw (%2),%%mm5\n" | |
232 "pandn %%mm2,%%mm3\n" | |
233 "pandn %%mm4,%%mm5\n" | |
234 "movq %%mm0,%%mm2\n" | |
235 "movq %%mm1,%%mm4\n" | |
236 "pcmpeqw %%mm1,%%mm2\n" | |
237 "pcmpeqw %%mm0,%%mm4\n" | |
238 "pandn %%mm3,%%mm2\n" | |
239 "pandn %%mm5,%%mm4\n" | |
240 "movq %%mm2,%%mm3\n" | |
241 "movq %%mm4,%%mm5\n" | |
242 "pand %%mm6,%%mm2\n" | |
243 "pand %%mm6,%%mm4\n" | |
244 "pandn %%mm7,%%mm3\n" | |
245 "pandn %%mm7,%%mm5\n" | |
246 "por %%mm3,%%mm2\n" | |
247 "por %%mm5,%%mm4\n" | |
248 | |
249 /* set *dst */ | |
250 "movq %%mm2,%%mm3\n" | |
251 "punpcklwd %%mm4,%%mm2\n" | |
252 "punpckhwd %%mm4,%%mm3\n" | |
253 "movq %%mm2,(%3)\n" | |
254 "movq %%mm3,8(%3)\n" | |
255 | |
256 /* next */ | |
257 "addl $8,%0\n" | |
258 "addl $8,%1\n" | |
259 "addl $8,%2\n" | |
260 "addl $16,%3\n" | |
261 | |
262 "decl %4\n" | |
263 "jnz 0b\n" | |
264 "1:\n" | |
265 | |
266 /* final run */ | |
267 /* set the current, current_pre, current_next registers */ | |
268 "movq (%1),%%mm1\n" | |
269 "movq (%1),%%mm7\n" | |
270 "movq -8(%1),%%mm0\n" | |
271 "psrlq $48,%%mm1\n" | |
272 "psrlq $48,%%mm0\n" | |
273 "psllq $48,%%mm1\n" | |
274 "movq %%mm7,%%mm2\n" | |
275 "movq %%mm7,%%mm3\n" | |
276 "psllq $16,%%mm2\n" | |
277 "psrlq $16,%%mm3\n" | |
278 "por %%mm2,%%mm0\n" | |
279 "por %%mm3,%%mm1\n" | |
280 | |
281 /* current_upper */ | |
282 "movq (%0),%%mm6\n" | |
283 | |
284 /* compute the upper-left pixel for dst on %%mm2 */ | |
285 /* compute the upper-right pixel for dst on %%mm4 */ | |
286 "movq %%mm0,%%mm2\n" | |
287 "movq %%mm1,%%mm4\n" | |
288 "movq %%mm0,%%mm3\n" | |
289 "movq %%mm1,%%mm5\n" | |
290 "pcmpeqw %%mm6,%%mm2\n" | |
291 "pcmpeqw %%mm6,%%mm4\n" | |
292 "pcmpeqw (%2),%%mm3\n" | |
293 "pcmpeqw (%2),%%mm5\n" | |
294 "pandn %%mm2,%%mm3\n" | |
295 "pandn %%mm4,%%mm5\n" | |
296 "movq %%mm0,%%mm2\n" | |
297 "movq %%mm1,%%mm4\n" | |
298 "pcmpeqw %%mm1,%%mm2\n" | |
299 "pcmpeqw %%mm0,%%mm4\n" | |
300 "pandn %%mm3,%%mm2\n" | |
301 "pandn %%mm5,%%mm4\n" | |
302 "movq %%mm2,%%mm3\n" | |
303 "movq %%mm4,%%mm5\n" | |
304 "pand %%mm6,%%mm2\n" | |
305 "pand %%mm6,%%mm4\n" | |
306 "pandn %%mm7,%%mm3\n" | |
307 "pandn %%mm7,%%mm5\n" | |
308 "por %%mm3,%%mm2\n" | |
309 "por %%mm5,%%mm4\n" | |
310 | |
311 /* set *dst */ | |
312 "movq %%mm2,%%mm3\n" | |
313 "punpcklwd %%mm4,%%mm2\n" | |
314 "punpckhwd %%mm4,%%mm3\n" | |
315 "movq %%mm2,(%3)\n" | |
316 "movq %%mm3,8(%3)\n" | |
317 "emms\n" | |
318 | |
319 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count) | |
320 : | |
321 : "cc" | |
322 ); | |
323 #else | |
324 __asm { | |
325 mov eax, src0; | |
326 mov ebx, src1; | |
327 mov ecx, src2; | |
328 mov edx, dst; | |
329 mov esi, count; | |
330 | |
331 /* first run */ | |
332 /* set the current, current_pre, current_next registers */ | |
333 movq mm0, qword ptr [ebx]; | |
334 movq mm7, qword ptr [ebx]; | |
335 movq mm1, qword ptr [ebx + 8]; | |
336 psllq mm0, 48; | |
337 psllq mm1, 48; | |
338 psrlq mm0, 48; | |
339 movq mm2, mm7; | |
340 movq mm3, mm7; | |
341 psllq mm2, 16; | |
342 psrlq mm3, 16; | |
343 por mm0, mm2; | |
344 por mm1, mm3; | |
345 | |
346 /* current_upper */ | |
347 movq mm6, qword ptr [eax]; | |
348 | |
349 /* compute the upper-left pixel for dst on %%mm2 */ | |
350 /* compute the upper-right pixel for dst on %%mm4 */ | |
351 movq mm2, mm0; | |
352 movq mm4, mm1; | |
353 movq mm3, mm0; | |
354 movq mm5, mm1; | |
355 pcmpeqw mm2, mm6; | |
356 pcmpeqw mm4, mm6; | |
357 pcmpeqw mm3, qword ptr [ecx]; | |
358 pcmpeqw mm5, qword ptr [ecx]; | |
359 pandn mm3, mm2; | |
360 pandn mm5, mm4; | |
361 movq mm2, mm0; | |
362 movq mm4, mm1; | |
363 pcmpeqw mm2, mm1; | |
364 pcmpeqw mm4, mm0; | |
365 pandn mm2, mm3; | |
366 pandn mm4, mm5; | |
367 movq mm3, mm2; | |
368 movq mm5, mm4; | |
369 pand mm2, mm6; | |
370 pand mm4, mm6; | |
371 pandn mm3, mm7; | |
372 pandn mm5, mm7; | |
373 por mm2, mm3; | |
374 por mm4, mm5; | |
375 | |
376 /* set *dst0 */ | |
377 movq mm3, mm2; | |
378 punpcklwd mm2, mm4; | |
379 punpckhwd mm3, mm4; | |
380 movq qword ptr [edx], mm2; | |
381 movq qword ptr [edx + 8], mm3; | |
382 | |
383 /* next */ | |
384 add eax, 8; | |
385 add ebx, 8; | |
386 add ecx, 8; | |
387 add edx, 16; | |
388 | |
389 /* central runs */ | |
390 shr esi, 2; | |
391 jz label1; | |
392 align 4; | |
393 label0: | |
394 | |
395 /* set the current, current_pre, current_next registers */ | |
396 movq mm0, qword ptr [ebx - 8]; | |
397 movq mm7, qword ptr [ebx]; | |
398 movq mm1, qword ptr [ebx + 8]; | |
399 psrlq mm0, 48; | |
400 psllq mm1, 48; | |
401 movq mm2, mm7; | |
402 movq mm3, mm7; | |
403 psllq mm2, 16; | |
404 psrlq mm3, 16; | |
405 por mm0, mm2; | |
406 por mm1, mm3; | |
407 | |
408 /* current_upper */ | |
409 movq mm6, qword ptr [eax]; | |
410 | |
411 /* compute the upper-left pixel for dst on %%mm2 */ | |
412 /* compute the upper-right pixel for dst on %%mm4 */ | |
413 movq mm2, mm0; | |
414 movq mm4, mm1; | |
415 movq mm3, mm0; | |
416 movq mm5, mm1; | |
417 pcmpeqw mm2, mm6; | |
418 pcmpeqw mm4, mm6; | |
419 pcmpeqw mm3, qword ptr [ecx]; | |
420 pcmpeqw mm5, qword ptr [ecx]; | |
421 pandn mm3, mm2; | |
422 pandn mm5, mm4; | |
423 movq mm2, mm0; | |
424 movq mm4, mm1; | |
425 pcmpeqw mm2, mm1; | |
426 pcmpeqw mm4, mm0; | |
427 pandn mm2, mm3; | |
428 pandn mm4, mm5; | |
429 movq mm3, mm2; | |
430 movq mm5, mm4; | |
431 pand mm2, mm6; | |
432 pand mm4, mm6; | |
433 pandn mm3, mm7; | |
434 pandn mm5, mm7; | |
435 por mm2, mm3; | |
436 por mm4, mm5; | |
437 | |
438 /* set *dst */ | |
439 movq mm3, mm2; | |
440 punpcklwd mm2, mm4; | |
441 punpckhwd mm3, mm4; | |
442 movq qword ptr [edx], mm2; | |
443 movq qword ptr [edx + 8], mm3; | |
444 | |
445 /* next */ | |
446 add eax, 8; | |
447 add ebx, 8; | |
448 add ecx, 8; | |
449 add edx, 16; | |
450 | |
451 dec esi; | |
452 jnz label0; | |
453 label1: | |
454 | |
455 /* final run */ | |
456 /* set the current, current_pre, current_next registers */ | |
457 movq mm1, qword ptr [ebx]; | |
458 movq mm7, qword ptr [ebx]; | |
459 movq mm0, qword ptr [ebx - 8]; | |
460 psrlq mm1, 48; | |
461 psrlq mm0, 48; | |
462 psllq mm1, 48; | |
463 movq mm2, mm7; | |
464 movq mm3, mm7; | |
465 psllq mm2, 16; | |
466 psrlq mm3, 16; | |
467 por mm0, mm2; | |
468 por mm1, mm3; | |
469 | |
470 /* current_upper */ | |
471 movq mm6, qword ptr [eax]; | |
472 | |
473 /* compute the upper-left pixel for dst on %%mm2 */ | |
474 /* compute the upper-right pixel for dst on %%mm4 */ | |
475 movq mm2, mm0; | |
476 movq mm4, mm1; | |
477 movq mm3, mm0; | |
478 movq mm5, mm1; | |
479 pcmpeqw mm2, mm6; | |
480 pcmpeqw mm4, mm6; | |
481 pcmpeqw mm3, qword ptr [ecx]; | |
482 pcmpeqw mm5, qword ptr [ecx]; | |
483 pandn mm3, mm2; | |
484 pandn mm5, mm4; | |
485 movq mm2, mm0; | |
486 movq mm4, mm1; | |
487 pcmpeqw mm2, mm1; | |
488 pcmpeqw mm4, mm0; | |
489 pandn mm2, mm3; | |
490 pandn mm4, mm5; | |
491 movq mm3, mm2; | |
492 movq mm5, mm4; | |
493 pand mm2, mm6; | |
494 pand mm4, mm6; | |
495 pandn mm3, mm7; | |
496 pandn mm5, mm7; | |
497 por mm2, mm3; | |
498 por mm4, mm5; | |
499 | |
500 /* set *dst */ | |
501 movq mm3, mm2; | |
502 punpcklwd mm2, mm4; | |
503 punpckhwd mm3, mm4; | |
504 movq qword ptr [edx], mm2; | |
505 movq qword ptr [edx + 8], mm3; | |
506 | |
507 mov src0, eax; | |
508 mov src1, ebx; | |
509 mov src2, ecx; | |
510 mov dst, edx; | |
511 mov count, esi; | |
512 | |
513 emms; | |
514 } | |
515 #endif | |
516 } | |
517 | |
518 static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count) | |
519 { | |
520 /* always do the first and last run */ | |
521 count -= 2 * 2; | |
522 | |
523 #ifdef __GNUC__ | |
524 __asm__ __volatile__ ( | |
525 /* first run */ | |
526 /* set the current, current_pre, current_next registers */ | |
527 "movq 0(%1),%%mm0\n" | |
528 "movq 0(%1),%%mm7\n" | |
529 "movq 8(%1),%%mm1\n" | |
530 "psllq $32,%%mm0\n" | |
531 "psllq $32,%%mm1\n" | |
532 "psrlq $32,%%mm0\n" | |
533 "movq %%mm7,%%mm2\n" | |
534 "movq %%mm7,%%mm3\n" | |
535 "psllq $32,%%mm2\n" | |
536 "psrlq $32,%%mm3\n" | |
537 "por %%mm2,%%mm0\n" | |
538 "por %%mm3,%%mm1\n" | |
539 | |
540 /* current_upper */ | |
541 "movq (%0),%%mm6\n" | |
542 | |
543 /* compute the upper-left pixel for dst on %%mm2 */ | |
544 /* compute the upper-right pixel for dst on %%mm4 */ | |
545 "movq %%mm0,%%mm2\n" | |
546 "movq %%mm1,%%mm4\n" | |
547 "movq %%mm0,%%mm3\n" | |
548 "movq %%mm1,%%mm5\n" | |
549 "pcmpeqd %%mm6,%%mm2\n" | |
550 "pcmpeqd %%mm6,%%mm4\n" | |
551 "pcmpeqd (%2),%%mm3\n" | |
552 "pcmpeqd (%2),%%mm5\n" | |
553 "pandn %%mm2,%%mm3\n" | |
554 "pandn %%mm4,%%mm5\n" | |
555 "movq %%mm0,%%mm2\n" | |
556 "movq %%mm1,%%mm4\n" | |
557 "pcmpeqd %%mm1,%%mm2\n" | |
558 "pcmpeqd %%mm0,%%mm4\n" | |
559 "pandn %%mm3,%%mm2\n" | |
560 "pandn %%mm5,%%mm4\n" | |
561 "movq %%mm2,%%mm3\n" | |
562 "movq %%mm4,%%mm5\n" | |
563 "pand %%mm6,%%mm2\n" | |
564 "pand %%mm6,%%mm4\n" | |
565 "pandn %%mm7,%%mm3\n" | |
566 "pandn %%mm7,%%mm5\n" | |
567 "por %%mm3,%%mm2\n" | |
568 "por %%mm5,%%mm4\n" | |
569 | |
570 /* set *dst */ | |
571 "movq %%mm2,%%mm3\n" | |
572 "punpckldq %%mm4,%%mm2\n" | |
573 "punpckhdq %%mm4,%%mm3\n" | |
574 "movq %%mm2,(%3)\n" | |
575 "movq %%mm3, 8(%3)\n" | |
576 | |
577 /* next */ | |
578 "addl $8,%0\n" | |
579 "addl $8,%1\n" | |
580 "addl $8,%2\n" | |
581 "addl $16,%3\n" | |
582 | |
583 /* central runs */ | |
584 "shrl $1,%4\n" | |
585 "jz 1f\n" | |
586 | |
587 "0:\n" | |
588 | |
589 /* set the current, current_pre, current_next registers */ | |
590 "movq -8(%1),%%mm0\n" | |
591 "movq (%1),%%mm7\n" | |
592 "movq 8(%1),%%mm1\n" | |
593 "psrlq $32,%%mm0\n" | |
594 "psllq $32,%%mm1\n" | |
595 "movq %%mm7,%%mm2\n" | |
596 "movq %%mm7,%%mm3\n" | |
597 "psllq $32,%%mm2\n" | |
598 "psrlq $32,%%mm3\n" | |
599 "por %%mm2,%%mm0\n" | |
600 "por %%mm3,%%mm1\n" | |
601 | |
602 /* current_upper */ | |
603 "movq (%0),%%mm6\n" | |
604 | |
605 /* compute the upper-left pixel for dst on %%mm2 */ | |
606 /* compute the upper-right pixel for dst on %%mm4 */ | |
607 "movq %%mm0,%%mm2\n" | |
608 "movq %%mm1,%%mm4\n" | |
609 "movq %%mm0,%%mm3\n" | |
610 "movq %%mm1,%%mm5\n" | |
611 "pcmpeqd %%mm6,%%mm2\n" | |
612 "pcmpeqd %%mm6,%%mm4\n" | |
613 "pcmpeqd (%2),%%mm3\n" | |
614 "pcmpeqd (%2),%%mm5\n" | |
615 "pandn %%mm2,%%mm3\n" | |
616 "pandn %%mm4,%%mm5\n" | |
617 "movq %%mm0,%%mm2\n" | |
618 "movq %%mm1,%%mm4\n" | |
619 "pcmpeqd %%mm1,%%mm2\n" | |
620 "pcmpeqd %%mm0,%%mm4\n" | |
621 "pandn %%mm3,%%mm2\n" | |
622 "pandn %%mm5,%%mm4\n" | |
623 "movq %%mm2,%%mm3\n" | |
624 "movq %%mm4,%%mm5\n" | |
625 "pand %%mm6,%%mm2\n" | |
626 "pand %%mm6,%%mm4\n" | |
627 "pandn %%mm7,%%mm3\n" | |
628 "pandn %%mm7,%%mm5\n" | |
629 "por %%mm3,%%mm2\n" | |
630 "por %%mm5,%%mm4\n" | |
631 | |
632 /* set *dst */ | |
633 "movq %%mm2,%%mm3\n" | |
634 "punpckldq %%mm4,%%mm2\n" | |
635 "punpckhdq %%mm4,%%mm3\n" | |
636 "movq %%mm2,(%3)\n" | |
637 "movq %%mm3,8(%3)\n" | |
638 | |
639 /* next */ | |
640 "addl $8,%0\n" | |
641 "addl $8,%1\n" | |
642 "addl $8,%2\n" | |
643 "addl $16,%3\n" | |
644 | |
645 "decl %4\n" | |
646 "jnz 0b\n" | |
647 "1:\n" | |
648 | |
649 /* final run */ | |
650 /* set the current, current_pre, current_next registers */ | |
651 "movq (%1),%%mm1\n" | |
652 "movq (%1),%%mm7\n" | |
653 "movq -8(%1), %%mm0\n" | |
654 "psrlq $32,%%mm1\n" | |
655 "psrlq $32,%%mm0\n" | |
656 "psllq $32,%%mm1\n" | |
657 "movq %%mm7,%%mm2\n" | |
658 "movq %%mm7,%%mm3\n" | |
659 "psllq $32,%%mm2\n" | |
660 "psrlq $32,%%mm3\n" | |
661 "por %%mm2,%%mm0\n" | |
662 "por %%mm3,%%mm1\n" | |
663 | |
664 /* current_upper */ | |
665 "movq (%0),%%mm6\n" | |
666 | |
667 /* compute the upper-left pixel for dst on %%mm2 */ | |
668 /* compute the upper-right pixel for dst on %%mm4 */ | |
669 "movq %%mm0,%%mm2\n" | |
670 "movq %%mm1,%%mm4\n" | |
671 "movq %%mm0,%%mm3\n" | |
672 "movq %%mm1,%%mm5\n" | |
673 "pcmpeqd %%mm6,%%mm2\n" | |
674 "pcmpeqd %%mm6,%%mm4\n" | |
675 "pcmpeqd (%2),%%mm3\n" | |
676 "pcmpeqd (%2),%%mm5\n" | |
677 "pandn %%mm2,%%mm3\n" | |
678 "pandn %%mm4,%%mm5\n" | |
679 "movq %%mm0,%%mm2\n" | |
680 "movq %%mm1,%%mm4\n" | |
681 "pcmpeqd %%mm1,%%mm2\n" | |
682 "pcmpeqd %%mm0,%%mm4\n" | |
683 "pandn %%mm3,%%mm2\n" | |
684 "pandn %%mm5,%%mm4\n" | |
685 "movq %%mm2,%%mm3\n" | |
686 "movq %%mm4,%%mm5\n" | |
687 "pand %%mm6,%%mm2\n" | |
688 "pand %%mm6,%%mm4\n" | |
689 "pandn %%mm7,%%mm3\n" | |
690 "pandn %%mm7,%%mm5\n" | |
691 "por %%mm3,%%mm2\n" | |
692 "por %%mm5,%%mm4\n" | |
693 | |
694 /* set *dst */ | |
695 "movq %%mm2,%%mm3\n" | |
696 "punpckldq %%mm4,%%mm2\n" | |
697 "punpckhdq %%mm4,%%mm3\n" | |
698 "movq %%mm2,(%3)\n" | |
699 "movq %%mm3,8(%3)\n" | |
700 "emms\n" | |
701 | |
702 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count) | |
703 : | |
704 : "cc" | |
705 ); | |
706 #else | |
707 __asm { | |
708 mov eax, src0; | |
709 mov ebx, src1; | |
710 mov ecx, src2; | |
711 mov edx, dst; | |
712 mov esi, count; | |
713 | |
714 /* first run */ | |
715 /* set the current, current_pre, current_next registers */ | |
716 movq mm0, qword ptr [ebx]; | |
717 movq mm7, qword ptr [ebx]; | |
718 movq mm1, qword ptr [ebx + 8]; | |
719 psllq mm0, 32; | |
720 psllq mm1, 32; | |
721 psrlq mm0, 32; | |
722 movq mm2, mm7; | |
723 movq mm3, mm7; | |
724 psllq mm2, 32; | |
725 psrlq mm3, 32; | |
726 por mm0, mm2; | |
727 por mm1, mm3; | |
728 | |
729 /* current_upper */ | |
730 movq mm6, qword ptr [eax]; | |
731 | |
732 /* compute the upper-left pixel for dst on %%mm2 */ | |
733 /* compute the upper-right pixel for dst on %%mm4 */ | |
734 movq mm2, mm0; | |
735 movq mm4, mm1; | |
736 movq mm3, mm0; | |
737 movq mm5, mm1; | |
738 pcmpeqd mm2, mm6; | |
739 pcmpeqd mm4, mm6; | |
740 pcmpeqd mm3, qword ptr [ecx]; | |
741 pcmpeqd mm5, qword ptr [ecx]; | |
742 pandn mm3, mm2; | |
743 pandn mm5, mm4; | |
744 movq mm2, mm0; | |
745 movq mm4, mm1; | |
746 pcmpeqd mm2, mm1; | |
747 pcmpeqd mm4, mm0; | |
748 pandn mm2, mm3; | |
749 pandn mm4, mm5; | |
750 movq mm3, mm2; | |
751 movq mm5, mm4; | |
752 pand mm2, mm6; | |
753 pand mm4, mm6; | |
754 pandn mm3, mm7; | |
755 pandn mm5, mm7; | |
756 por mm2, mm3; | |
757 por mm4, mm5; | |
758 | |
759 /* set *dst */ | |
760 movq mm3, mm2; | |
761 punpckldq mm2, mm4; | |
762 punpckhdq mm3, mm4; | |
763 movq qword ptr [edx], mm2; | |
764 movq qword ptr [edx + 8], mm3; | |
765 | |
766 /* next */ | |
767 add eax, 8; | |
768 add ebx, 8; | |
769 add ecx, 8; | |
770 add edx, 16; | |
771 | |
772 /* central runs */ | |
773 shr esi, 1; | |
774 jz label1; | |
775 label0: | |
776 | |
777 /* set the current, current_pre, current_next registers */ | |
778 movq mm0, qword ptr [ebx - 8]; | |
779 movq mm7, qword ptr [ebx]; | |
780 movq mm1, qword ptr [ebx + 8]; | |
781 psrlq mm0, 32; | |
782 psllq mm1, 32; | |
783 movq mm2, mm7; | |
784 movq mm3, mm7; | |
785 psllq mm2, 32; | |
786 psrlq mm3, 32; | |
787 por mm0, mm2; | |
788 por mm1, mm3; | |
789 | |
790 /* current_upper */ | |
791 movq mm6, qword ptr[eax]; | |
792 | |
793 /* compute the upper-left pixel for dst on %%mm2 */ | |
794 /* compute the upper-right pixel for dst on %%mm4 */ | |
795 movq mm2, mm0; | |
796 movq mm4, mm1; | |
797 movq mm3, mm0; | |
798 movq mm5, mm1; | |
799 pcmpeqd mm2, mm6; | |
800 pcmpeqd mm4, mm6; | |
801 pcmpeqd mm3, qword ptr[ecx]; | |
802 pcmpeqd mm5, qword ptr[ecx]; | |
803 pandn mm3, mm2; | |
804 pandn mm5, mm4; | |
805 movq mm2, mm0; | |
806 movq mm4, mm1; | |
807 pcmpeqd mm2, mm1; | |
808 pcmpeqd mm4, mm0; | |
809 pandn mm2, mm3; | |
810 pandn mm4, mm5; | |
811 movq mm3, mm2; | |
812 movq mm5, mm4; | |
813 pand mm2, mm6; | |
814 pand mm4, mm6; | |
815 pandn mm3, mm7; | |
816 pandn mm5, mm7; | |
817 por mm2, mm3; | |
818 por mm4, mm5; | |
819 | |
820 /* set *dst */ | |
821 movq mm3, mm2; | |
822 punpckldq mm2, mm4; | |
823 punpckhdq mm3, mm4; | |
824 movq qword ptr [edx], mm2; | |
825 movq qword ptr [edx + 8], mm3; | |
826 | |
827 /* next */ | |
828 add eax, 8; | |
829 add ebx, 8; | |
830 add ecx, 8; | |
831 add edx, 16; | |
832 | |
833 dec esi; | |
834 jnz label0; | |
835 label1: | |
836 | |
837 /* final run */ | |
838 /* set the current, current_pre, current_next registers */ | |
839 movq mm1, qword ptr [ebx]; | |
840 movq mm7, qword ptr [ebx]; | |
841 movq mm0, qword ptr [ebx - 8]; | |
842 psrlq mm1, 32; | |
843 psrlq mm0, 32; | |
844 psllq mm1, 32; | |
845 movq mm2, mm7; | |
846 movq mm3, mm7; | |
847 psllq mm2, 32; | |
848 psrlq mm3, 32; | |
849 por mm0, mm2; | |
850 por mm1, mm3; | |
851 | |
852 /* current_upper */ | |
853 movq mm6, qword ptr [eax]; | |
854 | |
855 /* compute the upper-left pixel for dst on %%mm2 */ | |
856 /* compute the upper-right pixel for dst on %%mm4 */ | |
857 movq mm2, mm0; | |
858 movq mm4, mm1; | |
859 movq mm3, mm0; | |
860 movq mm5, mm1; | |
861 pcmpeqd mm2, mm6; | |
862 pcmpeqd mm4, mm6; | |
863 pcmpeqd mm3, qword ptr [ecx]; | |
864 pcmpeqd mm5, qword ptr [ecx]; | |
865 pandn mm3, mm2; | |
866 pandn mm5, mm4; | |
867 movq mm2, mm0; | |
868 movq mm4, mm1; | |
869 pcmpeqd mm2, mm1; | |
870 pcmpeqd mm4, mm0; | |
871 pandn mm2, mm3; | |
872 pandn mm4, mm5; | |
873 movq mm3, mm2; | |
874 movq mm5, mm4; | |
875 pand mm2, mm6; | |
876 pand mm4, mm6; | |
877 pandn mm3, mm7; | |
878 pandn mm5, mm7; | |
879 por mm2, mm3; | |
880 por mm4, mm5; | |
881 | |
882 /* set *dst */ | |
883 movq mm3, mm2; | |
884 punpckldq mm2, mm4; | |
885 punpckhdq mm3, mm4; | |
886 movq qword ptr [edx], mm2; | |
887 movq qword ptr [edx + 8], mm3; | |
888 | |
889 mov src0, eax; | |
890 mov src1, ebx; | |
891 mov src2, ecx; | |
892 mov dst, edx; | |
893 mov count, esi; | |
894 | |
895 emms; | |
896 } | |
897 #endif | |
898 } | |
899 | |
900 static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count) | |
901 { | |
902 // assert( count >= 2*4 ); | |
903 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count); | |
904 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count); | |
905 } | |
906 | |
907 static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count) | |
908 { | |
909 // assert( count >= 2*2 ); | |
910 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count); | |
911 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count); | |
912 } | |
913 | |
914 #endif | |
915 | |
916 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */, | |
917 u8 *dstPtr, u32 dstPitch, int width, int height) | |
918 { | |
919 u16 *dst0 = (u16 *)dstPtr; | |
920 u16 *dst1 = dst0 + (dstPitch >> 1); | |
921 | |
922 u16 *src0 = (u16 *)srcPtr; | |
923 u16 *src1 = src0 + (srcPitch >> 1); | |
924 u16 *src2 = src1 + (srcPitch >> 1); | |
925 #ifdef MMX | |
926 if (cpu_mmx) | |
927 { | |
928 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width); | |
929 | |
930 int count = height; | |
931 | |
932 count -= 2; | |
933 while (count) | |
934 { | |
935 dst0 += dstPitch; | |
936 dst1 += dstPitch; | |
937 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width); | |
938 src0 = src1; | |
939 src1 = src2; | |
940 src2 += srcPitch >> 1; | |
941 --count; | |
942 } | |
943 dst0 += dstPitch; | |
944 dst1 += dstPitch; | |
945 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width); | |
946 } | |
947 else | |
948 { | |
949 #endif | |
950 internal_scale2x_16_def(dst0, src0, src0, src1, width); | |
951 internal_scale2x_16_def(dst1, src1, src0, src0, width); | |
952 | |
953 int count = height; | |
954 | |
955 count -= 2; | |
956 while (count) | |
957 { | |
958 dst0 += dstPitch; | |
959 dst1 += dstPitch; | |
960 internal_scale2x_16_def(dst0, src0, src1, src2, width); | |
961 internal_scale2x_16_def(dst1, src2, src1, src0, width); | |
962 src0 = src1; | |
963 src1 = src2; | |
964 src2 += srcPitch >> 1; | |
965 --count; | |
966 } | |
967 dst0 += dstPitch; | |
968 dst1 += dstPitch; | |
969 internal_scale2x_16_def(dst0, src0, src1, src1, width); | |
970 internal_scale2x_16_def(dst1, src1, src1, src0, width); | |
971 #ifdef MMX | |
972 } | |
973 | |
974 #endif | |
975 } | |
976 | |
977 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */, | |
978 u8 *dstPtr, u32 dstPitch, int width, int height) | |
979 { | |
980 u32 *dst0 = (u32 *)dstPtr; | |
981 u32 *dst1 = dst0 + (dstPitch >> 2); | |
982 | |
983 u32 *src0 = (u32 *)srcPtr; | |
984 u32 *src1 = src0 + (srcPitch >> 2); | |
985 u32 *src2 = src1 + (srcPitch >> 2); | |
986 #ifdef MMX | |
987 if (cpu_mmx) | |
988 { | |
989 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width); | |
990 | |
991 int count = height; | |
992 | |
993 count -= 2; | |
994 while (count) | |
995 { | |
996 dst0 += dstPitch >> 1; | |
997 dst1 += dstPitch >> 1; | |
998 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width); | |
999 src0 = src1; | |
1000 src1 = src2; | |
1001 src2 += srcPitch >> 2; | |
1002 --count; | |
1003 } | |
1004 dst0 += dstPitch >> 1; | |
1005 dst1 += dstPitch >> 1; | |
1006 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width); | |
1007 } | |
1008 else | |
1009 { | |
1010 #endif | |
1011 internal_scale2x_32_def(dst0, src0, src0, src1, width); | |
1012 internal_scale2x_32_def(dst1, src1, src0, src0, width); | |
1013 | |
1014 int count = height; | |
1015 | |
1016 count -= 2; | |
1017 while (count) | |
1018 { | |
1019 dst0 += dstPitch >> 1; | |
1020 dst1 += dstPitch >> 1; | |
1021 internal_scale2x_32_def(dst0, src0, src1, src2, width); | |
1022 internal_scale2x_32_def(dst1, src2, src1, src0, width); | |
1023 src0 = src1; | |
1024 src1 = src2; | |
1025 src2 += srcPitch >> 2; | |
1026 --count; | |
1027 } | |
1028 dst0 += dstPitch >> 1; | |
1029 dst1 += dstPitch >> 1; | |
1030 internal_scale2x_32_def(dst0, src0, src1, src1, width); | |
1031 internal_scale2x_32_def(dst1, src1, src1, src0, width); | |
1032 #ifdef MMX | |
1033 } | |
1034 | |
1035 #endif | |
1036 } |