rlm@1
|
1 /*
|
rlm@1
|
2 * This file is part of the Advance project.
|
rlm@1
|
3 *
|
rlm@1
|
4 * Copyright (C) 1999-2002 Andrea Mazzoleni
|
rlm@1
|
5 *
|
rlm@1
|
6 * This program is free software; you can redistribute it and/or modify
|
rlm@1
|
7 * it under the terms of the GNU General Public License as published by
|
rlm@1
|
8 * the Free Software Foundation; either version 2 of the License, or
|
rlm@1
|
9 * (at your option) any later version.
|
rlm@1
|
10 *
|
rlm@1
|
11 * This program is distributed in the hope that it will be useful,
|
rlm@1
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
rlm@1
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
rlm@1
|
14 * GNU General Public License for more details.
|
rlm@1
|
15 *
|
rlm@1
|
16 * You should have received a copy of the GNU General Public License
|
rlm@1
|
17 * along with this program; if not, write to the Free Software
|
rlm@1
|
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
rlm@1
|
19 *
|
rlm@1
|
20 * In addition, as a special exception, Andrea Mazzoleni
|
rlm@1
|
21 * gives permission to link the code of this program with
|
rlm@1
|
22 * the MAME library (or with modified versions of MAME that use the
|
rlm@1
|
23 * same license as MAME), and distribute linked combinations including
|
rlm@1
|
24 * the two. You must obey the GNU General Public License in all
|
rlm@1
|
25 * respects for all of the code used other than MAME. If you modify
|
rlm@1
|
26 * this file, you may extend this exception to your version of the
|
rlm@1
|
27 * file, but you are not obligated to do so. If you do not wish to
|
rlm@1
|
28 * do so, delete this exception statement from your version.
|
rlm@1
|
29 */
|
rlm@1
|
30
|
rlm@1
|
31 /*
|
rlm@1
|
32 * Alternatively at the previous license terms, you are allowed to use this
|
rlm@1
|
33 * code in your program with these conditions:
|
rlm@1
|
34 * - the program is not used in commercial activities.
|
rlm@1
|
35 * - the whole source code of the program is released with the binary.
|
rlm@1
|
36 */
|
rlm@1
|
37
|
rlm@1
|
38 #include "../Port.h"
|
rlm@1
|
39
|
rlm@1
|
40 #ifdef MMX
|
rlm@1
|
41 extern "C" bool cpu_mmx;
|
rlm@1
|
42 #endif
|
rlm@1
|
43
|
rlm@1
|
44 static void internal_scale2x_16_def(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
|
rlm@1
|
45 {
|
rlm@1
|
46 /* first pixel */
|
rlm@1
|
47 dst[0] = src1[0];
|
rlm@1
|
48 if (src1[1] == src0[0] && src2[0] != src0[0])
|
rlm@1
|
49 dst[1] = src0[0];
|
rlm@1
|
50 else
|
rlm@1
|
51 dst[1] = src1[0];
|
rlm@1
|
52 ++src0;
|
rlm@1
|
53 ++src1;
|
rlm@1
|
54 ++src2;
|
rlm@1
|
55 dst += 2;
|
rlm@1
|
56
|
rlm@1
|
57 /* central pixels */
|
rlm@1
|
58 count -= 2;
|
rlm@1
|
59 while (count)
|
rlm@1
|
60 {
|
rlm@1
|
61 if (src0[0] != src2[0] && src1[-1] != src1[1])
|
rlm@1
|
62 {
|
rlm@1
|
63 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
|
rlm@1
|
64 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
|
rlm@1
|
65 }
|
rlm@1
|
66 else
|
rlm@1
|
67 {
|
rlm@1
|
68 dst[0] = src1[0];
|
rlm@1
|
69 dst[1] = src1[0];
|
rlm@1
|
70 }
|
rlm@1
|
71
|
rlm@1
|
72 ++src0;
|
rlm@1
|
73 ++src1;
|
rlm@1
|
74 ++src2;
|
rlm@1
|
75 dst += 2;
|
rlm@1
|
76 --count;
|
rlm@1
|
77 }
|
rlm@1
|
78
|
rlm@1
|
79 /* last pixel */
|
rlm@1
|
80 if (src1[-1] == src0[0] && src2[0] != src0[0])
|
rlm@1
|
81 dst[0] = src0[0];
|
rlm@1
|
82 else
|
rlm@1
|
83 dst[0] = src1[0];
|
rlm@1
|
84 dst[1] = src1[0];
|
rlm@1
|
85 }
|
rlm@1
|
86
|
rlm@1
|
87 static void internal_scale2x_32_def(u32 *dst,
|
rlm@1
|
88 const u32 *src0,
|
rlm@1
|
89 const u32 *src1,
|
rlm@1
|
90 const u32 *src2,
|
rlm@1
|
91 unsigned count)
|
rlm@1
|
92 {
|
rlm@1
|
93 /* first pixel */
|
rlm@1
|
94 dst[0] = src1[0];
|
rlm@1
|
95 if (src1[1] == src0[0] && src2[0] != src0[0])
|
rlm@1
|
96 dst[1] = src0[0];
|
rlm@1
|
97 else
|
rlm@1
|
98 dst[1] = src1[0];
|
rlm@1
|
99 ++src0;
|
rlm@1
|
100 ++src1;
|
rlm@1
|
101 ++src2;
|
rlm@1
|
102 dst += 2;
|
rlm@1
|
103
|
rlm@1
|
104 /* central pixels */
|
rlm@1
|
105 count -= 2;
|
rlm@1
|
106 while (count)
|
rlm@1
|
107 {
|
rlm@1
|
108 if (src0[0] != src2[0] && src1[-1] != src1[1])
|
rlm@1
|
109 {
|
rlm@1
|
110 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
|
rlm@1
|
111 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
|
rlm@1
|
112 }
|
rlm@1
|
113 else
|
rlm@1
|
114 {
|
rlm@1
|
115 dst[0] = src1[0];
|
rlm@1
|
116 dst[1] = src1[0];
|
rlm@1
|
117 }
|
rlm@1
|
118
|
rlm@1
|
119 ++src0;
|
rlm@1
|
120 ++src1;
|
rlm@1
|
121 ++src2;
|
rlm@1
|
122 dst += 2;
|
rlm@1
|
123 --count;
|
rlm@1
|
124 }
|
rlm@1
|
125
|
rlm@1
|
126 /* last pixel */
|
rlm@1
|
127 if (src1[-1] == src0[0] && src2[0] != src0[0])
|
rlm@1
|
128 dst[0] = src0[0];
|
rlm@1
|
129 else
|
rlm@1
|
130 dst[0] = src1[0];
|
rlm@1
|
131 dst[1] = src1[0];
|
rlm@1
|
132 }
|
rlm@1
|
133
|
rlm@1
|
134 #ifdef MMX
|
rlm@1
|
135 static void internal_scale2x_16_mmx_single(u16 *dst, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
|
rlm@1
|
136 {
|
rlm@1
|
137 /* always do the first and last run */
|
rlm@1
|
138 count -= 2 * 4;
|
rlm@1
|
139
|
rlm@1
|
140 #ifdef __GNUC__
|
rlm@1
|
141 __asm__ __volatile__ (
|
rlm@1
|
142 /* first run */
|
rlm@1
|
143 /* set the current, current_pre, current_next registers */
|
rlm@1
|
144 "movq 0(%1), %%mm0\n"
|
rlm@1
|
145 "movq 0(%1),%%mm7\n"
|
rlm@1
|
146 "movq 8(%1),%%mm1\n"
|
rlm@1
|
147 "psllq $48,%%mm0\n"
|
rlm@1
|
148 "psllq $48,%%mm1\n"
|
rlm@1
|
149 "psrlq $48, %%mm0\n"
|
rlm@1
|
150 "movq %%mm7,%%mm2\n"
|
rlm@1
|
151 "movq %%mm7,%%mm3\n"
|
rlm@1
|
152 "psllq $16,%%mm2\n"
|
rlm@1
|
153 "psrlq $16,%%mm3\n"
|
rlm@1
|
154 "por %%mm2,%%mm0\n"
|
rlm@1
|
155 "por %%mm3,%%mm1\n"
|
rlm@1
|
156
|
rlm@1
|
157 /* current_upper */
|
rlm@1
|
158 "movq (%0),%%mm6\n"
|
rlm@1
|
159
|
rlm@1
|
160 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
161 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
162 "movq %%mm0,%%mm2\n"
|
rlm@1
|
163 "movq %%mm1,%%mm4\n"
|
rlm@1
|
164 "movq %%mm0,%%mm3\n"
|
rlm@1
|
165 "movq %%mm1,%%mm5\n"
|
rlm@1
|
166 "pcmpeqw %%mm6,%%mm2\n"
|
rlm@1
|
167 "pcmpeqw %%mm6,%%mm4\n"
|
rlm@1
|
168 "pcmpeqw (%2),%%mm3\n"
|
rlm@1
|
169 "pcmpeqw (%2),%%mm5\n"
|
rlm@1
|
170 "pandn %%mm2,%%mm3\n"
|
rlm@1
|
171 "pandn %%mm4,%%mm5\n"
|
rlm@1
|
172 "movq %%mm0,%%mm2\n"
|
rlm@1
|
173 "movq %%mm1,%%mm4\n"
|
rlm@1
|
174 "pcmpeqw %%mm1,%%mm2\n"
|
rlm@1
|
175 "pcmpeqw %%mm0,%%mm4\n"
|
rlm@1
|
176 "pandn %%mm3,%%mm2\n"
|
rlm@1
|
177 "pandn %%mm5,%%mm4\n"
|
rlm@1
|
178 "movq %%mm2,%%mm3\n"
|
rlm@1
|
179 "movq %%mm4,%%mm5\n"
|
rlm@1
|
180 "pand %%mm6,%%mm2\n"
|
rlm@1
|
181 "pand %%mm6,%%mm4\n"
|
rlm@1
|
182 "pandn %%mm7,%%mm3\n"
|
rlm@1
|
183 "pandn %%mm7,%%mm5\n"
|
rlm@1
|
184 "por %%mm3,%%mm2\n"
|
rlm@1
|
185 "por %%mm5,%%mm4\n"
|
rlm@1
|
186
|
rlm@1
|
187 /* set *dst */
|
rlm@1
|
188 "movq %%mm2,%%mm3\n"
|
rlm@1
|
189 "punpcklwd %%mm4,%%mm2\n"
|
rlm@1
|
190 "punpckhwd %%mm4,%%mm3\n"
|
rlm@1
|
191 "movq %%mm2,(%3)\n"
|
rlm@1
|
192 "movq %%mm3,8(%3)\n"
|
rlm@1
|
193
|
rlm@1
|
194 /* next */
|
rlm@1
|
195 "addl $8,%0\n"
|
rlm@1
|
196 "addl $8,%1\n"
|
rlm@1
|
197 "addl $8,%2\n"
|
rlm@1
|
198 "addl $16,%3\n"
|
rlm@1
|
199
|
rlm@1
|
200 /* central runs */
|
rlm@1
|
201 "shrl $2,%4\n"
|
rlm@1
|
202 "jz 1f\n"
|
rlm@1
|
203
|
rlm@1
|
204 "0:\n"
|
rlm@1
|
205
|
rlm@1
|
206 /* set the current, current_pre, current_next registers */
|
rlm@1
|
207 "movq -8(%1),%%mm0\n"
|
rlm@1
|
208 "movq (%1),%%mm7\n"
|
rlm@1
|
209 "movq 8(%1),%%mm1\n"
|
rlm@1
|
210 "psrlq $48,%%mm0\n"
|
rlm@1
|
211 "psllq $48,%%mm1\n"
|
rlm@1
|
212 "movq %%mm7,%%mm2\n"
|
rlm@1
|
213 "movq %%mm7,%%mm3\n"
|
rlm@1
|
214 "psllq $16,%%mm2\n"
|
rlm@1
|
215 "psrlq $16,%%mm3\n"
|
rlm@1
|
216 "por %%mm2,%%mm0\n"
|
rlm@1
|
217 "por %%mm3,%%mm1\n"
|
rlm@1
|
218
|
rlm@1
|
219 /* current_upper */
|
rlm@1
|
220 "movq (%0),%%mm6\n"
|
rlm@1
|
221
|
rlm@1
|
222 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
223 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
224 "movq %%mm0,%%mm2\n"
|
rlm@1
|
225 "movq %%mm1,%%mm4\n"
|
rlm@1
|
226 "movq %%mm0,%%mm3\n"
|
rlm@1
|
227 "movq %%mm1,%%mm5\n"
|
rlm@1
|
228 "pcmpeqw %%mm6,%%mm2\n"
|
rlm@1
|
229 "pcmpeqw %%mm6,%%mm4\n"
|
rlm@1
|
230 "pcmpeqw (%2),%%mm3\n"
|
rlm@1
|
231 "pcmpeqw (%2),%%mm5\n"
|
rlm@1
|
232 "pandn %%mm2,%%mm3\n"
|
rlm@1
|
233 "pandn %%mm4,%%mm5\n"
|
rlm@1
|
234 "movq %%mm0,%%mm2\n"
|
rlm@1
|
235 "movq %%mm1,%%mm4\n"
|
rlm@1
|
236 "pcmpeqw %%mm1,%%mm2\n"
|
rlm@1
|
237 "pcmpeqw %%mm0,%%mm4\n"
|
rlm@1
|
238 "pandn %%mm3,%%mm2\n"
|
rlm@1
|
239 "pandn %%mm5,%%mm4\n"
|
rlm@1
|
240 "movq %%mm2,%%mm3\n"
|
rlm@1
|
241 "movq %%mm4,%%mm5\n"
|
rlm@1
|
242 "pand %%mm6,%%mm2\n"
|
rlm@1
|
243 "pand %%mm6,%%mm4\n"
|
rlm@1
|
244 "pandn %%mm7,%%mm3\n"
|
rlm@1
|
245 "pandn %%mm7,%%mm5\n"
|
rlm@1
|
246 "por %%mm3,%%mm2\n"
|
rlm@1
|
247 "por %%mm5,%%mm4\n"
|
rlm@1
|
248
|
rlm@1
|
249 /* set *dst */
|
rlm@1
|
250 "movq %%mm2,%%mm3\n"
|
rlm@1
|
251 "punpcklwd %%mm4,%%mm2\n"
|
rlm@1
|
252 "punpckhwd %%mm4,%%mm3\n"
|
rlm@1
|
253 "movq %%mm2,(%3)\n"
|
rlm@1
|
254 "movq %%mm3,8(%3)\n"
|
rlm@1
|
255
|
rlm@1
|
256 /* next */
|
rlm@1
|
257 "addl $8,%0\n"
|
rlm@1
|
258 "addl $8,%1\n"
|
rlm@1
|
259 "addl $8,%2\n"
|
rlm@1
|
260 "addl $16,%3\n"
|
rlm@1
|
261
|
rlm@1
|
262 "decl %4\n"
|
rlm@1
|
263 "jnz 0b\n"
|
rlm@1
|
264 "1:\n"
|
rlm@1
|
265
|
rlm@1
|
266 /* final run */
|
rlm@1
|
267 /* set the current, current_pre, current_next registers */
|
rlm@1
|
268 "movq (%1),%%mm1\n"
|
rlm@1
|
269 "movq (%1),%%mm7\n"
|
rlm@1
|
270 "movq -8(%1),%%mm0\n"
|
rlm@1
|
271 "psrlq $48,%%mm1\n"
|
rlm@1
|
272 "psrlq $48,%%mm0\n"
|
rlm@1
|
273 "psllq $48,%%mm1\n"
|
rlm@1
|
274 "movq %%mm7,%%mm2\n"
|
rlm@1
|
275 "movq %%mm7,%%mm3\n"
|
rlm@1
|
276 "psllq $16,%%mm2\n"
|
rlm@1
|
277 "psrlq $16,%%mm3\n"
|
rlm@1
|
278 "por %%mm2,%%mm0\n"
|
rlm@1
|
279 "por %%mm3,%%mm1\n"
|
rlm@1
|
280
|
rlm@1
|
281 /* current_upper */
|
rlm@1
|
282 "movq (%0),%%mm6\n"
|
rlm@1
|
283
|
rlm@1
|
284 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
285 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
286 "movq %%mm0,%%mm2\n"
|
rlm@1
|
287 "movq %%mm1,%%mm4\n"
|
rlm@1
|
288 "movq %%mm0,%%mm3\n"
|
rlm@1
|
289 "movq %%mm1,%%mm5\n"
|
rlm@1
|
290 "pcmpeqw %%mm6,%%mm2\n"
|
rlm@1
|
291 "pcmpeqw %%mm6,%%mm4\n"
|
rlm@1
|
292 "pcmpeqw (%2),%%mm3\n"
|
rlm@1
|
293 "pcmpeqw (%2),%%mm5\n"
|
rlm@1
|
294 "pandn %%mm2,%%mm3\n"
|
rlm@1
|
295 "pandn %%mm4,%%mm5\n"
|
rlm@1
|
296 "movq %%mm0,%%mm2\n"
|
rlm@1
|
297 "movq %%mm1,%%mm4\n"
|
rlm@1
|
298 "pcmpeqw %%mm1,%%mm2\n"
|
rlm@1
|
299 "pcmpeqw %%mm0,%%mm4\n"
|
rlm@1
|
300 "pandn %%mm3,%%mm2\n"
|
rlm@1
|
301 "pandn %%mm5,%%mm4\n"
|
rlm@1
|
302 "movq %%mm2,%%mm3\n"
|
rlm@1
|
303 "movq %%mm4,%%mm5\n"
|
rlm@1
|
304 "pand %%mm6,%%mm2\n"
|
rlm@1
|
305 "pand %%mm6,%%mm4\n"
|
rlm@1
|
306 "pandn %%mm7,%%mm3\n"
|
rlm@1
|
307 "pandn %%mm7,%%mm5\n"
|
rlm@1
|
308 "por %%mm3,%%mm2\n"
|
rlm@1
|
309 "por %%mm5,%%mm4\n"
|
rlm@1
|
310
|
rlm@1
|
311 /* set *dst */
|
rlm@1
|
312 "movq %%mm2,%%mm3\n"
|
rlm@1
|
313 "punpcklwd %%mm4,%%mm2\n"
|
rlm@1
|
314 "punpckhwd %%mm4,%%mm3\n"
|
rlm@1
|
315 "movq %%mm2,(%3)\n"
|
rlm@1
|
316 "movq %%mm3,8(%3)\n"
|
rlm@1
|
317 "emms\n"
|
rlm@1
|
318
|
rlm@1
|
319 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
|
rlm@1
|
320 :
|
rlm@1
|
321 : "cc"
|
rlm@1
|
322 );
|
rlm@1
|
323 #else
|
rlm@1
|
324 __asm {
|
rlm@1
|
325 mov eax, src0;
|
rlm@1
|
326 mov ebx, src1;
|
rlm@1
|
327 mov ecx, src2;
|
rlm@1
|
328 mov edx, dst;
|
rlm@1
|
329 mov esi, count;
|
rlm@1
|
330
|
rlm@1
|
331 /* first run */
|
rlm@1
|
332 /* set the current, current_pre, current_next registers */
|
rlm@1
|
333 movq mm0, qword ptr [ebx];
|
rlm@1
|
334 movq mm7, qword ptr [ebx];
|
rlm@1
|
335 movq mm1, qword ptr [ebx + 8];
|
rlm@1
|
336 psllq mm0, 48;
|
rlm@1
|
337 psllq mm1, 48;
|
rlm@1
|
338 psrlq mm0, 48;
|
rlm@1
|
339 movq mm2, mm7;
|
rlm@1
|
340 movq mm3, mm7;
|
rlm@1
|
341 psllq mm2, 16;
|
rlm@1
|
342 psrlq mm3, 16;
|
rlm@1
|
343 por mm0, mm2;
|
rlm@1
|
344 por mm1, mm3;
|
rlm@1
|
345
|
rlm@1
|
346 /* current_upper */
|
rlm@1
|
347 movq mm6, qword ptr [eax];
|
rlm@1
|
348
|
rlm@1
|
349 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
350 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
351 movq mm2, mm0;
|
rlm@1
|
352 movq mm4, mm1;
|
rlm@1
|
353 movq mm3, mm0;
|
rlm@1
|
354 movq mm5, mm1;
|
rlm@1
|
355 pcmpeqw mm2, mm6;
|
rlm@1
|
356 pcmpeqw mm4, mm6;
|
rlm@1
|
357 pcmpeqw mm3, qword ptr [ecx];
|
rlm@1
|
358 pcmpeqw mm5, qword ptr [ecx];
|
rlm@1
|
359 pandn mm3, mm2;
|
rlm@1
|
360 pandn mm5, mm4;
|
rlm@1
|
361 movq mm2, mm0;
|
rlm@1
|
362 movq mm4, mm1;
|
rlm@1
|
363 pcmpeqw mm2, mm1;
|
rlm@1
|
364 pcmpeqw mm4, mm0;
|
rlm@1
|
365 pandn mm2, mm3;
|
rlm@1
|
366 pandn mm4, mm5;
|
rlm@1
|
367 movq mm3, mm2;
|
rlm@1
|
368 movq mm5, mm4;
|
rlm@1
|
369 pand mm2, mm6;
|
rlm@1
|
370 pand mm4, mm6;
|
rlm@1
|
371 pandn mm3, mm7;
|
rlm@1
|
372 pandn mm5, mm7;
|
rlm@1
|
373 por mm2, mm3;
|
rlm@1
|
374 por mm4, mm5;
|
rlm@1
|
375
|
rlm@1
|
376 /* set *dst0 */
|
rlm@1
|
377 movq mm3, mm2;
|
rlm@1
|
378 punpcklwd mm2, mm4;
|
rlm@1
|
379 punpckhwd mm3, mm4;
|
rlm@1
|
380 movq qword ptr [edx], mm2;
|
rlm@1
|
381 movq qword ptr [edx + 8], mm3;
|
rlm@1
|
382
|
rlm@1
|
383 /* next */
|
rlm@1
|
384 add eax, 8;
|
rlm@1
|
385 add ebx, 8;
|
rlm@1
|
386 add ecx, 8;
|
rlm@1
|
387 add edx, 16;
|
rlm@1
|
388
|
rlm@1
|
389 /* central runs */
|
rlm@1
|
390 shr esi, 2;
|
rlm@1
|
391 jz label1;
|
rlm@1
|
392 align 4;
|
rlm@1
|
393 label0:
|
rlm@1
|
394
|
rlm@1
|
395 /* set the current, current_pre, current_next registers */
|
rlm@1
|
396 movq mm0, qword ptr [ebx - 8];
|
rlm@1
|
397 movq mm7, qword ptr [ebx];
|
rlm@1
|
398 movq mm1, qword ptr [ebx + 8];
|
rlm@1
|
399 psrlq mm0, 48;
|
rlm@1
|
400 psllq mm1, 48;
|
rlm@1
|
401 movq mm2, mm7;
|
rlm@1
|
402 movq mm3, mm7;
|
rlm@1
|
403 psllq mm2, 16;
|
rlm@1
|
404 psrlq mm3, 16;
|
rlm@1
|
405 por mm0, mm2;
|
rlm@1
|
406 por mm1, mm3;
|
rlm@1
|
407
|
rlm@1
|
408 /* current_upper */
|
rlm@1
|
409 movq mm6, qword ptr [eax];
|
rlm@1
|
410
|
rlm@1
|
411 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
412 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
413 movq mm2, mm0;
|
rlm@1
|
414 movq mm4, mm1;
|
rlm@1
|
415 movq mm3, mm0;
|
rlm@1
|
416 movq mm5, mm1;
|
rlm@1
|
417 pcmpeqw mm2, mm6;
|
rlm@1
|
418 pcmpeqw mm4, mm6;
|
rlm@1
|
419 pcmpeqw mm3, qword ptr [ecx];
|
rlm@1
|
420 pcmpeqw mm5, qword ptr [ecx];
|
rlm@1
|
421 pandn mm3, mm2;
|
rlm@1
|
422 pandn mm5, mm4;
|
rlm@1
|
423 movq mm2, mm0;
|
rlm@1
|
424 movq mm4, mm1;
|
rlm@1
|
425 pcmpeqw mm2, mm1;
|
rlm@1
|
426 pcmpeqw mm4, mm0;
|
rlm@1
|
427 pandn mm2, mm3;
|
rlm@1
|
428 pandn mm4, mm5;
|
rlm@1
|
429 movq mm3, mm2;
|
rlm@1
|
430 movq mm5, mm4;
|
rlm@1
|
431 pand mm2, mm6;
|
rlm@1
|
432 pand mm4, mm6;
|
rlm@1
|
433 pandn mm3, mm7;
|
rlm@1
|
434 pandn mm5, mm7;
|
rlm@1
|
435 por mm2, mm3;
|
rlm@1
|
436 por mm4, mm5;
|
rlm@1
|
437
|
rlm@1
|
438 /* set *dst */
|
rlm@1
|
439 movq mm3, mm2;
|
rlm@1
|
440 punpcklwd mm2, mm4;
|
rlm@1
|
441 punpckhwd mm3, mm4;
|
rlm@1
|
442 movq qword ptr [edx], mm2;
|
rlm@1
|
443 movq qword ptr [edx + 8], mm3;
|
rlm@1
|
444
|
rlm@1
|
445 /* next */
|
rlm@1
|
446 add eax, 8;
|
rlm@1
|
447 add ebx, 8;
|
rlm@1
|
448 add ecx, 8;
|
rlm@1
|
449 add edx, 16;
|
rlm@1
|
450
|
rlm@1
|
451 dec esi;
|
rlm@1
|
452 jnz label0;
|
rlm@1
|
453 label1:
|
rlm@1
|
454
|
rlm@1
|
455 /* final run */
|
rlm@1
|
456 /* set the current, current_pre, current_next registers */
|
rlm@1
|
457 movq mm1, qword ptr [ebx];
|
rlm@1
|
458 movq mm7, qword ptr [ebx];
|
rlm@1
|
459 movq mm0, qword ptr [ebx - 8];
|
rlm@1
|
460 psrlq mm1, 48;
|
rlm@1
|
461 psrlq mm0, 48;
|
rlm@1
|
462 psllq mm1, 48;
|
rlm@1
|
463 movq mm2, mm7;
|
rlm@1
|
464 movq mm3, mm7;
|
rlm@1
|
465 psllq mm2, 16;
|
rlm@1
|
466 psrlq mm3, 16;
|
rlm@1
|
467 por mm0, mm2;
|
rlm@1
|
468 por mm1, mm3;
|
rlm@1
|
469
|
rlm@1
|
470 /* current_upper */
|
rlm@1
|
471 movq mm6, qword ptr [eax];
|
rlm@1
|
472
|
rlm@1
|
473 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
474 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
475 movq mm2, mm0;
|
rlm@1
|
476 movq mm4, mm1;
|
rlm@1
|
477 movq mm3, mm0;
|
rlm@1
|
478 movq mm5, mm1;
|
rlm@1
|
479 pcmpeqw mm2, mm6;
|
rlm@1
|
480 pcmpeqw mm4, mm6;
|
rlm@1
|
481 pcmpeqw mm3, qword ptr [ecx];
|
rlm@1
|
482 pcmpeqw mm5, qword ptr [ecx];
|
rlm@1
|
483 pandn mm3, mm2;
|
rlm@1
|
484 pandn mm5, mm4;
|
rlm@1
|
485 movq mm2, mm0;
|
rlm@1
|
486 movq mm4, mm1;
|
rlm@1
|
487 pcmpeqw mm2, mm1;
|
rlm@1
|
488 pcmpeqw mm4, mm0;
|
rlm@1
|
489 pandn mm2, mm3;
|
rlm@1
|
490 pandn mm4, mm5;
|
rlm@1
|
491 movq mm3, mm2;
|
rlm@1
|
492 movq mm5, mm4;
|
rlm@1
|
493 pand mm2, mm6;
|
rlm@1
|
494 pand mm4, mm6;
|
rlm@1
|
495 pandn mm3, mm7;
|
rlm@1
|
496 pandn mm5, mm7;
|
rlm@1
|
497 por mm2, mm3;
|
rlm@1
|
498 por mm4, mm5;
|
rlm@1
|
499
|
rlm@1
|
500 /* set *dst */
|
rlm@1
|
501 movq mm3, mm2;
|
rlm@1
|
502 punpcklwd mm2, mm4;
|
rlm@1
|
503 punpckhwd mm3, mm4;
|
rlm@1
|
504 movq qword ptr [edx], mm2;
|
rlm@1
|
505 movq qword ptr [edx + 8], mm3;
|
rlm@1
|
506
|
rlm@1
|
507 mov src0, eax;
|
rlm@1
|
508 mov src1, ebx;
|
rlm@1
|
509 mov src2, ecx;
|
rlm@1
|
510 mov dst, edx;
|
rlm@1
|
511 mov count, esi;
|
rlm@1
|
512
|
rlm@1
|
513 emms;
|
rlm@1
|
514 }
|
rlm@1
|
515 #endif
|
rlm@1
|
516 }
|
rlm@1
|
517
|
rlm@1
|
518 static void internal_scale2x_32_mmx_single(u32 *dst, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
|
rlm@1
|
519 {
|
rlm@1
|
520 /* always do the first and last run */
|
rlm@1
|
521 count -= 2 * 2;
|
rlm@1
|
522
|
rlm@1
|
523 #ifdef __GNUC__
|
rlm@1
|
524 __asm__ __volatile__ (
|
rlm@1
|
525 /* first run */
|
rlm@1
|
526 /* set the current, current_pre, current_next registers */
|
rlm@1
|
527 "movq 0(%1),%%mm0\n"
|
rlm@1
|
528 "movq 0(%1),%%mm7\n"
|
rlm@1
|
529 "movq 8(%1),%%mm1\n"
|
rlm@1
|
530 "psllq $32,%%mm0\n"
|
rlm@1
|
531 "psllq $32,%%mm1\n"
|
rlm@1
|
532 "psrlq $32,%%mm0\n"
|
rlm@1
|
533 "movq %%mm7,%%mm2\n"
|
rlm@1
|
534 "movq %%mm7,%%mm3\n"
|
rlm@1
|
535 "psllq $32,%%mm2\n"
|
rlm@1
|
536 "psrlq $32,%%mm3\n"
|
rlm@1
|
537 "por %%mm2,%%mm0\n"
|
rlm@1
|
538 "por %%mm3,%%mm1\n"
|
rlm@1
|
539
|
rlm@1
|
540 /* current_upper */
|
rlm@1
|
541 "movq (%0),%%mm6\n"
|
rlm@1
|
542
|
rlm@1
|
543 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
544 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
545 "movq %%mm0,%%mm2\n"
|
rlm@1
|
546 "movq %%mm1,%%mm4\n"
|
rlm@1
|
547 "movq %%mm0,%%mm3\n"
|
rlm@1
|
548 "movq %%mm1,%%mm5\n"
|
rlm@1
|
549 "pcmpeqd %%mm6,%%mm2\n"
|
rlm@1
|
550 "pcmpeqd %%mm6,%%mm4\n"
|
rlm@1
|
551 "pcmpeqd (%2),%%mm3\n"
|
rlm@1
|
552 "pcmpeqd (%2),%%mm5\n"
|
rlm@1
|
553 "pandn %%mm2,%%mm3\n"
|
rlm@1
|
554 "pandn %%mm4,%%mm5\n"
|
rlm@1
|
555 "movq %%mm0,%%mm2\n"
|
rlm@1
|
556 "movq %%mm1,%%mm4\n"
|
rlm@1
|
557 "pcmpeqd %%mm1,%%mm2\n"
|
rlm@1
|
558 "pcmpeqd %%mm0,%%mm4\n"
|
rlm@1
|
559 "pandn %%mm3,%%mm2\n"
|
rlm@1
|
560 "pandn %%mm5,%%mm4\n"
|
rlm@1
|
561 "movq %%mm2,%%mm3\n"
|
rlm@1
|
562 "movq %%mm4,%%mm5\n"
|
rlm@1
|
563 "pand %%mm6,%%mm2\n"
|
rlm@1
|
564 "pand %%mm6,%%mm4\n"
|
rlm@1
|
565 "pandn %%mm7,%%mm3\n"
|
rlm@1
|
566 "pandn %%mm7,%%mm5\n"
|
rlm@1
|
567 "por %%mm3,%%mm2\n"
|
rlm@1
|
568 "por %%mm5,%%mm4\n"
|
rlm@1
|
569
|
rlm@1
|
570 /* set *dst */
|
rlm@1
|
571 "movq %%mm2,%%mm3\n"
|
rlm@1
|
572 "punpckldq %%mm4,%%mm2\n"
|
rlm@1
|
573 "punpckhdq %%mm4,%%mm3\n"
|
rlm@1
|
574 "movq %%mm2,(%3)\n"
|
rlm@1
|
575 "movq %%mm3, 8(%3)\n"
|
rlm@1
|
576
|
rlm@1
|
577 /* next */
|
rlm@1
|
578 "addl $8,%0\n"
|
rlm@1
|
579 "addl $8,%1\n"
|
rlm@1
|
580 "addl $8,%2\n"
|
rlm@1
|
581 "addl $16,%3\n"
|
rlm@1
|
582
|
rlm@1
|
583 /* central runs */
|
rlm@1
|
584 "shrl $1,%4\n"
|
rlm@1
|
585 "jz 1f\n"
|
rlm@1
|
586
|
rlm@1
|
587 "0:\n"
|
rlm@1
|
588
|
rlm@1
|
589 /* set the current, current_pre, current_next registers */
|
rlm@1
|
590 "movq -8(%1),%%mm0\n"
|
rlm@1
|
591 "movq (%1),%%mm7\n"
|
rlm@1
|
592 "movq 8(%1),%%mm1\n"
|
rlm@1
|
593 "psrlq $32,%%mm0\n"
|
rlm@1
|
594 "psllq $32,%%mm1\n"
|
rlm@1
|
595 "movq %%mm7,%%mm2\n"
|
rlm@1
|
596 "movq %%mm7,%%mm3\n"
|
rlm@1
|
597 "psllq $32,%%mm2\n"
|
rlm@1
|
598 "psrlq $32,%%mm3\n"
|
rlm@1
|
599 "por %%mm2,%%mm0\n"
|
rlm@1
|
600 "por %%mm3,%%mm1\n"
|
rlm@1
|
601
|
rlm@1
|
602 /* current_upper */
|
rlm@1
|
603 "movq (%0),%%mm6\n"
|
rlm@1
|
604
|
rlm@1
|
605 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
606 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
607 "movq %%mm0,%%mm2\n"
|
rlm@1
|
608 "movq %%mm1,%%mm4\n"
|
rlm@1
|
609 "movq %%mm0,%%mm3\n"
|
rlm@1
|
610 "movq %%mm1,%%mm5\n"
|
rlm@1
|
611 "pcmpeqd %%mm6,%%mm2\n"
|
rlm@1
|
612 "pcmpeqd %%mm6,%%mm4\n"
|
rlm@1
|
613 "pcmpeqd (%2),%%mm3\n"
|
rlm@1
|
614 "pcmpeqd (%2),%%mm5\n"
|
rlm@1
|
615 "pandn %%mm2,%%mm3\n"
|
rlm@1
|
616 "pandn %%mm4,%%mm5\n"
|
rlm@1
|
617 "movq %%mm0,%%mm2\n"
|
rlm@1
|
618 "movq %%mm1,%%mm4\n"
|
rlm@1
|
619 "pcmpeqd %%mm1,%%mm2\n"
|
rlm@1
|
620 "pcmpeqd %%mm0,%%mm4\n"
|
rlm@1
|
621 "pandn %%mm3,%%mm2\n"
|
rlm@1
|
622 "pandn %%mm5,%%mm4\n"
|
rlm@1
|
623 "movq %%mm2,%%mm3\n"
|
rlm@1
|
624 "movq %%mm4,%%mm5\n"
|
rlm@1
|
625 "pand %%mm6,%%mm2\n"
|
rlm@1
|
626 "pand %%mm6,%%mm4\n"
|
rlm@1
|
627 "pandn %%mm7,%%mm3\n"
|
rlm@1
|
628 "pandn %%mm7,%%mm5\n"
|
rlm@1
|
629 "por %%mm3,%%mm2\n"
|
rlm@1
|
630 "por %%mm5,%%mm4\n"
|
rlm@1
|
631
|
rlm@1
|
632 /* set *dst */
|
rlm@1
|
633 "movq %%mm2,%%mm3\n"
|
rlm@1
|
634 "punpckldq %%mm4,%%mm2\n"
|
rlm@1
|
635 "punpckhdq %%mm4,%%mm3\n"
|
rlm@1
|
636 "movq %%mm2,(%3)\n"
|
rlm@1
|
637 "movq %%mm3,8(%3)\n"
|
rlm@1
|
638
|
rlm@1
|
639 /* next */
|
rlm@1
|
640 "addl $8,%0\n"
|
rlm@1
|
641 "addl $8,%1\n"
|
rlm@1
|
642 "addl $8,%2\n"
|
rlm@1
|
643 "addl $16,%3\n"
|
rlm@1
|
644
|
rlm@1
|
645 "decl %4\n"
|
rlm@1
|
646 "jnz 0b\n"
|
rlm@1
|
647 "1:\n"
|
rlm@1
|
648
|
rlm@1
|
649 /* final run */
|
rlm@1
|
650 /* set the current, current_pre, current_next registers */
|
rlm@1
|
651 "movq (%1),%%mm1\n"
|
rlm@1
|
652 "movq (%1),%%mm7\n"
|
rlm@1
|
653 "movq -8(%1), %%mm0\n"
|
rlm@1
|
654 "psrlq $32,%%mm1\n"
|
rlm@1
|
655 "psrlq $32,%%mm0\n"
|
rlm@1
|
656 "psllq $32,%%mm1\n"
|
rlm@1
|
657 "movq %%mm7,%%mm2\n"
|
rlm@1
|
658 "movq %%mm7,%%mm3\n"
|
rlm@1
|
659 "psllq $32,%%mm2\n"
|
rlm@1
|
660 "psrlq $32,%%mm3\n"
|
rlm@1
|
661 "por %%mm2,%%mm0\n"
|
rlm@1
|
662 "por %%mm3,%%mm1\n"
|
rlm@1
|
663
|
rlm@1
|
664 /* current_upper */
|
rlm@1
|
665 "movq (%0),%%mm6\n"
|
rlm@1
|
666
|
rlm@1
|
667 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
668 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
669 "movq %%mm0,%%mm2\n"
|
rlm@1
|
670 "movq %%mm1,%%mm4\n"
|
rlm@1
|
671 "movq %%mm0,%%mm3\n"
|
rlm@1
|
672 "movq %%mm1,%%mm5\n"
|
rlm@1
|
673 "pcmpeqd %%mm6,%%mm2\n"
|
rlm@1
|
674 "pcmpeqd %%mm6,%%mm4\n"
|
rlm@1
|
675 "pcmpeqd (%2),%%mm3\n"
|
rlm@1
|
676 "pcmpeqd (%2),%%mm5\n"
|
rlm@1
|
677 "pandn %%mm2,%%mm3\n"
|
rlm@1
|
678 "pandn %%mm4,%%mm5\n"
|
rlm@1
|
679 "movq %%mm0,%%mm2\n"
|
rlm@1
|
680 "movq %%mm1,%%mm4\n"
|
rlm@1
|
681 "pcmpeqd %%mm1,%%mm2\n"
|
rlm@1
|
682 "pcmpeqd %%mm0,%%mm4\n"
|
rlm@1
|
683 "pandn %%mm3,%%mm2\n"
|
rlm@1
|
684 "pandn %%mm5,%%mm4\n"
|
rlm@1
|
685 "movq %%mm2,%%mm3\n"
|
rlm@1
|
686 "movq %%mm4,%%mm5\n"
|
rlm@1
|
687 "pand %%mm6,%%mm2\n"
|
rlm@1
|
688 "pand %%mm6,%%mm4\n"
|
rlm@1
|
689 "pandn %%mm7,%%mm3\n"
|
rlm@1
|
690 "pandn %%mm7,%%mm5\n"
|
rlm@1
|
691 "por %%mm3,%%mm2\n"
|
rlm@1
|
692 "por %%mm5,%%mm4\n"
|
rlm@1
|
693
|
rlm@1
|
694 /* set *dst */
|
rlm@1
|
695 "movq %%mm2,%%mm3\n"
|
rlm@1
|
696 "punpckldq %%mm4,%%mm2\n"
|
rlm@1
|
697 "punpckhdq %%mm4,%%mm3\n"
|
rlm@1
|
698 "movq %%mm2,(%3)\n"
|
rlm@1
|
699 "movq %%mm3,8(%3)\n"
|
rlm@1
|
700 "emms\n"
|
rlm@1
|
701
|
rlm@1
|
702 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
|
rlm@1
|
703 :
|
rlm@1
|
704 : "cc"
|
rlm@1
|
705 );
|
rlm@1
|
706 #else
|
rlm@1
|
707 __asm {
|
rlm@1
|
708 mov eax, src0;
|
rlm@1
|
709 mov ebx, src1;
|
rlm@1
|
710 mov ecx, src2;
|
rlm@1
|
711 mov edx, dst;
|
rlm@1
|
712 mov esi, count;
|
rlm@1
|
713
|
rlm@1
|
714 /* first run */
|
rlm@1
|
715 /* set the current, current_pre, current_next registers */
|
rlm@1
|
716 movq mm0, qword ptr [ebx];
|
rlm@1
|
717 movq mm7, qword ptr [ebx];
|
rlm@1
|
718 movq mm1, qword ptr [ebx + 8];
|
rlm@1
|
719 psllq mm0, 32;
|
rlm@1
|
720 psllq mm1, 32;
|
rlm@1
|
721 psrlq mm0, 32;
|
rlm@1
|
722 movq mm2, mm7;
|
rlm@1
|
723 movq mm3, mm7;
|
rlm@1
|
724 psllq mm2, 32;
|
rlm@1
|
725 psrlq mm3, 32;
|
rlm@1
|
726 por mm0, mm2;
|
rlm@1
|
727 por mm1, mm3;
|
rlm@1
|
728
|
rlm@1
|
729 /* current_upper */
|
rlm@1
|
730 movq mm6, qword ptr [eax];
|
rlm@1
|
731
|
rlm@1
|
732 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
733 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
734 movq mm2, mm0;
|
rlm@1
|
735 movq mm4, mm1;
|
rlm@1
|
736 movq mm3, mm0;
|
rlm@1
|
737 movq mm5, mm1;
|
rlm@1
|
738 pcmpeqd mm2, mm6;
|
rlm@1
|
739 pcmpeqd mm4, mm6;
|
rlm@1
|
740 pcmpeqd mm3, qword ptr [ecx];
|
rlm@1
|
741 pcmpeqd mm5, qword ptr [ecx];
|
rlm@1
|
742 pandn mm3, mm2;
|
rlm@1
|
743 pandn mm5, mm4;
|
rlm@1
|
744 movq mm2, mm0;
|
rlm@1
|
745 movq mm4, mm1;
|
rlm@1
|
746 pcmpeqd mm2, mm1;
|
rlm@1
|
747 pcmpeqd mm4, mm0;
|
rlm@1
|
748 pandn mm2, mm3;
|
rlm@1
|
749 pandn mm4, mm5;
|
rlm@1
|
750 movq mm3, mm2;
|
rlm@1
|
751 movq mm5, mm4;
|
rlm@1
|
752 pand mm2, mm6;
|
rlm@1
|
753 pand mm4, mm6;
|
rlm@1
|
754 pandn mm3, mm7;
|
rlm@1
|
755 pandn mm5, mm7;
|
rlm@1
|
756 por mm2, mm3;
|
rlm@1
|
757 por mm4, mm5;
|
rlm@1
|
758
|
rlm@1
|
759 /* set *dst */
|
rlm@1
|
760 movq mm3, mm2;
|
rlm@1
|
761 punpckldq mm2, mm4;
|
rlm@1
|
762 punpckhdq mm3, mm4;
|
rlm@1
|
763 movq qword ptr [edx], mm2;
|
rlm@1
|
764 movq qword ptr [edx + 8], mm3;
|
rlm@1
|
765
|
rlm@1
|
766 /* next */
|
rlm@1
|
767 add eax, 8;
|
rlm@1
|
768 add ebx, 8;
|
rlm@1
|
769 add ecx, 8;
|
rlm@1
|
770 add edx, 16;
|
rlm@1
|
771
|
rlm@1
|
772 /* central runs */
|
rlm@1
|
773 shr esi, 1;
|
rlm@1
|
774 jz label1;
|
rlm@1
|
775 label0:
|
rlm@1
|
776
|
rlm@1
|
777 /* set the current, current_pre, current_next registers */
|
rlm@1
|
778 movq mm0, qword ptr [ebx - 8];
|
rlm@1
|
779 movq mm7, qword ptr [ebx];
|
rlm@1
|
780 movq mm1, qword ptr [ebx + 8];
|
rlm@1
|
781 psrlq mm0, 32;
|
rlm@1
|
782 psllq mm1, 32;
|
rlm@1
|
783 movq mm2, mm7;
|
rlm@1
|
784 movq mm3, mm7;
|
rlm@1
|
785 psllq mm2, 32;
|
rlm@1
|
786 psrlq mm3, 32;
|
rlm@1
|
787 por mm0, mm2;
|
rlm@1
|
788 por mm1, mm3;
|
rlm@1
|
789
|
rlm@1
|
790 /* current_upper */
|
rlm@1
|
791 movq mm6, qword ptr[eax];
|
rlm@1
|
792
|
rlm@1
|
793 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
794 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
795 movq mm2, mm0;
|
rlm@1
|
796 movq mm4, mm1;
|
rlm@1
|
797 movq mm3, mm0;
|
rlm@1
|
798 movq mm5, mm1;
|
rlm@1
|
799 pcmpeqd mm2, mm6;
|
rlm@1
|
800 pcmpeqd mm4, mm6;
|
rlm@1
|
801 pcmpeqd mm3, qword ptr[ecx];
|
rlm@1
|
802 pcmpeqd mm5, qword ptr[ecx];
|
rlm@1
|
803 pandn mm3, mm2;
|
rlm@1
|
804 pandn mm5, mm4;
|
rlm@1
|
805 movq mm2, mm0;
|
rlm@1
|
806 movq mm4, mm1;
|
rlm@1
|
807 pcmpeqd mm2, mm1;
|
rlm@1
|
808 pcmpeqd mm4, mm0;
|
rlm@1
|
809 pandn mm2, mm3;
|
rlm@1
|
810 pandn mm4, mm5;
|
rlm@1
|
811 movq mm3, mm2;
|
rlm@1
|
812 movq mm5, mm4;
|
rlm@1
|
813 pand mm2, mm6;
|
rlm@1
|
814 pand mm4, mm6;
|
rlm@1
|
815 pandn mm3, mm7;
|
rlm@1
|
816 pandn mm5, mm7;
|
rlm@1
|
817 por mm2, mm3;
|
rlm@1
|
818 por mm4, mm5;
|
rlm@1
|
819
|
rlm@1
|
820 /* set *dst */
|
rlm@1
|
821 movq mm3, mm2;
|
rlm@1
|
822 punpckldq mm2, mm4;
|
rlm@1
|
823 punpckhdq mm3, mm4;
|
rlm@1
|
824 movq qword ptr [edx], mm2;
|
rlm@1
|
825 movq qword ptr [edx + 8], mm3;
|
rlm@1
|
826
|
rlm@1
|
827 /* next */
|
rlm@1
|
828 add eax, 8;
|
rlm@1
|
829 add ebx, 8;
|
rlm@1
|
830 add ecx, 8;
|
rlm@1
|
831 add edx, 16;
|
rlm@1
|
832
|
rlm@1
|
833 dec esi;
|
rlm@1
|
834 jnz label0;
|
rlm@1
|
835 label1:
|
rlm@1
|
836
|
rlm@1
|
837 /* final run */
|
rlm@1
|
838 /* set the current, current_pre, current_next registers */
|
rlm@1
|
839 movq mm1, qword ptr [ebx];
|
rlm@1
|
840 movq mm7, qword ptr [ebx];
|
rlm@1
|
841 movq mm0, qword ptr [ebx - 8];
|
rlm@1
|
842 psrlq mm1, 32;
|
rlm@1
|
843 psrlq mm0, 32;
|
rlm@1
|
844 psllq mm1, 32;
|
rlm@1
|
845 movq mm2, mm7;
|
rlm@1
|
846 movq mm3, mm7;
|
rlm@1
|
847 psllq mm2, 32;
|
rlm@1
|
848 psrlq mm3, 32;
|
rlm@1
|
849 por mm0, mm2;
|
rlm@1
|
850 por mm1, mm3;
|
rlm@1
|
851
|
rlm@1
|
852 /* current_upper */
|
rlm@1
|
853 movq mm6, qword ptr [eax];
|
rlm@1
|
854
|
rlm@1
|
855 /* compute the upper-left pixel for dst on %%mm2 */
|
rlm@1
|
856 /* compute the upper-right pixel for dst on %%mm4 */
|
rlm@1
|
857 movq mm2, mm0;
|
rlm@1
|
858 movq mm4, mm1;
|
rlm@1
|
859 movq mm3, mm0;
|
rlm@1
|
860 movq mm5, mm1;
|
rlm@1
|
861 pcmpeqd mm2, mm6;
|
rlm@1
|
862 pcmpeqd mm4, mm6;
|
rlm@1
|
863 pcmpeqd mm3, qword ptr [ecx];
|
rlm@1
|
864 pcmpeqd mm5, qword ptr [ecx];
|
rlm@1
|
865 pandn mm3, mm2;
|
rlm@1
|
866 pandn mm5, mm4;
|
rlm@1
|
867 movq mm2, mm0;
|
rlm@1
|
868 movq mm4, mm1;
|
rlm@1
|
869 pcmpeqd mm2, mm1;
|
rlm@1
|
870 pcmpeqd mm4, mm0;
|
rlm@1
|
871 pandn mm2, mm3;
|
rlm@1
|
872 pandn mm4, mm5;
|
rlm@1
|
873 movq mm3, mm2;
|
rlm@1
|
874 movq mm5, mm4;
|
rlm@1
|
875 pand mm2, mm6;
|
rlm@1
|
876 pand mm4, mm6;
|
rlm@1
|
877 pandn mm3, mm7;
|
rlm@1
|
878 pandn mm5, mm7;
|
rlm@1
|
879 por mm2, mm3;
|
rlm@1
|
880 por mm4, mm5;
|
rlm@1
|
881
|
rlm@1
|
882 /* set *dst */
|
rlm@1
|
883 movq mm3, mm2;
|
rlm@1
|
884 punpckldq mm2, mm4;
|
rlm@1
|
885 punpckhdq mm3, mm4;
|
rlm@1
|
886 movq qword ptr [edx], mm2;
|
rlm@1
|
887 movq qword ptr [edx + 8], mm3;
|
rlm@1
|
888
|
rlm@1
|
889 mov src0, eax;
|
rlm@1
|
890 mov src1, ebx;
|
rlm@1
|
891 mov src2, ecx;
|
rlm@1
|
892 mov dst, edx;
|
rlm@1
|
893 mov count, esi;
|
rlm@1
|
894
|
rlm@1
|
895 emms;
|
rlm@1
|
896 }
|
rlm@1
|
897 #endif
|
rlm@1
|
898 }
|
rlm@1
|
899
|
rlm@1
|
900 static void internal_scale2x_16_mmx(u16 *dst0, u16 *dst1, const u16 *src0, const u16 *src1, const u16 *src2, unsigned count)
|
rlm@1
|
901 {
|
rlm@1
|
902 // assert( count >= 2*4 );
|
rlm@1
|
903 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
|
rlm@1
|
904 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
|
rlm@1
|
905 }
|
rlm@1
|
906
|
rlm@1
|
907 static void internal_scale2x_32_mmx(u32 *dst0, u32 *dst1, const u32 *src0, const u32 *src1, const u32 *src2, unsigned count)
|
rlm@1
|
908 {
|
rlm@1
|
909 // assert( count >= 2*2 );
|
rlm@1
|
910 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
|
rlm@1
|
911 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
|
rlm@1
|
912 }
|
rlm@1
|
913
|
rlm@1
|
914 #endif
|
rlm@1
|
915
|
rlm@1
|
916 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
|
rlm@1
|
917 u8 *dstPtr, u32 dstPitch, int width, int height)
|
rlm@1
|
918 {
|
rlm@1
|
919 u16 *dst0 = (u16 *)dstPtr;
|
rlm@1
|
920 u16 *dst1 = dst0 + (dstPitch >> 1);
|
rlm@1
|
921
|
rlm@1
|
922 u16 *src0 = (u16 *)srcPtr;
|
rlm@1
|
923 u16 *src1 = src0 + (srcPitch >> 1);
|
rlm@1
|
924 u16 *src2 = src1 + (srcPitch >> 1);
|
rlm@1
|
925 #ifdef MMX
|
rlm@1
|
926 if (cpu_mmx)
|
rlm@1
|
927 {
|
rlm@1
|
928 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
|
rlm@1
|
929
|
rlm@1
|
930 int count = height;
|
rlm@1
|
931
|
rlm@1
|
932 count -= 2;
|
rlm@1
|
933 while (count)
|
rlm@1
|
934 {
|
rlm@1
|
935 dst0 += dstPitch;
|
rlm@1
|
936 dst1 += dstPitch;
|
rlm@1
|
937 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
|
rlm@1
|
938 src0 = src1;
|
rlm@1
|
939 src1 = src2;
|
rlm@1
|
940 src2 += srcPitch >> 1;
|
rlm@1
|
941 --count;
|
rlm@1
|
942 }
|
rlm@1
|
943 dst0 += dstPitch;
|
rlm@1
|
944 dst1 += dstPitch;
|
rlm@1
|
945 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
|
rlm@1
|
946 }
|
rlm@1
|
947 else
|
rlm@1
|
948 {
|
rlm@1
|
949 #endif
|
rlm@1
|
950 internal_scale2x_16_def(dst0, src0, src0, src1, width);
|
rlm@1
|
951 internal_scale2x_16_def(dst1, src1, src0, src0, width);
|
rlm@1
|
952
|
rlm@1
|
953 int count = height;
|
rlm@1
|
954
|
rlm@1
|
955 count -= 2;
|
rlm@1
|
956 while (count)
|
rlm@1
|
957 {
|
rlm@1
|
958 dst0 += dstPitch;
|
rlm@1
|
959 dst1 += dstPitch;
|
rlm@1
|
960 internal_scale2x_16_def(dst0, src0, src1, src2, width);
|
rlm@1
|
961 internal_scale2x_16_def(dst1, src2, src1, src0, width);
|
rlm@1
|
962 src0 = src1;
|
rlm@1
|
963 src1 = src2;
|
rlm@1
|
964 src2 += srcPitch >> 1;
|
rlm@1
|
965 --count;
|
rlm@1
|
966 }
|
rlm@1
|
967 dst0 += dstPitch;
|
rlm@1
|
968 dst1 += dstPitch;
|
rlm@1
|
969 internal_scale2x_16_def(dst0, src0, src1, src1, width);
|
rlm@1
|
970 internal_scale2x_16_def(dst1, src1, src1, src0, width);
|
rlm@1
|
971 #ifdef MMX
|
rlm@1
|
972 }
|
rlm@1
|
973
|
rlm@1
|
974 #endif
|
rlm@1
|
975 }
|
rlm@1
|
976
|
rlm@1
|
977 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
|
rlm@1
|
978 u8 *dstPtr, u32 dstPitch, int width, int height)
|
rlm@1
|
979 {
|
rlm@1
|
980 u32 *dst0 = (u32 *)dstPtr;
|
rlm@1
|
981 u32 *dst1 = dst0 + (dstPitch >> 2);
|
rlm@1
|
982
|
rlm@1
|
983 u32 *src0 = (u32 *)srcPtr;
|
rlm@1
|
984 u32 *src1 = src0 + (srcPitch >> 2);
|
rlm@1
|
985 u32 *src2 = src1 + (srcPitch >> 2);
|
rlm@1
|
986 #ifdef MMX
|
rlm@1
|
987 if (cpu_mmx)
|
rlm@1
|
988 {
|
rlm@1
|
989 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
|
rlm@1
|
990
|
rlm@1
|
991 int count = height;
|
rlm@1
|
992
|
rlm@1
|
993 count -= 2;
|
rlm@1
|
994 while (count)
|
rlm@1
|
995 {
|
rlm@1
|
996 dst0 += dstPitch >> 1;
|
rlm@1
|
997 dst1 += dstPitch >> 1;
|
rlm@1
|
998 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
|
rlm@1
|
999 src0 = src1;
|
rlm@1
|
1000 src1 = src2;
|
rlm@1
|
1001 src2 += srcPitch >> 2;
|
rlm@1
|
1002 --count;
|
rlm@1
|
1003 }
|
rlm@1
|
1004 dst0 += dstPitch >> 1;
|
rlm@1
|
1005 dst1 += dstPitch >> 1;
|
rlm@1
|
1006 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
|
rlm@1
|
1007 }
|
rlm@1
|
1008 else
|
rlm@1
|
1009 {
|
rlm@1
|
1010 #endif
|
rlm@1
|
1011 internal_scale2x_32_def(dst0, src0, src0, src1, width);
|
rlm@1
|
1012 internal_scale2x_32_def(dst1, src1, src0, src0, width);
|
rlm@1
|
1013
|
rlm@1
|
1014 int count = height;
|
rlm@1
|
1015
|
rlm@1
|
1016 count -= 2;
|
rlm@1
|
1017 while (count)
|
rlm@1
|
1018 {
|
rlm@1
|
1019 dst0 += dstPitch >> 1;
|
rlm@1
|
1020 dst1 += dstPitch >> 1;
|
rlm@1
|
1021 internal_scale2x_32_def(dst0, src0, src1, src2, width);
|
rlm@1
|
1022 internal_scale2x_32_def(dst1, src2, src1, src0, width);
|
rlm@1
|
1023 src0 = src1;
|
rlm@1
|
1024 src1 = src2;
|
rlm@1
|
1025 src2 += srcPitch >> 2;
|
rlm@1
|
1026 --count;
|
rlm@1
|
1027 }
|
rlm@1
|
1028 dst0 += dstPitch >> 1;
|
rlm@1
|
1029 dst1 += dstPitch >> 1;
|
rlm@1
|
1030 internal_scale2x_32_def(dst0, src0, src1, src1, width);
|
rlm@1
|
1031 internal_scale2x_32_def(dst1, src1, src1, src0, width);
|
rlm@1
|
1032 #ifdef MMX
|
rlm@1
|
1033 }
|
rlm@1
|
1034
|
rlm@1
|
1035 #endif
|
rlm@1
|
1036 }
|