view src/filters/hq_shared32.cpp @ 135:eb6ba88088d3

Wrote a more efficient input-number-assembly program; 91 oc -> 60 oc.
author Dylan Holmes <ocsenave@gmail.com>
date Sun, 18 Mar 2012 05:13:19 -0500
parents f9f4f1b99eed
children
line wrap: on
line source
1 #include "../Port.h"
2 #include "hq_shared32.h"
3 #include "interp.h"
5 const unsigned __int64 reg_blank = 0x0000000000000000;
6 const unsigned __int64 const7 = 0x0000000700070007;
7 const unsigned __int64 treshold = 0x0000000000300706;
9 void Interp1(unsigned char *pc, unsigned int c1, unsigned int c2)
10 {
11 //*((int*)pc) = (c1*3+c2)/4;
13 #ifdef MMX
14 __asm
15 {
16 mov eax, pc
17 movd mm1, c1
18 movd mm2, c2
19 movq mm0, mm1
20 pslld mm0, 2
21 psubd mm0, mm1
22 paddd mm0, mm2
23 psrld mm0, 2
24 movd [eax], mm0
25 EMMS
26 }
27 #else
28 __asm
29 {
30 mov eax, pc
31 mov edx, c1
32 shl edx, 2
33 add edx, c2
34 sub edx, c1
35 shr edx, 2
36 mov [eax], edx
37 }
38 #endif
39 }
41 void Interp2(unsigned char *pc, unsigned int c1, unsigned int c2, unsigned int c3)
42 {
43 //*((int*)pc) = (c1*2+c2+c3)/4;
45 #ifdef MMX
46 __asm
47 {
48 mov eax, pc
49 movd mm0, c1
50 movd mm1, c2
51 movd mm2, c3
52 pslld mm0, 1
53 paddd mm0, mm1
54 paddd mm0, mm2
55 psrad mm0, 2
56 movd [eax], mm0
57 EMMS
58 }
59 #else
60 __asm
61 {
62 mov eax, pc
63 mov edx, c1
64 shl edx, 1
65 add edx, c2
66 add edx, c3
67 shr edx, 2
68 mov [eax], edx
69 }
70 #endif
71 }
73 void Interp3(unsigned char *pc, unsigned int c1, unsigned int c2)
74 {
75 //*((int*)pc) = (c1*7+c2)/8;
76 //*((int*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
77 // (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
79 #ifdef MMX
80 __asm
81 {
82 mov eax, pc
83 movd mm1, c1
84 movd mm2, c2
85 punpcklbw mm1, reg_blank
86 punpcklbw mm2, reg_blank
87 pmullw mm1, const7
88 paddw mm1, mm2
89 psrlw mm1, 3
90 packuswb mm1, reg_blank
91 movd [eax], mm1
92 EMMS
93 }
94 #else
95 __asm
96 {
97 mov eax, c1
98 mov ebx, c2
99 mov ecx, eax
100 shl ecx, 3
101 sub ecx, eax
102 add ecx, ebx
103 shr ecx, 3
104 mov eax, pc
105 mov [eax], ecx
106 }
107 #endif
108 }
110 void Interp4(unsigned char *pc, unsigned int c1, unsigned int c2, unsigned int c3)
111 {
112 //*((int*)pc) = (c1*2+(c2+c3)*7)/16;
113 //*((int*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
114 // (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
116 #ifdef MMX
117 __asm
118 {
119 mov eax, pc
120 movd mm1, c1
121 movd mm2, c2
122 movd mm3, c3
123 punpcklbw mm1, reg_blank
124 punpcklbw mm2, reg_blank
125 punpcklbw mm3, reg_blank
126 psllw mm1, 1
127 paddw mm2, mm3
128 pmullw mm2, const7
129 paddw mm1, mm2
130 psrlw mm1, 4
131 packuswb mm1, reg_blank
132 movd [eax], mm1
133 EMMS
134 }
135 #else
137 __asm
138 {
139 mov eax, [c1]
140 and eax, 0FF00h
141 shl eax, 1
142 mov ecx, [c2]
143 and ecx, 0FF00h
144 mov edx, [c3]
145 and edx, 0FF00h
146 add ecx, edx
147 imul ecx, ecx, 7
148 add eax, ecx
149 and eax, 0FF000h
151 mov ebx, [c1]
152 and ebx, 0FF00FFh
153 shl ebx, 1
154 mov ecx, [c2]
155 and ecx, 0FF00FFh
156 mov edx, [c3]
157 and edx, 0FF00FFh
158 add ecx, edx
159 imul ecx, ecx, 7
160 add ebx, ecx
161 and ebx, 0FF00FF0h
163 add eax, ebx
164 shr eax, 4
166 mov ebx, pc
167 mov [ebx], eax
168 }
169 #endif
170 }
172 void Interp5(unsigned char *pc, unsigned int c1, unsigned int c2)
173 {
174 //*((int*)pc) = (c1+c2)/2;
176 #ifdef MMX
177 __asm
178 {
179 mov eax, pc
180 movd mm0, c1
181 movd mm1, c2
182 paddd mm0, mm1
183 psrad mm0, 1
184 movd [eax], mm0
185 EMMS
186 }
187 #else
188 __asm
189 {
190 mov eax, pc
191 mov edx, c1
192 add edx, c2
193 shr edx, 1
194 mov [eax], edx
195 }
196 #endif
197 }
199 void Interp1_16(unsigned char *pc, unsigned short c1, unsigned short c2)
200 {
201 *((unsigned short *)pc) = interp_16_31(c1, c2);
202 //*((int*)pc) = (c1*3+c2)/4;
203 }
205 void Interp2_16(unsigned char *pc, unsigned short c1, unsigned short c2, unsigned short c3)
206 {
207 *((unsigned short *)pc) = interp_16_211(c1, c2, c3);
208 //*((int*)pc) = (c1*2+c2+c3)/4;
209 }
211 void Interp3_16(unsigned char *pc, unsigned short c1, unsigned short c2)
212 {
213 *((unsigned short *)pc) = interp_16_71(c1, c2);
214 // *((unsigned short*)pc) = (c1*7+c2)/8;
215 // *((unsigned short*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
216 // (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
217 }
219 void Interp4_16(unsigned char *pc, unsigned short c1, unsigned short c2, unsigned short c3)
220 {
221 *((unsigned short *)pc) = interp_16_772(c2, c3, c1);
222 // *((unsigned short*)pc) = (c1*2+(c2+c3)*7)/16;
223 // *((unsigned short*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
224 // (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
225 }
227 void Interp5_16(unsigned char *pc, unsigned short c1, unsigned short c2)
228 {
229 *((unsigned short *)pc) = interp_16_11(c1, c2);
230 }
232 bool Diff(unsigned int c1, unsigned int c2)
233 {
234 unsigned int
235 YUV1 = RGBtoYUV(c1),
236 YUV2 = RGBtoYUV(c2);
238 if (YUV1 == YUV2) return false; // Save some processing power
240 #ifdef MMX
241 unsigned int retval;
242 __asm
243 {
244 mov eax, 0x7FFFFFFF
245 movd mm7, eax; mm7 = ABS_MASK = 0x7FFFFFFF
247 ; Copy source colors in first reg
248 movd mm0, YUV1
249 movd mm1, YUV2
251 mov eax, 0x00FF0000
252 movd mm6, eax; mm6 = Ymask = 0x00FF0000
254 ; Calculate color Y difference
255 movq mm2, mm0
256 movq mm3, mm1
257 pand mm2, mm6
258 pand mm3, mm6
259 psubd mm2, mm3
260 pand mm2, mm7
262 mov eax, 0x0000FF00
263 movd mm6, eax; mm6 = Umask = 0x0000FF00
265 ; Calculate color U difference
266 movq mm3, mm0
267 movq mm4, mm1
268 pand mm3, mm6
269 pand mm4, mm6
270 psubd mm3, mm4
271 pand mm3, mm7
273 mov eax, 0x000000FF
274 movd mm6, eax; mm6 = Vmask = 0x000000FF
276 ; Calculate color V difference
277 movq mm4, mm0
278 movq mm5, mm1
279 pand mm4, mm6
280 pand mm5, mm6
281 psubd mm4, mm5
282 pand mm4, mm7
284 mov eax, 0x00300000
285 movd mm5, eax; mm5 = trY = 0x00300000
286 mov eax, 0x00000700
287 movd mm6, eax; mm6 = trU = 0x00000700
288 mov eax, 0x00000006
289 movd mm7, eax; mm7 = trV = 0x00000006
291 ; Compare the results
292 pcmpgtd mm2, trY
293 pcmpgtd mm3, trU
294 pcmpgtd mm4, trV
295 por mm2, mm3
296 por mm2, mm4
298 movd retval, mm2
300 EMMS
301 }
302 return (retval != 0);
303 #else
304 return
305 (abs32((YUV1 & Ymask) - (YUV2 & Ymask)) > trY) ||
306 (abs32((YUV1 & Umask) - (YUV2 & Umask)) > trU) ||
307 (abs32((YUV1 & Vmask) - (YUV2 & Vmask)) > trV);
308 #endif
309 }
311 unsigned int RGBtoYUV(unsigned int c)
312 { // Division through 3 slows down the emulation about 10% !!!
313 #ifdef MMX
314 unsigned int retval;
315 __asm
316 {
317 movd mm0, c
318 movq mm1, mm0
319 movq mm2, mm0; mm0 = mm1 = mm2 = c
321 mov eax, 0x000000FF
322 movd mm5, eax; mm5 = REDMASK = 0x000000FF
323 mov eax, 0x0000FF00
324 movd mm6, eax; mm6 = GREENMASK = 0x0000FF00
325 mov eax, 0x00FF0000
326 movd mm7, eax; mm7 = BLUEMASK = 0x00FF0000
328 pand mm0, mm5
329 pand mm1, mm6
330 pand mm2, mm7; mm0 = R mm1 = G mm2 = B
332 movq mm3, mm0
333 paddd mm3, mm1
334 paddd mm3, mm2
335 ; psrld mm3, 2; mm3 = Y
336 ; pslld mm3, 16
337 pslld mm3, 14; mm3 = Y << 16
339 mov eax, 512
340 movd mm7, eax; mm7 = 128 << 2 = 512
342 movq mm4, mm0
343 psubd mm4, mm2
344 ; psrld mm4, 2
345 ; paddd mm4, mm7; mm4 = U
346 ; pslld mm4, 8; mm4 = U << 8
347 paddd mm4, mm7
348 pslld mm4, 6
350 mov eax, 128
351 movd mm7, eax; mm7 = 128
353 movq mm5, mm1
354 pslld mm5, 1
355 psubd mm5, mm0
356 psubd mm5, mm2
357 psrld mm5, 3
358 paddd mm5, mm7; mm5 = V
360 paddd mm5, mm4
361 paddd mm5, mm3
363 movd retval, mm5
365 EMMS
366 }
367 return retval;
368 #else
369 unsigned char r, g, b, Y, u, v;
370 r = (c & 0x000000FF);
371 g = (c & 0x0000FF00) >> 8;
372 b = (c & 0x00FF0000) >> 16;
373 Y = (r + g + b) >> 2;
374 u = 128 + ((r - b) >> 2);
375 v = 128 + ((-r + 2 * g - b) >> 3);
376 return (Y << 16) + (u << 8) + v;
378 // Extremely High Quality Code
379 //unsigned char r, g, b;
380 //r = c & 0xFF;
381 //g = (c >> 8) & 0xFF;
382 //b = (c >> 16) & 0xFF;
383 //unsigned char y, u, v;
384 //y = (0.256788 * r + 0.504129 * g + 0.097906 * b) + 16;
385 //u = (-0.148223 * r - 0.290993 * g + 0.439216 * b) + 128;
386 //v = (0.439216 * r - 0.367788 * g - 0.071427 * b) + 128;
387 //return (y << 16) + (u << 8) + v;
388 #endif