1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
9 * libpng version 1.2.0 - September 1, 2001
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
38 * For other platforms, see the main GNU site:
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
42 * Version 2.5.2l.15 is definitely too old...
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_read_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
207 * - added new png_init_mmx_flags() function (here only because it needs to
208 * call mmxsupport(), which should probably become global png_mmxsupport());
209 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 * and made it public; moved png_init_mmx_flags() to png.c as internal func
216 * - removed dependency on png_read_filter_row_c() (C code already duplicated
217 * within MMX version of png_read_filter_row()) so no longer necessary to
218 * compile it into pngrutil.o
221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
224 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
225 * - write MMX code for 48-bit case (pixel_bytes == 6)
226 * - figure out what's up with 24-bit case (pixel_bytes == 3):
227 * why subtract 8 from width_mmx in the pass 4/5 case?
228 * (only width_mmx case) (near line 1606)
229 * - rewrite all MMX interlacing code so it's aligned with beginning
230 * of the row buffer, not the end (see 19991007 for details)
231 * x pick one version of mmxsupport() and get rid of the other
232 * - add error messages to any remaining bogus default cases
233 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
234 * x add support for runtime enable/disable/query of various MMX routines
240 #if defined(PNG_USE_PNGGCCRD)
242 int PNGAPI png_mmx_support(void);
244 #ifdef PNG_USE_LOCAL_ARRAYS
245 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
246 static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
247 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
250 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
251 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
252 * so define them without: */
253 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
254 # define _mmx_supported mmx_supported
255 # define _const4 const4
256 # define _const6 const6
257 # define _mask8_0 mask8_0
258 # define _mask16_1 mask16_1
259 # define _mask16_0 mask16_0
260 # define _mask24_2 mask24_2
261 # define _mask24_1 mask24_1
262 # define _mask24_0 mask24_0
263 # define _mask32_3 mask32_3
264 # define _mask32_2 mask32_2
265 # define _mask32_1 mask32_1
266 # define _mask32_0 mask32_0
267 # define _mask48_5 mask48_5
268 # define _mask48_4 mask48_4
269 # define _mask48_3 mask48_3
270 # define _mask48_2 mask48_2
271 # define _mask48_1 mask48_1
272 # define _mask48_0 mask48_0
273 # define _LBCarryMask LBCarryMask
274 # define _HBClearMask HBClearMask
275 # define _ActiveMask ActiveMask
276 # define _ActiveMask2 ActiveMask2
277 # define _ActiveMaskEnd ActiveMaskEnd
278 # define _ShiftBpp ShiftBpp
279 # define _ShiftRem ShiftRem
280 #ifdef PNG_THREAD_UNSAFE_OK
281 # define _unmask unmask
282 # define _FullLength FullLength
283 # define _MMXLength MMXLength
285 # define _patemp patemp
286 # define _pbtemp pbtemp
287 # define _pctemp pctemp
292 /* These constants are used in the inlined MMX assembly code.
293 Ignore gcc's "At top level: defined but not used" warnings. */
295 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
296 * since that case uses the %ebx register for indexing the Global Offset Table
297 * and there were no other registers available. But gcc 2.95 and later emit
298 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
299 * in the non-PIC case, so we'll just use the global unconditionally now.
301 #ifdef PNG_THREAD_UNSAFE_OK
305 static unsigned long long _mask8_0 = 0x0102040810204080LL;
307 static unsigned long long _mask16_1 = 0x0101020204040808LL;
308 static unsigned long long _mask16_0 = 0x1010202040408080LL;
310 static unsigned long long _mask24_2 = 0x0101010202020404LL;
311 static unsigned long long _mask24_1 = 0x0408080810101020LL;
312 static unsigned long long _mask24_0 = 0x2020404040808080LL;
314 static unsigned long long _mask32_3 = 0x0101010102020202LL;
315 static unsigned long long _mask32_2 = 0x0404040408080808LL;
316 static unsigned long long _mask32_1 = 0x1010101020202020LL;
317 static unsigned long long _mask32_0 = 0x4040404080808080LL;
319 static unsigned long long _mask48_5 = 0x0101010101010202LL;
320 static unsigned long long _mask48_4 = 0x0202020204040404LL;
321 static unsigned long long _mask48_3 = 0x0404080808080808LL;
322 static unsigned long long _mask48_2 = 0x1010101010102020LL;
323 static unsigned long long _mask48_1 = 0x2020202040404040LL;
324 static unsigned long long _mask48_0 = 0x4040808080808080LL;
326 static unsigned long long _const4 = 0x0000000000FFFFFFLL;
327 //static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
328 static unsigned long long _const6 = 0x00000000000000FFLL;
330 // These are used in the row-filter routines and should/would be local
331 // variables if not for gcc addressing limitations.
332 // WARNING: Their presence probably defeats the thread safety of libpng.
334 #ifdef PNG_THREAD_UNSAFE_OK
335 static png_uint_32 _FullLength;
336 static png_uint_32 _MMXLength;
338 static int _patemp; // temp variables for Paeth routine
344 png_squelch_warnings(void)
346 #ifdef PNG_THREAD_UNSAFE_OK
351 _MMXLength = _MMXLength;
356 _mask16_1 = _mask16_1;
357 _mask16_0 = _mask16_0;
358 _mask24_2 = _mask24_2;
359 _mask24_1 = _mask24_1;
360 _mask24_0 = _mask24_0;
361 _mask32_3 = _mask32_3;
362 _mask32_2 = _mask32_2;
363 _mask32_1 = _mask32_1;
364 _mask32_0 = _mask32_0;
365 _mask48_5 = _mask48_5;
366 _mask48_4 = _mask48_4;
367 _mask48_3 = _mask48_3;
368 _mask48_2 = _mask48_2;
369 _mask48_1 = _mask48_1;
370 _mask48_0 = _mask48_0;
372 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
375 static int _mmx_supported = 2;
377 /*===========================================================================*/
379 /* P N G _ C O M B I N E _ R O W */
381 /*===========================================================================*/
383 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
386 #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
388 #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
391 /* Combines the row recently read in with the previous row.
392 This routine takes care of alpha and transparency if requested.
393 This routine also handles the two methods of progressive display
394 of interlaced images, depending on the mask value.
395 The mask value describes which pixels are to be combined with
396 the row. The pattern always repeats every 8 pixels, so just 8
397 bits are needed. A one indicates the pixel is to be combined; a
398 zero indicates the pixel is to be skipped. This is in addition
399 to any alpha or transparency value associated with the pixel.
400 If you want all pixels to be combined, pass 0xff (255) in mask. */
402 /* Use this routine for the x86 platform - it uses a faster MMX routine
403 if the machine supports MMX. */
406 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
408 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
410 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
411 if (_mmx_supported == 2) {
412 /* this should have happened in png_init_mmx_flags() already */
413 png_warning(png_ptr, "asm_flags may not have been initialized");
420 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
421 png_memcpy(row, png_ptr->row_buf + 1,
422 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
424 else /* (png_combine_row() is never called with mask == 0) */
426 switch (png_ptr->row_info.pixel_depth)
428 case 1: /* png_ptr->row_info.pixel_depth */
432 int s_inc, s_start, s_end;
437 sp = png_ptr->row_buf + 1;
440 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
441 if (png_ptr->transformations & PNG_PACKSWAP)
457 for (i = 0; i < png_ptr->width; i++)
463 value = (*sp >> shift) & 0x1;
464 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
465 *dp |= (png_byte)(value << shift);
485 case 2: /* png_ptr->row_info.pixel_depth */
489 int s_start, s_end, s_inc;
495 sp = png_ptr->row_buf + 1;
498 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
499 if (png_ptr->transformations & PNG_PACKSWAP)
515 for (i = 0; i < png_ptr->width; i++)
519 value = (*sp >> shift) & 0x3;
520 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
521 *dp |= (png_byte)(value << shift);
540 case 4: /* png_ptr->row_info.pixel_depth */
544 int s_start, s_end, s_inc;
550 sp = png_ptr->row_buf + 1;
553 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
554 if (png_ptr->transformations & PNG_PACKSWAP)
569 for (i = 0; i < png_ptr->width; i++)
573 value = (*sp >> shift) & 0xf;
574 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
575 *dp |= (png_byte)(value << shift);
594 case 8: /* png_ptr->row_info.pixel_depth */
599 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
600 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
601 /* && _mmx_supported */ )
605 int dummy_value_a; // fix 'forbidden register spilled' error
610 _unmask = ~mask; // global variable for -fPIC version
611 srcptr = png_ptr->row_buf + 1;
613 len = png_ptr->width &~7; // reduce to multiple of 8
614 diff = (int) (png_ptr->width & 7); // amount lost
616 __asm__ __volatile__ (
617 "movd _unmask, %%mm7 \n\t" // load bit pattern
618 "psubb %%mm6, %%mm6 \n\t" // zero mm6
619 "punpcklbw %%mm7, %%mm7 \n\t"
620 "punpcklwd %%mm7, %%mm7 \n\t"
621 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
623 "movq _mask8_0, %%mm0 \n\t"
624 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
625 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
627 // preload "movl len, %%ecx \n\t" // load length of line
628 // preload "movl srcptr, %%esi \n\t" // load source
629 // preload "movl dstptr, %%edi \n\t" // load dest
631 "cmpl $0, %%ecx \n\t" // len == 0 ?
632 "je mainloop8end \n\t"
635 "movq (%%esi), %%mm4 \n\t" // *srcptr
636 "pand %%mm0, %%mm4 \n\t"
637 "movq %%mm0, %%mm6 \n\t"
638 "pandn (%%edi), %%mm6 \n\t" // *dstptr
639 "por %%mm6, %%mm4 \n\t"
640 "movq %%mm4, (%%edi) \n\t"
641 "addl $8, %%esi \n\t" // inc by 8 bytes processed
642 "addl $8, %%edi \n\t"
643 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
647 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
648 "movl %%eax, %%ecx \n\t"
649 "cmpl $0, %%ecx \n\t"
651 // preload "movl mask, %%edx \n\t"
652 "sall $24, %%edx \n\t" // make low byte, high byte
655 "sall %%edx \n\t" // move high bit to CF
656 "jnc skip8 \n\t" // if CF = 0
657 "movb (%%esi), %%al \n\t"
658 "movb %%al, (%%edi) \n\t"
664 "jnz secondloop8 \n\t"
669 : "=a" (dummy_value_a), // output regs (dummy)
670 "=d" (dummy_value_d),
671 "=c" (dummy_value_c),
672 "=S" (dummy_value_S),
675 : "3" (srcptr), // esi // input regs
678 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
682 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
683 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
687 else /* mmx _not supported - Use modified C routine */
688 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
690 register png_uint_32 i;
691 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
692 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
693 register int stride = png_pass_inc[png_ptr->pass];
694 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
695 register int rep_bytes = png_pass_width[png_ptr->pass];
696 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
697 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
698 int diff = (int) (png_ptr->width & 7); /* amount lost */
699 register png_uint_32 final_val = len; /* GRR bugfix */
701 srcptr = png_ptr->row_buf + 1 + initial_val;
702 dstptr = row + initial_val;
704 for (i = initial_val; i < final_val; i += stride)
706 png_memcpy(dstptr, srcptr, rep_bytes);
710 if (diff) /* number of leftover pixels: 3 for pngtest */
712 final_val+=diff /* *BPP1 */ ;
713 for (; i < final_val; i += stride)
715 if (rep_bytes > (int)(final_val-i))
716 rep_bytes = (int)(final_val-i);
717 png_memcpy(dstptr, srcptr, rep_bytes);
723 } /* end of else (_mmx_supported) */
728 case 16: /* png_ptr->row_info.pixel_depth */
733 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
734 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
735 /* && _mmx_supported */ )
739 int dummy_value_a; // fix 'forbidden register spilled' error
744 _unmask = ~mask; // global variable for -fPIC version
745 srcptr = png_ptr->row_buf + 1;
747 len = png_ptr->width &~7; // reduce to multiple of 8
748 diff = (int) (png_ptr->width & 7); // amount lost //
750 __asm__ __volatile__ (
751 "movd _unmask, %%mm7 \n\t" // load bit pattern
752 "psubb %%mm6, %%mm6 \n\t" // zero mm6
753 "punpcklbw %%mm7, %%mm7 \n\t"
754 "punpcklwd %%mm7, %%mm7 \n\t"
755 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
757 "movq _mask16_0, %%mm0 \n\t"
758 "movq _mask16_1, %%mm1 \n\t"
760 "pand %%mm7, %%mm0 \n\t"
761 "pand %%mm7, %%mm1 \n\t"
763 "pcmpeqb %%mm6, %%mm0 \n\t"
764 "pcmpeqb %%mm6, %%mm1 \n\t"
766 // preload "movl len, %%ecx \n\t" // load length of line
767 // preload "movl srcptr, %%esi \n\t" // load source
768 // preload "movl dstptr, %%edi \n\t" // load dest
770 "cmpl $0, %%ecx \n\t"
771 "jz mainloop16end \n\t"
774 "movq (%%esi), %%mm4 \n\t"
775 "pand %%mm0, %%mm4 \n\t"
776 "movq %%mm0, %%mm6 \n\t"
777 "movq (%%edi), %%mm7 \n\t"
778 "pandn %%mm7, %%mm6 \n\t"
779 "por %%mm6, %%mm4 \n\t"
780 "movq %%mm4, (%%edi) \n\t"
782 "movq 8(%%esi), %%mm5 \n\t"
783 "pand %%mm1, %%mm5 \n\t"
784 "movq %%mm1, %%mm7 \n\t"
785 "movq 8(%%edi), %%mm6 \n\t"
786 "pandn %%mm6, %%mm7 \n\t"
787 "por %%mm7, %%mm5 \n\t"
788 "movq %%mm5, 8(%%edi) \n\t"
790 "addl $16, %%esi \n\t" // inc by 16 bytes processed
791 "addl $16, %%edi \n\t"
792 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
795 "mainloop16end: \n\t"
796 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
797 "movl %%eax, %%ecx \n\t"
798 "cmpl $0, %%ecx \n\t"
800 // preload "movl mask, %%edx \n\t"
801 "sall $24, %%edx \n\t" // make low byte, high byte
804 "sall %%edx \n\t" // move high bit to CF
805 "jnc skip16 \n\t" // if CF = 0
806 "movw (%%esi), %%ax \n\t"
807 "movw %%ax, (%%edi) \n\t"
810 "addl $2, %%esi \n\t"
811 "addl $2, %%edi \n\t"
813 "jnz secondloop16 \n\t"
818 : "=a" (dummy_value_a), // output regs (dummy)
819 "=c" (dummy_value_c),
820 "=d" (dummy_value_d),
821 "=S" (dummy_value_S),
824 : "0" (diff), // eax // input regs
825 // was (unmask) " " RESERVED // ebx // Global Offset Table idx
831 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
832 : "%mm0", "%mm1", "%mm4" // clobber list
833 , "%mm5", "%mm6", "%mm7"
837 else /* mmx _not supported - Use modified C routine */
838 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
840 register png_uint_32 i;
841 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
842 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
843 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
844 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
845 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
846 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
847 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
848 int diff = (int) (png_ptr->width & 7); /* amount lost */
849 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
851 srcptr = png_ptr->row_buf + 1 + initial_val;
852 dstptr = row + initial_val;
854 for (i = initial_val; i < final_val; i += stride)
856 png_memcpy(dstptr, srcptr, rep_bytes);
860 if (diff) /* number of leftover pixels: 3 for pngtest */
862 final_val+=diff*BPP2;
863 for (; i < final_val; i += stride)
865 if (rep_bytes > (int)(final_val-i))
866 rep_bytes = (int)(final_val-i);
867 png_memcpy(dstptr, srcptr, rep_bytes);
872 } /* end of else (_mmx_supported) */
877 case 24: /* png_ptr->row_info.pixel_depth */
882 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
883 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
884 /* && _mmx_supported */ )
888 int dummy_value_a; // fix 'forbidden register spilled' error
893 _unmask = ~mask; // global variable for -fPIC version
894 srcptr = png_ptr->row_buf + 1;
896 len = png_ptr->width &~7; // reduce to multiple of 8
897 diff = (int) (png_ptr->width & 7); // amount lost //
899 __asm__ __volatile__ (
900 "movd _unmask, %%mm7 \n\t" // load bit pattern
901 "psubb %%mm6, %%mm6 \n\t" // zero mm6
902 "punpcklbw %%mm7, %%mm7 \n\t"
903 "punpcklwd %%mm7, %%mm7 \n\t"
904 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
906 "movq _mask24_0, %%mm0 \n\t"
907 "movq _mask24_1, %%mm1 \n\t"
908 "movq _mask24_2, %%mm2 \n\t"
910 "pand %%mm7, %%mm0 \n\t"
911 "pand %%mm7, %%mm1 \n\t"
912 "pand %%mm7, %%mm2 \n\t"
914 "pcmpeqb %%mm6, %%mm0 \n\t"
915 "pcmpeqb %%mm6, %%mm1 \n\t"
916 "pcmpeqb %%mm6, %%mm2 \n\t"
918 // preload "movl len, %%ecx \n\t" // load length of line
919 // preload "movl srcptr, %%esi \n\t" // load source
920 // preload "movl dstptr, %%edi \n\t" // load dest
922 "cmpl $0, %%ecx \n\t"
923 "jz mainloop24end \n\t"
926 "movq (%%esi), %%mm4 \n\t"
927 "pand %%mm0, %%mm4 \n\t"
928 "movq %%mm0, %%mm6 \n\t"
929 "movq (%%edi), %%mm7 \n\t"
930 "pandn %%mm7, %%mm6 \n\t"
931 "por %%mm6, %%mm4 \n\t"
932 "movq %%mm4, (%%edi) \n\t"
934 "movq 8(%%esi), %%mm5 \n\t"
935 "pand %%mm1, %%mm5 \n\t"
936 "movq %%mm1, %%mm7 \n\t"
937 "movq 8(%%edi), %%mm6 \n\t"
938 "pandn %%mm6, %%mm7 \n\t"
939 "por %%mm7, %%mm5 \n\t"
940 "movq %%mm5, 8(%%edi) \n\t"
942 "movq 16(%%esi), %%mm6 \n\t"
943 "pand %%mm2, %%mm6 \n\t"
944 "movq %%mm2, %%mm4 \n\t"
945 "movq 16(%%edi), %%mm7 \n\t"
946 "pandn %%mm7, %%mm4 \n\t"
947 "por %%mm4, %%mm6 \n\t"
948 "movq %%mm6, 16(%%edi) \n\t"
950 "addl $24, %%esi \n\t" // inc by 24 bytes processed
951 "addl $24, %%edi \n\t"
952 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
956 "mainloop24end: \n\t"
957 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
958 "movl %%eax, %%ecx \n\t"
959 "cmpl $0, %%ecx \n\t"
961 // preload "movl mask, %%edx \n\t"
962 "sall $24, %%edx \n\t" // make low byte, high byte
965 "sall %%edx \n\t" // move high bit to CF
966 "jnc skip24 \n\t" // if CF = 0
967 "movw (%%esi), %%ax \n\t"
968 "movw %%ax, (%%edi) \n\t"
969 "xorl %%eax, %%eax \n\t"
970 "movb 2(%%esi), %%al \n\t"
971 "movb %%al, 2(%%edi) \n\t"
974 "addl $3, %%esi \n\t"
975 "addl $3, %%edi \n\t"
977 "jnz secondloop24 \n\t"
982 : "=a" (dummy_value_a), // output regs (dummy)
983 "=d" (dummy_value_d),
984 "=c" (dummy_value_c),
985 "=S" (dummy_value_S),
988 : "3" (srcptr), // esi // input regs
991 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
995 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
996 : "%mm0", "%mm1", "%mm2" // clobber list
997 , "%mm4", "%mm5", "%mm6", "%mm7"
1001 else /* mmx _not supported - Use modified C routine */
1002 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1004 register png_uint_32 i;
1005 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1006 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1007 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1008 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1009 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1010 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1011 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1012 int diff = (int) (png_ptr->width & 7); /* amount lost */
1013 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1015 srcptr = png_ptr->row_buf + 1 + initial_val;
1016 dstptr = row + initial_val;
1018 for (i = initial_val; i < final_val; i += stride)
1020 png_memcpy(dstptr, srcptr, rep_bytes);
1024 if (diff) /* number of leftover pixels: 3 for pngtest */
1026 final_val+=diff*BPP3;
1027 for (; i < final_val; i += stride)
1029 if (rep_bytes > (int)(final_val-i))
1030 rep_bytes = (int)(final_val-i);
1031 png_memcpy(dstptr, srcptr, rep_bytes);
1036 } /* end of else (_mmx_supported) */
1041 case 32: /* png_ptr->row_info.pixel_depth */
1046 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1047 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1048 /* && _mmx_supported */ )
1052 int dummy_value_a; // fix 'forbidden register spilled' error
1057 _unmask = ~mask; // global variable for -fPIC version
1058 srcptr = png_ptr->row_buf + 1;
1060 len = png_ptr->width &~7; // reduce to multiple of 8
1061 diff = (int) (png_ptr->width & 7); // amount lost //
1063 __asm__ __volatile__ (
1064 "movd _unmask, %%mm7 \n\t" // load bit pattern
1065 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1066 "punpcklbw %%mm7, %%mm7 \n\t"
1067 "punpcklwd %%mm7, %%mm7 \n\t"
1068 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1070 "movq _mask32_0, %%mm0 \n\t"
1071 "movq _mask32_1, %%mm1 \n\t"
1072 "movq _mask32_2, %%mm2 \n\t"
1073 "movq _mask32_3, %%mm3 \n\t"
1075 "pand %%mm7, %%mm0 \n\t"
1076 "pand %%mm7, %%mm1 \n\t"
1077 "pand %%mm7, %%mm2 \n\t"
1078 "pand %%mm7, %%mm3 \n\t"
1080 "pcmpeqb %%mm6, %%mm0 \n\t"
1081 "pcmpeqb %%mm6, %%mm1 \n\t"
1082 "pcmpeqb %%mm6, %%mm2 \n\t"
1083 "pcmpeqb %%mm6, %%mm3 \n\t"
1085 // preload "movl len, %%ecx \n\t" // load length of line
1086 // preload "movl srcptr, %%esi \n\t" // load source
1087 // preload "movl dstptr, %%edi \n\t" // load dest
1089 "cmpl $0, %%ecx \n\t" // lcr
1090 "jz mainloop32end \n\t"
1093 "movq (%%esi), %%mm4 \n\t"
1094 "pand %%mm0, %%mm4 \n\t"
1095 "movq %%mm0, %%mm6 \n\t"
1096 "movq (%%edi), %%mm7 \n\t"
1097 "pandn %%mm7, %%mm6 \n\t"
1098 "por %%mm6, %%mm4 \n\t"
1099 "movq %%mm4, (%%edi) \n\t"
1101 "movq 8(%%esi), %%mm5 \n\t"
1102 "pand %%mm1, %%mm5 \n\t"
1103 "movq %%mm1, %%mm7 \n\t"
1104 "movq 8(%%edi), %%mm6 \n\t"
1105 "pandn %%mm6, %%mm7 \n\t"
1106 "por %%mm7, %%mm5 \n\t"
1107 "movq %%mm5, 8(%%edi) \n\t"
1109 "movq 16(%%esi), %%mm6 \n\t"
1110 "pand %%mm2, %%mm6 \n\t"
1111 "movq %%mm2, %%mm4 \n\t"
1112 "movq 16(%%edi), %%mm7 \n\t"
1113 "pandn %%mm7, %%mm4 \n\t"
1114 "por %%mm4, %%mm6 \n\t"
1115 "movq %%mm6, 16(%%edi) \n\t"
1117 "movq 24(%%esi), %%mm7 \n\t"
1118 "pand %%mm3, %%mm7 \n\t"
1119 "movq %%mm3, %%mm5 \n\t"
1120 "movq 24(%%edi), %%mm4 \n\t"
1121 "pandn %%mm4, %%mm5 \n\t"
1122 "por %%mm5, %%mm7 \n\t"
1123 "movq %%mm7, 24(%%edi) \n\t"
1125 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1126 "addl $32, %%edi \n\t"
1127 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1128 "ja mainloop32 \n\t"
1130 "mainloop32end: \n\t"
1131 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1132 "movl %%eax, %%ecx \n\t"
1133 "cmpl $0, %%ecx \n\t"
1135 // preload "movl mask, %%edx \n\t"
1136 "sall $24, %%edx \n\t" // low byte => high byte
1138 "secondloop32: \n\t"
1139 "sall %%edx \n\t" // move high bit to CF
1140 "jnc skip32 \n\t" // if CF = 0
1141 "movl (%%esi), %%eax \n\t"
1142 "movl %%eax, (%%edi) \n\t"
1145 "addl $4, %%esi \n\t"
1146 "addl $4, %%edi \n\t"
1148 "jnz secondloop32 \n\t"
1153 : "=a" (dummy_value_a), // output regs (dummy)
1154 "=d" (dummy_value_d),
1155 "=c" (dummy_value_c),
1156 "=S" (dummy_value_S),
1157 "=D" (dummy_value_D)
1159 : "3" (srcptr), // esi // input regs
1160 "4" (dstptr), // edi
1162 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1166 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1167 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1168 , "%mm4", "%mm5", "%mm6", "%mm7"
1172 else /* mmx _not supported - Use modified C routine */
1173 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1175 register png_uint_32 i;
1176 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1177 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1178 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1179 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1180 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1181 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1182 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1183 int diff = (int) (png_ptr->width & 7); /* amount lost */
1184 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1186 srcptr = png_ptr->row_buf + 1 + initial_val;
1187 dstptr = row + initial_val;
1189 for (i = initial_val; i < final_val; i += stride)
1191 png_memcpy(dstptr, srcptr, rep_bytes);
1195 if (diff) /* number of leftover pixels: 3 for pngtest */
1197 final_val+=diff*BPP4;
1198 for (; i < final_val; i += stride)
1200 if (rep_bytes > (int)(final_val-i))
1201 rep_bytes = (int)(final_val-i);
1202 png_memcpy(dstptr, srcptr, rep_bytes);
1207 } /* end of else (_mmx_supported) */
1212 case 48: /* png_ptr->row_info.pixel_depth */
1217 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1218 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1219 /* && _mmx_supported */ )
1223 int dummy_value_a; // fix 'forbidden register spilled' error
1228 _unmask = ~mask; // global variable for -fPIC version
1229 srcptr = png_ptr->row_buf + 1;
1231 len = png_ptr->width &~7; // reduce to multiple of 8
1232 diff = (int) (png_ptr->width & 7); // amount lost //
1234 __asm__ __volatile__ (
1235 "movd _unmask, %%mm7 \n\t" // load bit pattern
1236 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1237 "punpcklbw %%mm7, %%mm7 \n\t"
1238 "punpcklwd %%mm7, %%mm7 \n\t"
1239 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1241 "movq _mask48_0, %%mm0 \n\t"
1242 "movq _mask48_1, %%mm1 \n\t"
1243 "movq _mask48_2, %%mm2 \n\t"
1244 "movq _mask48_3, %%mm3 \n\t"
1245 "movq _mask48_4, %%mm4 \n\t"
1246 "movq _mask48_5, %%mm5 \n\t"
1248 "pand %%mm7, %%mm0 \n\t"
1249 "pand %%mm7, %%mm1 \n\t"
1250 "pand %%mm7, %%mm2 \n\t"
1251 "pand %%mm7, %%mm3 \n\t"
1252 "pand %%mm7, %%mm4 \n\t"
1253 "pand %%mm7, %%mm5 \n\t"
1255 "pcmpeqb %%mm6, %%mm0 \n\t"
1256 "pcmpeqb %%mm6, %%mm1 \n\t"
1257 "pcmpeqb %%mm6, %%mm2 \n\t"
1258 "pcmpeqb %%mm6, %%mm3 \n\t"
1259 "pcmpeqb %%mm6, %%mm4 \n\t"
1260 "pcmpeqb %%mm6, %%mm5 \n\t"
1262 // preload "movl len, %%ecx \n\t" // load length of line
1263 // preload "movl srcptr, %%esi \n\t" // load source
1264 // preload "movl dstptr, %%edi \n\t" // load dest
1266 "cmpl $0, %%ecx \n\t"
1267 "jz mainloop48end \n\t"
1270 "movq (%%esi), %%mm7 \n\t"
1271 "pand %%mm0, %%mm7 \n\t"
1272 "movq %%mm0, %%mm6 \n\t"
1273 "pandn (%%edi), %%mm6 \n\t"
1274 "por %%mm6, %%mm7 \n\t"
1275 "movq %%mm7, (%%edi) \n\t"
1277 "movq 8(%%esi), %%mm6 \n\t"
1278 "pand %%mm1, %%mm6 \n\t"
1279 "movq %%mm1, %%mm7 \n\t"
1280 "pandn 8(%%edi), %%mm7 \n\t"
1281 "por %%mm7, %%mm6 \n\t"
1282 "movq %%mm6, 8(%%edi) \n\t"
1284 "movq 16(%%esi), %%mm6 \n\t"
1285 "pand %%mm2, %%mm6 \n\t"
1286 "movq %%mm2, %%mm7 \n\t"
1287 "pandn 16(%%edi), %%mm7 \n\t"
1288 "por %%mm7, %%mm6 \n\t"
1289 "movq %%mm6, 16(%%edi) \n\t"
1291 "movq 24(%%esi), %%mm7 \n\t"
1292 "pand %%mm3, %%mm7 \n\t"
1293 "movq %%mm3, %%mm6 \n\t"
1294 "pandn 24(%%edi), %%mm6 \n\t"
1295 "por %%mm6, %%mm7 \n\t"
1296 "movq %%mm7, 24(%%edi) \n\t"
1298 "movq 32(%%esi), %%mm6 \n\t"
1299 "pand %%mm4, %%mm6 \n\t"
1300 "movq %%mm4, %%mm7 \n\t"
1301 "pandn 32(%%edi), %%mm7 \n\t"
1302 "por %%mm7, %%mm6 \n\t"
1303 "movq %%mm6, 32(%%edi) \n\t"
1305 "movq 40(%%esi), %%mm7 \n\t"
1306 "pand %%mm5, %%mm7 \n\t"
1307 "movq %%mm5, %%mm6 \n\t"
1308 "pandn 40(%%edi), %%mm6 \n\t"
1309 "por %%mm6, %%mm7 \n\t"
1310 "movq %%mm7, 40(%%edi) \n\t"
1312 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1313 "addl $48, %%edi \n\t"
1314 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1316 "ja mainloop48 \n\t"
1318 "mainloop48end: \n\t"
1319 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1320 "movl %%eax, %%ecx \n\t"
1321 "cmpl $0, %%ecx \n\t"
1323 // preload "movl mask, %%edx \n\t"
1324 "sall $24, %%edx \n\t" // make low byte, high byte
1326 "secondloop48: \n\t"
1327 "sall %%edx \n\t" // move high bit to CF
1328 "jnc skip48 \n\t" // if CF = 0
1329 "movl (%%esi), %%eax \n\t"
1330 "movl %%eax, (%%edi) \n\t"
1333 "addl $4, %%esi \n\t"
1334 "addl $4, %%edi \n\t"
1336 "jnz secondloop48 \n\t"
1341 : "=a" (dummy_value_a), // output regs (dummy)
1342 "=d" (dummy_value_d),
1343 "=c" (dummy_value_c),
1344 "=S" (dummy_value_S),
1345 "=D" (dummy_value_D)
1347 : "3" (srcptr), // esi // input regs
1348 "4" (dstptr), // edi
1350 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1354 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1355 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1356 , "%mm4", "%mm5", "%mm6", "%mm7"
1360 else /* mmx _not supported - Use modified C routine */
1361 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1363 register png_uint_32 i;
1364 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1365 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1366 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1367 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1368 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1369 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1370 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1371 int diff = (int) (png_ptr->width & 7); /* amount lost */
1372 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1374 srcptr = png_ptr->row_buf + 1 + initial_val;
1375 dstptr = row + initial_val;
1377 for (i = initial_val; i < final_val; i += stride)
1379 png_memcpy(dstptr, srcptr, rep_bytes);
1383 if (diff) /* number of leftover pixels: 3 for pngtest */
1385 final_val+=diff*BPP6;
1386 for (; i < final_val; i += stride)
1388 if (rep_bytes > (int)(final_val-i))
1389 rep_bytes = (int)(final_val-i);
1390 png_memcpy(dstptr, srcptr, rep_bytes);
1395 } /* end of else (_mmx_supported) */
1400 case 64: /* png_ptr->row_info.pixel_depth */
1404 register png_uint_32 i;
1405 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1406 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1407 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1408 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1409 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1410 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1411 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1412 int diff = (int) (png_ptr->width & 7); /* amount lost */
1413 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1415 srcptr = png_ptr->row_buf + 1 + initial_val;
1416 dstptr = row + initial_val;
1418 for (i = initial_val; i < final_val; i += stride)
1420 png_memcpy(dstptr, srcptr, rep_bytes);
1424 if (diff) /* number of leftover pixels: 3 for pngtest */
1426 final_val+=diff*BPP8;
1427 for (; i < final_val; i += stride)
1429 if (rep_bytes > (int)(final_val-i))
1430 rep_bytes = (int)(final_val-i);
1431 png_memcpy(dstptr, srcptr, rep_bytes);
1440 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1442 /* this should never happen */
1443 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1446 } /* end switch (png_ptr->row_info.pixel_depth) */
1448 } /* end if (non-trivial mask) */
1450 } /* end png_combine_row() */
1452 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1457 /*===========================================================================*/
1459 /* P N G _ D O _ R E A D _ I N T E R L A C E */
1461 /*===========================================================================*/
1463 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1464 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1466 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1467 * has taken place. [GRR: what other steps come before and/or after?]
1471 png_do_read_interlace(png_structp png_ptr)
1473 png_row_infop row_info = &(png_ptr->row_info);
1474 png_bytep row = png_ptr->row_buf + 1;
1475 int pass = png_ptr->pass;
1476 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1477 png_uint_32 transformations = png_ptr->transformations;
1480 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1482 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1483 if (_mmx_supported == 2) {
1484 /* this should have happened in png_init_mmx_flags() already */
1485 png_warning(png_ptr, "asm_flags may not have been initialized");
1490 if (row != NULL && row_info != NULL)
1492 png_uint_32 final_width;
1494 final_width = row_info->width * png_pass_inc[pass];
1496 switch (row_info->pixel_depth)
1502 int s_start, s_end, s_inc;
1507 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1508 dp = row + (png_size_t)((final_width - 1) >> 3);
1509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1510 if (transformations & PNG_PACKSWAP)
1512 sshift = (int)((row_info->width + 7) & 7);
1513 dshift = (int)((final_width + 7) & 7);
1521 sshift = 7 - (int)((row_info->width + 7) & 7);
1522 dshift = 7 - (int)((final_width + 7) & 7);
1528 for (i = row_info->width; i; i--)
1530 v = (png_byte)((*sp >> sshift) & 0x1);
1531 for (j = 0; j < png_pass_inc[pass]; j++)
1533 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1534 *dp |= (png_byte)(v << dshift);
1535 if (dshift == s_end)
1543 if (sshift == s_end)
1558 int s_start, s_end, s_inc;
1561 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1562 dp = row + (png_size_t)((final_width - 1) >> 2);
1563 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1564 if (transformations & PNG_PACKSWAP)
1566 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1567 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1575 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1576 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1582 for (i = row_info->width; i; i--)
1587 v = (png_byte)((*sp >> sshift) & 0x3);
1588 for (j = 0; j < png_pass_inc[pass]; j++)
1590 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1591 *dp |= (png_byte)(v << dshift);
1592 if (dshift == s_end)
1600 if (sshift == s_end)
1615 int s_start, s_end, s_inc;
1618 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1619 dp = row + (png_size_t)((final_width - 1) >> 1);
1620 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1621 if (transformations & PNG_PACKSWAP)
1623 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1624 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1632 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1633 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1639 for (i = row_info->width; i; i--)
1644 v = (png_byte)((*sp >> sshift) & 0xf);
1645 for (j = 0; j < png_pass_inc[pass]; j++)
1647 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1648 *dp |= (png_byte)(v << dshift);
1649 if (dshift == s_end)
1657 if (sshift == s_end)
1668 /*====================================================================*/
1670 default: /* 8-bit or larger (this is where the routine is modified) */
1673 // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1674 // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1675 // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1676 // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1680 png_size_t pixel_bytes;
1681 int width = (int)row_info->width;
1683 pixel_bytes = (row_info->pixel_depth >> 3);
1685 /* point sptr at the last pixel in the pre-expanded row: */
1686 sptr = row + (width - 1) * pixel_bytes;
1688 /* point dp at the last pixel position in the expanded row: */
1689 dp = row + (final_width - 1) * pixel_bytes;
1691 /* New code by Nirav Chhatrapati - Intel Corporation */
1693 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1694 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1695 /* && _mmx_supported */ )
1697 //--------------------------------------------------------------
1698 if (pixel_bytes == 3)
1700 if (((pass == 0) || (pass == 1)) && width)
1702 int dummy_value_c; // fix 'forbidden register spilled'
1706 __asm__ __volatile__ (
1707 "subl $21, %%edi \n\t"
1708 // (png_pass_inc[pass] - 1)*pixel_bytes
1710 ".loop3_pass0: \n\t"
1711 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1712 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1713 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1714 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1715 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1716 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1717 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1718 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1719 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1720 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1721 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1722 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1723 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1724 "movq %%mm4, 16(%%edi) \n\t"
1725 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1726 "movq %%mm3, 8(%%edi) \n\t"
1727 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1728 "subl $3, %%esi \n\t"
1729 "movq %%mm0, (%%edi) \n\t"
1730 "subl $24, %%edi \n\t"
1732 "jnz .loop3_pass0 \n\t"
1735 : "=c" (dummy_value_c), // output regs (dummy)
1736 "=S" (dummy_value_S),
1737 "=D" (dummy_value_D)
1739 : "1" (sptr), // esi // input regs
1742 // doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
1744 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1745 : "%mm0", "%mm1", "%mm2" // clobber list
1750 else if (((pass == 2) || (pass == 3)) && width)
1752 int dummy_value_c; // fix 'forbidden register spilled'
1756 __asm__ __volatile__ (
1757 "subl $9, %%edi \n\t"
1758 // (png_pass_inc[pass] - 1)*pixel_bytes
1760 ".loop3_pass2: \n\t"
1761 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1762 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1763 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1764 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1765 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1766 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1767 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1768 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1769 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1770 "movq %%mm0, 4(%%edi) \n\t"
1771 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1772 "subl $3, %%esi \n\t"
1773 "movd %%mm0, (%%edi) \n\t"
1774 "subl $12, %%edi \n\t"
1776 "jnz .loop3_pass2 \n\t"
1779 : "=c" (dummy_value_c), // output regs (dummy)
1780 "=S" (dummy_value_S),
1781 "=D" (dummy_value_D)
1783 : "1" (sptr), // esi // input regs
1787 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1788 : "%mm0", "%mm1", "%mm2" // clobber list
1792 else if (width) /* && ((pass == 4) || (pass == 5)) */
1794 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1797 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1800 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1801 // sptr points at last pixel in pre-expanded row
1802 // dp points at last pixel position in expanded row
1803 int dummy_value_c; // fix 'forbidden register spilled'
1807 __asm__ __volatile__ (
1808 "subl $3, %%esi \n\t"
1809 "subl $9, %%edi \n\t"
1810 // (png_pass_inc[pass] + 1)*pixel_bytes
1812 ".loop3_pass4: \n\t"
1813 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1814 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1815 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1816 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1817 "pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1818 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1819 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1820 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1821 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1822 "movq %%mm0, (%%edi) \n\t"
1823 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1824 "pand _const6, %%mm3 \n\t" // z z z z z z z 5
1825 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1826 "subl $6, %%esi \n\t"
1827 "movd %%mm2, 8(%%edi) \n\t"
1828 "subl $12, %%edi \n\t"
1829 "subl $2, %%ecx \n\t"
1830 "jnz .loop3_pass4 \n\t"
1833 : "=c" (dummy_value_c), // output regs (dummy)
1834 "=S" (dummy_value_S),
1835 "=D" (dummy_value_D)
1837 : "1" (sptr), // esi // input regs
1839 "0" (width_mmx) // ecx
1841 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1842 : "%mm0", "%mm1" // clobber list
1848 sptr -= width_mmx*3;
1850 for (i = width; i; i--)
1855 png_memcpy(v, sptr, 3);
1856 for (j = 0; j < png_pass_inc[pass]; j++)
1858 png_memcpy(dp, v, 3);
1864 } /* end of pixel_bytes == 3 */
1866 //--------------------------------------------------------------
1867 else if (pixel_bytes == 1)
1869 if (((pass == 0) || (pass == 1)) && width)
1871 int width_mmx = ((width >> 2) << 2);
1872 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1875 int dummy_value_c; // fix 'forbidden register spilled'
1879 __asm__ __volatile__ (
1880 "subl $3, %%esi \n\t"
1881 "subl $31, %%edi \n\t"
1883 ".loop1_pass0: \n\t"
1884 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1885 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1886 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1887 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1888 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1889 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1890 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1891 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1892 "movq %%mm0, (%%edi) \n\t"
1893 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1894 "movq %%mm3, 8(%%edi) \n\t"
1895 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1896 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1897 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1898 "movq %%mm2, 16(%%edi) \n\t"
1899 "subl $4, %%esi \n\t"
1900 "movq %%mm4, 24(%%edi) \n\t"
1901 "subl $32, %%edi \n\t"
1902 "subl $4, %%ecx \n\t"
1903 "jnz .loop1_pass0 \n\t"
1906 : "=c" (dummy_value_c), // output regs (dummy)
1907 "=S" (dummy_value_S),
1908 "=D" (dummy_value_D)
1910 : "1" (sptr), // esi // input regs
1912 "0" (width_mmx) // ecx
1914 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1915 : "%mm0", "%mm1", "%mm2" // clobber list
1923 for (i = width; i; i--)
1927 /* I simplified this part in version 1.0.4e
1928 * here and in several other instances where
1929 * pixel_bytes == 1 -- GR-P
1934 * png_memcpy(v, sptr, pixel_bytes);
1935 * for (j = 0; j < png_pass_inc[pass]; j++)
1937 * png_memcpy(dp, v, pixel_bytes);
1938 * dp -= pixel_bytes;
1940 * sptr -= pixel_bytes;
1942 * Replacement code is in the next three lines:
1945 for (j = 0; j < png_pass_inc[pass]; j++)
1952 else if (((pass == 2) || (pass == 3)) && width)
1954 int width_mmx = ((width >> 2) << 2);
1955 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1958 int dummy_value_c; // fix 'forbidden register spilled'
1962 __asm__ __volatile__ (
1963 "subl $3, %%esi \n\t"
1964 "subl $15, %%edi \n\t"
1966 ".loop1_pass2: \n\t"
1967 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1968 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1969 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
1970 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1971 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
1972 "movq %%mm0, (%%edi) \n\t"
1973 "subl $4, %%esi \n\t"
1974 "movq %%mm1, 8(%%edi) \n\t"
1975 "subl $16, %%edi \n\t"
1976 "subl $4, %%ecx \n\t"
1977 "jnz .loop1_pass2 \n\t"
1980 : "=c" (dummy_value_c), // output regs (dummy)
1981 "=S" (dummy_value_S),
1982 "=D" (dummy_value_D)
1984 : "1" (sptr), // esi // input regs
1986 "0" (width_mmx) // ecx
1988 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1989 : "%mm0", "%mm1" // clobber list
1996 for (i = width; i; i--)
2000 for (j = 0; j < png_pass_inc[pass]; j++)
2007 else if (width) /* && ((pass == 4) || (pass == 5)) */
2009 int width_mmx = ((width >> 3) << 3);
2010 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2013 int dummy_value_c; // fix 'forbidden register spilled'
2017 __asm__ __volatile__ (
2018 "subl $7, %%esi \n\t"
2019 "subl $15, %%edi \n\t"
2021 ".loop1_pass4: \n\t"
2022 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2023 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2024 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2025 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2026 "movq %%mm1, 8(%%edi) \n\t"
2027 "subl $8, %%esi \n\t"
2028 "movq %%mm0, (%%edi) \n\t"
2029 "subl $16, %%edi \n\t"
2030 "subl $8, %%ecx \n\t"
2031 "jnz .loop1_pass4 \n\t"
2034 : "=c" (dummy_value_c), // output regs (none)
2035 "=S" (dummy_value_S),
2036 "=D" (dummy_value_D)
2038 : "1" (sptr), // esi // input regs
2040 "0" (width_mmx) // ecx
2042 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2043 : "%mm0", "%mm1" // clobber list
2050 for (i = width; i; i--)
2054 for (j = 0; j < png_pass_inc[pass]; j++)
2061 } /* end of pixel_bytes == 1 */
2063 //--------------------------------------------------------------
2064 else if (pixel_bytes == 2)
2066 if (((pass == 0) || (pass == 1)) && width)
2068 int width_mmx = ((width >> 1) << 1);
2069 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2072 int dummy_value_c; // fix 'forbidden register spilled'
2076 __asm__ __volatile__ (
2077 "subl $2, %%esi \n\t"
2078 "subl $30, %%edi \n\t"
2080 ".loop2_pass0: \n\t"
2081 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2082 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2083 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2084 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2085 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2086 "movq %%mm0, (%%edi) \n\t"
2087 "movq %%mm0, 8(%%edi) \n\t"
2088 "movq %%mm1, 16(%%edi) \n\t"
2089 "subl $4, %%esi \n\t"
2090 "movq %%mm1, 24(%%edi) \n\t"
2091 "subl $32, %%edi \n\t"
2092 "subl $2, %%ecx \n\t"
2093 "jnz .loop2_pass0 \n\t"
2096 : "=c" (dummy_value_c), // output regs (dummy)
2097 "=S" (dummy_value_S),
2098 "=D" (dummy_value_D)
2100 : "1" (sptr), // esi // input regs
2102 "0" (width_mmx) // ecx
2104 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2105 : "%mm0", "%mm1" // clobber list
2110 sptr -= (width_mmx*2 - 2); // sign fixed
2111 dp -= (width_mmx*16 - 2); // sign fixed
2112 for (i = width; i; i--)
2117 png_memcpy(v, sptr, 2);
2118 for (j = 0; j < png_pass_inc[pass]; j++)
2121 png_memcpy(dp, v, 2);
2125 else if (((pass == 2) || (pass == 3)) && width)
2127 int width_mmx = ((width >> 1) << 1) ;
2128 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2131 int dummy_value_c; // fix 'forbidden register spilled'
2135 __asm__ __volatile__ (
2136 "subl $2, %%esi \n\t"
2137 "subl $14, %%edi \n\t"
2139 ".loop2_pass2: \n\t"
2140 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2141 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2142 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2143 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2144 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2145 "movq %%mm0, (%%edi) \n\t"
2146 "subl $4, %%esi \n\t"
2147 "movq %%mm1, 8(%%edi) \n\t"
2148 "subl $16, %%edi \n\t"
2149 "subl $2, %%ecx \n\t"
2150 "jnz .loop2_pass2 \n\t"
2153 : "=c" (dummy_value_c), // output regs (dummy)
2154 "=S" (dummy_value_S),
2155 "=D" (dummy_value_D)
2157 : "1" (sptr), // esi // input regs
2159 "0" (width_mmx) // ecx
2161 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2162 : "%mm0", "%mm1" // clobber list
2167 sptr -= (width_mmx*2 - 2); // sign fixed
2168 dp -= (width_mmx*8 - 2); // sign fixed
2169 for (i = width; i; i--)
2174 png_memcpy(v, sptr, 2);
2175 for (j = 0; j < png_pass_inc[pass]; j++)
2178 png_memcpy(dp, v, 2);
2182 else if (width) // pass == 4 or 5
2184 int width_mmx = ((width >> 1) << 1) ;
2185 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2188 int dummy_value_c; // fix 'forbidden register spilled'
2192 __asm__ __volatile__ (
2193 "subl $2, %%esi \n\t"
2194 "subl $6, %%edi \n\t"
2196 ".loop2_pass4: \n\t"
2197 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2198 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2199 "subl $4, %%esi \n\t"
2200 "movq %%mm0, (%%edi) \n\t"
2201 "subl $8, %%edi \n\t"
2202 "subl $2, %%ecx \n\t"
2203 "jnz .loop2_pass4 \n\t"
2206 : "=c" (dummy_value_c), // output regs (dummy)
2207 "=S" (dummy_value_S),
2208 "=D" (dummy_value_D)
2210 : "1" (sptr), // esi // input regs
2212 "0" (width_mmx) // ecx
2214 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2215 : "%mm0" // clobber list
2220 sptr -= (width_mmx*2 - 2); // sign fixed
2221 dp -= (width_mmx*4 - 2); // sign fixed
2222 for (i = width; i; i--)
2227 png_memcpy(v, sptr, 2);
2228 for (j = 0; j < png_pass_inc[pass]; j++)
2231 png_memcpy(dp, v, 2);
2235 } /* end of pixel_bytes == 2 */
2237 //--------------------------------------------------------------
2238 else if (pixel_bytes == 4)
2240 if (((pass == 0) || (pass == 1)) && width)
2242 int width_mmx = ((width >> 1) << 1);
2243 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2246 int dummy_value_c; // fix 'forbidden register spilled'
2250 __asm__ __volatile__ (
2251 "subl $4, %%esi \n\t"
2252 "subl $60, %%edi \n\t"
2254 ".loop4_pass0: \n\t"
2255 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2256 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2257 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2258 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2259 "movq %%mm0, (%%edi) \n\t"
2260 "movq %%mm0, 8(%%edi) \n\t"
2261 "movq %%mm0, 16(%%edi) \n\t"
2262 "movq %%mm0, 24(%%edi) \n\t"
2263 "movq %%mm1, 32(%%edi) \n\t"
2264 "movq %%mm1, 40(%%edi) \n\t"
2265 "movq %%mm1, 48(%%edi) \n\t"
2266 "subl $8, %%esi \n\t"
2267 "movq %%mm1, 56(%%edi) \n\t"
2268 "subl $64, %%edi \n\t"
2269 "subl $2, %%ecx \n\t"
2270 "jnz .loop4_pass0 \n\t"
2273 : "=c" (dummy_value_c), // output regs (dummy)
2274 "=S" (dummy_value_S),
2275 "=D" (dummy_value_D)
2277 : "1" (sptr), // esi // input regs
2279 "0" (width_mmx) // ecx
2281 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2282 : "%mm0", "%mm1" // clobber list
2287 sptr -= (width_mmx*4 - 4); // sign fixed
2288 dp -= (width_mmx*32 - 4); // sign fixed
2289 for (i = width; i; i--)
2294 png_memcpy(v, sptr, 4);
2295 for (j = 0; j < png_pass_inc[pass]; j++)
2298 png_memcpy(dp, v, 4);
2302 else if (((pass == 2) || (pass == 3)) && width)
2304 int width_mmx = ((width >> 1) << 1);
2305 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2308 int dummy_value_c; // fix 'forbidden register spilled'
2312 __asm__ __volatile__ (
2313 "subl $4, %%esi \n\t"
2314 "subl $28, %%edi \n\t"
2316 ".loop4_pass2: \n\t"
2317 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2318 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2319 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2320 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2321 "movq %%mm0, (%%edi) \n\t"
2322 "movq %%mm0, 8(%%edi) \n\t"
2323 "movq %%mm1, 16(%%edi) \n\t"
2324 "movq %%mm1, 24(%%edi) \n\t"
2325 "subl $8, %%esi \n\t"
2326 "subl $32, %%edi \n\t"
2327 "subl $2, %%ecx \n\t"
2328 "jnz .loop4_pass2 \n\t"
2331 : "=c" (dummy_value_c), // output regs (dummy)
2332 "=S" (dummy_value_S),
2333 "=D" (dummy_value_D)
2335 : "1" (sptr), // esi // input regs
2337 "0" (width_mmx) // ecx
2339 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2340 : "%mm0", "%mm1" // clobber list
2345 sptr -= (width_mmx*4 - 4); // sign fixed
2346 dp -= (width_mmx*16 - 4); // sign fixed
2347 for (i = width; i; i--)
2352 png_memcpy(v, sptr, 4);
2353 for (j = 0; j < png_pass_inc[pass]; j++)
2356 png_memcpy(dp, v, 4);
2360 else if (width) // pass == 4 or 5
2362 int width_mmx = ((width >> 1) << 1) ;
2363 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2366 int dummy_value_c; // fix 'forbidden register spilled'
2370 __asm__ __volatile__ (
2371 "subl $4, %%esi \n\t"
2372 "subl $12, %%edi \n\t"
2374 ".loop4_pass4: \n\t"
2375 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2376 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2377 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2378 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2379 "movq %%mm0, (%%edi) \n\t"
2380 "subl $8, %%esi \n\t"
2381 "movq %%mm1, 8(%%edi) \n\t"
2382 "subl $16, %%edi \n\t"
2383 "subl $2, %%ecx \n\t"
2384 "jnz .loop4_pass4 \n\t"
2387 : "=c" (dummy_value_c), // output regs (dummy)
2388 "=S" (dummy_value_S),
2389 "=D" (dummy_value_D)
2391 : "1" (sptr), // esi // input regs
2393 "0" (width_mmx) // ecx
2395 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2396 : "%mm0", "%mm1" // clobber list
2401 sptr -= (width_mmx*4 - 4); // sign fixed
2402 dp -= (width_mmx*8 - 4); // sign fixed
2403 for (i = width; i; i--)
2408 png_memcpy(v, sptr, 4);
2409 for (j = 0; j < png_pass_inc[pass]; j++)
2412 png_memcpy(dp, v, 4);
2416 } /* end of pixel_bytes == 4 */
2418 //--------------------------------------------------------------
2419 else if (pixel_bytes == 8)
2421 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2422 // GRR NOTE: no need to combine passes here!
2423 if (((pass == 0) || (pass == 1)) && width)
2425 int dummy_value_c; // fix 'forbidden register spilled'
2429 // source is 8-byte RRGGBBAA
2430 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2431 __asm__ __volatile__ (
2432 "subl $56, %%edi \n\t" // start of last block
2434 ".loop8_pass0: \n\t"
2435 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2436 "movq %%mm0, (%%edi) \n\t"
2437 "movq %%mm0, 8(%%edi) \n\t"
2438 "movq %%mm0, 16(%%edi) \n\t"
2439 "movq %%mm0, 24(%%edi) \n\t"
2440 "movq %%mm0, 32(%%edi) \n\t"
2441 "movq %%mm0, 40(%%edi) \n\t"
2442 "movq %%mm0, 48(%%edi) \n\t"
2443 "subl $8, %%esi \n\t"
2444 "movq %%mm0, 56(%%edi) \n\t"
2445 "subl $64, %%edi \n\t"
2447 "jnz .loop8_pass0 \n\t"
2450 : "=c" (dummy_value_c), // output regs (dummy)
2451 "=S" (dummy_value_S),
2452 "=D" (dummy_value_D)
2454 : "1" (sptr), // esi // input regs
2458 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2459 : "%mm0" // clobber list
2463 else if (((pass == 2) || (pass == 3)) && width)
2465 // source is 8-byte RRGGBBAA
2466 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2467 int width_mmx = ((width >> 1) << 1) ;
2471 int dummy_value_c; // fix 'forbidden register spilled'
2475 __asm__ __volatile__ (
2476 "subl $24, %%edi \n\t" // start of last block
2478 ".loop8_pass2: \n\t"
2479 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2480 "movq %%mm0, (%%edi) \n\t"
2481 "movq %%mm0, 8(%%edi) \n\t"
2482 "movq %%mm0, 16(%%edi) \n\t"
2483 "subl $8, %%esi \n\t"
2484 "movq %%mm0, 24(%%edi) \n\t"
2485 "subl $32, %%edi \n\t"
2487 "jnz .loop8_pass2 \n\t"
2490 : "=c" (dummy_value_c), // output regs (dummy)
2491 "=S" (dummy_value_S),
2492 "=D" (dummy_value_D)
2494 : "1" (sptr), // esi // input regs
2498 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2499 : "%mm0" // clobber list
2504 else if (width) // pass == 4 or 5
2506 // source is 8-byte RRGGBBAA
2507 // dest is 16-byte RRGGBBAA RRGGBBAA
2508 int width_mmx = ((width >> 1) << 1) ;
2512 int dummy_value_c; // fix 'forbidden register spilled'
2516 __asm__ __volatile__ (
2517 "subl $8, %%edi \n\t" // start of last block
2519 ".loop8_pass4: \n\t"
2520 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2521 "movq %%mm0, (%%edi) \n\t"
2522 "subl $8, %%esi \n\t"
2523 "movq %%mm0, 8(%%edi) \n\t"
2524 "subl $16, %%edi \n\t"
2526 "jnz .loop8_pass4 \n\t"
2529 : "=c" (dummy_value_c), // output regs (dummy)
2530 "=S" (dummy_value_S),
2531 "=D" (dummy_value_D)
2533 : "1" (sptr), // esi // input regs
2537 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2538 : "%mm0" // clobber list
2544 } /* end of pixel_bytes == 8 */
2546 //--------------------------------------------------------------
2547 else if (pixel_bytes == 6)
2549 for (i = width; i; i--)
2553 png_memcpy(v, sptr, 6);
2554 for (j = 0; j < png_pass_inc[pass]; j++)
2556 png_memcpy(dp, v, 6);
2561 } /* end of pixel_bytes == 6 */
2563 //--------------------------------------------------------------
2566 for (i = width; i; i--)
2570 png_memcpy(v, sptr, pixel_bytes);
2571 for (j = 0; j < png_pass_inc[pass]; j++)
2573 png_memcpy(dp, v, pixel_bytes);
2579 } // end of _mmx_supported ========================================
2581 else /* MMX not supported: use modified C code - takes advantage
2582 * of inlining of png_memcpy for a constant */
2583 /* GRR 19991007: does it? or should pixel_bytes in each
2584 * block be replaced with immediate value (e.g., 1)? */
2585 /* GRR 19991017: replaced with constants in each case */
2586 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2588 if (pixel_bytes == 1)
2590 for (i = width; i; i--)
2593 for (j = 0; j < png_pass_inc[pass]; j++)
2600 else if (pixel_bytes == 3)
2602 for (i = width; i; i--)
2606 png_memcpy(v, sptr, 3);
2607 for (j = 0; j < png_pass_inc[pass]; j++)
2609 png_memcpy(dp, v, 3);
2615 else if (pixel_bytes == 2)
2617 for (i = width; i; i--)
2621 png_memcpy(v, sptr, 2);
2622 for (j = 0; j < png_pass_inc[pass]; j++)
2624 png_memcpy(dp, v, 2);
2630 else if (pixel_bytes == 4)
2632 for (i = width; i; i--)
2636 png_memcpy(v, sptr, 4);
2637 for (j = 0; j < png_pass_inc[pass]; j++)
2640 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2642 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2643 row, dp, row+png_ptr->row_buf_size);
2644 printf("row_buf=%d\n",png_ptr->row_buf_size);
2647 png_memcpy(dp, v, 4);
2653 else if (pixel_bytes == 6)
2655 for (i = width; i; i--)
2659 png_memcpy(v, sptr, 6);
2660 for (j = 0; j < png_pass_inc[pass]; j++)
2662 png_memcpy(dp, v, 6);
2668 else if (pixel_bytes == 8)
2670 for (i = width; i; i--)
2674 png_memcpy(v, sptr, 8);
2675 for (j = 0; j < png_pass_inc[pass]; j++)
2677 png_memcpy(dp, v, 8);
2683 else /* GRR: should never be reached */
2685 for (i = width; i; i--)
2689 png_memcpy(v, sptr, pixel_bytes);
2690 for (j = 0; j < png_pass_inc[pass]; j++)
2692 png_memcpy(dp, v, pixel_bytes);
2695 sptr -= pixel_bytes;
2699 } /* end if (MMX not supported) */
2702 } /* end switch (row_info->pixel_depth) */
2704 row_info->width = final_width;
2705 row_info->rowbytes = ((final_width *
2706 (png_uint_32)row_info->pixel_depth + 7) >> 3);
2709 } /* end png_do_read_interlace() */
2711 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2712 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2716 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2717 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2719 // These variables are utilized in the functions below. They are declared
2720 // globally here to ensure alignment on 8-byte boundaries.
2725 } _LBCarryMask = {0x0101010101010101LL},
2726 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2727 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2729 #ifdef PNG_THREAD_UNSAFE_OK
2730 //===========================================================================//
2732 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2734 //===========================================================================//
2736 // Optimized code for PNG Average filter decoder
2738 static void /* PRIVATE */
2739 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2743 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2747 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2748 _FullLength = row_info->rowbytes; // # of bytes to filter
2750 __asm__ __volatile__ (
2751 // initialize address pointers and offset
2753 "pushl %%ebx \n\t" // save index to Global Offset Table
2755 //pre "movl row, %%edi \n\t" // edi: Avg(x)
2756 "xorl %%ebx, %%ebx \n\t" // ebx: x
2757 "movl %%edi, %%edx \n\t"
2758 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2759 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2760 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2762 "xorl %%eax,%%eax \n\t"
2764 // Compute the Raw value for the first bpp bytes
2765 // Raw(x) = Avg(x) + (Prior(x)/2)
2767 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2769 "shrb %%al \n\t" // divide by 2
2770 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2771 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2772 "cmpl %%ecx, %%ebx \n\t"
2773 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2774 "jb avg_rlp \n\t" // mov does not affect flags
2776 // get # of bytes to alignment
2777 "movl %%edi, _dif \n\t" // take start of row
2778 "addl %%ebx, _dif \n\t" // add bpp
2779 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2780 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2781 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2782 "jz avg_go \n\t" // alignment
2785 // Compute the Raw value for the bytes up to the alignment boundary
2786 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2787 "xorl %%ecx, %%ecx \n\t"
2790 "xorl %%eax, %%eax \n\t"
2791 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2792 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2793 "addw %%cx, %%ax \n\t"
2795 "shrw %%ax \n\t" // divide by 2
2796 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2797 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2798 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2799 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2802 "movl _FullLength, %%eax \n\t"
2803 "movl %%eax, %%ecx \n\t"
2804 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2805 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2806 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2807 "movl %%ecx, _MMXLength \n\t"
2809 "popl %%ebx \n\t" // restore index to Global Offset Table
2812 : "=c" (dummy_value_c), // output regs (dummy)
2813 "=S" (dummy_value_S),
2814 "=D" (dummy_value_D)
2816 : "0" (bpp), // ecx // input regs
2817 "1" (prev_row), // esi
2820 : "%eax", "%edx" // clobber list
2824 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2825 // (seems to work fine without...)
2828 // now do the math for the rest of the row
2833 _ActiveMask.use = 0x0000000000ffffffLL;
2834 _ShiftBpp.use = 24; // == 3 * 8
2835 _ShiftRem.use = 40; // == 64 - 24
2837 __asm__ __volatile__ (
2838 // re-init address pointers and offset
2839 "movq _ActiveMask, %%mm7 \n\t"
2840 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2841 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2842 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2843 "movq _HBClearMask, %%mm4 \n\t"
2844 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2846 // prime the pump: load the first Raw(x-bpp) data set
2847 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2848 // (correct pos. in loop below)
2850 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2851 "movq %%mm5, %%mm3 \n\t"
2852 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2854 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2855 "movq %%mm7, %%mm6 \n\t"
2856 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2857 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2858 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2860 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2862 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2863 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2865 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2867 // lsb's were == 1 (only valid for active group)
2868 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2869 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2871 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2873 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2874 // bytes to add to Avg
2875 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2876 // Avg for each Active
2878 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2879 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2881 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2882 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2883 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2885 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2887 // lsb's were == 1 (only valid for active group)
2888 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2889 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2891 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2893 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2894 // bytes to add to Avg
2895 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2896 // Avg for each Active
2899 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2900 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2903 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2904 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2905 // Data only needs to be shifted once here to
2906 // get the correct x-bpp offset.
2907 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2909 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2911 // lsb's were == 1 (only valid for active group)
2912 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2913 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2915 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2917 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2918 // bytes to add to Avg
2919 "addl $8, %%ecx \n\t"
2920 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2921 // Avg for each Active
2923 // now ready to write back to memory
2924 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2925 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2926 "cmpl _MMXLength, %%ecx \n\t"
2927 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2930 : "=S" (dummy_value_S), // output regs (dummy)
2931 "=D" (dummy_value_D)
2933 : "0" (prev_row), // esi // input regs
2936 : "%ecx" // clobber list
2937 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2938 , "%mm0", "%mm1", "%mm2", "%mm3"
2939 , "%mm4", "%mm5", "%mm6", "%mm7"
2947 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2948 //case 5: // GRR BOGUS
2950 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2951 // appropriate inactive bytes
2952 _ShiftBpp.use = bpp << 3;
2953 _ShiftRem.use = 64 - _ShiftBpp.use;
2955 __asm__ __volatile__ (
2956 "movq _HBClearMask, %%mm4 \n\t"
2958 // re-init address pointers and offset
2959 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2960 // alignment boundary
2962 // load _ActiveMask and clear all bytes except for 1st active group
2963 "movq _ActiveMask, %%mm7 \n\t"
2964 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2965 "psrlq _ShiftRem, %%mm7 \n\t"
2966 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2967 "movq %%mm7, %%mm6 \n\t"
2968 "movq _LBCarryMask, %%mm5 \n\t"
2969 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
2972 // prime the pump: load the first Raw(x-bpp) data set
2973 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2974 // (we correct pos. in loop below)
2976 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2977 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
2978 "movq (%%esi,%%ecx,), %%mm1 \n\t"
2979 // add (Prev_row/2) to average
2980 "movq %%mm5, %%mm3 \n\t"
2981 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2982 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2983 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2985 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2987 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2988 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2990 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2992 // lsb's were == 1 (only valid for active group)
2993 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2994 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2996 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2998 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
2999 // bytes to add to Avg
3000 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3003 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3004 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3005 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3006 "addl $8, %%ecx \n\t"
3007 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3009 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3011 // lsb's were == 1 (only valid for active group)
3012 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3013 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3015 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3017 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3018 // bytes to add to Avg
3019 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3020 // Avg for each Active
3022 "cmpl _MMXLength, %%ecx \n\t"
3023 // now ready to write back to memory
3024 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3025 // prep Raw(x-bpp) for next loop
3026 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3029 : "=S" (dummy_value_S), // output regs (dummy)
3030 "=D" (dummy_value_D)
3032 : "0" (prev_row), // esi // input regs
3035 : "%ecx" // clobber list
3036 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3037 , "%mm0", "%mm1", "%mm2", "%mm3"
3038 , "%mm4", "%mm5", "%mm6", "%mm7"
3042 break; // end 4,6 bpp
3046 _ActiveMask.use = 0x000000000000ffffLL;
3047 _ShiftBpp.use = 16; // == 2 * 8
3048 _ShiftRem.use = 48; // == 64 - 16
3050 __asm__ __volatile__ (
3052 "movq _ActiveMask, %%mm7 \n\t"
3053 // re-init address pointers and offset
3054 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3056 "movq _LBCarryMask, %%mm5 \n\t"
3057 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3058 "movq _HBClearMask, %%mm4 \n\t"
3059 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3061 // prime the pump: load the first Raw(x-bpp) data set
3062 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3063 // (we correct pos. in loop below)
3065 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3066 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3067 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3068 // add (Prev_row/2) to average
3069 "movq %%mm5, %%mm3 \n\t"
3070 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3071 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3072 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3074 "movq %%mm7, %%mm6 \n\t"
3075 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3078 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3079 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3081 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3083 // lsb's were == 1 (only valid
3084 // for active group)
3085 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3086 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3088 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3090 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3091 // bytes to add to Avg
3092 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3093 // for each Active byte
3095 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3096 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3098 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3099 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3100 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3102 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3104 // lsb's were == 1 (only valid
3105 // for active group)
3106 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3107 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3109 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3111 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3112 // bytes to add to Avg
3113 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3114 // Avg for each Active byte
3116 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3117 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3119 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3120 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3121 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3123 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3124 // where both lsb's were == 1
3125 // (only valid for active group)
3126 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3127 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3129 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3131 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3132 // bytes to add to Avg
3133 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3134 // Avg for each Active byte
3136 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3137 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3139 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3140 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3141 "addl $8, %%ecx \n\t"
3142 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3144 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3146 // lsb's were == 1 (only valid
3147 // for active group)
3148 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3149 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3151 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3153 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3154 // bytes to add to Avg
3155 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3156 // Avg for each Active byte
3158 "cmpl _MMXLength, %%ecx \n\t"
3159 // now ready to write back to memory
3160 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3161 // prep Raw(x-bpp) for next loop
3162 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3165 : "=S" (dummy_value_S), // output regs (dummy)
3166 "=D" (dummy_value_D)
3168 : "0" (prev_row), // esi // input regs
3171 : "%ecx" // clobber list
3172 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3173 , "%mm0", "%mm1", "%mm2", "%mm3"
3174 , "%mm4", "%mm5", "%mm6", "%mm7"
3182 __asm__ __volatile__ (
3183 // re-init address pointers and offset
3185 "pushl %%ebx \n\t" // save Global Offset Table index
3187 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3189 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3190 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3192 // do Paeth decode for remaining bytes
3193 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3194 "movl %%edi, %%edx \n\t"
3195 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3196 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3197 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3200 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3201 "xorl %%eax, %%eax \n\t"
3202 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3203 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3204 "addw %%cx, %%ax \n\t"
3206 "shrw %%ax \n\t" // divide by 2
3207 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3209 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3210 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3211 // mov does not affect flags; -1 to offset inc ebx
3216 "popl %%ebx \n\t" // Global Offset Table index
3219 : "=c" (dummy_value_c), // output regs (dummy)
3220 "=S" (dummy_value_S),
3221 "=D" (dummy_value_D)
3223 : "0" (bpp), // ecx // input regs
3224 "1" (prev_row), // esi
3227 : "%eax", "%edx" // clobber list
3233 return; // end 1 bpp
3237 __asm__ __volatile__ (
3238 // re-init address pointers and offset
3239 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3240 "movq _LBCarryMask, %%mm5 \n\t" // boundary
3241 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3242 "movq _HBClearMask, %%mm4 \n\t"
3243 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3245 // prime the pump: load the first Raw(x-bpp) data set
3246 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3247 // (NO NEED to correct pos. in loop below)
3250 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3251 "movq %%mm5, %%mm3 \n\t"
3252 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3253 "addl $8, %%ecx \n\t"
3254 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3255 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3256 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3257 // where both lsb's were == 1
3258 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3259 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3260 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3261 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3262 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3263 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3264 "cmpl _MMXLength, %%ecx \n\t"
3265 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3266 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3269 : "=S" (dummy_value_S), // output regs (dummy)
3270 "=D" (dummy_value_D)
3272 : "0" (prev_row), // esi // input regs
3275 : "%ecx" // clobber list
3276 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3277 , "%mm0", "%mm1", "%mm2"
3278 , "%mm3", "%mm4", "%mm5"
3284 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3288 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3290 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3294 __asm__ __volatile__ (
3295 "movq _LBCarryMask, %%mm5 \n\t"
3296 // re-init address pointers and offset
3297 "movl _dif, %%ebx \n\t" // ebx: x = offset to
3298 // alignment boundary
3299 "movl row, %%edi \n\t" // edi: Avg(x)
3300 "movq _HBClearMask, %%mm4 \n\t"
3301 "movl %%edi, %%edx \n\t"
3302 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3303 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3305 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3306 "movq %%mm5, %%mm3 \n\t"
3307 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3308 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3309 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3310 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3311 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3312 // where both lsb's were == 1
3313 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3314 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3316 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3318 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3320 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3322 "addl $8, %%ebx \n\t"
3323 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3325 "cmpl _MMXLength, %%ebx \n\t"
3326 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3329 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3331 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3333 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3335 #endif /* 0 - NEVER REACHED */
3339 } // end switch (bpp)
3341 __asm__ __volatile__ (
3342 // MMX acceleration complete; now do clean-up
3343 // check if any remaining bytes left to decode
3345 "pushl %%ebx \n\t" // save index to Global Offset Table
3347 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3348 //pre "movl row, %%edi \n\t" // edi: Avg(x)
3349 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3352 // do Avg decode for remaining bytes
3353 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3354 "movl %%edi, %%edx \n\t"
3355 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3356 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3357 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3360 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3361 "xorl %%eax, %%eax \n\t"
3362 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3363 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3364 "addw %%cx, %%ax \n\t"
3366 "shrw %%ax \n\t" // divide by 2
3367 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3368 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3369 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3370 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3373 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3375 "popl %%ebx \n\t" // restore index to Global Offset Table
3378 : "=c" (dummy_value_c), // output regs (dummy)
3379 "=S" (dummy_value_S),
3380 "=D" (dummy_value_D)
3382 : "0" (bpp), // ecx // input regs
3383 "1" (prev_row), // esi
3386 : "%eax", "%edx" // clobber list
3392 } /* end png_read_filter_row_mmx_avg() */
3397 #ifdef PNG_THREAD_UNSAFE_OK
3398 //===========================================================================//
3400 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3402 //===========================================================================//
3404 // Optimized code for PNG Paeth filter decoder
3406 static void /* PRIVATE */
3407 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3411 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3415 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3416 _FullLength = row_info->rowbytes; // # of bytes to filter
3418 __asm__ __volatile__ (
3420 "pushl %%ebx \n\t" // save index to Global Offset Table
3422 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3423 //pre "movl row, %%edi \n\t"
3424 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3425 //pre "movl prev_row, %%esi \n\t"
3426 "xorl %%eax, %%eax \n\t"
3428 // Compute the Raw value for the first bpp bytes
3429 // Note: the formula works out to be always
3430 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3432 "movb (%%edi,%%ebx,), %%al \n\t"
3433 "addb (%%esi,%%ebx,), %%al \n\t"
3435 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3436 "cmpl %%ecx, %%ebx \n\t"
3437 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3439 // get # of bytes to alignment
3440 "movl %%edi, _dif \n\t" // take start of row
3441 "addl %%ebx, _dif \n\t" // add bpp
3442 "xorl %%ecx, %%ecx \n\t"
3443 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3445 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3446 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3452 "xorl %%eax, %%eax \n\t"
3453 // pav = p - a = (a + b - c) - a = b - c
3454 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3455 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3456 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3457 "movl %%eax, _patemp \n\t" // Save pav for later use
3458 "xorl %%eax, %%eax \n\t"
3459 // pbv = p - b = (a + b - c) - b = a - c
3460 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3461 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3462 "movl %%eax, %%ecx \n\t"
3463 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3464 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3466 "testl $0x80000000, %%eax \n\t"
3468 "negl %%eax \n\t" // reverse sign of neg values
3471 "movl %%eax, _pctemp \n\t" // save pc for later use
3473 "testl $0x80000000, %%ecx \n\t"
3475 "negl %%ecx \n\t" // reverse sign of neg values
3478 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3480 "movl _patemp, %%eax \n\t"
3481 "testl $0x80000000, %%eax \n\t"
3483 "negl %%eax \n\t" // reverse sign of neg values
3486 "movl %%eax, _patemp \n\t" // save pa for later use
3488 "cmpl %%ecx, %%eax \n\t"
3489 "jna paeth_abb \n\t"
3490 // pa > pb; now test if pb <= pc
3491 "cmpl _pctemp, %%ecx \n\t"
3492 "jna paeth_bbc \n\t"
3493 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3494 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3495 "jmp paeth_paeth \n\t"
3498 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3499 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3500 "jmp paeth_paeth \n\t"
3503 // pa <= pb; now test if pa <= pc
3504 "cmpl _pctemp, %%eax \n\t"
3505 "jna paeth_abc \n\t"
3506 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3507 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3508 "jmp paeth_paeth \n\t"
3511 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3512 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3517 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3518 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3519 "cmpl _dif, %%ebx \n\t"
3523 "movl _FullLength, %%ecx \n\t"
3524 "movl %%ecx, %%eax \n\t"
3525 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3526 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3527 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3528 "movl %%ecx, _MMXLength \n\t"
3530 "popl %%ebx \n\t" // restore index to Global Offset Table
3533 : "=c" (dummy_value_c), // output regs (dummy)
3534 "=S" (dummy_value_S),
3535 "=D" (dummy_value_D)
3537 : "0" (bpp), // ecx // input regs
3538 "1" (prev_row), // esi
3541 : "%eax", "%edx" // clobber list
3547 // now do the math for the rest of the row
3552 _ActiveMask.use = 0x0000000000ffffffLL;
3553 _ActiveMaskEnd.use = 0xffff000000000000LL;
3554 _ShiftBpp.use = 24; // == bpp(3) * 8
3555 _ShiftRem.use = 40; // == 64 - 24
3557 __asm__ __volatile__ (
3558 "movl _dif, %%ecx \n\t"
3559 // preload "movl row, %%edi \n\t"
3560 // preload "movl prev_row, %%esi \n\t"
3561 "pxor %%mm0, %%mm0 \n\t"
3562 // prime the pump: load the first Raw(x-bpp) data set
3563 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3565 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3567 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3568 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3569 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3570 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3571 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3573 // pav = p - a = (a + b - c) - a = b - c
3574 "movq %%mm2, %%mm4 \n\t"
3575 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3576 // pbv = p - b = (a + b - c) - b = a - c
3577 "movq %%mm1, %%mm5 \n\t"
3578 "psubw %%mm3, %%mm4 \n\t"
3579 "pxor %%mm7, %%mm7 \n\t"
3580 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3581 "movq %%mm4, %%mm6 \n\t"
3582 "psubw %%mm3, %%mm5 \n\t"
3584 // pa = abs(p-a) = abs(pav)
3585 // pb = abs(p-b) = abs(pbv)
3586 // pc = abs(p-c) = abs(pcv)
3587 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3588 "paddw %%mm5, %%mm6 \n\t"
3589 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3590 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3591 "psubw %%mm0, %%mm4 \n\t"
3592 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3593 "psubw %%mm0, %%mm4 \n\t"
3594 "psubw %%mm7, %%mm5 \n\t"
3595 "pxor %%mm0, %%mm0 \n\t"
3596 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3597 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3598 "psubw %%mm7, %%mm5 \n\t"
3599 "psubw %%mm0, %%mm6 \n\t"
3601 "movq %%mm4, %%mm7 \n\t"
3602 "psubw %%mm0, %%mm6 \n\t"
3603 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3604 "movq %%mm7, %%mm0 \n\t"
3605 // use mm7 mask to merge pa & pb
3606 "pand %%mm7, %%mm5 \n\t"
3607 // use mm0 mask copy to merge a & b
3608 "pand %%mm0, %%mm2 \n\t"
3609 "pandn %%mm4, %%mm7 \n\t"
3610 "pandn %%mm1, %%mm0 \n\t"
3611 "paddw %%mm5, %%mm7 \n\t"
3612 "paddw %%mm2, %%mm0 \n\t"
3613 // test ((pa <= pb)? pa:pb) <= pc
3614 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3615 "pxor %%mm1, %%mm1 \n\t"
3616 "pand %%mm7, %%mm3 \n\t"
3617 "pandn %%mm0, %%mm7 \n\t"
3618 "paddw %%mm3, %%mm7 \n\t"
3619 "pxor %%mm0, %%mm0 \n\t"
3620 "packuswb %%mm1, %%mm7 \n\t"
3621 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3622 "pand _ActiveMask, %%mm7 \n\t"
3623 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3624 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3625 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3626 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3627 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3629 // now do Paeth for 2nd set of bytes (3-5)
3630 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3631 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3632 "pxor %%mm7, %%mm7 \n\t"
3633 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3634 // pbv = p - b = (a + b - c) - b = a - c
3635 "movq %%mm1, %%mm5 \n\t"
3636 // pav = p - a = (a + b - c) - a = b - c
3637 "movq %%mm2, %%mm4 \n\t"
3638 "psubw %%mm3, %%mm5 \n\t"
3639 "psubw %%mm3, %%mm4 \n\t"
3640 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3641 // pav + pbv = pbv + pav
3642 "movq %%mm5, %%mm6 \n\t"
3643 "paddw %%mm4, %%mm6 \n\t"
3645 // pa = abs(p-a) = abs(pav)
3646 // pb = abs(p-b) = abs(pbv)
3647 // pc = abs(p-c) = abs(pcv)
3648 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3649 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3650 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3651 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3652 "psubw %%mm0, %%mm5 \n\t"
3653 "psubw %%mm7, %%mm4 \n\t"
3654 "psubw %%mm0, %%mm5 \n\t"
3655 "psubw %%mm7, %%mm4 \n\t"
3656 "pxor %%mm0, %%mm0 \n\t"
3657 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3658 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3659 "psubw %%mm0, %%mm6 \n\t"
3661 "movq %%mm4, %%mm7 \n\t"
3662 "psubw %%mm0, %%mm6 \n\t"
3663 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3664 "movq %%mm7, %%mm0 \n\t"
3665 // use mm7 mask to merge pa & pb
3666 "pand %%mm7, %%mm5 \n\t"
3667 // use mm0 mask copy to merge a & b
3668 "pand %%mm0, %%mm2 \n\t"
3669 "pandn %%mm4, %%mm7 \n\t"
3670 "pandn %%mm1, %%mm0 \n\t"
3671 "paddw %%mm5, %%mm7 \n\t"
3672 "paddw %%mm2, %%mm0 \n\t"
3673 // test ((pa <= pb)? pa:pb) <= pc
3674 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3675 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3676 "pand %%mm7, %%mm3 \n\t"
3677 "pandn %%mm0, %%mm7 \n\t"
3678 "pxor %%mm1, %%mm1 \n\t"
3679 "paddw %%mm3, %%mm7 \n\t"
3680 "pxor %%mm0, %%mm0 \n\t"
3681 "packuswb %%mm1, %%mm7 \n\t"
3682 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3683 "pand _ActiveMask, %%mm7 \n\t"
3684 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3685 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3687 // pav = p - a = (a + b - c) - a = b - c
3688 "movq %%mm2, %%mm4 \n\t"
3689 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3690 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3691 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3692 "movq %%mm7, %%mm1 \n\t"
3693 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3694 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3695 // now mm1 will be used as Raw(x-bpp)
3696 // now do Paeth for 3rd, and final, set of bytes (6-7)
3697 "pxor %%mm7, %%mm7 \n\t"
3698 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3699 "psubw %%mm3, %%mm4 \n\t"
3700 // pbv = p - b = (a + b - c) - b = a - c
3701 "movq %%mm1, %%mm5 \n\t"
3702 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3703 "movq %%mm4, %%mm6 \n\t"
3704 "psubw %%mm3, %%mm5 \n\t"
3705 "pxor %%mm0, %%mm0 \n\t"
3706 "paddw %%mm5, %%mm6 \n\t"
3708 // pa = abs(p-a) = abs(pav)
3709 // pb = abs(p-b) = abs(pbv)
3710 // pc = abs(p-c) = abs(pcv)
3711 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3712 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3713 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3714 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3715 "psubw %%mm0, %%mm4 \n\t"
3716 "psubw %%mm7, %%mm5 \n\t"
3717 "psubw %%mm0, %%mm4 \n\t"
3718 "psubw %%mm7, %%mm5 \n\t"
3719 "pxor %%mm0, %%mm0 \n\t"
3720 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3721 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3722 "psubw %%mm0, %%mm6 \n\t"
3724 "movq %%mm4, %%mm7 \n\t"
3725 "psubw %%mm0, %%mm6 \n\t"
3726 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3727 "movq %%mm7, %%mm0 \n\t"
3728 // use mm0 mask copy to merge a & b
3729 "pand %%mm0, %%mm2 \n\t"
3730 // use mm7 mask to merge pa & pb
3731 "pand %%mm7, %%mm5 \n\t"
3732 "pandn %%mm1, %%mm0 \n\t"
3733 "pandn %%mm4, %%mm7 \n\t"
3734 "paddw %%mm2, %%mm0 \n\t"
3735 "paddw %%mm5, %%mm7 \n\t"
3736 // test ((pa <= pb)? pa:pb) <= pc
3737 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3738 "pand %%mm7, %%mm3 \n\t"
3739 "pandn %%mm0, %%mm7 \n\t"
3740 "paddw %%mm3, %%mm7 \n\t"
3741 "pxor %%mm1, %%mm1 \n\t"
3742 "packuswb %%mm7, %%mm1 \n\t"
3743 // step ecx to next set of 8 bytes and repeat loop til done
3744 "addl $8, %%ecx \n\t"
3745 "pand _ActiveMaskEnd, %%mm1 \n\t"
3746 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3749 "cmpl _MMXLength, %%ecx \n\t"
3750 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3751 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3752 // mm1 will be used as Raw(x-bpp) next loop
3753 // mm3 ready to be used as Prior(x-bpp) next loop
3756 : "=S" (dummy_value_S), // output regs (dummy)
3757 "=D" (dummy_value_D)
3759 : "0" (prev_row), // esi // input regs
3762 : "%ecx" // clobber list
3763 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3764 , "%mm0", "%mm1", "%mm2", "%mm3"
3765 , "%mm4", "%mm5", "%mm6", "%mm7"
3772 //case 7: // GRR BOGUS
3773 //case 5: // GRR BOGUS
3775 _ActiveMask.use = 0x00000000ffffffffLL;
3776 _ActiveMask2.use = 0xffffffff00000000LL;
3777 _ShiftBpp.use = bpp << 3; // == bpp * 8
3778 _ShiftRem.use = 64 - _ShiftBpp.use;
3780 __asm__ __volatile__ (
3781 "movl _dif, %%ecx \n\t"
3782 // preload "movl row, %%edi \n\t"
3783 // preload "movl prev_row, %%esi \n\t"
3784 // prime the pump: load the first Raw(x-bpp) data set
3785 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3786 "pxor %%mm0, %%mm0 \n\t"
3789 // must shift to position Raw(x-bpp) data
3790 "psrlq _ShiftRem, %%mm1 \n\t"
3791 // do first set of 4 bytes
3792 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3793 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3794 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3795 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3796 // must shift to position Prior(x-bpp) data
3797 "psrlq _ShiftRem, %%mm3 \n\t"
3798 // pav = p - a = (a + b - c) - a = b - c
3799 "movq %%mm2, %%mm4 \n\t"
3800 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3801 // pbv = p - b = (a + b - c) - b = a - c
3802 "movq %%mm1, %%mm5 \n\t"
3803 "psubw %%mm3, %%mm4 \n\t"
3804 "pxor %%mm7, %%mm7 \n\t"
3805 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3806 "movq %%mm4, %%mm6 \n\t"
3807 "psubw %%mm3, %%mm5 \n\t"
3808 // pa = abs(p-a) = abs(pav)
3809 // pb = abs(p-b) = abs(pbv)
3810 // pc = abs(p-c) = abs(pcv)
3811 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3812 "paddw %%mm5, %%mm6 \n\t"
3813 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3814 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3815 "psubw %%mm0, %%mm4 \n\t"
3816 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3817 "psubw %%mm0, %%mm4 \n\t"
3818 "psubw %%mm7, %%mm5 \n\t"
3819 "pxor %%mm0, %%mm0 \n\t"
3820 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3821 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3822 "psubw %%mm7, %%mm5 \n\t"
3823 "psubw %%mm0, %%mm6 \n\t"
3825 "movq %%mm4, %%mm7 \n\t"
3826 "psubw %%mm0, %%mm6 \n\t"
3827 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3828 "movq %%mm7, %%mm0 \n\t"
3829 // use mm7 mask to merge pa & pb
3830 "pand %%mm7, %%mm5 \n\t"
3831 // use mm0 mask copy to merge a & b
3832 "pand %%mm0, %%mm2 \n\t"
3833 "pandn %%mm4, %%mm7 \n\t"
3834 "pandn %%mm1, %%mm0 \n\t"
3835 "paddw %%mm5, %%mm7 \n\t"
3836 "paddw %%mm2, %%mm0 \n\t"
3837 // test ((pa <= pb)? pa:pb) <= pc
3838 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3839 "pxor %%mm1, %%mm1 \n\t"
3840 "pand %%mm7, %%mm3 \n\t"
3841 "pandn %%mm0, %%mm7 \n\t"
3842 "paddw %%mm3, %%mm7 \n\t"
3843 "pxor %%mm0, %%mm0 \n\t"
3844 "packuswb %%mm1, %%mm7 \n\t"
3845 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3846 "pand _ActiveMask, %%mm7 \n\t"
3847 "psrlq _ShiftRem, %%mm3 \n\t"
3848 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3849 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3850 "movq %%mm2, %%mm6 \n\t"
3851 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3852 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3853 "psllq _ShiftBpp, %%mm6 \n\t"
3854 "movq %%mm7, %%mm5 \n\t"
3855 "psrlq _ShiftRem, %%mm1 \n\t"
3856 "por %%mm6, %%mm3 \n\t"
3857 "psllq _ShiftBpp, %%mm5 \n\t"
3858 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3859 "por %%mm5, %%mm1 \n\t"
3860 // do second set of 4 bytes
3861 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3862 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3863 // pav = p - a = (a + b - c) - a = b - c
3864 "movq %%mm2, %%mm4 \n\t"
3865 // pbv = p - b = (a + b - c) - b = a - c
3866 "movq %%mm1, %%mm5 \n\t"
3867 "psubw %%mm3, %%mm4 \n\t"
3868 "pxor %%mm7, %%mm7 \n\t"
3869 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3870 "movq %%mm4, %%mm6 \n\t"
3871 "psubw %%mm3, %%mm5 \n\t"
3872 // pa = abs(p-a) = abs(pav)
3873 // pb = abs(p-b) = abs(pbv)
3874 // pc = abs(p-c) = abs(pcv)
3875 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3876 "paddw %%mm5, %%mm6 \n\t"
3877 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3878 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3879 "psubw %%mm0, %%mm4 \n\t"
3880 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3881 "psubw %%mm0, %%mm4 \n\t"
3882 "psubw %%mm7, %%mm5 \n\t"
3883 "pxor %%mm0, %%mm0 \n\t"
3884 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3885 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3886 "psubw %%mm7, %%mm5 \n\t"
3887 "psubw %%mm0, %%mm6 \n\t"
3889 "movq %%mm4, %%mm7 \n\t"
3890 "psubw %%mm0, %%mm6 \n\t"
3891 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3892 "movq %%mm7, %%mm0 \n\t"
3893 // use mm7 mask to merge pa & pb
3894 "pand %%mm7, %%mm5 \n\t"
3895 // use mm0 mask copy to merge a & b
3896 "pand %%mm0, %%mm2 \n\t"
3897 "pandn %%mm4, %%mm7 \n\t"
3898 "pandn %%mm1, %%mm0 \n\t"
3899 "paddw %%mm5, %%mm7 \n\t"
3900 "paddw %%mm2, %%mm0 \n\t"
3901 // test ((pa <= pb)? pa:pb) <= pc
3902 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3903 "pxor %%mm1, %%mm1 \n\t"
3904 "pand %%mm7, %%mm3 \n\t"
3905 "pandn %%mm0, %%mm7 \n\t"
3906 "pxor %%mm1, %%mm1 \n\t"
3907 "paddw %%mm3, %%mm7 \n\t"
3908 "pxor %%mm0, %%mm0 \n\t"
3909 // step ecx to next set of 8 bytes and repeat loop til done
3910 "addl $8, %%ecx \n\t"
3911 "packuswb %%mm7, %%mm1 \n\t"
3912 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3913 "cmpl _MMXLength, %%ecx \n\t"
3914 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3915 // mm1 will be used as Raw(x-bpp) next loop
3918 : "=S" (dummy_value_S), // output regs (dummy)
3919 "=D" (dummy_value_D)
3921 : "0" (prev_row), // esi // input regs
3924 : "%ecx" // clobber list
3925 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3926 , "%mm0", "%mm1", "%mm2", "%mm3"
3927 , "%mm4", "%mm5", "%mm6", "%mm7"
3935 _ActiveMask.use = 0x00000000ffffffffLL;
3937 __asm__ __volatile__ (
3938 "movl _dif, %%ecx \n\t"
3939 // preload "movl row, %%edi \n\t"
3940 // preload "movl prev_row, %%esi \n\t"
3941 "pxor %%mm0, %%mm0 \n\t"
3942 // prime the pump: load the first Raw(x-bpp) data set
3943 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3944 // a=Raw(x-bpp) bytes
3946 // do first set of 4 bytes
3947 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3948 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3949 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3950 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3951 // pav = p - a = (a + b - c) - a = b - c
3952 "movq %%mm2, %%mm4 \n\t"
3953 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3954 // pbv = p - b = (a + b - c) - b = a - c
3955 "movq %%mm1, %%mm5 \n\t"
3956 "psubw %%mm3, %%mm4 \n\t"
3957 "pxor %%mm7, %%mm7 \n\t"
3958 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3959 "movq %%mm4, %%mm6 \n\t"
3960 "psubw %%mm3, %%mm5 \n\t"
3961 // pa = abs(p-a) = abs(pav)
3962 // pb = abs(p-b) = abs(pbv)
3963 // pc = abs(p-c) = abs(pcv)
3964 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3965 "paddw %%mm5, %%mm6 \n\t"
3966 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3967 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3968 "psubw %%mm0, %%mm4 \n\t"
3969 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3970 "psubw %%mm0, %%mm4 \n\t"
3971 "psubw %%mm7, %%mm5 \n\t"
3972 "pxor %%mm0, %%mm0 \n\t"
3973 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3974 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3975 "psubw %%mm7, %%mm5 \n\t"
3976 "psubw %%mm0, %%mm6 \n\t"
3978 "movq %%mm4, %%mm7 \n\t"
3979 "psubw %%mm0, %%mm6 \n\t"
3980 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3981 "movq %%mm7, %%mm0 \n\t"
3982 // use mm7 mask to merge pa & pb
3983 "pand %%mm7, %%mm5 \n\t"
3984 // use mm0 mask copy to merge a & b
3985 "pand %%mm0, %%mm2 \n\t"
3986 "pandn %%mm4, %%mm7 \n\t"
3987 "pandn %%mm1, %%mm0 \n\t"
3988 "paddw %%mm5, %%mm7 \n\t"
3989 "paddw %%mm2, %%mm0 \n\t"
3990 // test ((pa <= pb)? pa:pb) <= pc
3991 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3992 "pxor %%mm1, %%mm1 \n\t"
3993 "pand %%mm7, %%mm3 \n\t"
3994 "pandn %%mm0, %%mm7 \n\t"
3995 "paddw %%mm3, %%mm7 \n\t"
3996 "pxor %%mm0, %%mm0 \n\t"
3997 "packuswb %%mm1, %%mm7 \n\t"
3998 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3999 "pand _ActiveMask, %%mm7 \n\t"
4000 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4001 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4002 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4003 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4004 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4005 // do second set of 4 bytes
4006 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4007 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4008 // pav = p - a = (a + b - c) - a = b - c
4009 "movq %%mm2, %%mm4 \n\t"
4010 // pbv = p - b = (a + b - c) - b = a - c
4011 "movq %%mm1, %%mm5 \n\t"
4012 "psubw %%mm3, %%mm4 \n\t"
4013 "pxor %%mm7, %%mm7 \n\t"
4014 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4015 "movq %%mm4, %%mm6 \n\t"
4016 "psubw %%mm3, %%mm5 \n\t"
4017 // pa = abs(p-a) = abs(pav)
4018 // pb = abs(p-b) = abs(pbv)
4019 // pc = abs(p-c) = abs(pcv)
4020 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4021 "paddw %%mm5, %%mm6 \n\t"
4022 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4023 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4024 "psubw %%mm0, %%mm4 \n\t"
4025 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4026 "psubw %%mm0, %%mm4 \n\t"
4027 "psubw %%mm7, %%mm5 \n\t"
4028 "pxor %%mm0, %%mm0 \n\t"
4029 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4030 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4031 "psubw %%mm7, %%mm5 \n\t"
4032 "psubw %%mm0, %%mm6 \n\t"
4034 "movq %%mm4, %%mm7 \n\t"
4035 "psubw %%mm0, %%mm6 \n\t"
4036 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4037 "movq %%mm7, %%mm0 \n\t"
4038 // use mm7 mask to merge pa & pb
4039 "pand %%mm7, %%mm5 \n\t"
4040 // use mm0 mask copy to merge a & b
4041 "pand %%mm0, %%mm2 \n\t"
4042 "pandn %%mm4, %%mm7 \n\t"
4043 "pandn %%mm1, %%mm0 \n\t"
4044 "paddw %%mm5, %%mm7 \n\t"
4045 "paddw %%mm2, %%mm0 \n\t"
4046 // test ((pa <= pb)? pa:pb) <= pc
4047 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4048 "pxor %%mm1, %%mm1 \n\t"
4049 "pand %%mm7, %%mm3 \n\t"
4050 "pandn %%mm0, %%mm7 \n\t"
4051 "pxor %%mm1, %%mm1 \n\t"
4052 "paddw %%mm3, %%mm7 \n\t"
4053 "pxor %%mm0, %%mm0 \n\t"
4054 // step ecx to next set of 8 bytes and repeat loop til done
4055 "addl $8, %%ecx \n\t"
4056 "packuswb %%mm7, %%mm1 \n\t"
4057 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4058 "cmpl _MMXLength, %%ecx \n\t"
4059 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4060 // mm1 will be used as Raw(x-bpp) next loop
4063 : "=S" (dummy_value_S), // output regs (dummy)
4064 "=D" (dummy_value_D)
4066 : "0" (prev_row), // esi // input regs
4069 : "%ecx" // clobber list
4070 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4071 , "%mm0", "%mm1", "%mm2", "%mm3"
4072 , "%mm4", "%mm5", "%mm6", "%mm7"
4080 _ActiveMask.use = 0x00000000ffffffffLL;
4082 __asm__ __volatile__ (
4083 "movl _dif, %%ecx \n\t"
4084 // preload "movl row, %%edi \n\t"
4085 // preload "movl prev_row, %%esi \n\t"
4086 "pxor %%mm0, %%mm0 \n\t"
4087 // prime the pump: load the first Raw(x-bpp) data set
4088 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4089 // a=Raw(x-bpp) bytes
4091 // do first set of 4 bytes
4092 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4093 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4094 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4095 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4096 // pav = p - a = (a + b - c) - a = b - c
4097 "movq %%mm2, %%mm4 \n\t"
4098 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4099 // pbv = p - b = (a + b - c) - b = a - c
4100 "movq %%mm1, %%mm5 \n\t"
4101 "psubw %%mm3, %%mm4 \n\t"
4102 "pxor %%mm7, %%mm7 \n\t"
4103 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4104 "movq %%mm4, %%mm6 \n\t"
4105 "psubw %%mm3, %%mm5 \n\t"
4106 // pa = abs(p-a) = abs(pav)
4107 // pb = abs(p-b) = abs(pbv)
4108 // pc = abs(p-c) = abs(pcv)
4109 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4110 "paddw %%mm5, %%mm6 \n\t"
4111 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4112 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4113 "psubw %%mm0, %%mm4 \n\t"
4114 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4115 "psubw %%mm0, %%mm4 \n\t"
4116 "psubw %%mm7, %%mm5 \n\t"
4117 "pxor %%mm0, %%mm0 \n\t"
4118 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4119 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4120 "psubw %%mm7, %%mm5 \n\t"
4121 "psubw %%mm0, %%mm6 \n\t"
4123 "movq %%mm4, %%mm7 \n\t"
4124 "psubw %%mm0, %%mm6 \n\t"
4125 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4126 "movq %%mm7, %%mm0 \n\t"
4127 // use mm7 mask to merge pa & pb
4128 "pand %%mm7, %%mm5 \n\t"
4129 // use mm0 mask copy to merge a & b
4130 "pand %%mm0, %%mm2 \n\t"
4131 "pandn %%mm4, %%mm7 \n\t"
4132 "pandn %%mm1, %%mm0 \n\t"
4133 "paddw %%mm5, %%mm7 \n\t"
4134 "paddw %%mm2, %%mm0 \n\t"
4135 // test ((pa <= pb)? pa:pb) <= pc
4136 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4137 "pxor %%mm1, %%mm1 \n\t"
4138 "pand %%mm7, %%mm3 \n\t"
4139 "pandn %%mm0, %%mm7 \n\t"
4140 "paddw %%mm3, %%mm7 \n\t"
4141 "pxor %%mm0, %%mm0 \n\t"
4142 "packuswb %%mm1, %%mm7 \n\t"
4143 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4144 "pand _ActiveMask, %%mm7 \n\t"
4145 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4146 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4147 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4148 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4149 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4151 // do second set of 4 bytes
4152 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4153 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4154 // pav = p - a = (a + b - c) - a = b - c
4155 "movq %%mm2, %%mm4 \n\t"
4156 // pbv = p - b = (a + b - c) - b = a - c
4157 "movq %%mm1, %%mm5 \n\t"
4158 "psubw %%mm3, %%mm4 \n\t"
4159 "pxor %%mm7, %%mm7 \n\t"
4160 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4161 "movq %%mm4, %%mm6 \n\t"
4162 "psubw %%mm3, %%mm5 \n\t"
4163 // pa = abs(p-a) = abs(pav)
4164 // pb = abs(p-b) = abs(pbv)
4165 // pc = abs(p-c) = abs(pcv)
4166 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4167 "paddw %%mm5, %%mm6 \n\t"
4168 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4169 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4170 "psubw %%mm0, %%mm4 \n\t"
4171 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4172 "psubw %%mm0, %%mm4 \n\t"
4173 "psubw %%mm7, %%mm5 \n\t"
4174 "pxor %%mm0, %%mm0 \n\t"
4175 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4176 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4177 "psubw %%mm7, %%mm5 \n\t"
4178 "psubw %%mm0, %%mm6 \n\t"
4180 "movq %%mm4, %%mm7 \n\t"
4181 "psubw %%mm0, %%mm6 \n\t"
4182 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4183 "movq %%mm7, %%mm0 \n\t"
4184 // use mm7 mask to merge pa & pb
4185 "pand %%mm7, %%mm5 \n\t"
4186 // use mm0 mask copy to merge a & b
4187 "pand %%mm0, %%mm2 \n\t"
4188 "pandn %%mm4, %%mm7 \n\t"
4189 "pandn %%mm1, %%mm0 \n\t"
4190 "paddw %%mm5, %%mm7 \n\t"
4191 "paddw %%mm2, %%mm0 \n\t"
4192 // test ((pa <= pb)? pa:pb) <= pc
4193 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4194 "pxor %%mm1, %%mm1 \n\t"
4195 "pand %%mm7, %%mm3 \n\t"
4196 "pandn %%mm0, %%mm7 \n\t"
4197 "pxor %%mm1, %%mm1 \n\t"
4198 "paddw %%mm3, %%mm7 \n\t"
4199 "pxor %%mm0, %%mm0 \n\t"
4200 // step ecx to next set of 8 bytes and repeat loop til done
4201 "addl $8, %%ecx \n\t"
4202 "packuswb %%mm7, %%mm1 \n\t"
4203 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4204 "cmpl _MMXLength, %%ecx \n\t"
4205 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4206 // mm1 will be used as Raw(x-bpp) next loop
4209 : "=S" (dummy_value_S), // output regs (dummy)
4210 "=D" (dummy_value_D)
4212 : "0" (prev_row), // esi // input regs
4215 : "%ecx" // clobber list
4216 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4217 , "%mm0", "%mm1", "%mm2", "%mm3"
4218 , "%mm4", "%mm5", "%mm6", "%mm7"
4228 __asm__ __volatile__ (
4230 "pushl %%ebx \n\t" // save Global Offset Table index
4232 "movl _dif, %%ebx \n\t"
4233 "cmpl _FullLength, %%ebx \n\t"
4234 "jnb paeth_dend \n\t"
4236 // preload "movl row, %%edi \n\t"
4237 // preload "movl prev_row, %%esi \n\t"
4238 // do Paeth decode for remaining bytes
4239 "movl %%ebx, %%edx \n\t"
4240 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4241 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4242 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4245 "xorl %%eax, %%eax \n\t"
4246 // pav = p - a = (a + b - c) - a = b - c
4247 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4248 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4249 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4250 "movl %%eax, _patemp \n\t" // Save pav for later use
4251 "xorl %%eax, %%eax \n\t"
4252 // pbv = p - b = (a + b - c) - b = a - c
4253 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4254 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4255 "movl %%eax, %%ecx \n\t"
4256 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4257 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4259 "testl $0x80000000, %%eax \n\t"
4260 "jz paeth_dpca \n\t"
4261 "negl %%eax \n\t" // reverse sign of neg values
4264 "movl %%eax, _pctemp \n\t" // save pc for later use
4266 "testl $0x80000000, %%ecx \n\t"
4267 "jz paeth_dpba \n\t"
4268 "negl %%ecx \n\t" // reverse sign of neg values
4271 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4273 "movl _patemp, %%eax \n\t"
4274 "testl $0x80000000, %%eax \n\t"
4275 "jz paeth_dpaa \n\t"
4276 "negl %%eax \n\t" // reverse sign of neg values
4279 "movl %%eax, _patemp \n\t" // save pa for later use
4281 "cmpl %%ecx, %%eax \n\t"
4282 "jna paeth_dabb \n\t"
4283 // pa > pb; now test if pb <= pc
4284 "cmpl _pctemp, %%ecx \n\t"
4285 "jna paeth_dbbc \n\t"
4286 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4287 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4288 "jmp paeth_dpaeth \n\t"
4291 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4292 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4293 "jmp paeth_dpaeth \n\t"
4296 // pa <= pb; now test if pa <= pc
4297 "cmpl _pctemp, %%eax \n\t"
4298 "jna paeth_dabc \n\t"
4299 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4300 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4301 "jmp paeth_dpaeth \n\t"
4304 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4305 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4307 "paeth_dpaeth: \n\t"
4310 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4311 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4312 "cmpl _FullLength, %%ebx \n\t"
4317 "popl %%ebx \n\t" // index to Global Offset Table
4320 : "=c" (dummy_value_c), // output regs (dummy)
4321 "=S" (dummy_value_S),
4322 "=D" (dummy_value_D)
4324 : "0" (bpp), // ecx // input regs
4325 "1" (prev_row), // esi
4328 : "%eax", "%edx" // clobber list
4334 return; // No need to go further with this one
4336 } // end switch (bpp)
4338 __asm__ __volatile__ (
4339 // MMX acceleration complete; now do clean-up
4340 // check if any remaining bytes left to decode
4342 "pushl %%ebx \n\t" // save index to Global Offset Table
4344 "movl _MMXLength, %%ebx \n\t"
4345 "cmpl _FullLength, %%ebx \n\t"
4346 "jnb paeth_end \n\t"
4347 //pre "movl row, %%edi \n\t"
4348 //pre "movl prev_row, %%esi \n\t"
4349 // do Paeth decode for remaining bytes
4350 "movl %%ebx, %%edx \n\t"
4351 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4352 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4353 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4356 "xorl %%eax, %%eax \n\t"
4357 // pav = p - a = (a + b - c) - a = b - c
4358 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4359 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4360 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4361 "movl %%eax, _patemp \n\t" // Save pav for later use
4362 "xorl %%eax, %%eax \n\t"
4363 // pbv = p - b = (a + b - c) - b = a - c
4364 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4365 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4366 "movl %%eax, %%ecx \n\t"
4367 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4368 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4370 "testl $0x80000000, %%eax \n\t"
4371 "jz paeth_pca2 \n\t"
4372 "negl %%eax \n\t" // reverse sign of neg values
4375 "movl %%eax, _pctemp \n\t" // save pc for later use
4377 "testl $0x80000000, %%ecx \n\t"
4378 "jz paeth_pba2 \n\t"
4379 "negl %%ecx \n\t" // reverse sign of neg values
4382 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4384 "movl _patemp, %%eax \n\t"
4385 "testl $0x80000000, %%eax \n\t"
4386 "jz paeth_paa2 \n\t"
4387 "negl %%eax \n\t" // reverse sign of neg values
4390 "movl %%eax, _patemp \n\t" // save pa for later use
4392 "cmpl %%ecx, %%eax \n\t"
4393 "jna paeth_abb2 \n\t"
4394 // pa > pb; now test if pb <= pc
4395 "cmpl _pctemp, %%ecx \n\t"
4396 "jna paeth_bbc2 \n\t"
4397 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4398 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4399 "jmp paeth_paeth2 \n\t"
4402 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4403 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4404 "jmp paeth_paeth2 \n\t"
4407 // pa <= pb; now test if pa <= pc
4408 "cmpl _pctemp, %%eax \n\t"
4409 "jna paeth_abc2 \n\t"
4410 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4411 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4412 "jmp paeth_paeth2 \n\t"
4415 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4416 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4418 "paeth_paeth2: \n\t"
4421 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4422 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4423 "cmpl _FullLength, %%ebx \n\t"
4427 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4429 "popl %%ebx \n\t" // restore index to Global Offset Table
4432 : "=c" (dummy_value_c), // output regs (dummy)
4433 "=S" (dummy_value_S),
4434 "=D" (dummy_value_D)
4436 : "0" (bpp), // ecx // input regs
4437 "1" (prev_row), // esi
4440 : "%eax", "%edx" // clobber list (no input regs!)
4446 } /* end png_read_filter_row_mmx_paeth() */
4452 #ifdef PNG_THREAD_UNSAFE_OK
4453 //===========================================================================//
4455 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4457 //===========================================================================//
4459 // Optimized code for PNG Sub filter decoder
4461 static void /* PRIVATE */
4462 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4468 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4469 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4471 __asm__ __volatile__ (
4472 //pre "movl row, %%edi \n\t"
4473 "movl %%edi, %%esi \n\t" // lp = row
4474 //pre "movl bpp, %%eax \n\t"
4475 "addl %%eax, %%edi \n\t" // rp = row + bpp
4476 //irr "xorl %%eax, %%eax \n\t"
4477 // get # of bytes to alignment
4478 "movl %%edi, _dif \n\t" // take start of row
4479 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4480 // alignment boundary
4481 "xorl %%ecx, %%ecx \n\t"
4482 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4483 "subl %%edi, _dif \n\t" // subtract from start ==> value
4484 "jz sub_go \n\t" // ecx at alignment
4486 "sub_lp1: \n\t" // fix alignment
4487 "movb (%%esi,%%ecx,), %%al \n\t"
4488 "addb %%al, (%%edi,%%ecx,) \n\t"
4490 "cmpl _dif, %%ecx \n\t"
4494 "movl _FullLength, %%eax \n\t"
4495 "movl %%eax, %%edx \n\t"
4496 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4497 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4498 "subl %%edx, %%eax \n\t" // drop over bytes from length
4499 "movl %%eax, _MMXLength \n\t"
4501 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4502 "=D" (dummy_value_D) // 1
4504 : "0" (bpp), // eax // input regs
4507 : "%ebx", "%ecx", "%edx" // clobber list
4510 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4511 , "%mm0", "%mm1", "%mm2", "%mm3"
4512 , "%mm4", "%mm5", "%mm6", "%mm7"
4516 // now do the math for the rest of the row
4521 _ActiveMask.use = 0x0000ffffff000000LL;
4522 _ShiftBpp.use = 24; // == 3 * 8
4523 _ShiftRem.use = 40; // == 64 - 24
4525 __asm__ __volatile__ (
4526 // preload "movl row, %%edi \n\t"
4527 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4528 // active byte group
4529 "movl %%edi, %%esi \n\t" // lp = row
4530 // preload "movl bpp, %%eax \n\t"
4531 "addl %%eax, %%edi \n\t" // rp = row + bpp
4532 "movq %%mm7, %%mm6 \n\t"
4533 "movl _dif, %%edx \n\t"
4534 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4535 // 3rd active byte group
4536 // prime the pump: load the first Raw(x-bpp) data set
4537 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4539 "sub_3lp: \n\t" // shift data for adding first
4540 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4541 // shift clears inactive bytes)
4542 // add 1st active group
4543 "movq (%%edi,%%edx,), %%mm0 \n\t"
4544 "paddb %%mm1, %%mm0 \n\t"
4546 // add 2nd active group
4547 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4548 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4549 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4550 "paddb %%mm1, %%mm0 \n\t"
4552 // add 3rd active group
4553 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4554 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4555 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4556 "addl $8, %%edx \n\t"
4557 "paddb %%mm1, %%mm0 \n\t"
4559 "cmpl _MMXLength, %%edx \n\t"
4560 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4561 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4564 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4565 "=D" (dummy_value_D) // 1
4567 : "0" (bpp), // eax // input regs
4570 : "%edx", "%esi" // clobber list
4571 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4572 , "%mm0", "%mm1", "%mm6", "%mm7"
4580 __asm__ __volatile__ (
4581 "movl _dif, %%edx \n\t"
4582 // preload "movl row, %%edi \n\t"
4583 "cmpl _FullLength, %%edx \n\t"
4585 "movl %%edi, %%esi \n\t" // lp = row
4586 "xorl %%eax, %%eax \n\t"
4587 // preload "movl bpp, %%eax \n\t"
4588 "addl %%eax, %%edi \n\t" // rp = row + bpp
4591 "movb (%%esi,%%edx,), %%al \n\t"
4592 "addb %%al, (%%edi,%%edx,) \n\t"
4594 "cmpl _FullLength, %%edx \n\t"
4599 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4600 "=D" (dummy_value_D) // 1
4602 : "0" (bpp), // eax // input regs
4605 : "%edx", "%esi" // clobber list
4612 //case 7: // GRR BOGUS
4613 //case 5: // GRR BOGUS
4615 _ShiftBpp.use = bpp << 3;
4616 _ShiftRem.use = 64 - _ShiftBpp.use;
4618 __asm__ __volatile__ (
4619 // preload "movl row, %%edi \n\t"
4620 "movl _dif, %%edx \n\t"
4621 "movl %%edi, %%esi \n\t" // lp = row
4622 // preload "movl bpp, %%eax \n\t"
4623 "addl %%eax, %%edi \n\t" // rp = row + bpp
4625 // prime the pump: load the first Raw(x-bpp) data set
4626 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4628 "sub_4lp: \n\t" // shift data for adding first
4629 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4630 // shift clears inactive bytes)
4631 "movq (%%edi,%%edx,), %%mm0 \n\t"
4632 "paddb %%mm1, %%mm0 \n\t"
4634 // add 2nd active group
4635 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4636 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4637 "addl $8, %%edx \n\t"
4638 "paddb %%mm1, %%mm0 \n\t"
4640 "cmpl _MMXLength, %%edx \n\t"
4641 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4642 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4645 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4646 "=D" (dummy_value_D) // 1
4648 : "0" (bpp), // eax // input regs
4651 : "%edx", "%esi" // clobber list
4652 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4661 _ActiveMask.use = 0x00000000ffff0000LL;
4662 _ShiftBpp.use = 16; // == 2 * 8
4663 _ShiftRem.use = 48; // == 64 - 16
4665 __asm__ __volatile__ (
4666 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4667 // active byte group
4668 "movl _dif, %%edx \n\t"
4669 "movq %%mm7, %%mm6 \n\t"
4670 // preload "movl row, %%edi \n\t"
4671 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4672 // 3rd active byte group
4673 "movl %%edi, %%esi \n\t" // lp = row
4674 "movq %%mm6, %%mm5 \n\t"
4675 // preload "movl bpp, %%eax \n\t"
4676 "addl %%eax, %%edi \n\t" // rp = row + bpp
4677 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4678 // 4th active byte group
4679 // prime the pump: load the first Raw(x-bpp) data set
4680 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4682 "sub_2lp: \n\t" // shift data for adding first
4683 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4684 // shift clears inactive bytes)
4685 // add 1st active group
4686 "movq (%%edi,%%edx,), %%mm0 \n\t"
4687 "paddb %%mm1, %%mm0 \n\t"
4689 // add 2nd active group
4690 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4691 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4692 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4693 "paddb %%mm1, %%mm0 \n\t"
4695 // add 3rd active group
4696 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4697 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4698 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4699 "paddb %%mm1, %%mm0 \n\t"
4701 // add 4th active group
4702 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4703 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4704 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4705 "addl $8, %%edx \n\t"
4706 "paddb %%mm1, %%mm0 \n\t"
4707 "cmpl _MMXLength, %%edx \n\t"
4708 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4709 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4712 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4713 "=D" (dummy_value_D) // 1
4715 : "0" (bpp), // eax // input regs
4718 : "%edx", "%esi" // clobber list
4719 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4720 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4728 __asm__ __volatile__ (
4729 // preload "movl row, %%edi \n\t"
4730 "movl _dif, %%edx \n\t"
4731 "movl %%edi, %%esi \n\t" // lp = row
4732 // preload "movl bpp, %%eax \n\t"
4733 "addl %%eax, %%edi \n\t" // rp = row + bpp
4734 "movl _MMXLength, %%ecx \n\t"
4736 // prime the pump: load the first Raw(x-bpp) data set
4737 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4738 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4741 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4742 "paddb %%mm7, %%mm0 \n\t"
4743 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4744 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4746 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4747 // This will be repeated for each group of 8 bytes with the 8th
4748 // group being used as the Raw(x-bpp) for the 1st group of the
4751 "paddb %%mm0, %%mm1 \n\t"
4752 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4753 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4754 "paddb %%mm1, %%mm2 \n\t"
4755 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4756 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4757 "paddb %%mm2, %%mm3 \n\t"
4758 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4759 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4760 "paddb %%mm3, %%mm4 \n\t"
4761 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4762 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4763 "paddb %%mm4, %%mm5 \n\t"
4764 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4765 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4766 "paddb %%mm5, %%mm6 \n\t"
4767 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4768 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4769 "addl $64, %%edx \n\t"
4770 "paddb %%mm6, %%mm7 \n\t"
4771 "cmpl %%ecx, %%edx \n\t"
4772 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4775 "cmpl _MMXLength, %%edx \n\t"
4779 "movq (%%edi,%%edx,), %%mm0 \n\t"
4780 "addl $8, %%edx \n\t"
4781 "paddb %%mm7, %%mm0 \n\t"
4782 "cmpl _MMXLength, %%edx \n\t"
4783 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4784 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4785 // to mm1 to be new Raw(x-bpp)
4791 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4792 "=D" (dummy_value_D) // 1
4794 : "0" (bpp), // eax // input regs
4797 : "%ecx", "%edx", "%esi" // clobber list
4798 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4799 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4805 default: // bpp greater than 8 bytes GRR BOGUS
4807 __asm__ __volatile__ (
4808 "movl _dif, %%edx \n\t"
4809 // preload "movl row, %%edi \n\t"
4810 "movl %%edi, %%esi \n\t" // lp = row
4811 // preload "movl bpp, %%eax \n\t"
4812 "addl %%eax, %%edi \n\t" // rp = row + bpp
4815 "movq (%%edi,%%edx,), %%mm0 \n\t"
4816 "movq (%%esi,%%edx,), %%mm1 \n\t"
4817 "addl $8, %%edx \n\t"
4818 "paddb %%mm1, %%mm0 \n\t"
4819 "cmpl _MMXLength, %%edx \n\t"
4820 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4821 // -8 to offset addl edx
4824 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4825 "=D" (dummy_value_D) // 1
4827 : "0" (bpp), // eax // input regs
4830 : "%edx", "%esi" // clobber list
4831 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4838 } // end switch (bpp)
4840 __asm__ __volatile__ (
4841 "movl _MMXLength, %%edx \n\t"
4842 //pre "movl row, %%edi \n\t"
4843 "cmpl _FullLength, %%edx \n\t"
4846 "movl %%edi, %%esi \n\t" // lp = row
4847 //pre "movl bpp, %%eax \n\t"
4848 "addl %%eax, %%edi \n\t" // rp = row + bpp
4849 "xorl %%eax, %%eax \n\t"
4852 "movb (%%esi,%%edx,), %%al \n\t"
4853 "addb %%al, (%%edi,%%edx,) \n\t"
4855 "cmpl _FullLength, %%edx \n\t"
4859 "EMMS \n\t" // end MMX instructions
4861 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4862 "=D" (dummy_value_D) // 1
4864 : "0" (bpp), // eax // input regs
4867 : "%edx", "%esi" // clobber list
4870 } // end of png_read_filter_row_mmx_sub()
4876 //===========================================================================//
4878 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4880 //===========================================================================//
4882 // Optimized code for PNG Up filter decoder
4884 static void /* PRIVATE */
4885 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4889 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4893 len = row_info->rowbytes; // number of bytes to filter
4895 __asm__ __volatile__ (
4896 //pre "movl row, %%edi \n\t"
4897 // get # of bytes to alignment
4898 "movl %%edi, %%ecx \n\t"
4899 "xorl %%ebx, %%ebx \n\t"
4900 "addl $0x7, %%ecx \n\t"
4901 "xorl %%eax, %%eax \n\t"
4902 "andl $0xfffffff8, %%ecx \n\t"
4903 //pre "movl prev_row, %%esi \n\t"
4904 "subl %%edi, %%ecx \n\t"
4907 "up_lp1: \n\t" // fix alignment
4908 "movb (%%edi,%%ebx,), %%al \n\t"
4909 "addb (%%esi,%%ebx,), %%al \n\t"
4911 "cmpl %%ecx, %%ebx \n\t"
4912 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4913 "jb up_lp1 \n\t" // offset incl ebx
4916 //pre "movl len, %%edx \n\t"
4917 "movl %%edx, %%ecx \n\t"
4918 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4919 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4920 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4922 // unrolled loop - use all MMX registers and interleave to reduce
4923 // number of branch instructions (loops) and reduce partial stalls
4925 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4926 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4927 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4928 "paddb %%mm1, %%mm0 \n\t"
4929 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4930 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4931 "paddb %%mm3, %%mm2 \n\t"
4932 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4933 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4934 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4935 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4936 "paddb %%mm5, %%mm4 \n\t"
4937 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4938 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4939 "paddb %%mm7, %%mm6 \n\t"
4940 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4941 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4942 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4943 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4944 "paddb %%mm1, %%mm0 \n\t"
4945 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4946 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4947 "paddb %%mm3, %%mm2 \n\t"
4948 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4949 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4950 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4951 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4952 "paddb %%mm5, %%mm4 \n\t"
4953 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4954 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4955 "addl $64, %%ebx \n\t"
4956 "paddb %%mm7, %%mm6 \n\t"
4957 "cmpl %%ecx, %%ebx \n\t"
4958 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4959 "jb up_loop \n\t" // -8 to offset addl ebx
4961 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
4964 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
4965 "jb up_lt8 \n\t" // [added by lcreeve@netins.net]
4967 "addl %%edx, %%ecx \n\t"
4968 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4969 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4972 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
4973 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4974 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4975 "addl $8, %%ebx \n\t"
4976 "paddb %%mm1, %%mm0 \n\t"
4977 "cmpl %%ecx, %%ebx \n\t"
4978 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4979 "jb up_lpA \n\t" // offset add ebx
4980 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
4984 "xorl %%eax, %%eax \n\t"
4985 "addl %%edx, %%ecx \n\t" // move over byte count into counter
4987 "up_lp2: \n\t" // use x86 regs for remaining bytes
4988 "movb (%%edi,%%ebx,), %%al \n\t"
4989 "addb (%%esi,%%ebx,), %%al \n\t"
4991 "cmpl %%ecx, %%ebx \n\t"
4992 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4993 "jb up_lp2 \n\t" // offset inc ebx
4996 "EMMS \n\t" // conversion of filtered row complete
4998 : "=d" (dummy_value_d), // 0 // output regs (dummy)
4999 "=S" (dummy_value_S), // 1
5000 "=D" (dummy_value_D) // 2
5002 : "0" (len), // edx // input regs
5003 "1" (prev_row), // esi
5006 : "%eax", "%ebx", "%ecx" // clobber list (no input regs!)
5008 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5009 , "%mm0", "%mm1", "%mm2", "%mm3"
5010 , "%mm4", "%mm5", "%mm6", "%mm7"
5014 } // end of png_read_filter_row_mmx_up()
5016 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5021 /*===========================================================================*/
5023 /* P N G _ R E A D _ F I L T E R _ R O W */
5025 /*===========================================================================*/
5028 /* Optimized png_read_filter_row routines */
5031 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5032 row, png_bytep prev_row, int filter)
5038 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5039 /* GRR: these are superseded by png_ptr->asm_flags: */
5040 #define UseMMX_sub 1 // GRR: converted 20000730
5041 #define UseMMX_up 1 // GRR: converted 20000729
5042 #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5043 #define UseMMX_paeth 1 // GRR: converted 20000828
5045 if (_mmx_supported == 2) {
5046 /* this should have happened in png_init_mmx_flags() already */
5047 png_warning(png_ptr, "asm_flags may not have been initialized");
5050 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5053 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5056 case 0: sprintf(filnm, "none");
5058 case 1: sprintf(filnm, "sub-%s",
5059 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5060 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5064 case 2: sprintf(filnm, "up-%s",
5065 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5066 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5070 case 3: sprintf(filnm, "avg-%s",
5071 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5072 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5076 case 4: sprintf(filnm, "Paeth-%s",
5077 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5078 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5082 default: sprintf(filnm, "unknw");
5085 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5086 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5087 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5088 (int)((row_info->pixel_depth + 7) >> 3));
5089 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5090 #endif /* PNG_DEBUG */
5094 case PNG_FILTER_VALUE_NONE:
5097 case PNG_FILTER_VALUE_SUB:
5098 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5099 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5100 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5101 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5103 png_read_filter_row_mmx_sub(row_info, row);
5106 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5109 png_uint_32 istop = row_info->rowbytes;
5110 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5111 png_bytep rp = row + bpp;
5114 for (i = bpp; i < istop; i++)
5116 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5119 } /* end !UseMMX_sub */
5122 case PNG_FILTER_VALUE_UP:
5123 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5124 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5125 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5126 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5128 png_read_filter_row_mmx_up(row_info, row, prev_row);
5131 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5134 png_uint_32 istop = row_info->rowbytes;
5136 png_bytep pp = prev_row;
5138 for (i = 0; i < istop; ++i)
5140 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5143 } /* end !UseMMX_up */
5146 case PNG_FILTER_VALUE_AVG:
5147 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5148 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5149 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5150 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5152 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5155 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5159 png_bytep pp = prev_row;
5161 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5162 png_uint_32 istop = row_info->rowbytes - bpp;
5164 for (i = 0; i < bpp; i++)
5166 *rp = (png_byte)(((int)(*rp) +
5167 ((int)(*pp++) >> 1)) & 0xff);
5171 for (i = 0; i < istop; i++)
5173 *rp = (png_byte)(((int)(*rp) +
5174 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5177 } /* end !UseMMX_avg */
5180 case PNG_FILTER_VALUE_PAETH:
5181 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5182 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5183 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5184 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5186 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5189 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5193 png_bytep pp = prev_row;
5195 png_bytep cp = prev_row;
5196 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5197 png_uint_32 istop = row_info->rowbytes - bpp;
5199 for (i = 0; i < bpp; i++)
5201 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5205 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5207 int a, b, c, pa, pb, pc, p;
5221 pa = p < 0 ? -p : p;
5222 pb = pc < 0 ? -pc : pc;
5223 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5227 if (pa <= pb && pa <= pc)
5235 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5237 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5240 } /* end !UseMMX_paeth */
5244 png_warning(png_ptr, "Ignoring bad row-filter type");
5250 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5253 /*===========================================================================*/
5255 /* P N G _ M M X _ S U P P O R T */
5257 /*===========================================================================*/
5259 /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5260 * (2) all instructions compile with gcc 2.7.2.3 and later
5261 * (3) the function is moved down here to prevent gcc from
5262 * inlining it in multiple places and then barfing be-
5263 * cause the ".NOT_SUPPORTED" label is multiply defined
5264 * [is there a way to signal that a *single* function should
5265 * not be inlined? is there a way to modify the label for
5266 * each inlined instance, e.g., by appending _1, _2, etc.?
5267 * maybe if don't use leading "." in label name? (nope...sigh)]
5271 png_mmx_support(void)
5273 #if defined(PNG_MMX_CODE_SUPPORTED)
5274 __asm__ __volatile__ (
5275 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5276 "pushl %%ecx \n\t" // so does ecx...
5277 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5278 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5279 // "pushf \n\t" // 16-bit pushf
5280 "pushfl \n\t" // save Eflag to stack
5281 "popl %%eax \n\t" // get Eflag from stack into eax
5282 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5283 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5284 "pushl %%eax \n\t" // save modified Eflag back to stack
5285 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5286 // "popf \n\t" // 16-bit popf
5287 "popfl \n\t" // restore modified value to Eflag reg
5288 "pushfl \n\t" // save Eflag to stack
5289 "popl %%eax \n\t" // get Eflag from stack
5290 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5291 "jz .NOT_SUPPORTED \n\t" // if same, CPUID instr. is not supported
5293 "xorl %%eax, %%eax \n\t" // set eax to zero
5294 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5295 "cpuid \n\t" // get the CPU identification info
5296 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5297 "jl .NOT_SUPPORTED \n\t" // if eax is zero, MMX is not supported
5299 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5300 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5301 // faster than the instruction "mov eax, 1"
5302 "cpuid \n\t" // get the CPU identification info again
5303 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5304 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5305 "jz .NOT_SUPPORTED \n\t" // non-zero = yes, MMX IS supported
5307 "movl $1, %%eax \n\t" // set return value to 1
5308 "jmp .RETURN \n\t" // DONE: have MMX support
5310 ".NOT_SUPPORTED: \n\t" // target label for jump instructions
5311 "movl $0, %%eax \n\t" // set return value to 0
5312 ".RETURN: \n\t" // target label for jump instructions
5313 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5314 "popl %%edx \n\t" // restore edx
5315 "popl %%ecx \n\t" // restore ecx
5316 "popl %%ebx \n\t" // restore ebx
5318 // "ret \n\t" // DONE: no MMX support
5319 // (fall through to standard C "ret")
5321 : // output list (none)
5323 : // any variables used on input (none)
5325 : "%eax" // clobber list
5326 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5327 // , "memory" // if write to a variable gcc thought was in a reg
5328 // , "cc" // "condition codes" (flag bits)
5332 #endif /* PNG_MMX_CODE_SUPPORTED */
5334 return _mmx_supported;
5338 #endif /* PNG_USE_PNGGCCRD */