1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 * libpng 1.2.0 - September 1, 2001
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 * [runtime MMX configuration, GRR 20010102]
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 static int mmx_supported=2;
37 int mmx_supported_local = 0;
39 push ebx //CPUID will trash these
42 pushfd //Save Eflag to stack
43 pop eax //Get Eflag from stack into eax
44 mov ecx, eax //Make another copy of Eflag in ecx
45 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
46 push eax //Save modified Eflag back to stack
48 popfd //Restored modified value back to Eflag reg
49 pushfd //Save Eflag to stack
50 pop eax //Get Eflag from stack
51 xor eax, ecx //Compare the new Eflag with the original Eflag
52 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
53 //skip following instructions and jump to
56 xor eax, eax //Set eax to zero
58 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
61 cmp eax, 1 //make sure eax return non-zero value
62 jl NOT_SUPPORTED //If eax is zero, mmx not supported
64 xor eax, eax //set eax to zero
65 inc eax //Now increment eax to 1. This instruction is
66 //faster than the instruction "mov eax, 1"
68 _asm _emit 0x0f //CPUID instruction
71 and edx, 0x00800000 //mask out all bits but mmx bit(24)
72 cmp edx, 0 // 0 = mmx not supported
73 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
75 mov mmx_supported_local, 1 //set return value to 1
78 mov eax, mmx_supported_local //move return value to eax
79 pop edx //CPUID trashed these
84 //mmx_supported_local=0; // test code for force don't support MMX
85 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
87 mmx_supported = mmx_supported_local;
88 return mmx_supported_local;
91 /* Combines the row recently read in with the previous row.
92 This routine takes care of alpha and transparency if requested.
93 This routine also handles the two methods of progressive display
94 of interlaced images, depending on the mask value.
95 The mask value describes which pixels are to be combined with
96 the row. The pattern always repeats every 8 pixels, so just 8
97 bits are needed. A one indicates the pixel is to be combined; a
98 zero indicates the pixel is to be skipped. This is in addition
99 to any alpha or transparency value associated with the pixel. If
100 you want all pixels to be combined, pass 0xff (255) in mask. */
102 /* Use this routine for x86 platform - uses faster MMX routine if machine
106 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
108 #ifdef PNG_USE_LOCAL_ARRAYS
109 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
112 png_debug(1,"in png_combine_row_asm\n");
114 if (mmx_supported == 2) {
115 /* this should have happened in png_init_mmx_flags() already */
116 png_warning(png_ptr, "asm_flags may not have been initialized");
122 png_memcpy(row, png_ptr->row_buf + 1,
123 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
125 /* GRR: add "else if (mask == 0)" case?
126 * or does png_combine_row() not even get called in that case? */
129 switch (png_ptr->row_info.pixel_depth)
135 int s_inc, s_start, s_end;
140 sp = png_ptr->row_buf + 1;
143 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
144 if (png_ptr->transformations & PNG_PACKSWAP)
160 for (i = 0; i < png_ptr->width; i++)
166 value = (*sp >> shift) & 0x1;
167 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
168 *dp |= (png_byte)(value << shift);
192 int s_start, s_end, s_inc;
198 sp = png_ptr->row_buf + 1;
201 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
202 if (png_ptr->transformations & PNG_PACKSWAP)
218 for (i = 0; i < png_ptr->width; i++)
222 value = (*sp >> shift) & 0x3;
223 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
224 *dp |= (png_byte)(value << shift);
247 int s_start, s_end, s_inc;
253 sp = png_ptr->row_buf + 1;
256 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
257 if (png_ptr->transformations & PNG_PACKSWAP)
272 for (i = 0; i < png_ptr->width; i++)
276 value = (*sp >> shift) & 0xf;
277 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
278 *dp |= (png_byte)(value << shift);
305 __int64 mask0=0x0102040810204080;
307 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
308 /* && mmx_supported */ )
310 srcptr = png_ptr->row_buf + 1;
314 len = png_ptr->width &~7; //reduce to multiple of 8
315 diff = png_ptr->width & 7; //amount lost
319 movd mm7, unmask //load bit pattern
320 psubb mm6,mm6 //zero mm6
323 punpckldq mm7,mm7 //fill register with 8 masks
327 pand mm0,mm7 //nonzero if keep byte
328 pcmpeqb mm0,mm6 //zeros->1s, v versa
330 mov ecx,len //load length of line (pixels)
331 mov esi,srcptr //load source
332 mov ebx,dstptr //load dest
344 add esi,8 //inc by 8 bytes processed
346 sub ecx,8 //dec by 8 pixels processed
356 sal edx,24 //make low byte the high byte
359 sal edx,1 //move high bit to CF
360 jnc skip8 //if CF = 0
373 else /* mmx not supported - use modified C routine */
375 register unsigned int incr1, initial_val, final_val;
376 png_size_t pixel_bytes;
378 register int disp = png_pass_inc[png_ptr->pass];
379 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
381 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
382 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
384 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
385 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
386 final_val = png_ptr->width*pixel_bytes;
387 incr1 = (disp)*pixel_bytes;
388 for (i = initial_val; i < final_val; i += incr1)
390 png_memcpy(dstptr, srcptr, pixel_bytes);
405 __int64 mask1=0x0101020204040808,
406 mask0=0x1010202040408080;
408 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
409 /* && mmx_supported */ )
411 srcptr = png_ptr->row_buf + 1;
415 len = (png_ptr->width)&~7;
416 diff = (png_ptr->width)&7;
419 movd mm7, unmask //load bit pattern
420 psubb mm6,mm6 //zero mm6
423 punpckldq mm7,mm7 //fill register with 8 masks
434 mov ecx,len //load length of line
435 mov esi,srcptr //load source
436 mov ebx,dstptr //load dest
457 add esi,16 //inc by 16 bytes processed
459 sub ecx,8 //dec by 8 pixels processed
469 sal edx,24 //make low byte the high byte
471 sal edx,1 //move high bit to CF
472 jnc skip16 //if CF = 0
485 else /* mmx not supported - use modified C routine */
487 register unsigned int incr1, initial_val, final_val;
488 png_size_t pixel_bytes;
490 register int disp = png_pass_inc[png_ptr->pass];
491 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
493 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
494 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
496 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
497 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
498 final_val = png_ptr->width*pixel_bytes;
499 incr1 = (disp)*pixel_bytes;
500 for (i = initial_val; i < final_val; i += incr1)
502 png_memcpy(dstptr, srcptr, pixel_bytes);
518 __int64 mask2=0x0101010202020404, //24bpp
519 mask1=0x0408080810101020,
520 mask0=0x2020404040808080;
522 srcptr = png_ptr->row_buf + 1;
526 len = (png_ptr->width)&~7;
527 diff = (png_ptr->width)&7;
529 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
530 /* && mmx_supported */ )
534 movd mm7, unmask //load bit pattern
535 psubb mm6,mm6 //zero mm6
538 punpckldq mm7,mm7 //fill register with 8 masks
552 mov ecx,len //load length of line
553 mov esi,srcptr //load source
554 mov ebx,dstptr //load dest
584 add esi,24 //inc by 24 bytes processed
586 sub ecx,8 //dec by 8 pixels processed
596 sal edx,24 //make low byte the high byte
598 sal edx,1 //move high bit to CF
599 jnc skip24 //if CF = 0
616 else /* mmx not supported - use modified C routine */
618 register unsigned int incr1, initial_val, final_val;
619 png_size_t pixel_bytes;
621 register int disp = png_pass_inc[png_ptr->pass];
622 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
624 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
625 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
627 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
628 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
629 final_val = png_ptr->width*pixel_bytes;
630 incr1 = (disp)*pixel_bytes;
631 for (i = initial_val; i < final_val; i += incr1)
633 png_memcpy(dstptr, srcptr, pixel_bytes);
649 __int64 mask3=0x0101010102020202, //32bpp
650 mask2=0x0404040408080808,
651 mask1=0x1010101020202020,
652 mask0=0x4040404080808080;
654 srcptr = png_ptr->row_buf + 1;
658 len = (png_ptr->width)&~7;
659 diff = (png_ptr->width)&7;
661 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
662 /* && mmx_supported */ )
666 movd mm7, unmask //load bit pattern
667 psubb mm6,mm6 //zero mm6
670 punpckldq mm7,mm7 //fill register with 8 masks
687 mov ecx,len //load length of line
688 mov esi,srcptr //load source
689 mov ebx,dstptr //load dest
727 add esi,32 //inc by 32 bytes processed
729 sub ecx,8 //dec by 8 pixels processed
739 sal edx,24 //make low byte the high byte
741 sal edx,1 //move high bit to CF
742 jnc skip32 //if CF = 0
756 else /* mmx _not supported - Use modified C routine */
758 register unsigned int incr1, initial_val, final_val;
759 png_size_t pixel_bytes;
761 register int disp = png_pass_inc[png_ptr->pass];
762 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
764 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
765 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
767 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
768 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
769 final_val = png_ptr->width*pixel_bytes;
770 incr1 = (disp)*pixel_bytes;
771 for (i = initial_val; i < final_val; i += incr1)
773 png_memcpy(dstptr, srcptr, pixel_bytes);
789 __int64 mask5=0x0101010101010202,
790 mask4=0x0202020204040404,
791 mask3=0x0404080808080808,
792 mask2=0x1010101010102020,
793 mask1=0x2020202040404040,
794 mask0=0x4040808080808080;
796 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
797 /* && mmx_supported */ )
799 srcptr = png_ptr->row_buf + 1;
803 len = (png_ptr->width)&~7;
804 diff = (png_ptr->width)&7;
807 movd mm7, unmask //load bit pattern
808 psubb mm6,mm6 //zero mm6
811 punpckldq mm7,mm7 //fill register with 8 masks
834 mov ecx,len //load length of line
835 mov esi,srcptr //load source
836 mov ebx,dstptr //load dest
884 add esi,48 //inc by 32 bytes processed
886 sub ecx,8 //dec by 8 pixels processed
896 sal edx,24 //make low byte the high byte
899 sal edx,1 //move high bit to CF
900 jnc skip48 //if CF = 0
914 else /* mmx _not supported - Use modified C routine */
916 register unsigned int incr1, initial_val, final_val;
917 png_size_t pixel_bytes;
919 register int disp = png_pass_inc[png_ptr->pass];
920 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
922 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
923 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
925 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
926 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
927 final_val = png_ptr->width*pixel_bytes;
928 incr1 = (disp)*pixel_bytes;
929 for (i = initial_val; i < final_val; i += incr1)
931 png_memcpy(dstptr, srcptr, pixel_bytes);
944 png_size_t pixel_bytes;
945 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
948 register unsigned int incr1, initial_val, final_val;
950 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
951 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
953 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
954 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
955 final_val = png_ptr->width*pixel_bytes;
956 incr1 = (disp)*pixel_bytes;
957 for (i = initial_val; i < final_val; i += incr1)
959 png_memcpy(dp, sptr, pixel_bytes);
965 } /* end switch (png_ptr->row_info.pixel_depth) */
966 } /* end if (non-trivial mask) */
968 } /* end png_combine_row() */
971 #if defined(PNG_READ_INTERLACING_SUPPORTED)
974 png_do_read_interlace(png_structp png_ptr)
976 png_row_infop row_info = &(png_ptr->row_info);
977 png_bytep row = png_ptr->row_buf + 1;
978 int pass = png_ptr->pass;
979 png_uint_32 transformations = png_ptr->transformations;
980 #ifdef PNG_USE_LOCAL_ARRAYS
981 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
984 png_debug(1,"in png_do_read_interlace\n");
986 if (mmx_supported == 2) {
987 /* this should have happened in png_init_mmx_flags() already */
988 png_warning(png_ptr, "asm_flags may not have been initialized");
992 if (row != NULL && row_info != NULL)
994 png_uint_32 final_width;
996 final_width = row_info->width * png_pass_inc[pass];
998 switch (row_info->pixel_depth)
1004 int s_start, s_end, s_inc;
1009 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1010 dp = row + (png_size_t)((final_width - 1) >> 3);
1011 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1012 if (transformations & PNG_PACKSWAP)
1014 sshift = (int)((row_info->width + 7) & 7);
1015 dshift = (int)((final_width + 7) & 7);
1023 sshift = 7 - (int)((row_info->width + 7) & 7);
1024 dshift = 7 - (int)((final_width + 7) & 7);
1030 for (i = row_info->width; i; i--)
1032 v = (png_byte)((*sp >> sshift) & 0x1);
1033 for (j = 0; j < png_pass_inc[pass]; j++)
1035 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1036 *dp |= (png_byte)(v << dshift);
1037 if (dshift == s_end)
1045 if (sshift == s_end)
1060 int s_start, s_end, s_inc;
1063 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1064 dp = row + (png_size_t)((final_width - 1) >> 2);
1065 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1066 if (transformations & PNG_PACKSWAP)
1068 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1069 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1077 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1078 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1084 for (i = row_info->width; i; i--)
1089 v = (png_byte)((*sp >> sshift) & 0x3);
1090 for (j = 0; j < png_pass_inc[pass]; j++)
1092 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1093 *dp |= (png_byte)(v << dshift);
1094 if (dshift == s_end)
1102 if (sshift == s_end)
1117 int s_start, s_end, s_inc;
1120 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1121 dp = row + (png_size_t)((final_width - 1) >> 1);
1122 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1123 if (transformations & PNG_PACKSWAP)
1125 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1126 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1134 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1135 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1141 for (i = row_info->width; i; i--)
1146 v = (png_byte)((*sp >> sshift) & 0xf);
1147 for (j = 0; j < png_pass_inc[pass]; j++)
1149 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1150 *dp |= (png_byte)(v << dshift);
1151 if (dshift == s_end)
1159 if (sshift == s_end)
1170 default: // This is the place where the routine is modified
1172 __int64 const4 = 0x0000000000FFFFFF;
1173 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1174 __int64 const6 = 0x00000000000000FF;
1177 png_size_t pixel_bytes;
1178 int width = row_info->width;
1180 pixel_bytes = (row_info->pixel_depth >> 3);
1182 sptr = row + (width - 1) * pixel_bytes;
1183 dp = row + (final_width - 1) * pixel_bytes;
1184 // New code by Nirav Chhatrapati - Intel Corporation
1186 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1188 // use MMX routine if machine supports it
1189 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1190 /* && mmx_supported */ )
1192 if (pixel_bytes == 3)
1194 if (((pass == 0) || (pass == 1)) && width)
1201 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1203 movd mm0, [esi] ; X X X X X v2 v1 v0
1204 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1205 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1206 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1207 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1208 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1209 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1210 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1211 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1212 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1213 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1214 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1217 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1219 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1229 else if (((pass == 2) || (pass == 3)) && width)
1236 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1238 movd mm0, [esi] ; X X X X X v2 v1 v0
1239 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1240 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1241 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1242 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1243 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1244 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1245 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1246 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1247 movq [edi+4], mm0 ; move to memory
1248 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1249 movd [edi], mm0 ; move to memory
1257 else if (width) /* && ((pass == 4) || (pass == 5)) */
1259 int width_mmx = ((width >> 1) << 1) - 8;
1262 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1273 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1274 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1275 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1276 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1277 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1278 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1279 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1280 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1281 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1282 movq [edi], mm0 ; move quad to memory
1283 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1284 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1285 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1286 movd [edi+8], mm6 ; move double to memory
1295 sptr -= width_mmx*3;
1297 for (i = width; i; i--)
1302 png_memcpy(v, sptr, 3);
1303 for (j = 0; j < png_pass_inc[pass]; j++)
1305 png_memcpy(dp, v, 3);
1311 } /* end of pixel_bytes == 3 */
1313 else if (pixel_bytes == 1)
1315 if (((pass == 0) || (pass == 1)) && width)
1317 int width_mmx = ((width >> 2) << 2);
1329 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1330 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1331 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1332 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1333 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1334 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1335 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1336 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1337 movq [edi], mm0 ; move to memory v3
1338 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1339 movq [edi+8], mm3 ; move to memory v2
1340 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1341 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1342 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1343 movq [edi+16], mm2 ; move to memory v1
1344 movq [edi+24], mm4 ; move to memory v0
1355 for (i = width; i; i--)
1359 /* I simplified this part in version 1.0.4e
1360 * here and in several other instances where
1361 * pixel_bytes == 1 -- GR-P
1366 * png_memcpy(v, sptr, pixel_bytes);
1367 * for (j = 0; j < png_pass_inc[pass]; j++)
1369 * png_memcpy(dp, v, pixel_bytes);
1370 * dp -= pixel_bytes;
1372 * sptr -= pixel_bytes;
1374 * Replacement code is in the next three lines:
1377 for (j = 0; j < png_pass_inc[pass]; j++)
1382 else if (((pass == 2) || (pass == 3)) && width)
1384 int width_mmx = ((width >> 2) << 2);
1396 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1397 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1398 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1399 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1400 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1401 movq [edi], mm0 ; move to memory v2 and v3
1403 movq [edi+8], mm1 ; move to memory v1 and v0
1413 for (i = width; i; i--)
1417 for (j = 0; j < png_pass_inc[pass]; j++)
1424 else if (width) /* && ((pass == 4) || (pass == 5))) */
1426 int width_mmx = ((width >> 3) << 3);
1438 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1439 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1440 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1441 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1442 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1443 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1445 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1456 for (i = width; i; i--)
1460 for (j = 0; j < png_pass_inc[pass]; j++)
1467 } /* end of pixel_bytes == 1 */
1469 else if (pixel_bytes == 2)
1471 if (((pass == 0) || (pass == 1)) && width)
1473 int width_mmx = ((width >> 1) << 1);
1485 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1486 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1487 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1488 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1489 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1492 movq [edi + 16], mm1
1493 movq [edi + 24], mm1
1502 sptr -= (width_mmx*2 - 2); // sign fixed
1503 dp -= (width_mmx*16 - 2); // sign fixed
1504 for (i = width; i; i--)
1509 png_memcpy(v, sptr, 2);
1510 for (j = 0; j < png_pass_inc[pass]; j++)
1513 png_memcpy(dp, v, 2);
1517 else if (((pass == 2) || (pass == 3)) && width)
1519 int width_mmx = ((width >> 1) << 1) ;
1531 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1532 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1533 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1534 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1535 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1547 sptr -= (width_mmx*2 - 2); // sign fixed
1548 dp -= (width_mmx*8 - 2); // sign fixed
1549 for (i = width; i; i--)
1554 png_memcpy(v, sptr, 2);
1555 for (j = 0; j < png_pass_inc[pass]; j++)
1558 png_memcpy(dp, v, 2);
1562 else if (width) // pass == 4 or 5
1564 int width_mmx = ((width >> 1) << 1) ;
1576 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1577 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1587 sptr -= (width_mmx*2 - 2); // sign fixed
1588 dp -= (width_mmx*4 - 2); // sign fixed
1589 for (i = width; i; i--)
1594 png_memcpy(v, sptr, 2);
1595 for (j = 0; j < png_pass_inc[pass]; j++)
1598 png_memcpy(dp, v, 2);
1602 } /* end of pixel_bytes == 2 */
1604 else if (pixel_bytes == 4)
1606 if (((pass == 0) || (pass == 1)) && width)
1608 int width_mmx = ((width >> 1) << 1) ;
1620 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1621 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1622 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1623 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1626 movq [edi + 16], mm0
1627 movq [edi + 24], mm0
1629 movq [edi + 40], mm1
1632 movq [edi + 56], mm1
1640 sptr -= (width_mmx*4 - 4); // sign fixed
1641 dp -= (width_mmx*32 - 4); // sign fixed
1642 for (i = width; i; i--)
1647 png_memcpy(v, sptr, 4);
1648 for (j = 0; j < png_pass_inc[pass]; j++)
1651 png_memcpy(dp, v, 4);
1655 else if (((pass == 2) || (pass == 3)) && width)
1657 int width_mmx = ((width >> 1) << 1) ;
1669 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1670 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1671 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1672 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1676 movq [edi + 24], mm1
1685 sptr -= (width_mmx*4 - 4); // sign fixed
1686 dp -= (width_mmx*16 - 4); // sign fixed
1687 for (i = width; i; i--)
1692 png_memcpy(v, sptr, 4);
1693 for (j = 0; j < png_pass_inc[pass]; j++)
1696 png_memcpy(dp, v, 4);
1700 else if (width) // pass == 4 or 5
1702 int width_mmx = ((width >> 1) << 1) ;
1714 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1715 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1716 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1717 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1728 sptr -= (width_mmx*4 - 4); // sign fixed
1729 dp -= (width_mmx*8 - 4); // sign fixed
1730 for (i = width; i; i--)
1735 png_memcpy(v, sptr, 4);
1736 for (j = 0; j < png_pass_inc[pass]; j++)
1739 png_memcpy(dp, v, 4);
1744 } /* end of pixel_bytes == 4 */
1746 else if (pixel_bytes == 6)
1748 for (i = width; i; i--)
1752 png_memcpy(v, sptr, 6);
1753 for (j = 0; j < png_pass_inc[pass]; j++)
1755 png_memcpy(dp, v, 6);
1760 } /* end of pixel_bytes == 6 */
1764 for (i = width; i; i--)
1768 png_memcpy(v, sptr, pixel_bytes);
1769 for (j = 0; j < png_pass_inc[pass]; j++)
1771 png_memcpy(dp, v, pixel_bytes);
1777 } /* end of mmx_supported */
1779 else /* MMX not supported: use modified C code - takes advantage
1780 * of inlining of memcpy for a constant */
1782 if (pixel_bytes == 1)
1784 for (i = width; i; i--)
1787 for (j = 0; j < png_pass_inc[pass]; j++)
1792 else if (pixel_bytes == 3)
1794 for (i = width; i; i--)
1798 png_memcpy(v, sptr, pixel_bytes);
1799 for (j = 0; j < png_pass_inc[pass]; j++)
1801 png_memcpy(dp, v, pixel_bytes);
1804 sptr -= pixel_bytes;
1807 else if (pixel_bytes == 2)
1809 for (i = width; i; i--)
1813 png_memcpy(v, sptr, pixel_bytes);
1814 for (j = 0; j < png_pass_inc[pass]; j++)
1816 png_memcpy(dp, v, pixel_bytes);
1819 sptr -= pixel_bytes;
1822 else if (pixel_bytes == 4)
1824 for (i = width; i; i--)
1828 png_memcpy(v, sptr, pixel_bytes);
1829 for (j = 0; j < png_pass_inc[pass]; j++)
1831 png_memcpy(dp, v, pixel_bytes);
1834 sptr -= pixel_bytes;
1837 else if (pixel_bytes == 6)
1839 for (i = width; i; i--)
1843 png_memcpy(v, sptr, pixel_bytes);
1844 for (j = 0; j < png_pass_inc[pass]; j++)
1846 png_memcpy(dp, v, pixel_bytes);
1849 sptr -= pixel_bytes;
1854 for (i = width; i; i--)
1858 png_memcpy(v, sptr, pixel_bytes);
1859 for (j = 0; j < png_pass_inc[pass]; j++)
1861 png_memcpy(dp, v, pixel_bytes);
1864 sptr -= pixel_bytes;
1868 } /* end of MMX not supported */
1871 } /* end switch (row_info->pixel_depth) */
1873 row_info->width = final_width;
1874 row_info->rowbytes = ((final_width *
1875 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1880 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1883 // These variables are utilized in the functions below. They are declared
1884 // globally here to ensure alignment on 8-byte boundaries.
1889 } LBCarryMask = {0x0101010101010101},
1890 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1891 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1894 // Optimized code for PNG Average filter decoder
1896 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1897 , png_bytep prev_row)
1900 png_uint_32 FullLength;
1901 png_uint_32 MMXLength;
1905 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1906 FullLength = row_info->rowbytes; // # of bytes to filter
1908 // Init address pointers and offset
1909 mov edi, row // edi ==> Avg(x)
1910 xor ebx, ebx // ebx ==> x
1912 mov esi, prev_row // esi ==> Prior(x)
1913 sub edx, bpp // edx ==> Raw(x-bpp)
1916 // Compute the Raw value for the first bpp bytes
1917 // Raw(x) = Avg(x) + (Prior(x)/2)
1919 mov al, [esi + ebx] // Load al with Prior(x)
1921 shr al, 1 // divide by 2
1922 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1924 mov [edi+ebx-1], al // Write back Raw(x);
1925 // mov does not affect flags; -1 to offset inc ebx
1927 // get # of bytes to alignment
1928 mov diff, edi // take start of row
1929 add diff, ebx // add bpp
1930 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1931 and diff, 0xfffffff8 // mask to alignment boundary
1932 sub diff, edi // subtract from start ==> value ebx at alignment
1935 // Compute the Raw value for the bytes upto the alignment boundary
1936 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1940 mov cl, [esi + ebx] // load cl with Prior(x)
1941 mov al, [edx + ebx] // load al with Raw(x-bpp)
1944 shr ax, 1 // divide by 2
1945 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1946 cmp ebx, diff // Check if at alignment boundary
1947 mov [edi+ebx-1], al // Write back Raw(x);
1948 // mov does not affect flags; -1 to offset inc ebx
1949 jb davglp1 // Repeat until at alignment boundary
1953 sub eax, ebx // subtract alignment fix
1954 and eax, 0x00000007 // calc bytes over mult of 8
1955 sub ecx, eax // drop over bytes from original length
1958 // Now do the math for the rest of the row
1963 ActiveMask.use = 0x0000000000ffffff;
1964 ShiftBpp.use = 24; // == 3 * 8
1965 ShiftRem.use = 40; // == 64 - 24
1967 // Re-init address pointers and offset
1968 movq mm7, ActiveMask
1969 mov ebx, diff // ebx ==> x = offset to alignment boundary
1970 movq mm5, LBCarryMask
1971 mov edi, row // edi ==> Avg(x)
1972 movq mm4, HBClearMask
1973 mov esi, prev_row // esi ==> Prior(x)
1974 // PRIME the pump (load the first Raw(x-bpp) data set
1975 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1976 // (we correct position in loop below)
1978 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1979 // Add (Prev_row/2) to Average
1981 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1982 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1984 pand mm3, mm1 // get lsb for each prev_row byte
1985 psrlq mm1, 1 // divide prev_row bytes by 2
1986 pand mm1, mm4 // clear invalid bit 7 of each byte
1987 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1988 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1989 movq mm1, mm3 // now use mm1 for getting LBCarrys
1990 pand mm1, mm2 // get LBCarrys for each byte where both
1991 // lsb's were == 1 (Only valid for active group)
1992 psrlq mm2, 1 // divide raw bytes by 2
1993 pand mm2, mm4 // clear invalid bit 7 of each byte
1994 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1995 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1996 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
1998 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
1999 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2000 movq mm2, mm0 // mov updated Raws to mm2
2001 psllq mm2, ShiftBpp // shift data to position correctly
2002 movq mm1, mm3 // now use mm1 for getting LBCarrys
2003 pand mm1, mm2 // get LBCarrys for each byte where both
2004 // lsb's were == 1 (Only valid for active group)
2005 psrlq mm2, 1 // divide raw bytes by 2
2006 pand mm2, mm4 // clear invalid bit 7 of each byte
2007 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2008 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2009 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2012 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2013 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2015 movq mm2, mm0 // mov updated Raws to mm2
2016 psllq mm2, ShiftBpp // shift data to position correctly
2017 // Data only needs to be shifted once here to
2018 // get the correct x-bpp offset.
2019 movq mm1, mm3 // now use mm1 for getting LBCarrys
2020 pand mm1, mm2 // get LBCarrys for each byte where both
2021 // lsb's were == 1 (Only valid for active group)
2022 psrlq mm2, 1 // divide raw bytes by 2
2023 pand mm2, mm4 // clear invalid bit 7 of each byte
2024 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2025 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2027 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2030 // Now ready to write back to memory
2031 movq [edi + ebx - 8], mm0
2032 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2034 movq mm2, mm0 // mov updated Raw(x) to mm2
2045 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2046 // appropriate inactive bytes
2047 ShiftBpp.use = bpp << 3;
2048 ShiftRem.use = 64 - ShiftBpp.use;
2050 movq mm4, HBClearMask
2051 // Re-init address pointers and offset
2052 mov ebx, diff // ebx ==> x = offset to alignment boundary
2053 // Load ActiveMask and clear all bytes except for 1st active group
2054 movq mm7, ActiveMask
2055 mov edi, row // edi ==> Avg(x)
2057 mov esi, prev_row // esi ==> Prior(x)
2059 movq mm5, LBCarryMask
2060 psllq mm6, ShiftBpp // Create mask for 2nd active group
2061 // PRIME the pump (load the first Raw(x-bpp) data set
2062 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2063 // (we correct position in loop below)
2065 movq mm0, [edi + ebx]
2066 psrlq mm2, ShiftRem // shift data to position correctly
2067 movq mm1, [esi + ebx]
2068 // Add (Prev_row/2) to Average
2070 pand mm3, mm1 // get lsb for each prev_row byte
2071 psrlq mm1, 1 // divide prev_row bytes by 2
2072 pand mm1, mm4 // clear invalid bit 7 of each byte
2073 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2074 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2075 movq mm1, mm3 // now use mm1 for getting LBCarrys
2076 pand mm1, mm2 // get LBCarrys for each byte where both
2077 // lsb's were == 1 (Only valid for active group)
2078 psrlq mm2, 1 // divide raw bytes by 2
2079 pand mm2, mm4 // clear invalid bit 7 of each byte
2080 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2081 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2082 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2084 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2085 movq mm2, mm0 // mov updated Raws to mm2
2086 psllq mm2, ShiftBpp // shift data to position correctly
2088 movq mm1, mm3 // now use mm1 for getting LBCarrys
2089 pand mm1, mm2 // get LBCarrys for each byte where both
2090 // lsb's were == 1 (Only valid for active group)
2091 psrlq mm2, 1 // divide raw bytes by 2
2092 pand mm2, mm4 // clear invalid bit 7 of each byte
2093 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2094 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2095 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2098 // Now ready to write back to memory
2099 movq [edi + ebx - 8], mm0
2100 // Prep Raw(x-bpp) for next loop
2101 movq mm2, mm0 // mov updated Raws to mm2
2108 ActiveMask.use = 0x000000000000ffff;
2109 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2110 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2113 movq mm7, ActiveMask
2114 // Re-init address pointers and offset
2115 mov ebx, diff // ebx ==> x = offset to alignment boundary
2116 movq mm5, LBCarryMask
2117 mov edi, row // edi ==> Avg(x)
2118 movq mm4, HBClearMask
2119 mov esi, prev_row // esi ==> Prior(x)
2120 // PRIME the pump (load the first Raw(x-bpp) data set
2121 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2122 // (we correct position in loop below)
2124 movq mm0, [edi + ebx]
2125 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2126 movq mm1, [esi + ebx]
2127 // Add (Prev_row/2) to Average
2129 pand mm3, mm1 // get lsb for each prev_row byte
2130 psrlq mm1, 1 // divide prev_row bytes by 2
2131 pand mm1, mm4 // clear invalid bit 7 of each byte
2133 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2134 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2135 movq mm1, mm3 // now use mm1 for getting LBCarrys
2136 pand mm1, mm2 // get LBCarrys for each byte where both
2137 // lsb's were == 1 (Only valid for active group)
2138 psrlq mm2, 1 // divide raw bytes by 2
2139 pand mm2, mm4 // clear invalid bit 7 of each byte
2140 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2141 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2142 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2143 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2144 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2145 movq mm2, mm0 // mov updated Raws to mm2
2146 psllq mm2, ShiftBpp // shift data to position correctly
2147 movq mm1, mm3 // now use mm1 for getting LBCarrys
2148 pand mm1, mm2 // get LBCarrys for each byte where both
2149 // lsb's were == 1 (Only valid for active group)
2150 psrlq mm2, 1 // divide raw bytes by 2
2151 pand mm2, mm4 // clear invalid bit 7 of each byte
2152 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2153 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2154 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2156 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2157 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2158 movq mm2, mm0 // mov updated Raws to mm2
2159 psllq mm2, ShiftBpp // shift data to position correctly
2160 // Data only needs to be shifted once here to
2161 // get the correct x-bpp offset.
2162 movq mm1, mm3 // now use mm1 for getting LBCarrys
2163 pand mm1, mm2 // get LBCarrys for each byte where both
2164 // lsb's were == 1 (Only valid for active group)
2165 psrlq mm2, 1 // divide raw bytes by 2
2166 pand mm2, mm4 // clear invalid bit 7 of each byte
2167 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2168 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2169 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2171 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2172 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2173 movq mm2, mm0 // mov updated Raws to mm2
2174 psllq mm2, ShiftBpp // shift data to position correctly
2175 // Data only needs to be shifted once here to
2176 // get the correct x-bpp offset.
2178 movq mm1, mm3 // now use mm1 for getting LBCarrys
2179 pand mm1, mm2 // get LBCarrys for each byte where both
2180 // lsb's were == 1 (Only valid for active group)
2181 psrlq mm2, 1 // divide raw bytes by 2
2182 pand mm2, mm4 // clear invalid bit 7 of each byte
2183 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2184 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2185 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2188 // Now ready to write back to memory
2189 movq [edi + ebx - 8], mm0
2190 // Prep Raw(x-bpp) for next loop
2191 movq mm2, mm0 // mov updated Raws to mm2
2200 // Re-init address pointers and offset
2201 mov ebx, diff // ebx ==> x = offset to alignment boundary
2202 mov edi, row // edi ==> Avg(x)
2203 cmp ebx, FullLength // Test if offset at end of array
2205 // Do Paeth decode for remaining bytes
2206 mov esi, prev_row // esi ==> Prior(x)
2208 xor ecx, ecx // zero ecx before using cl & cx in loop below
2209 sub edx, bpp // edx ==> Raw(x-bpp)
2211 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2213 mov cl, [esi + ebx] // load cl with Prior(x)
2214 mov al, [edx + ebx] // load al with Raw(x-bpp)
2217 shr ax, 1 // divide by 2
2218 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2219 cmp ebx, FullLength // Check if at end of array
2220 mov [edi+ebx-1], al // Write back Raw(x);
2221 // mov does not affect flags; -1 to offset inc ebx
2231 // Re-init address pointers and offset
2232 mov ebx, diff // ebx ==> x = offset to alignment boundary
2233 movq mm5, LBCarryMask
2234 mov edi, row // edi ==> Avg(x)
2235 movq mm4, HBClearMask
2236 mov esi, prev_row // esi ==> Prior(x)
2237 // PRIME the pump (load the first Raw(x-bpp) data set
2238 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2239 // (NO NEED to correct position in loop below)
2241 movq mm0, [edi + ebx]
2243 movq mm1, [esi + ebx]
2245 pand mm3, mm1 // get lsb for each prev_row byte
2246 psrlq mm1, 1 // divide prev_row bytes by 2
2247 pand mm3, mm2 // get LBCarrys for each byte where both
2249 psrlq mm2, 1 // divide raw bytes by 2
2250 pand mm1, mm4 // clear invalid bit 7 of each byte
2251 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2252 pand mm2, mm4 // clear invalid bit 7 of each byte
2253 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2254 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2256 movq [edi + ebx - 8], mm0
2257 movq mm2, mm0 // reuse as Raw(x-bpp)
2262 default: // bpp greater than 8
2265 movq mm5, LBCarryMask
2266 // Re-init address pointers and offset
2267 mov ebx, diff // ebx ==> x = offset to alignment boundary
2268 mov edi, row // edi ==> Avg(x)
2269 movq mm4, HBClearMask
2271 mov esi, prev_row // esi ==> Prior(x)
2272 sub edx, bpp // edx ==> Raw(x-bpp)
2274 movq mm0, [edi + ebx]
2276 movq mm1, [esi + ebx]
2277 pand mm3, mm1 // get lsb for each prev_row byte
2278 movq mm2, [edx + ebx]
2279 psrlq mm1, 1 // divide prev_row bytes by 2
2280 pand mm3, mm2 // get LBCarrys for each byte where both
2282 psrlq mm2, 1 // divide raw bytes by 2
2283 pand mm1, mm4 // clear invalid bit 7 of each byte
2284 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2285 pand mm2, mm4 // clear invalid bit 7 of each byte
2286 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2288 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2290 movq [edi + ebx - 8], mm0
2295 } // end switch ( bpp )
2298 // MMX acceleration complete now do clean-up
2299 // Check if any remaining bytes left to decode
2300 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2301 mov edi, row // edi ==> Avg(x)
2302 cmp ebx, FullLength // Test if offset at end of array
2304 // Do Paeth decode for remaining bytes
2305 mov esi, prev_row // esi ==> Prior(x)
2307 xor ecx, ecx // zero ecx before using cl & cx in loop below
2308 sub edx, bpp // edx ==> Raw(x-bpp)
2310 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2312 mov cl, [esi + ebx] // load cl with Prior(x)
2313 mov al, [edx + ebx] // load al with Raw(x-bpp)
2316 shr ax, 1 // divide by 2
2317 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2318 cmp ebx, FullLength // Check if at end of array
2319 mov [edi+ebx-1], al // Write back Raw(x);
2320 // mov does not affect flags; -1 to offset inc ebx
2323 emms // End MMX instructions; prep for possible FP instrs.
2327 // Optimized code for PNG Paeth filter decoder
2329 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2332 png_uint_32 FullLength;
2333 png_uint_32 MMXLength;
2338 int patemp, pbtemp, pctemp;
2340 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2341 FullLength = row_info->rowbytes; // # of bytes to filter
2344 xor ebx, ebx // ebx ==> x offset
2346 xor edx, edx // edx ==> x-bpp offset
2350 // Compute the Raw value for the first bpp bytes
2351 // Note: the formula works out to be always
2352 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2358 mov [edi + ebx - 1], al
2360 // get # of bytes to alignment
2361 mov diff, edi // take start of row
2362 add diff, ebx // add bpp
2364 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2365 and diff, 0xfffffff8 // mask to alignment boundary
2366 sub diff, edi // subtract from start ==> value ebx at alignment
2371 // pav = p - a = (a + b - c) - a = b - c
2372 mov al, [esi + ebx] // load Prior(x) into al
2373 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2374 sub eax, ecx // subtract Prior(x-bpp)
2375 mov patemp, eax // Save pav for later use
2377 // pbv = p - b = (a + b - c) - b = a - c
2378 mov al, [edi + edx] // load Raw(x-bpp) into al
2379 sub eax, ecx // subtract Prior(x-bpp)
2381 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2382 add eax, patemp // pcv = pav + pbv
2384 test eax, 0x80000000
2386 neg eax // reverse sign of neg values
2388 mov pctemp, eax // save pc for later use
2390 test ecx, 0x80000000
2392 neg ecx // reverse sign of neg values
2394 mov pbtemp, ecx // save pb for later use
2397 test eax, 0x80000000
2399 neg eax // reverse sign of neg values
2401 mov patemp, eax // save pa for later use
2405 // pa > pb; now test if pb <= pc
2408 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2409 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2412 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2413 mov cl, [esi + ebx] // load Prior(x) into cl
2416 // pa <= pb; now test if pa <= pc
2419 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2420 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2423 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2424 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2428 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2429 add [edi + ebx - 1], cl
2435 sub eax, ebx // subtract alignment fix
2436 and eax, 0x00000007 // calc bytes over mult of 8
2437 sub ecx, eax // drop over bytes from original length
2440 // Now do the math for the rest of the row
2445 ActiveMask.use = 0x0000000000ffffff;
2446 ActiveMaskEnd.use = 0xffff000000000000;
2447 ShiftBpp.use = 24; // == bpp(3) * 8
2448 ShiftRem.use = 40; // == 64 - 24
2455 // PRIME the pump (load the first Raw(x-bpp) data set
2456 movq mm1, [edi+ebx-8]
2458 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2459 movq mm2, [esi + ebx] // load b=Prior(x)
2460 punpcklbw mm1, mm0 // Unpack High bytes of a
2461 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2462 punpcklbw mm2, mm0 // Unpack High bytes of b
2463 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2464 // pav = p - a = (a + b - c) - a = b - c
2466 punpcklbw mm3, mm0 // Unpack High bytes of c
2467 // pbv = p - b = (a + b - c) - b = a - c
2471 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2475 // pa = abs(p-a) = abs(pav)
2476 // pb = abs(p-b) = abs(pbv)
2477 // pc = abs(p-c) = abs(pcv)
2478 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2480 pand mm0, mm4 // Only pav bytes < 0 in mm7
2481 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2483 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2487 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2488 pand mm0, mm6 // Only pav bytes < 0 in mm7
2494 pcmpgtw mm7, mm5 // pa > pb?
2496 // use mm7 mask to merge pa & pb
2498 // use mm0 mask copy to merge a & b
2504 // test ((pa <= pb)? pa:pb) <= pc
2505 pcmpgtw mm7, mm6 // pab > pc?
2512 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2513 pand mm7, ActiveMask
2514 movq mm2, mm3 // load b=Prior(x) step 1
2515 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2516 punpcklbw mm3, mm0 // Unpack High bytes of c
2517 movq [edi + ebx], mm7 // write back updated value
2518 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2519 // Now do Paeth for 2nd set of bytes (3-5)
2520 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2521 punpcklbw mm1, mm0 // Unpack High bytes of a
2523 punpcklbw mm2, mm0 // Unpack High bytes of b
2524 // pbv = p - b = (a + b - c) - b = a - c
2526 // pav = p - a = (a + b - c) - a = b - c
2530 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2531 // pav + pbv = pbv + pav
2535 // pa = abs(p-a) = abs(pav)
2536 // pb = abs(p-b) = abs(pbv)
2537 // pc = abs(p-c) = abs(pcv)
2538 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2539 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2540 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2541 pand mm7, mm4 // Only pav bytes < 0 in mm7
2547 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2548 pand mm0, mm6 // Only pav bytes < 0 in mm7
2553 pcmpgtw mm7, mm5 // pa > pb?
2555 // use mm7 mask to merge pa & pb
2557 // use mm0 mask copy to merge a & b
2563 // test ((pa <= pb)? pa:pb) <= pc
2564 pcmpgtw mm7, mm6 // pab > pc?
2565 movq mm2, [esi + ebx] // load b=Prior(x)
2572 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2573 pand mm7, ActiveMask
2574 punpckhbw mm2, mm0 // Unpack High bytes of b
2575 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2576 // pav = p - a = (a + b - c) - a = b - c
2578 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2579 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2580 movq [edi + ebx], mm7 // write back updated value
2582 punpckhbw mm3, mm0 // Unpack High bytes of c
2583 psllq mm1, ShiftBpp // Shift bytes
2584 // Now mm1 will be used as Raw(x-bpp)
2585 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2587 punpckhbw mm1, mm0 // Unpack High bytes of a
2589 // pbv = p - b = (a + b - c) - b = a - c
2591 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2597 // pa = abs(p-a) = abs(pav)
2598 // pb = abs(p-b) = abs(pbv)
2599 // pc = abs(p-c) = abs(pcv)
2600 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2601 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2602 pand mm0, mm4 // Only pav bytes < 0 in mm7
2603 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2609 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2610 pand mm0, mm6 // Only pav bytes < 0 in mm7
2615 pcmpgtw mm7, mm5 // pa > pb?
2617 // use mm0 mask copy to merge a & b
2619 // use mm7 mask to merge pa & pb
2625 // test ((pa <= pb)? pa:pb) <= pc
2626 pcmpgtw mm7, mm6 // pab > pc?
2632 // Step ebx to next set of 8 bytes and repeat loop til done
2634 pand mm1, ActiveMaskEnd
2635 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2638 pxor mm0, mm0 // pxor does not affect flags
2639 movq [edi + ebx - 8], mm1 // write back updated value
2640 // mm1 will be used as Raw(x-bpp) next loop
2641 // mm3 ready to be used as Prior(x-bpp) next loop
2651 ActiveMask.use = 0x00000000ffffffff;
2652 ActiveMask2.use = 0xffffffff00000000;
2653 ShiftBpp.use = bpp << 3; // == bpp * 8
2654 ShiftRem.use = 64 - ShiftBpp.use;
2660 // PRIME the pump (load the first Raw(x-bpp) data set
2661 movq mm1, [edi+ebx-8]
2664 // Must shift to position Raw(x-bpp) data
2666 // Do first set of 4 bytes
2667 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2668 punpcklbw mm1, mm0 // Unpack Low bytes of a
2669 movq mm2, [esi + ebx] // load b=Prior(x)
2670 punpcklbw mm2, mm0 // Unpack Low bytes of b
2671 // Must shift to position Prior(x-bpp) data
2673 // pav = p - a = (a + b - c) - a = b - c
2675 punpcklbw mm3, mm0 // Unpack Low bytes of c
2676 // pbv = p - b = (a + b - c) - b = a - c
2680 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2683 // pa = abs(p-a) = abs(pav)
2684 // pb = abs(p-b) = abs(pbv)
2685 // pc = abs(p-c) = abs(pcv)
2686 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2688 pand mm0, mm4 // Only pav bytes < 0 in mm7
2689 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2691 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2695 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2696 pand mm0, mm6 // Only pav bytes < 0 in mm7
2702 pcmpgtw mm7, mm5 // pa > pb?
2704 // use mm7 mask to merge pa & pb
2706 // use mm0 mask copy to merge a & b
2712 // test ((pa <= pb)? pa:pb) <= pc
2713 pcmpgtw mm7, mm6 // pab > pc?
2720 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2721 pand mm7, ActiveMask
2723 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2724 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2726 movq [edi + ebx], mm7 // write back updated value
2727 movq mm1, [edi+ebx-8]
2733 punpckhbw mm3, mm0 // Unpack High bytes of c
2735 // Do second set of 4 bytes
2736 punpckhbw mm2, mm0 // Unpack High bytes of b
2737 punpckhbw mm1, mm0 // Unpack High bytes of a
2738 // pav = p - a = (a + b - c) - a = b - c
2740 // pbv = p - b = (a + b - c) - b = a - c
2744 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2747 // pa = abs(p-a) = abs(pav)
2748 // pb = abs(p-b) = abs(pbv)
2749 // pc = abs(p-c) = abs(pcv)
2750 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2752 pand mm0, mm4 // Only pav bytes < 0 in mm7
2753 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2755 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2759 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2760 pand mm0, mm6 // Only pav bytes < 0 in mm7
2766 pcmpgtw mm7, mm5 // pa > pb?
2768 // use mm7 mask to merge pa & pb
2770 // use mm0 mask copy to merge a & b
2776 // test ((pa <= pb)? pa:pb) <= pc
2777 pcmpgtw mm7, mm6 // pab > pc?
2784 // Step ex to next set of 8 bytes and repeat loop til done
2787 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2789 movq [edi + ebx - 8], mm1 // write back updated value
2790 // mm1 will be used as Raw(x-bpp) next loop
2798 ActiveMask.use = 0x00000000ffffffff;
2804 // PRIME the pump (load the first Raw(x-bpp) data set
2805 movq mm1, [edi+ebx-8] // Only time should need to read
2806 // a=Raw(x-bpp) bytes
2808 // Do first set of 4 bytes
2809 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2810 punpckhbw mm1, mm0 // Unpack Low bytes of a
2811 movq mm2, [esi + ebx] // load b=Prior(x)
2812 punpcklbw mm2, mm0 // Unpack High bytes of b
2813 // pav = p - a = (a + b - c) - a = b - c
2815 punpckhbw mm3, mm0 // Unpack High bytes of c
2816 // pbv = p - b = (a + b - c) - b = a - c
2820 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2823 // pa = abs(p-a) = abs(pav)
2824 // pb = abs(p-b) = abs(pbv)
2825 // pc = abs(p-c) = abs(pcv)
2826 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2828 pand mm0, mm4 // Only pav bytes < 0 in mm7
2829 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2831 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2835 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2836 pand mm0, mm6 // Only pav bytes < 0 in mm7
2842 pcmpgtw mm7, mm5 // pa > pb?
2844 // use mm7 mask to merge pa & pb
2846 // use mm0 mask copy to merge a & b
2852 // test ((pa <= pb)? pa:pb) <= pc
2853 pcmpgtw mm7, mm6 // pab > pc?
2860 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2861 pand mm7, ActiveMask
2862 movq mm2, mm3 // load b=Prior(x) step 1
2863 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2864 punpcklbw mm3, mm0 // Unpack High bytes of c
2865 movq [edi + ebx], mm7 // write back updated value
2866 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2867 // Do second set of 4 bytes
2868 punpckhbw mm2, mm0 // Unpack Low bytes of b
2869 punpcklbw mm1, mm0 // Unpack Low bytes of a
2870 // pav = p - a = (a + b - c) - a = b - c
2872 // pbv = p - b = (a + b - c) - b = a - c
2876 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2879 // pa = abs(p-a) = abs(pav)
2880 // pb = abs(p-b) = abs(pbv)
2881 // pc = abs(p-c) = abs(pcv)
2882 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2884 pand mm0, mm4 // Only pav bytes < 0 in mm7
2885 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2887 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2891 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2892 pand mm0, mm6 // Only pav bytes < 0 in mm7
2898 pcmpgtw mm7, mm5 // pa > pb?
2900 // use mm7 mask to merge pa & pb
2902 // use mm0 mask copy to merge a & b
2908 // test ((pa <= pb)? pa:pb) <= pc
2909 pcmpgtw mm7, mm6 // pab > pc?
2916 // Step ex to next set of 8 bytes and repeat loop til done
2919 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2921 movq [edi + ebx - 8], mm1 // write back updated value
2922 // mm1 will be used as Raw(x-bpp) next loop
2929 ActiveMask.use = 0x00000000ffffffff;
2935 // PRIME the pump (load the first Raw(x-bpp) data set
2936 movq mm1, [edi+ebx-8] // Only time should need to read
2937 // a=Raw(x-bpp) bytes
2939 // Do first set of 4 bytes
2940 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2941 punpcklbw mm1, mm0 // Unpack Low bytes of a
2942 movq mm2, [esi + ebx] // load b=Prior(x)
2943 punpcklbw mm2, mm0 // Unpack Low bytes of b
2944 // pav = p - a = (a + b - c) - a = b - c
2946 punpcklbw mm3, mm0 // Unpack Low bytes of c
2947 // pbv = p - b = (a + b - c) - b = a - c
2951 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2954 // pa = abs(p-a) = abs(pav)
2955 // pb = abs(p-b) = abs(pbv)
2956 // pc = abs(p-c) = abs(pcv)
2957 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2959 pand mm0, mm4 // Only pav bytes < 0 in mm7
2960 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2962 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2966 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2967 pand mm0, mm6 // Only pav bytes < 0 in mm7
2973 pcmpgtw mm7, mm5 // pa > pb?
2975 // use mm7 mask to merge pa & pb
2977 // use mm0 mask copy to merge a & b
2983 // test ((pa <= pb)? pa:pb) <= pc
2984 pcmpgtw mm7, mm6 // pab > pc?
2991 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2992 pand mm7, ActiveMask
2993 movq mm2, [esi + ebx] // load b=Prior(x)
2994 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2995 punpckhbw mm3, mm0 // Unpack High bytes of c
2996 movq [edi + ebx], mm7 // write back updated value
2997 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
2999 // Do second set of 4 bytes
3000 punpckhbw mm2, mm0 // Unpack High bytes of b
3001 punpckhbw mm1, mm0 // Unpack High bytes of a
3002 // pav = p - a = (a + b - c) - a = b - c
3004 // pbv = p - b = (a + b - c) - b = a - c
3008 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3011 // pa = abs(p-a) = abs(pav)
3012 // pb = abs(p-b) = abs(pbv)
3013 // pc = abs(p-c) = abs(pcv)
3014 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3016 pand mm0, mm4 // Only pav bytes < 0 in mm7
3017 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3019 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3023 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3024 pand mm0, mm6 // Only pav bytes < 0 in mm7
3030 pcmpgtw mm7, mm5 // pa > pb?
3032 // use mm7 mask to merge pa & pb
3034 // use mm0 mask copy to merge a & b
3040 // test ((pa <= pb)? pa:pb) <= pc
3041 pcmpgtw mm7, mm6 // pab > pc?
3048 // Step ex to next set of 8 bytes and repeat loop til done
3051 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3053 movq [edi + ebx - 8], mm1 // write back updated value
3054 // mm1 will be used as Raw(x-bpp) next loop
3070 // Do Paeth decode for remaining bytes
3072 xor ecx, ecx // zero ecx before using cl & cx in loop below
3073 sub edx, bpp // Set edx = ebx - bpp
3076 // pav = p - a = (a + b - c) - a = b - c
3077 mov al, [esi + ebx] // load Prior(x) into al
3078 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3079 sub eax, ecx // subtract Prior(x-bpp)
3080 mov patemp, eax // Save pav for later use
3082 // pbv = p - b = (a + b - c) - b = a - c
3083 mov al, [edi + edx] // load Raw(x-bpp) into al
3084 sub eax, ecx // subtract Prior(x-bpp)
3086 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3087 add eax, patemp // pcv = pav + pbv
3089 test eax, 0x80000000
3091 neg eax // reverse sign of neg values
3093 mov pctemp, eax // save pc for later use
3095 test ecx, 0x80000000
3097 neg ecx // reverse sign of neg values
3099 mov pbtemp, ecx // save pb for later use
3102 test eax, 0x80000000
3104 neg eax // reverse sign of neg values
3106 mov patemp, eax // save pa for later use
3110 // pa > pb; now test if pb <= pc
3113 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3114 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3117 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3118 mov cl, [esi + ebx] // load Prior(x) into cl
3121 // pa <= pb; now test if pa <= pc
3124 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3125 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3128 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3129 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3133 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3134 add [edi + ebx - 1], cl
3140 return; // No need to go further with this one
3141 } // end switch ( bpp )
3144 // MMX acceleration complete now do clean-up
3145 // Check if any remaining bytes left to decode
3151 // Do Paeth decode for remaining bytes
3153 xor ecx, ecx // zero ecx before using cl & cx in loop below
3154 sub edx, bpp // Set edx = ebx - bpp
3157 // pav = p - a = (a + b - c) - a = b - c
3158 mov al, [esi + ebx] // load Prior(x) into al
3159 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3160 sub eax, ecx // subtract Prior(x-bpp)
3161 mov patemp, eax // Save pav for later use
3163 // pbv = p - b = (a + b - c) - b = a - c
3164 mov al, [edi + edx] // load Raw(x-bpp) into al
3165 sub eax, ecx // subtract Prior(x-bpp)
3167 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3168 add eax, patemp // pcv = pav + pbv
3170 test eax, 0x80000000
3172 neg eax // reverse sign of neg values
3174 mov pctemp, eax // save pc for later use
3176 test ecx, 0x80000000
3178 neg ecx // reverse sign of neg values
3180 mov pbtemp, ecx // save pb for later use
3183 test eax, 0x80000000
3185 neg eax // reverse sign of neg values
3187 mov patemp, eax // save pa for later use
3191 // pa > pb; now test if pb <= pc
3194 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3195 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3198 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3199 mov cl, [esi + ebx] // load Prior(x) into cl
3202 // pa <= pb; now test if pa <= pc
3205 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3206 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3209 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3210 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3214 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3215 add [edi + ebx - 1], cl
3219 emms // End MMX instructions; prep for possible FP instrs.
3223 // Optimized code for PNG Sub filter decoder
3225 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3229 png_uint_32 FullLength;
3230 png_uint_32 MMXLength;
3233 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3234 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3237 mov esi, edi // lp = row
3238 add edi, bpp // rp = row + bpp
3240 // get # of bytes to alignment
3241 mov diff, edi // take start of row
3242 add diff, 0xf // add 7 + 8 to incr past
3243 // alignment boundary
3245 and diff, 0xfffffff8 // mask to alignment boundary
3246 sub diff, edi // subtract from start ==> value
3259 sub edx, ebx // subtract alignment fix
3260 and edx, 0x00000007 // calc bytes over mult of 8
3261 sub ecx, edx // drop over bytes from length
3265 // Now do the math for the rest of the row
3270 ActiveMask.use = 0x0000ffffff000000;
3271 ShiftBpp.use = 24; // == 3 * 8
3272 ShiftRem.use = 40; // == 64 - 24
3275 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3276 mov esi, edi // lp = row
3277 add edi, bpp // rp = row + bpp
3280 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3282 // PRIME the pump (load the first Raw(x-bpp) data set
3283 movq mm1, [edi+ebx-8]
3285 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3286 // no need for mask; shift clears inactive bytes
3287 // Add 1st active group
3290 // Add 2nd active group
3291 movq mm1, mm0 // mov updated Raws to mm1
3292 psllq mm1, ShiftBpp // shift data to position correctly
3293 pand mm1, mm7 // mask to use only 2nd active group
3295 // Add 3rd active group
3296 movq mm1, mm0 // mov updated Raws to mm1
3297 psllq mm1, ShiftBpp // shift data to position correctly
3298 pand mm1, mm6 // mask to use only 3rd active group
3302 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3303 // Prep for doing 1st add at top of loop
3312 // Placed here just in case this is a duplicate of the
3313 // non-MMX code for the SUB filter in png_read_filter_row below
3318 // bpp = (row_info->pixel_depth + 7) >> 3;
3319 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3320 // i < row_info->rowbytes; i++, rp++, lp++)
3322 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3329 mov esi, edi // lp = row
3331 add edi, bpp // rp = row + bpp
3348 ShiftBpp.use = bpp << 3;
3349 ShiftRem.use = 64 - ShiftBpp.use;
3353 mov esi, edi // lp = row
3354 add edi, bpp // rp = row + bpp
3355 // PRIME the pump (load the first Raw(x-bpp) data set
3356 movq mm1, [edi+ebx-8]
3358 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3359 // no need for mask; shift clears inactive bytes
3362 // Add 2nd active group
3363 movq mm1, mm0 // mov updated Raws to mm1
3364 psllq mm1, ShiftBpp // shift data to position correctly
3365 // there is no need for any mask
3366 // since shift clears inactive bits/bytes
3370 movq [edi+ebx-8], mm0
3371 movq mm1, mm0 // Prep for doing 1st add at top of loop
3379 ActiveMask.use = 0x00000000ffff0000;
3380 ShiftBpp.use = 16; // == 2 * 8
3381 ShiftRem.use = 48; // == 64 - 16
3383 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3387 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3389 mov esi, edi // lp = row
3391 add edi, bpp // rp = row + bpp
3392 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3394 // PRIME the pump (load the first Raw(x-bpp) data set
3395 movq mm1, [edi+ebx-8]
3397 // Add 1st active group
3398 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3399 // no need for mask; shift clears inactive
3403 // Add 2nd active group
3404 movq mm1, mm0 // mov updated Raws to mm1
3405 psllq mm1, ShiftBpp // shift data to position correctly
3406 pand mm1, mm7 // mask to use only 2nd active group
3408 // Add 3rd active group
3409 movq mm1, mm0 // mov updated Raws to mm1
3410 psllq mm1, ShiftBpp // shift data to position correctly
3411 pand mm1, mm6 // mask to use only 3rd active group
3413 // Add 4th active group
3414 movq mm1, mm0 // mov updated Raws to mm1
3415 psllq mm1, ShiftBpp // shift data to position correctly
3416 pand mm1, mm5 // mask to use only 4th active group
3420 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3421 movq mm1, mm0 // Prep for doing 1st add at top of loop
3431 mov esi, edi // lp = row
3432 add edi, bpp // rp = row + bpp
3434 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3435 // Raw(x-bpp) data set
3436 and ecx, 0x0000003f // calc bytes over mult of 64
3438 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3440 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3441 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3442 // Now mm0 will be used as Raw(x-bpp) for
3443 // the 2nd group of 8 bytes. This will be
3444 // repeated for each group of 8 bytes with
3445 // the 8th group being used as the Raw(x-bpp)
3446 // for the 1st group of the next loop.
3448 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3449 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3451 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3452 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3454 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3455 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3457 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3458 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3460 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3461 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3463 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3464 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3468 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3477 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3478 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3479 // be the new Raw(x-bpp) for the next loop
3486 default: // bpp greater than 8 bytes
3491 mov esi, edi // lp = row
3492 add edi, bpp // rp = row + bpp
3499 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3506 } // end switch ( bpp )
3513 mov esi, edi // lp = row
3515 add edi, bpp // rp = row + bpp
3523 emms // End MMX instructions; prep for possible FP instrs.
3527 // Optimized code for PNG Up filter decoder
3529 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3533 len = row_info->rowbytes; // # of bytes to filter
3536 // get # of bytes to alignment
3551 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3556 sub edx, ebx // subtract alignment fix
3557 and edx, 0x0000003f // calc bytes over mult of 64
3558 sub ecx, edx // drop over bytes from length
3559 // Unrolled loop - use all MMX registers and interleave to reduce
3560 // number of branch instructions (loops) and reduce partial stalls
3564 movq mm3, [esi+ebx+8]
3566 movq mm2, [edi+ebx+8]
3569 movq mm5, [esi+ebx+16]
3570 movq [edi+ebx+8], mm2
3571 movq mm4, [edi+ebx+16]
3572 movq mm7, [esi+ebx+24]
3574 movq mm6, [edi+ebx+24]
3575 movq [edi+ebx+16], mm4
3577 movq mm1, [esi+ebx+32]
3578 movq [edi+ebx+24], mm6
3579 movq mm0, [edi+ebx+32]
3580 movq mm3, [esi+ebx+40]
3582 movq mm2, [edi+ebx+40]
3583 movq [edi+ebx+32], mm0
3585 movq mm5, [esi+ebx+48]
3586 movq [edi+ebx+40], mm2
3587 movq mm4, [edi+ebx+48]
3588 movq mm7, [esi+ebx+56]
3590 movq mm6, [edi+ebx+56]
3591 movq [edi+ebx+48], mm4
3595 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3596 // -8 to offset add ebx
3599 cmp edx, 0 // Test for bytes over mult of 64
3603 // 2 lines added by lcreeve@netins.net
3604 // (mail 11 Jul 98 in png-implement list)
3605 cmp edx, 8 //test for less than 8 bytes
3610 and edx, 0x00000007 // calc bytes over mult of 8
3611 sub ecx, edx // drop over bytes from length
3613 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3620 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3622 cmp edx, 0 // Test for bytes over mult of 8
3626 add ecx, edx // move over byte count into counter
3627 // Loop using x86 registers to update remaining bytes
3633 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3636 // Conversion of filtered row completed
3637 emms // End MMX instructions; prep for possible FP instrs.
3642 // Optimized png_read_filter_row routines
3644 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3645 row, png_bytep prev_row, int filter)
3651 if (mmx_supported == 2) {
3652 /* this should have happened in png_init_mmx_flags() already */
3653 png_warning(png_ptr, "asm_flags may not have been initialized");
3658 png_debug(1, "in png_read_filter_row\n");
3661 case 0: sprintf(filnm, "none");
3663 case 1: sprintf(filnm, "sub-%s",
3664 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3666 case 2: sprintf(filnm, "up-%s",
3667 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3669 case 3: sprintf(filnm, "avg-%s",
3670 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3672 case 4: sprintf(filnm, "Paeth-%s",
3673 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3675 default: sprintf(filnm, "unknw");
3678 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3679 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3680 (int)((row_info->pixel_depth + 7) >> 3));
3681 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3682 #endif /* PNG_DEBUG */
3686 case PNG_FILTER_VALUE_NONE:
3689 case PNG_FILTER_VALUE_SUB:
3691 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3692 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3693 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3695 png_read_filter_row_mmx_sub(row_info, row);
3700 png_uint_32 istop = row_info->rowbytes;
3701 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3702 png_bytep rp = row + bpp;
3705 for (i = bpp; i < istop; i++)
3707 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3714 case PNG_FILTER_VALUE_UP:
3716 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3717 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3718 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3720 png_read_filter_row_mmx_up(row_info, row, prev_row);
3725 png_uint_32 istop = row_info->rowbytes;
3727 png_bytep pp = prev_row;
3729 for (i = 0; i < istop; ++i)
3731 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3738 case PNG_FILTER_VALUE_AVG:
3740 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3741 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3742 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3744 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3750 png_bytep pp = prev_row;
3752 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3753 png_uint_32 istop = row_info->rowbytes - bpp;
3755 for (i = 0; i < bpp; i++)
3757 *rp = (png_byte)(((int)(*rp) +
3758 ((int)(*pp++) >> 1)) & 0xff);
3762 for (i = 0; i < istop; i++)
3764 *rp = (png_byte)(((int)(*rp) +
3765 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3772 case PNG_FILTER_VALUE_PAETH:
3774 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3775 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3776 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3778 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3784 png_bytep pp = prev_row;
3786 png_bytep cp = prev_row;
3787 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3788 png_uint_32 istop=row_info->rowbytes - bpp;
3790 for (i = 0; i < bpp; i++)
3792 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3796 for (i = 0; i < istop; i++) // use leftover rp,pp
3798 int a, b, c, pa, pb, pc, p;
3812 pa = p < 0 ? -p : p;
3813 pb = pc < 0 ? -pc : pc;
3814 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3818 if (pa <= pb && pa <= pc)
3826 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3828 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3836 png_warning(png_ptr, "Ignoring bad row filter type");
3842 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */