Main MRPT website > C++ reference for MRPT 1.5.7
CImage_SSE3.cpp
Go to the documentation of this file.
1 /* +---------------------------------------------------------------------------+
2  | Mobile Robot Programming Toolkit (MRPT) |
3  | http://www.mrpt.org/ |
4  | |
5  | Copyright (c) 2005-2017, Individual contributors, see AUTHORS file |
6  | See: http://www.mrpt.org/Authors - All rights reserved. |
7  | Released under BSD License. See details in http://www.mrpt.org/License |
8  +---------------------------------------------------------------------------+ */
9 
10 #include "base-precomp.h" // Precompiled headers
11 
12 // ---------------------------------------------------------------------------
13 // This file contains the SSE3/SSSE3 optimized functions for mrpt::utils::CImage
14 // See the sources and the doxygen documentation page "sse_optimizations" for more details.
15 // ---------------------------------------------------------------------------
16 #if MRPT_HAS_SSE3
17 
18 #include <mrpt/utils/CImage.h>
19 #include <mrpt/utils/SSE_types.h>
20 #include <mrpt/utils/SSE_macros.h>
21 #include "CImage_SSEx.h"
22 
23 
24 /** \addtogroup sse_optimizations
25  * SSE optimized functions
26  * @{
27  */
28 
29 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3
30  * - <b>Input format:</b> uint8_t, 3 channels (RGB or BGR)
31  * - <b>Output format:</b> uint8_t, 3 channels (RGB or BGR)
32  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3
33  * - <b>Notes:</b>
34  * - <b>Requires:</b> SSSE3
35  * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalf()
36  */
37 void image_SSSE3_scale_half_3c8u(const uint8_t* in, uint8_t* out, int w, int h)
38 {
39  MRPT_ALIGN16 const unsigned long long mask0[2] = { 0x0D0C080706020100ull, 0x808080808080800Eull }; // Long words are in inverse order due to little endianness
40  MRPT_ALIGN16 const unsigned long long mask1[2] = { 0x8080808080808080ull, 0x0E0A090804030280ull };
41  MRPT_ALIGN16 const unsigned long long mask2[2] = { 0x0C0B0A0605040080ull, 0x8080808080808080ull };
42  MRPT_ALIGN16 const unsigned long long mask3[2] = { 0x808080808080800Full, 0x8080808080808080ull };
43 
44  const __m128i m0 = _mm_load_si128((const __m128i*)mask0);
45  const __m128i m1 = _mm_load_si128((const __m128i*)mask1);
46  const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
47  const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
48 
49  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
50  const int sh = h >> 1;
51 
52  int odd_row = 0;
53 
54  for (int i=0; i<sh; i++)
55  {
56  for (int j=0; j<sw; j++)
57  {
58  // 16-byte blocks #0,#1,#2:
59  __m128i d0 = _mm_load_si128((const __m128i*)in); in += 16;
60  __m128i d1 = _mm_load_si128((const __m128i*)in); in += 16;
61 
62  // First 16 bytes:
63  __m128i shuf0 = _mm_shuffle_epi8(d0,m0);
64  __m128i shuf1 = _mm_shuffle_epi8(d1,m1);
65 
66  __m128i res0 = _mm_or_si128(shuf0,shuf1);
67 
68  if ((odd_row&0x1)!=0)
69  _mm_storeu_si128((__m128i*)out,res0); // unaligned output
70  else _mm_store_si128 ((__m128i*)out,res0); // aligned output
71  out += 16;
72 
73  // Last 8 bytes:
74  __m128i d2 = _mm_load_si128((const __m128i*)in); in += 16;
75 
76  _mm_storel_epi64( // Write lower 8 bytes only
77  (__m128i*)out,
78  _mm_or_si128(_mm_shuffle_epi8(d2,m2),_mm_shuffle_epi8(d1,m3))
79  );
80  odd_row++;
81  out += 8;
82  }
83  in += 3*w;
84  }
85 }
86 
87 
88 // This is the actual function behind both: image_SSSE3_rgb_to_gray_8u() and image_SSSE3_bgr_to_gray_8u():
89 template <bool IS_RGB>
91 {
92  // Masks: 0 1 2 3 4 5 6 7 8 9 A B C D E F
93  BUILD_128BIT_CONST(mask0, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F, 80,80, 80,80) // reds[0-7] from D0
94  BUILD_128BIT_CONST(mask1, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80, 80,02, 80,05) // reds[0-7] from D1
95 
96  BUILD_128BIT_CONST(mask2, 80,01, 80,04, 80,07, 80,0A, 80,0D, 80,80, 80,80, 80,80) // greens[0-7] from D0
97  BUILD_128BIT_CONST(mask3, 80,80, 80,80, 80,80, 80,80, 80,80, 80,00, 80,03, 80,06) // greens[0-7] from D1
98 
99  BUILD_128BIT_CONST(mask4, 80,02, 80,05, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80) // blues[0-7] from D0
100  BUILD_128BIT_CONST(mask5, 80,80, 80,80, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07) // blues[0-7] from D1
101 
102 
103  BUILD_128BIT_CONST(mask6, 80,08, 80,0B, 80,0E, 80,80, 80,80, 80,80, 80,80, 80,80) // reds[8-15] from D1
104  BUILD_128BIT_CONST(mask7, 80,80, 80,80, 80,80, 80,01, 80,04, 80,07, 80,0A, 80,0D) // reds[8-15] from D2
105 
106  BUILD_128BIT_CONST(mask8, 80,09, 80,0C, 80,0F, 80,80, 80,80, 80,80, 80,80, 80,80) // greens[8-15] from D1
107  BUILD_128BIT_CONST(mask9, 80,80, 80,80, 80,80, 80,02, 80,05, 80,08, 80,0B, 80,0E) // greens[8-15] from D2
108 
109  BUILD_128BIT_CONST(mask10,80,0A, 80,0D, 80,80, 80,80, 80,80, 80,80, 80,80, 80,80) // blues[8-15] from D1
110  BUILD_128BIT_CONST(mask11,80,80, 80,80, 80,00, 80,03, 80,06, 80,09, 80,0C, 80,0F) // blues[8-15] from D2
111 
112 
113  BUILD_128BIT_CONST(mask_to_low, 01,03,05,07,09,0B,0D,0F, 80,80,80,80,80,80,80,80)
114 
115 
116  // Conversion factors for RGB->Y
117  BUILD_128BIT_CONST(val_red , 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D, 00,1D)
118  BUILD_128BIT_CONST(val_green , 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96, 00,96)
119  BUILD_128BIT_CONST(val_blue , 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D, 00,4D)
120 
121  const __m128i m0 = _mm_load_si128( IS_RGB ? (const __m128i*)mask4 : (const __m128i*)mask0);
122  const __m128i m1 = _mm_load_si128( IS_RGB ? (const __m128i*)mask5 : (const __m128i*)mask1);
123  const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
124  const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
125  const __m128i m4 = _mm_load_si128( IS_RGB ? (const __m128i*)mask0 : (const __m128i*)mask4);
126  const __m128i m5 = _mm_load_si128( IS_RGB ? (const __m128i*)mask1 : (const __m128i*)mask5);
127 
128  const __m128i m6 = _mm_load_si128( IS_RGB ? (const __m128i*)mask10 : (const __m128i*)mask6);
129  const __m128i m7 = _mm_load_si128( IS_RGB ? (const __m128i*)mask11 : (const __m128i*)mask7);
130  const __m128i m8 = _mm_load_si128((const __m128i*)mask8);
131  const __m128i m9 = _mm_load_si128((const __m128i*)mask9);
132  const __m128i m10= _mm_load_si128( IS_RGB ? (const __m128i*)mask6 : (const __m128i*)mask10);
133  const __m128i m11= _mm_load_si128( IS_RGB ? (const __m128i*)mask7 : (const __m128i*)mask11);
134 
135  const __m128i mask_low= _mm_load_si128((const __m128i*)mask_to_low);
136 
137  const __m128i VAL_R = _mm_load_si128((const __m128i*)val_red);
138  const __m128i VAL_G = _mm_load_si128((const __m128i*)val_green);
139  const __m128i VAL_B = _mm_load_si128((const __m128i*)val_blue);
140 
141  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
142  const int sh = h ;
143 
144  for (int i=0; i<sh; i++)
145  {
146  for (int j=0; j<sw; j++)
147  {
148  // We process RGB data in blocks of 3 x 16byte blocks:
149  const __m128i d0 = _mm_load_si128((const __m128i*)in); in += 16;
150  const __m128i d1 = _mm_load_si128((const __m128i*)in); in += 16;
151  const __m128i d2 = _mm_load_si128((const __m128i*)in); in += 16;
152 
153  // First 8 bytes of gray levels:
154  {
155  const __m128i BLUES_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m0),_mm_shuffle_epi8(d1,m1));
156  const __m128i GREENS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m2),_mm_shuffle_epi8(d1,m3));
157  const __m128i REDS_0_7 = _mm_or_si128(_mm_shuffle_epi8(d0,m4),_mm_shuffle_epi8(d1,m5));
158 
159  // _mm_mulhi_epu16(): Multiplies the 8 unsigned 16-bit integers from a by the 8 unsigned 16-bit integers from b.
160  //r0 := (a0 * b0)[31:16]
161  //r1 := (a1 * b1)[31:16]
162  //...
163  //r7 := (a7 * b7)[31:16]
164  //
165  const __m128i GRAYS_0_7 =
166  _mm_adds_epu16(
167  _mm_mulhi_epu16(REDS_0_7, VAL_R),
168  _mm_adds_epu16(
169  _mm_mulhi_epu16(GREENS_0_7, VAL_G),
170  _mm_mulhi_epu16(BLUES_0_7, VAL_B)
171  ));
172 
173  _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_0_7,mask_low));
174  out+=8;
175  }
176 
177  // Second 8 bytes of gray levels:
178  {
179  const __m128i BLUES_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m6),_mm_shuffle_epi8(d2,m7));
180  const __m128i GREENS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m8),_mm_shuffle_epi8(d2,m9));
181  const __m128i REDS_8_15 = _mm_or_si128(_mm_shuffle_epi8(d1,m10),_mm_shuffle_epi8(d2,m11));
182 
183  const __m128i GRAYS_8_15 =
184  _mm_adds_epu16(
185  _mm_mulhi_epu16(REDS_8_15, VAL_R),
186  _mm_adds_epu16(
187  _mm_mulhi_epu16(GREENS_8_15, VAL_G),
188  _mm_mulhi_epu16(BLUES_8_15, VAL_B)
189  ));
190 
191  _mm_storel_epi64((__m128i*)out, _mm_shuffle_epi8(GRAYS_8_15,mask_low));
192  out+=8;
193  }
194  }
195  }
196 
197 } // end private_image_SSSE3_rgb_or_bgr_to_gray_8u()
198 
199 
200 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B
201  * - <b>Input format:</b> uint8_t, 3 channels (BGR order)
202  * - <b>Output format:</b> uint8_t, 1 channel
203  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3 and w*1
204  * - <b>Notes:</b>
205  * - <b>Requires:</b> SSSE3
206  * - <b>Invoked from:</b> mrpt::utils::CImage::grayscale(), mrpt::utils::CImage::grayscaleInPlace()
207  */
208 void image_SSSE3_bgr_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
209 {
210  private_image_SSSE3_rgb_or_bgr_to_gray_8u<false>(in,out,w,h);
211 }
212 
213 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B
214  * - <b>Input format:</b> uint8_t, 3 channels (RGB order)
215  * - <b>Output format:</b> uint8_t, 1 channel
216  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*3 and w*1
217  * - <b>Notes:</b>
218  * - <b>Requires:</b> SSSE3
219  * - <b>Invoked from:</b> mrpt::utils::CImage::grayscale(), mrpt::utils::CImage::grayscaleInPlace()
220  */
221 void image_SSSE3_rgb_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
222 {
223  private_image_SSSE3_rgb_or_bgr_to_gray_8u<true>(in,out,w,h);
224 }
225 
226 
227 /** @} */
228 
229 #endif // end of MRPT_HAS_SSE3
void image_SSSE3_bgr_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h)
Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B.
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:3962
unsigned char uint8_t
Definition: rptypes.h:43
void image_SSSE3_scale_half_3c8u(const uint8_t *in, uint8_t *out, int w, int h)
Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3...
Definition: CImage_SSE3.cpp:37
GLuint in
Definition: glext.h:6301
#define BUILD_128BIT_CONST(_name, B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, B10, B11, B12, B13, B14, B15)
Definition: SSE_macros.h:14
void image_SSSE3_rgb_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h)
Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B.
#define MRPT_ALIGN16
void private_image_SSSE3_rgb_or_bgr_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h)
Definition: CImage_SSE3.cpp:90



Page generated by Doxygen 1.8.14 for MRPT 1.5.7 Git: 5902e14cc Wed Apr 24 15:04:01 2019 +0200 at lun oct 28 01:39:17 CET 2019