MRPT  1.9.9
CImage_SSE3.cpp
Go to the documentation of this file.
1 /* +------------------------------------------------------------------------+
2  | Mobile Robot Programming Toolkit (MRPT) |
3  | http://www.mrpt.org/ |
4  | |
5  | Copyright (c) 2005-2018, Individual contributors, see AUTHORS file |
6  | See: http://www.mrpt.org/Authors - All rights reserved. |
7  | Released under BSD License. See details in http://www.mrpt.org/License |
8  +------------------------------------------------------------------------+ */
9 
10 #include "img-precomp.h" // Precompiled headers
11 
12 // ---------------------------------------------------------------------------
13 // This file contains the SSE3/SSSE3 optimized functions for
14 // mrpt::img::CImage
15 // See the sources and the doxygen documentation page "sse_optimizations" for
16 // more details.
17 // ---------------------------------------------------------------------------
18 #if MRPT_HAS_SSE3
19 
20 #include <mrpt/img/CImage.h>
21 #include <mrpt/core/SSE_types.h>
22 #include <mrpt/core/SSE_macros.h>
23 #include "CImage_SSEx.h"
24 
25 /** \addtogroup sse_optimizations
26  * SSE optimized functions
27  * @{
28  */
29 
30 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel &
31  * ignoring the other 3
32  * - <b>Input format:</b> uint8_t, 3 channels (RGB or BGR)
33  * - <b>Output format:</b> uint8_t, 3 channels (RGB or BGR)
34  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
35  * pixels), widthStep=w*3
36  * - <b>Notes:</b>
37  * - <b>Requires:</b> SSSE3
38  * - <b>Invoked from:</b> mrpt::img::CImage::scaleHalf()
39  */
40 void image_SSSE3_scale_half_3c8u(const uint8_t* in, uint8_t* out, int w, int h)
41 {
42  alignas(MRPT_MAX_ALIGN_BYTES) const unsigned long long mask0[2] = {
43  0x0D0C080706020100ull, 0x808080808080800Eull}; // Long words are in
44  // inverse order due to
45  // little endianness
46  alignas(MRPT_MAX_ALIGN_BYTES) const unsigned long long mask1[2] = {0x8080808080808080ull,
47  0x0E0A090804030280ull};
48  alignas(MRPT_MAX_ALIGN_BYTES) const unsigned long long mask2[2] = {0x0C0B0A0605040080ull,
49  0x8080808080808080ull};
50  alignas(MRPT_MAX_ALIGN_BYTES) const unsigned long long mask3[2] = {0x808080808080800Full,
51  0x8080808080808080ull};
52 
53  const __m128i m0 = _mm_load_si128((const __m128i*)mask0);
54  const __m128i m1 = _mm_load_si128((const __m128i*)mask1);
55  const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
56  const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
57 
58  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
59  const int sh = h >> 1;
60 
61  int odd_row = 0;
62 
63  for (int i = 0; i < sh; i++)
64  {
65  for (int j = 0; j < sw; j++)
66  {
67  // 16-byte blocks #0,#1,#2:
68  __m128i d0 = _mm_load_si128((const __m128i*)in);
69  in += 16;
70  __m128i d1 = _mm_load_si128((const __m128i*)in);
71  in += 16;
72 
73  // First 16 bytes:
74  __m128i shuf0 = _mm_shuffle_epi8(d0, m0);
75  __m128i shuf1 = _mm_shuffle_epi8(d1, m1);
76 
77  __m128i res0 = _mm_or_si128(shuf0, shuf1);
78 
79  if ((odd_row & 0x1) != 0)
80  _mm_storeu_si128((__m128i*)out, res0); // unaligned output
81  else
82  _mm_store_si128((__m128i*)out, res0); // aligned output
83  out += 16;
84 
85  // Last 8 bytes:
86  __m128i d2 = _mm_load_si128((const __m128i*)in);
87  in += 16;
88 
89  _mm_storel_epi64( // Write lower 8 bytes only
90  (__m128i*)out,
91  _mm_or_si128(
92  _mm_shuffle_epi8(d2, m2), _mm_shuffle_epi8(d1, m3)));
93  odd_row++;
94  out += 8;
95  }
96  in += 3 * w;
97  }
98 }
99 
100 // This is the actual function behind both: image_SSSE3_rgb_to_gray_8u() and
101 // image_SSSE3_bgr_to_gray_8u():
102 template <bool IS_RGB>
104  const uint8_t* in, uint8_t* out, int w, int h)
105 {
106  // Masks: 0 1 2 3 4 5 6 7 8 9 A B C D E
107  // F
109  mask0, 80, 00, 80, 03, 80, 06, 80, 09, 80, 0C, 80, 0F, 80, 80, 80,
110  80) // reds[0-7] from D0
112  mask1, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 02, 80,
113  05) // reds[0-7] from D1
114 
116  mask2, 80, 01, 80, 04, 80, 07, 80, 0A, 80, 0D, 80, 80, 80, 80, 80,
117  80) // greens[0-7] from D0
119  mask3, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 00, 80, 03, 80,
120  06) // greens[0-7] from D1
121 
123  mask4, 80, 02, 80, 05, 80, 08, 80, 0B, 80, 0E, 80, 80, 80, 80, 80,
124  80) // blues[0-7] from D0
126  mask5, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 01, 80, 04, 80,
127  07) // blues[0-7] from D1
128 
130  mask6, 80, 08, 80, 0B, 80, 0E, 80, 80, 80, 80, 80, 80, 80, 80, 80,
131  80) // reds[8-15] from D1
133  mask7, 80, 80, 80, 80, 80, 80, 80, 01, 80, 04, 80, 07, 80, 0A, 80,
134  0D) // reds[8-15] from D2
135 
137  mask8, 80, 09, 80, 0C, 80, 0F, 80, 80, 80, 80, 80, 80, 80, 80, 80,
138  80) // greens[8-15] from D1
140  mask9, 80, 80, 80, 80, 80, 80, 80, 02, 80, 05, 80, 08, 80, 0B, 80,
141  0E) // greens[8-15] from D2
142 
144  mask10, 80, 0A, 80, 0D, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
145  80) // blues[8-15] from D1
147  mask11, 80, 80, 80, 80, 80, 00, 80, 03, 80, 06, 80, 09, 80, 0C, 80,
148  0F) // blues[8-15] from D2
149 
151  mask_to_low, 01, 03, 05, 07, 09, 0B, 0D, 0F, 80, 80, 80, 80, 80, 80, 80,
152  80)
153 
154  // Conversion factors for RGB->Y
156  val_red, 00, 1D, 00, 1D, 00, 1D, 00, 1D, 00, 1D, 00, 1D, 00, 1D, 00, 1D)
158  val_green, 00, 96, 00, 96, 00, 96, 00, 96, 00, 96, 00, 96, 00, 96, 00,
159  96)
161  val_blue, 00, 4D, 00, 4D, 00, 4D, 00, 4D, 00, 4D, 00, 4D, 00, 4D, 00,
162  4D)
163 
164  const __m128i m0 =
165  _mm_load_si128(IS_RGB ? (const __m128i*)mask4 : (const __m128i*)mask0);
166  const __m128i m1 =
167  _mm_load_si128(IS_RGB ? (const __m128i*)mask5 : (const __m128i*)mask1);
168  const __m128i m2 = _mm_load_si128((const __m128i*)mask2);
169  const __m128i m3 = _mm_load_si128((const __m128i*)mask3);
170  const __m128i m4 =
171  _mm_load_si128(IS_RGB ? (const __m128i*)mask0 : (const __m128i*)mask4);
172  const __m128i m5 =
173  _mm_load_si128(IS_RGB ? (const __m128i*)mask1 : (const __m128i*)mask5);
174 
175  const __m128i m6 =
176  _mm_load_si128(IS_RGB ? (const __m128i*)mask10 : (const __m128i*)mask6);
177  const __m128i m7 =
178  _mm_load_si128(IS_RGB ? (const __m128i*)mask11 : (const __m128i*)mask7);
179  const __m128i m8 = _mm_load_si128((const __m128i*)mask8);
180  const __m128i m9 = _mm_load_si128((const __m128i*)mask9);
181  const __m128i m10 =
182  _mm_load_si128(IS_RGB ? (const __m128i*)mask6 : (const __m128i*)mask10);
183  const __m128i m11 =
184  _mm_load_si128(IS_RGB ? (const __m128i*)mask7 : (const __m128i*)mask11);
185 
186  const __m128i mask_low = _mm_load_si128((const __m128i*)mask_to_low);
187 
188  const __m128i VAL_R = _mm_load_si128((const __m128i*)val_red);
189  const __m128i VAL_G = _mm_load_si128((const __m128i*)val_green);
190  const __m128i VAL_B = _mm_load_si128((const __m128i*)val_blue);
191 
192  const int sw = w >> 4; // This are the number of 3*16 blocks in each row
193  const int sh = h;
194 
195  for (int i = 0; i < sh; i++)
196  {
197  for (int j = 0; j < sw; j++)
198  {
199  // We process RGB data in blocks of 3 x 16byte blocks:
200  const __m128i d0 = _mm_load_si128((const __m128i*)in);
201  in += 16;
202  const __m128i d1 = _mm_load_si128((const __m128i*)in);
203  in += 16;
204  const __m128i d2 = _mm_load_si128((const __m128i*)in);
205  in += 16;
206 
207  // First 8 bytes of gray levels:
208  {
209  const __m128i BLUES_0_7 = _mm_or_si128(
210  _mm_shuffle_epi8(d0, m0), _mm_shuffle_epi8(d1, m1));
211  const __m128i GREENS_0_7 = _mm_or_si128(
212  _mm_shuffle_epi8(d0, m2), _mm_shuffle_epi8(d1, m3));
213  const __m128i REDS_0_7 = _mm_or_si128(
214  _mm_shuffle_epi8(d0, m4), _mm_shuffle_epi8(d1, m5));
215 
216  // _mm_mulhi_epu16(): Multiplies the 8 unsigned 16-bit integers
217  // from a by the 8 unsigned 16-bit integers from b.
218  // r0 := (a0 * b0)[31:16]
219  // r1 := (a1 * b1)[31:16]
220  //...
221  // r7 := (a7 * b7)[31:16]
222  //
223  const __m128i GRAYS_0_7 = _mm_adds_epu16(
224  _mm_mulhi_epu16(REDS_0_7, VAL_R),
225  _mm_adds_epu16(
226  _mm_mulhi_epu16(GREENS_0_7, VAL_G),
227  _mm_mulhi_epu16(BLUES_0_7, VAL_B)));
228 
229  _mm_storel_epi64(
230  (__m128i*)out, _mm_shuffle_epi8(GRAYS_0_7, mask_low));
231  out += 8;
232  }
233 
234  // Second 8 bytes of gray levels:
235  {
236  const __m128i BLUES_8_15 = _mm_or_si128(
237  _mm_shuffle_epi8(d1, m6), _mm_shuffle_epi8(d2, m7));
238  const __m128i GREENS_8_15 = _mm_or_si128(
239  _mm_shuffle_epi8(d1, m8), _mm_shuffle_epi8(d2, m9));
240  const __m128i REDS_8_15 = _mm_or_si128(
241  _mm_shuffle_epi8(d1, m10), _mm_shuffle_epi8(d2, m11));
242 
243  const __m128i GRAYS_8_15 = _mm_adds_epu16(
244  _mm_mulhi_epu16(REDS_8_15, VAL_R),
245  _mm_adds_epu16(
246  _mm_mulhi_epu16(GREENS_8_15, VAL_G),
247  _mm_mulhi_epu16(BLUES_8_15, VAL_B)));
248 
249  _mm_storel_epi64(
250  (__m128i*)out, _mm_shuffle_epi8(GRAYS_8_15, mask_low));
251  out += 8;
252  }
253  }
254  }
255 
256 } // end private_image_SSSE3_rgb_or_bgr_to_gray_8u()
257 
258 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using
259  * Y=77*R+150*G+29*B
260  * - <b>Input format:</b> uint8_t, 3 channels (BGR order)
261  * - <b>Output format:</b> uint8_t, 1 channel
262  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
263  * pixels), widthStep=w*3 and w*1
264  * - <b>Notes:</b>
265  * - <b>Requires:</b> SSSE3
266  * - <b>Invoked from:</b> mrpt::img::CImage::grayscale(),
267  * mrpt::img::CImage::grayscaleInPlace()
268  */
269 void image_SSSE3_bgr_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
270 {
271  private_image_SSSE3_rgb_or_bgr_to_gray_8u<false>(in, out, w, h);
272 }
273 
274 /** Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using
275  * Y=77*R+150*G+29*B
276  * - <b>Input format:</b> uint8_t, 3 channels (RGB order)
277  * - <b>Output format:</b> uint8_t, 1 channel
278  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
279  * pixels), widthStep=w*3 and w*1
280  * - <b>Notes:</b>
281  * - <b>Requires:</b> SSSE3
282  * - <b>Invoked from:</b> mrpt::img::CImage::grayscale(),
283  * mrpt::img::CImage::grayscaleInPlace()
284  */
285 void image_SSSE3_rgb_to_gray_8u(const uint8_t* in, uint8_t* out, int w, int h)
286 {
287  private_image_SSSE3_rgb_or_bgr_to_gray_8u<true>(in, out, w, h);
288 }
289 
290 /** @} */
291 
292 #endif // end of MRPT_HAS_SSE3
#define MRPT_MAX_ALIGN_BYTES
void image_SSSE3_bgr_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h)
Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B.
#define BUILD_128BIT_CONST( _name, B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, B10, B11, B12, B13, B14, B15)
Definition: SSE_macros.h:13
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:4178
unsigned char uint8_t
Definition: rptypes.h:41
void image_SSSE3_scale_half_3c8u(const uint8_t *in, uint8_t *out, int w, int h)
Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3...
Definition: CImage_SSE3.cpp:40
GLuint in
Definition: glext.h:7274
void image_SSSE3_rgb_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h)
Convert a RGB image (3cu8) into a GRAYSCALE (1c8u) image, using Y=77*R+150*G+29*B.
void private_image_SSSE3_rgb_or_bgr_to_gray_8u(const uint8_t *in, uint8_t *out, int w, int h)



Page generated by Doxygen 1.8.14 for MRPT 1.9.9 Git: 7d5e6d718 Fri Aug 24 01:51:28 2018 +0200 at lun nov 2 08:35:50 CET 2020