MRPT  1.9.9
CImage_SSE2.cpp
Go to the documentation of this file.
1 /* +------------------------------------------------------------------------+
2  | Mobile Robot Programming Toolkit (MRPT) |
3  | http://www.mrpt.org/ |
4  | |
5  | Copyright (c) 2005-2018, Individual contributors, see AUTHORS file |
6  | See: http://www.mrpt.org/Authors - All rights reserved. |
7  | Released under BSD License. See details in http://www.mrpt.org/License |
8  +------------------------------------------------------------------------+ */
9 
10 #include "img-precomp.h" // Precompiled headers
11 
12 #if MRPT_HAS_SSE2
13 // ---------------------------------------------------------------------------
14 // This file contains the SSE2 optimized functions for mrpt::img::CImage
15 // See the sources and the doxygen documentation page "sse_optimizations" for
16 // more details.
17 //
18 // Some functions here are derived from sources in libcvd, released
19 // under BSD. https://www.edwardrosten.com/cvd/
20 //
21 // ---------------------------------------------------------------------------
22 
23 #include <mrpt/img/CImage.h>
24 #include <mrpt/core/SSE_types.h>
25 #include <mrpt/core/SSE_macros.h>
26 #include "CImage_SSEx.h"
27 
28 /** \addtogroup sse_optimizations
29  * SSE optimized functions
30  * @{
31  */
32 
33 /** Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel &
34  * ignoring the other 3
35  * - <b>Input format:</b> uint8_t, 1 channel
36  * - <b>Output format:</b> uint8_t, 1 channel
37  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
38  * pixels), widthStep=w*1
39  * - <b>Notes:</b>
40  * - <b>Requires:</b> SSE2
41  * - <b>Invoked from:</b> mrpt::img::CImage::scaleHalf()
42  */
43 void image_SSE2_scale_half_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
44 {
45  alignas(32) const unsigned long long mask[2] = {0x00FF00FF00FF00FFull,
46  0x00FF00FF00FF00FFull};
47  const __m128i m = _mm_load_si128((const __m128i*)mask);
48 
49  int sw = w >> 4;
50  int sh = h >> 1;
51 
52  for (int i = 0; i < sh; i++)
53  {
54  for (int j = 0; j < sw; j++)
55  {
56  const __m128i here_sampled =
57  _mm_and_si128(_mm_load_si128((const __m128i*)in), m);
58  _mm_storel_epi64(
59  (__m128i*)out, _mm_packus_epi16(here_sampled, here_sampled));
60  in += 16;
61  out += 8;
62  }
63  in += w;
64  }
65 }
66 
67 /** Average each 2x2 pixels into 1x1 pixel (arithmetic average)
68  * - <b>Input format:</b> uint8_t, 1 channel
69  * - <b>Output format:</b> uint8_t, 1 channel
70  * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in
71  * pixels), widthStep=w*1
72  * - <b>Notes:</b>
73  * - <b>Requires:</b> SSE2
74  * - <b>Invoked from:</b> mrpt::img::CImage::scaleHalfSmooth()
75  */
77  const uint8_t* in, uint8_t* out, int w, int h)
78 {
79  alignas(MRPT_MAX_ALIGN_BYTES) const unsigned long long mask[2] = {0x00FF00FF00FF00FFull,
80  0x00FF00FF00FF00FFull};
81  const uint8_t* nextRow = in + w;
82  __m128i m = _mm_load_si128((const __m128i*)mask);
83  int sw = w >> 4;
84  int sh = h >> 1;
85 
86  for (int i = 0; i < sh; i++)
87  {
88  for (int j = 0; j < sw; j++)
89  {
90  __m128i here = _mm_load_si128((const __m128i*)in);
91  __m128i next = _mm_load_si128((const __m128i*)nextRow);
92  here = _mm_avg_epu8(here, next);
93  next = _mm_and_si128(_mm_srli_si128(here, 1), m);
94  here = _mm_and_si128(here, m);
95  here = _mm_avg_epu16(here, next);
96  _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here, here));
97  in += 16;
98  nextRow += 16;
99  out += 8;
100  }
101 
102  in += w;
103  nextRow += w;
104  }
105 }
106 
107 /** KLT score at a given point of a grayscale image.
108  * - <b>Requires:</b> SSE2
109  * - <b>Invoked from:</b> mrpt::img::CImage::KLT_response()
110  *
111  * This function is not manually optimized for SSE2 but templatized for
112  * different
113  * window sizes such as the compiler can optimize automatically for that
114  * size.
115  *
116  * Only for the most common window sizes this templates are instantiated
117  * (W=[2-16] and W=32 ),
118  * falling back to
119  * a generic implementation otherwise. The next figure shows the performance
120  * (time for
121  * KLT_response() to compute the score for one single pixel) for different
122  * window sizes.
123  *
124  * <img src="KLT_response_performance_SSE2.png" >
125  *
126  */
127 float KLT_response_optimized();
128 
129 // TODO:
130 // Sum of absolute differences: Use _mm_sad_epu8
131 
132 /** @} */
133 
134 #endif // end if MRPT_HAS_SSE2
GLenum GLint GLuint mask
Definition: glext.h:4050
#define MRPT_MAX_ALIGN_BYTES
GLubyte GLubyte GLubyte GLubyte w
Definition: glext.h:4178
unsigned char uint8_t
Definition: rptypes.h:41
void image_SSE2_scale_half_smooth_1c8u(const uint8_t *in, uint8_t *out, int w, int h)
Average each 2x2 pixels into 1x1 pixel (arithmetic average)
Definition: CImage_SSE2.cpp:76
void image_SSE2_scale_half_1c8u(const uint8_t *in, uint8_t *out, int w, int h)
Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3...
Definition: CImage_SSE2.cpp:43
GLuint in
Definition: glext.h:7274
float KLT_response_optimized()
KLT score at a given point of a grayscale image.



Page generated by Doxygen 1.8.14 for MRPT 1.9.9 Git: 7d5e6d718 Fri Aug 24 01:51:28 2018 +0200 at lun nov 2 08:35:50 CET 2020