45     alignas(32) 
const unsigned long long mask[2] = {0x00FF00FF00FF00FFull,
    46                                                     0x00FF00FF00FF00FFull};
    47     const __m128i m = _mm_load_si128((
const __m128i*)
mask);
    52     for (
int i = 0; i < sh; i++)
    54         for (
int j = 0; j < sw; j++)
    56             const __m128i here_sampled =
    57                 _mm_and_si128(_mm_load_si128((
const __m128i*)
in), m);
    59                 (__m128i*)out, _mm_packus_epi16(here_sampled, here_sampled));
    80                                                     0x00FF00FF00FF00FFull};
    82     __m128i m = _mm_load_si128((
const __m128i*)
mask);
    86     for (
int i = 0; i < sh; i++)
    88         for (
int j = 0; j < sw; j++)
    90             __m128i here = _mm_load_si128((
const __m128i*)
in);
    91             __m128i next = _mm_load_si128((
const __m128i*)nextRow);
    92             here = _mm_avg_epu8(here, next);
    93             next = _mm_and_si128(_mm_srli_si128(here, 1), m);
    94             here = _mm_and_si128(here, m);
    95             here = _mm_avg_epu16(here, next);
    96             _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here, here));
   134 #endif  // end if MRPT_HAS_SSE2 
#define MRPT_MAX_ALIGN_BYTES
GLubyte GLubyte GLubyte GLubyte w
void image_SSE2_scale_half_smooth_1c8u(const uint8_t *in, uint8_t *out, int w, int h)
Average each 2x2 pixels into 1x1 pixel (arithmetic average) 
void image_SSE2_scale_half_1c8u(const uint8_t *in, uint8_t *out, int w, int h)
Subsample each 2x2 pixel block into 1x1 pixel, taking the first pixel & ignoring the other 3...
float KLT_response_optimized()
KLT score at a given point of a grayscale image.