_mm_maddubs_epi16

Microsoft Specific

Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pmaddubsw. This instruction multiplies and adds integers.

__m128i _mm_maddubs_epi16( 
   __m128i a,
   __m128i b
);

Parameters

  • [in] a
    A 128-bit parameter that contains sixteen 8-bit unsigned integers.

  • [in] b
    A 128-bit parameter that contains sixteen 8-bit signed integers.

Return value

A 128-bit result that contains eight 16-bit signed integers, where each result element represents the saturated sum of adjacent SIMD products. This can expressed with the following equations:

r0 := SATURATE_16((a0 * b0) + (a1 * b1))
r1 := SATURATE_16((a2 * b2) + (a3 * b3))
...
r7 := SATURATE_16((a14 * b14) + (a15 * b15))

Requirements

Intrinsic

Architecture

_mm_maddubs_epi16

x86, x64

Header file <tmmintrin.h>

Remarks

r0-r7 are the sequentially ordered 16-bit components of return value r. r0 indicates the least significant 16 bits.

a0-a15 and b0-b15 are the sequentially ordered 8-bit components of parameters a and b, respectively. a0 and b0 are the least significant 8 bits. Parameter a contains unsigned bytes. Parameter b contains signed bytes.

SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))

Before you use this intrinsic, software must ensure that the underlying processor supports the instruction.

Example

#include <stdio.h>
#include <tmmintrin.h>

int main ()
{
    __m128i a, b, final;
    int temp;

    a.m128i_u8[0] = 1;
    b.m128i_i8[0] = 32;
    a.m128i_u8[1] = 1;
    b.m128i_i8[1] = -32;
    temp = (a.m128i_u8[0] * b.m128i_i8[0]) + (a.m128i_u8[1] * b.m128i_i8[1]);
    final.m128i_i16[0] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[2] = 1;
    b.m128i_i8[2] = 2;
    a.m128i_u8[3] = 2;
    b.m128i_i8[3] = 4;
    temp = (a.m128i_u8[2] * b.m128i_i8[2]) + (a.m128i_u8[3] * b.m128i_i8[3]);
    final.m128i_i16[1] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[4] = 10;
    b.m128i_i8[4] = -128;
    a.m128i_u8[5] = 12;
    b.m128i_i8[5] = 12;
    temp = (a.m128i_u8[4] * b.m128i_i8[4]) + (a.m128i_u8[5] * b.m128i_i8[5]);
    final.m128i_i16[2] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[6] = 255;
    b.m128i_i8[6] = -128;
    a.m128i_u8[7] = 255;
    b.m128i_i8[7] = -128;
    temp = (a.m128i_u8[6] * b.m128i_i8[6]) + (a.m128i_u8[7] * b.m128i_i8[7]);
    final.m128i_i16[3] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[8] = 0;
    b.m128i_i8[8] = 100;
    a.m128i_u8[9] = 20;
    b.m128i_i8[9] = 20;
    temp = (a.m128i_u8[8] * b.m128i_i8[8]) + (a.m128i_u8[9] * b.m128i_i8[9]);
    final.m128i_i16[4] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[10] = 10;
    b.m128i_i8[10] = 10;
    a.m128i_u8[11] = 11;
    b.m128i_i8[11] = 11;
    temp = (a.m128i_u8[10] * b.m128i_i8[10]) + (a.m128i_u8[11] * b.m128i_i8[11]);
    final.m128i_i16[5] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[12] = 12;
    b.m128i_i8[12] = 12;
    a.m128i_u8[13] = 13;
    b.m128i_i8[13] = 13;
    temp = (a.m128i_u8[12] * b.m128i_i8[12]) + (a.m128i_u8[13] * b.m128i_i8[13]);
    final.m128i_i16[6] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m128i_u8[14] = 14;
    b.m128i_i8[14] = 14;
    a.m128i_u8[15] = 15;
    b.m128i_i8[15] = 15;
    temp = (a.m128i_u8[14] * b.m128i_i8[14]) + (a.m128i_u8[15] * b.m128i_i8[15]);
    final.m128i_i16[7] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    __m128i res = _mm_maddubs_epi16(a, b);

    printf_s("Res0 should be %d: %d\nRes1 should be %d: %d\n",
                final.m128i_i16[0], res.m128i_i16[0], final.m128i_i16[1], res.m128i_i16[1]);
    printf_s("Res2 should be %d: %d\nRes3 should be %d: %d\n",
                final.m128i_i16[2], res.m128i_i16[2], final.m128i_i16[3], res.m128i_i16[3]);
    printf_s("Res4 should be %d: %d\nRes5 should be %d: %d\n",
                final.m128i_i16[4], res.m128i_i16[4], final.m128i_i16[5], res.m128i_i16[5]);
    printf_s("Res6 should be %d: %d\nRes7 should be %d: %d\n",
                final.m128i_i16[6], res.m128i_i16[6], final.m128i_i16[7], res.m128i_i16[7]);

    return 0;
}

Res0 should be 0: 0
Res1 should be 10: 10
Res2 should be -1136: -1136
Res3 should be -32768: -32768
Res4 should be 400: 400
Res5 should be 221: 221
Res6 should be 313: 313
Res7 should be 421: 421

See Also

Concepts

Compiler Intrinsics