For convenience, the lsxintrin.h file was imported into
lasxintrin.h and 18 new interface functions for 128 and 256 vector
conversions were added, using the -mlasx option.
__m256 __lasx_cast_128_s (__m128); __m256d __lasx_cast_128_d (__m128d); __m256i __lasx_cast_128 (__m128i); __m256 __lasx_concat_128_s (__m128, __m128); __m256d __lasx_concat_128_d (__m128d, __m128d); __m256i __lasx_concat_128 (__m128i, __m128i); __m128 __lasx_extract_128_lo_s (__m256); __m128 __lasx_extract_128_hi_s (__m256); __m128d __lasx_extract_128_lo_d (__m256d); __m128d __lasx_extract_128_hi_d (__m256d); __m128i __lasx_extract_128_lo (__m256i); __m128i __lasx_extract_128_hi (__m256i); __m256 __lasx_insert_128_lo_s (__m256, __m128); __m256 __lasx_insert_128_hi_s (__m256, __m128); __m256d __lasx_insert_128_lo_d (__m256d, __m128d); __m256d __lasx_insert_128_hi_d (__m256d, __m128d); __m256i __lasx_insert_128_lo (__m256i, __m128i); __m256i __lasx_insert_128_hi (__m256i, __m128i);
When gcc does not support interfaces for 128 and 256 conversions, use the following code for equivalent substitution.
#ifndef __loongarch_asx_sx_conv
#include <lasxintrin.h>
#include <lsxintrin.h>
__m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_cast_128_s (__m128 src)
{
__m256 dest;
asm ("" : "=f"(dest) : "0"(src));
return dest;
}
__m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_cast_128_d (__m128d src)
{
__m256d dest;
asm ("" : "=f"(dest) : "0"(src));
return dest;
}
__m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_cast_128 (__m128i src)
{
__m256i dest;
asm ("" : "=f"(dest) : "0"(src));
return dest;
}
__m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_concat_128_s (__m128 src1, __m128 src2)
{
__m256 dest;
asm ("xvpermi.q %u0,%u2,0x02\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_concat_128_d (__m128d src1, __m128d src2)
{
__m256d dest;
asm ("xvpermi.q %u0,%u2,0x02\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_concat_128 (__m128i src1, __m128i src2)
{
__m256i dest;
asm ("xvpermi.q %u0,%u2,0x02\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m128 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_extract_128_lo_s (__m256 src)
{
__m128 dest;
asm ("" : "=f"(dest) : "0"(src));
return dest;
}
__m128d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_extract_128_lo_d (__m256d src)
{
__m128d dest;
asm ("" : "=f"(dest) : "0"(src));
return dest;
}
__m128i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_extract_128_lo (__m256i src)
{
__m128i dest;
asm ("" : "=f"(dest) : "0"(src));
return dest;
}
__m128 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_extract_128_hi_s (__m256 src)
{
__m128 dest;
asm ("xvpermi.d %u0,%u1,0xe\n"
: "=f"(dest)
: "f"(src));
return dest;
}
__m128d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_extract_128_hi_d (__m256d src)
{
__m128d dest;
asm ("xvpermi.d %u0,%u1,0xe\n"
: "=f"(dest)
: "f"(src));
return dest;
}
__m128i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_extract_128_hi (__m256i src)
{
__m128i dest;
asm ("xvpermi.d %u0,%u1,0xe\n"
: "=f"(dest)
: "f"(src));
return dest;
}
__m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_insert_128_lo_s (__m256 src1, __m128 src2)
{
__m256 dest;
asm ("xvpermi.q %u0,%u2,0x30\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_insert_128_lo_d (__m256d a, __m128d b)
{
__m256d dest;
asm ("xvpermi.q %u0,%u2,0x30\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_insert_128_lo (__m256i src1, __m128i src2)
{
__m256i dest;
asm ("xvpermi.q %u0,%u2,0x30\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_insert_128_hi_s (__m256 src1, __m128 src2)
{
__m256 dest;
asm ("xvpermi.q %u0,%u2,0x02\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_insert_128_hi_d (__m256d src1, __m128d src2)
{
__m256d dest;
asm ("xvpermi.q %u0,%u2,0x02\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
__m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lasx_insert_128_hi (__m256i src1, __m128i src2)
{
__m256i dest;
asm ("xvpermi.q %u0,%u2,0x02\n"
: "=f"(dest)
: "0"(src1), "f"(src2));
return dest;
}
#endif