3.1.11. Nuclei Custom Xxlvw Extension
3.1.11.1. Introduction
3.1.11.2. 扩展
使用该扩展时,需要打开 V 扩展和 xxlvw 扩展,同时该扩展只支持rv32
示例:-march=rv32imafcv_xxlvw -mabi=ilp32f
Attention
使用该扩展的相关 intrinsic 时,需要添加以下头文件:
#include <riscv_vector.h>
3.1.11.3. 支持的指令
- Complex number format convert
vcpack.vv vd, vs2, vs1, vmvcunpackr.v vd, vs2, vmvcunpacki.v vd, vs2, vm
- Fix point dynamic scaling operations
vdsmul.vv/vs vd, vs2, vs1, vmvdsmacini.v vs2, vmvdsmacini.s rs1,vmvdsmacini.i uimm, vmvdsmac.vv/vs vs2, vs1, vmvdsmaco.vv/vs vd, vs2, vs1, vmvlsb.v vd, vs2, vm
- Complex dynamic scaling operations
vconj.v vd, vs2, vmvdscmul.vv/vs vd, vs2, vs1, vmvdscmulj.vv/vs vd, vs2, vs1, vmvdscredsum.v vd, vs2, vmvdscmac.vv/vs vs2, vs1, vmvdscmacj.vv/vs vs2, vs1, vmvdscmaco.vv/vs vd, vs2, vs1, vmvdscmacjo.vv/vs vd, vs2, vs1, vmvdscmacor.vv/vs vd, vs2, vs1, vmvdscmacoi.vv/vs vd, vs2, vs1, vmvdscmacjor.vv/vs vd, vs2, vs1, vmvdscmacjoi.vv/vs vd, vs2, vs1, vmvdscmulr.vv/vs vd, vs2, vs1, vmvdscmuli.vv/vs vd, vs2, vs1, vmvdscmuljr.vv/vs vd, vs2, vs1, vmvdscmulji.vv/vs vd, vs2, vs1, vm
- Dynamic scaling Reduced operation
vdsredsum.v vd, vs2, vmvdsredsumn.vs vd, vs2, rs1vdsredsumn.vi vd, vs2, uimmvredmaxi.vv vd, vs2, vs1, vmvredmini.vv vd, vs2, vs1, vm
- Inter-element operation instructions
vperm.vi vd, vs2, uimmvfsl.vv vd, vs2, vs1vfsr.vv vd, vs2, vs1
- Fast non-linear operations
vlnlp0.v vnlpr0, vs2vlnlp1.v vnlpr1, vs2vnle.vv vd, vs2, vs1, vmvnle.vs vd, vs2, vs1, vmvnlm.vv vd, vs2, vs1, vmvnlm.vs vd, vs2, vs1, vm
- Format conversion instructions
vfcvt.b2h.v vd, vs2vfcvt.b2w.v vd, vs2vfcvt.h2w.v vd, vs2vfcvt.p2c.v vd, vs2vfcvt.h2b.v vd, vs2vfcvt.w2b.v vd, vs2vfcvt.w2h.v vd, vs2vfcvt.c2p.v vd, vs2
3.1.11.4. intrinsic 命名规则
rvv intrinsic 命名规则: https://github.com/riscv-non-isa/rvv-intrinsic-doc/releases/tag/v1.0.0-rc7 v-intrinsic-spec.pdf-> Chapter 6.
我们的命名规则遵循上述的命名规则,并在此基础上在前缀处添加了 _xl
3.1.11.5. Nuclei 自定义的 intrinsic
Note
每一条指令对应的intrinsic,会给出示例,全部的intrinsic请参考rvv intrinsic 命名规则和示例进行构建。
下文出现的sew指的是指令支持的数据类型的宽度,full_preds 指的是intrinsic函数支持的后缀形式, full 是全部都支持,包含 none(无后缀),_m,_tu,_tum,_tumu,_mu等六种形式。
vcpack.vvsew = 32 full_preds
vint32m1_t __riscv_xl_vcpack_vv_i32m1(vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m1_t __riscv_xl_vcpack_vv_i32m1_m(vbool32_t vm, vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m1_t __riscv_xl_vcpack_vv_i32m1_tu(vint32m1_t vd, vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m1_t __riscv_xl_vcpack_vv_i32m1_tum(vbool32_t vm,vint32m1_t vd, vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m1_t __riscv_xl_vcpack_vv_i32m1_tumu(vbool32_t vm,vint32m1_t vd, vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m1_t __riscv_xl_vcpack_vv_i32m1_mu(vbool32_t vm,vint32m1_t vd, vint32m1_t vs2, vint32m1_t vs1, size_t vl);
Note
后续指令对应的intrinsic,只会给出none(无后缀)的全部intrinsic,其余后缀的需要使用时,参考上面的构建规则
vcunpackr.vsew = 32 full_preds
vint32mf2_t __riscv_xl_vcunpackr_v_i32mf2(vint32mf2_t vs2, size_t vl);
vint32m1_t __riscv_xl_vcunpackr_v_i32m1(vint32m1_t vs2, size_t vl);
vint32m2_t __riscv_xl_vcunpackr_v_i32m2(vint32m2_t vs2, size_t vl);
vint32m4_t __riscv_xl_vcunpackr_v_i32m4(vint32m4_t vs2, size_t vl);
vint32m8_t __riscv_xl_vcunpackr_v_i32m8(vint32m8_t vs2, size_t vl);
vcunpacki.vsew = 32 full_preds
vint32mf2_t __riscv_xl_vcunpacki_v_i32mf2(vint32mf2_t vs2, size_t vl);
vint32m1_t __riscv_xl_vcunpacki_v_i32m1(vint32m1_t vs2, size_t vl);
vint32m2_t __riscv_xl_vcunpacki_v_i32m2(vint32m2_t vs2, size_t vl);
vint32m4_t __riscv_xl_vcunpacki_v_i32m4(vint32m4_t vs2, size_t vl);
vint32m8_t __riscv_xl_vcunpacki_v_i32m8(vint32m8_t vs2, size_t vl);
vdsmul.vvsew = 8/16/32 full_preds
vint8mf8_t __riscv_xl_vdsmul_vv_i8mf8(vint8mf8_t vs2, vint8mf8_t vs1, size_t vl);
vint8mf4_t __riscv_xl_vdsmul_vv_i8mf4(vint8mf4_t vs2, vint8mf4_t vs1, size_t vl);
vint8mf2_t __riscv_xl_vdsmul_vv_i8mf2(vint8mf2_t vs2, vint8mf2_t vs1, size_t vl);
vint8m1_t __riscv_xl_vdsmul_vv_i8m1(vint8m1_t vs2, vint8m1_t vs1, size_t vl);
vint8m2_t __riscv_xl_vdsmul_vv_i8m2(vint8m2_t vs2, vint8m2_t vs1, size_t vl);
vint8m4_t __riscv_xl_vdsmul_vv_i8m4(vint8m4_t vs2, vint8m4_t vs1, size_t vl);
vint8m8_t __riscv_xl_vdsmul_vv_i8m8(vint8m8_t vs2, vint8m8_t vs1, size_t vl);
vint16mf4_t __riscv_xl_vdsmul_vv_i16mf4(vint16mf4_t vs2, vint16mf4_t vs1, size_t vl);
vint16mf2_t __riscv_xl_vdsmul_vv_i16mf2(vint16mf2_t vs2, vint16mf2_t vs1, size_t vl);
vint16m1_t __riscv_xl_vdsmul_vv_i16m1(vint16m1_t vs2, vint16m1_t vs1, size_t vl);
vint16m2_t __riscv_xl_vdsmul_vv_i16m2(vint16m2_t vs2, vint16m2_t vs1, size_t vl);
vint16m4_t __riscv_xl_vdsmul_vv_i16m4(vint16m4_t vs2, vint16m4_t vs1, size_t vl);
vint16m8_t __riscv_xl_vdsmul_vv_i16m8(vint16m8_t vs2, vint16m8_t vs1, size_t vl);
vint32mf2_t __riscv_xl_vdsmul_vv_i32mf2(vint32mf2_t vs2, vint32mf2_t vs1, size_t vl);
vint32m1_t __riscv_xl_vdsmul_vv_i32m1(vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m2_t __riscv_xl_vdsmul_vv_i32m2(vint32m2_t vs2, vint32m2_t vs1, size_t vl);
vint32m4_t __riscv_xl_vdsmul_vv_i32m4(vint32m4_t vs2, vint32m4_t vs1, size_t vl);
vint32m8_t __riscv_xl_vdsmul_vv_i32m8(vint32m8_t vs2, vint32m8_t vs1, size_t vl);
vdsmul.vssew = 8/16/32 full_preds
vint8mf8_t __riscv_xl_vdsmul_vs_i8mf8_i8mf8(vint8mf8_t vs2, vint8mf8_t vs1, size_t vl);
vint8mf4_t __riscv_xl_vdsmul_vs_i8mf4_i8mf4(vint8mf4_t vs2, vint8mf4_t vs1, size_t vl);
vint8mf2_t __riscv_xl_vdsmul_vs_i8mf2_i8mf2(vint8mf2_t vs2, vint8mf2_t vs1, size_t vl);
vint8m1_t __riscv_xl_vdsmul_vs_i8m1_i8m1(vint8m1_t vs2, vint8m1_t vs1, size_t vl);
vint8m2_t __riscv_xl_vdsmul_vs_i8m2_i8m2(vint8m2_t vs2, vint8m2_t vs1, size_t vl);
vint8m4_t __riscv_xl_vdsmul_vs_i8m4_i8m4(vint8m4_t vs2, vint8m4_t vs1, size_t vl);
vint8m8_t __riscv_xl_vdsmul_vs_i8m8_i8m8(vint8m8_t vs2, vint8m8_t vs1, size_t vl);
vint16mf4_t __riscv_xl_vdsmul_vs_i16mf4_i16mf4(vint16mf4_t vs2, vint16mf4_t vs1, size_t vl);
vint16mf2_t __riscv_xl_vdsmul_vs_i16mf2_i16mf2(vint16mf2_t vs2, vint16mf2_t vs1, size_t vl);
vint16m1_t __riscv_xl_vdsmul_vs_i16m1_i16m1(vint16m1_t vs2, vint16m1_t vs1, size_t vl);
vint16m2_t __riscv_xl_vdsmul_vs_i16m2_i16m2(vint16m2_t vs2, vint16m2_t vs1, size_t vl);
vint16m4_t __riscv_xl_vdsmul_vs_i16m4_i16m4(vint16m4_t vs2, vint16m4_t vs1, size_t vl);
vint16m8_t __riscv_xl_vdsmul_vs_i16m8_i16m8(vint16m8_t vs2, vint16m8_t vs1, size_t vl);
vint32mf2_t __riscv_xl_vdsmul_vs_i32mf2_i32mf2(vint32mf2_t vs2, vint32mf2_t vs1, size_t vl);
vint32m1_t __riscv_xl_vdsmul_vs_i32m1_i32m1(vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m2_t __riscv_xl_vdsmul_vs_i32m2_i32m2(vint32m2_t vs2, vint32m2_t vs1, size_t vl);
vint32m4_t __riscv_xl_vdsmul_vs_i32m4_i32m4(vint32m4_t vs2, vint32m4_t vs1, size_t vl);
vint32m8_t __riscv_xl_vdsmul_vs_i32m8_i32m8(vint32m8_t vs2, vint32m8_t vs1, size_t vl);
vdsmacini.vsew = 8/16/32 none_m_preds[none,_m]
vint8mf8_t __riscv_xl_vdsmacini_v_i8mf8(vint8mf8_t vs2, size_t vl);
vint8mf4_t __riscv_xl_vdsmacini_v_i8mf4(vint8mf4_t vs2, size_t vl);
vint8mf2_t __riscv_xl_vdsmacini_v_i8mf2(vint8mf2_t vs2, size_t vl);
vint8m1_t __riscv_xl_vdsmacini_v_i8m1(vint8m1_t vs2, size_t vl);
vint8m2_t __riscv_xl_vdsmacini_v_i8m2(vint8m2_t vs2, size_t vl);
vint8m4_t __riscv_xl_vdsmacini_v_i8m4(vint8m4_t vs2, size_t vl);
vint8m8_t __riscv_xl_vdsmacini_v_i8m8(vint8m8_t vs2, size_t vl);
vint16mf4_t __riscv_xl_vdsmacini_v_i16mf4(vint16mf4_t vs2, size_t vl);
vint16mf2_t __riscv_xl_vdsmacini_v_i16mf2(vint16mf2_t vs2, size_t vl);
vint16m1_t __riscv_xl_vdsmacini_v_i16m1(vint16m1_t vs2, size_t vl);
vint16m2_t __riscv_xl_vdsmacini_v_i16m2(vint16m2_t vs2, size_t vl);
vint16m4_t __riscv_xl_vdsmacini_v_i16m4(vint16m4_t vs2, size_t vl);
vint16m8_t __riscv_xl_vdsmacini_v_i16m8(vint16m8_t vs2, size_t vl);
vint32mf2_t __riscv_xl_vdsmacini_v_i32mf2(vint32mf2_t vs2, size_t vl);
vint32m1_t __riscv_xl_vdsmacini_v_i32m1(vint32m1_t vs2, size_t vl);
vint32m2_t __riscv_xl_vdsmacini_v_i32m2(vint32m2_t vs2, size_t vl);
vint32m4_t __riscv_xl_vdsmacini_v_i32m4(vint32m4_t vs2, size_t vl);
vint32m8_t __riscv_xl_vdsmacini_v_i32m8(vint32m8_t vs2, size_t vl);
Note
虽然该指令没有 vd parameter,但是该指令的intrinsic使用时是需要一个返回值的,其返回值不为空。具体示例可参考 Examples 部分。
暂时没有 vd parameter 的指令intrinsic,都需要一个返回值。
未来该类型指令的intrinsic的使用方法可能会有变化,目前只是一个workaround版本。
vdsmacini.ssew = 8/16/32 none_m_preds[none,_m]
vint8mf8_t __riscv_xl_vdsmacini_x_i8mf8(int8_t rs1, size_t vl);
vint8mf4_t __riscv_xl_vdsmacini_x_i8mf4(int8_t rs1, size_t vl);
vint8mf2_t __riscv_xl_vdsmacini_x_i8mf2(int8_t rs1, size_t vl);
vint8m1_t __riscv_xl_vdsmacini_x_i8m1(int8_t rs1, size_t vl);
vint8m2_t __riscv_xl_vdsmacini_x_i8m2(int8_t rs1, size_t vl);
vint8m4_t __riscv_xl_vdsmacini_x_i8m4(int8_t rs1, size_t vl);
vint8m8_t __riscv_xl_vdsmacini_x_i8m8(int8_t rs1, size_t vl);
vint16mf4_t __riscv_xl_vdsmacini_x_i16mf4(int16_t rs1, size_t vl);
vint16mf2_t __riscv_xl_vdsmacini_x_i16mf2(int16_t rs1, size_t vl);
vint16m1_t __riscv_xl_vdsmacini_x_i16m1(int16_t rs1, size_t vl);
vint16m2_t __riscv_xl_vdsmacini_x_i16m2(int16_t rs1, size_t vl);
vint16m4_t __riscv_xl_vdsmacini_x_i16m4(int16_t rs1, size_t vl);
vint16m8_t __riscv_xl_vdsmacini_x_i16m8(int16_t rs1, size_t vl);
vint32mf2_t __riscv_xl_vdsmacini_x_i32mf2(int32_t rs1, size_t vl);
vint32m1_t __riscv_xl_vdsmacini_x_i32m1(int32_t rs1, size_t vl);
vint32m2_t __riscv_xl_vdsmacini_x_i32m2(int32_t rs1, size_t vl);
vint32m4_t __riscv_xl_vdsmacini_x_i32m4(int32_t rs1, size_t vl);
vint32m8_t __riscv_xl_vdsmacini_x_i32m8(int32_t rs1, size_t vl);
vdsmac.vv/vssew = 8/16/32 none_m_preds[none,_m]
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdsmac 即可。
vdsmaco.vv/vssew = 8/16/32 full_preds
Tip
同上,只需要将 vdsmul 替换为 vdsmaco 即可
vlsb.vsew = 8/16/32 full_preds
Tip
intrinsic 的名字参考 vdsmacini.v intrinsic 的名字,只需要将 vdsmacini 替换为 vlsb 即可。
vconj.vsew = 8/16/32 full_preds
Tip
intrinsic 的名字参考 vdsmacini.v intrinsic 的名字,只需要将 vdsmacini 替换为 vconj 即可。
vdscmul.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmul 即可。
vdscmulj.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmulj 即可。
vdscredsum.vsew = 8/16/32 full_preds
Tip
intrinsic 的名字参考 vdsmacini.v intrinsic 的名字,只需要将 vdsmacini 替换为 vdscredsum 即可。
vdscmac.vv/vssew = 8/16/32 none_m_preds[none,_m]
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmac 即可。
vdscmacj.vv/vssew = 8/16/32 none_m_preds[none,_m]
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmacj 即可。
vdscmaco.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmaco 即可。
vdscmacjo.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmacjo 即可。
vdscmacor.vvsew = 32 full_preds
vint32mf2_t __riscv_xl_vdscmacor_vv_i32mf2(vint32mf2_t vs2, vint32mf2_t vs1, size_t vl);
vint32m1_t __riscv_xl_vdscmacor_vv_i32m1(vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m2_t __riscv_xl_vdscmacor_vv_i32m2(vint32m2_t vs2, vint32m2_t vs1, size_t vl);
vint32m4_t __riscv_xl_vdscmacor_vv_i32m4(vint32m4_t vs2, vint32m4_t vs1, size_t vl);
vint32m8_t __riscv_xl_vdscmacor_vv_i32m8(vint32m8_t vs2, vint32m8_t vs1, size_t vl);
vdscmacor.vssew = 32 full_preds
vint32mf2_t __riscv_xl_vdscmacor_vs_i32mf2_i32mf2(vint32mf2_t vs2, vint32mf2_t vs1, size_t vl);
vint32m1_t __riscv_xl_vdscmacor_vs_i32m1_i32m1(vint32m1_t vs2, vint32m1_t vs1, size_t vl);
vint32m2_t __riscv_xl_vdscmacor_vs_i32m2_i32m2(vint32m2_t vs2, vint32m2_t vs1, size_t vl);
vint32m4_t __riscv_xl_vdscmacor_vs_i32m4_i32m4(vint32m4_t vs2, vint32m4_t vs1, size_t vl);
vint32m8_t __riscv_xl_vdscmacor_vs_i32m8_i32m8(vint32m8_t vs2, vint32m8_t vs1, size_t vl);
vdscmacoi.vv/vssew = 32 full_preds
Tip
intrinsic 的名字 参考 vdscmacor.vv/vs intrinsic 的名字,只需要将 vdscmacor 替换为 vdscmacoi 即可。
vdscmacjor.vv/vssew = 32 full_preds
Tip
intrinsic 的名字 参考 vdscmacor.vv/vs intrinsic 的名字,只需要将 vdscmacor 替换为 vdscmacjor 即可。
vdscmacjoi.vv/vssew = 32 full_preds
Tip
intrinsic 的名字 参考 vdscmacor.vv/vs intrinsic 的名字,只需要将 vdscmacor 替换为 vdscmacjoi 即可。
vdscmulr.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmulr 即可。
vdscmuli.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmuli 即可。
vdscmuljr.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmuljr 即可。
vdscmulji.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vdscmulji 即可。
vdsredsum.vsew = 8/16/32 full_preds
Tip
intrinsic 的名字参考 vdsmacini.v intrinsic 的名字,只需要将 vdsmacini 替换为 vdsredsum 即可。
vdsredsumn.vs vd, vs2, rs1sew = 8/16/32 none_tu_preds
vint8mf8_t __riscv_xl_vdsredsumn_vx_i8mf8(vint8mf8_t vs2, int8_t rs1, size_t vl);
vint8mf4_t __riscv_xl_vdsredsumn_vx_i8mf4(vint8mf4_t vs2, int8_t rs1, size_t vl);
vint8mf2_t __riscv_xl_vdsredsumn_vx_i8mf2(vint8mf2_t vs2, int8_t rs1, size_t vl);
vint8m1_t __riscv_xl_vdsredsumn_vx_i8m1(vint8m1_t vs2, int8_t rs1, size_t vl);
vint8m2_t __riscv_xl_vdsredsumn_vx_i8m2(vint8m2_t vs2, int8_t rs1, size_t vl);
vint8m4_t __riscv_xl_vdsredsumn_vx_i8m4(vint8m4_t vs2, int8_t rs1, size_t vl);
vint8m8_t __riscv_xl_vdsredsumn_vx_i8m8(vint8m8_t vs2, int8_t rs1, size_t vl);
vint16mf4_t __riscv_xl_vdsredsumn_vx_i16mf4(vint16mf4_t vs2, int16_t rs1, size_t vl);
vint16mf2_t __riscv_xl_vdsredsumn_vx_i16mf2(vint16mf2_t vs2, int16_t rs1, size_t vl);
vint16m1_t __riscv_xl_vdsredsumn_vx_i16m1(vint16m1_t vs2, int16_t rs1, size_t vl);
vint16m2_t __riscv_xl_vdsredsumn_vx_i16m2(vint16m2_t vs2, int16_t rs1, size_t vl);
vint16m4_t __riscv_xl_vdsredsumn_vx_i16m4(vint16m4_t vs2, int16_t rs1, size_t vl);
vint16m8_t __riscv_xl_vdsredsumn_vx_i16m8(vint16m8_t vs2, int16_t rs1, size_t vl);
vint32mf2_t __riscv_xl_vdsredsumn_vx_i32mf2(vint32mf2_t vs2, int32_t rs1, size_t vl);
vint32m1_t __riscv_xl_vdsredsumn_vx_i32m1(vint32m1_t vs2, int32_t rs1, size_t vl);
vint32m2_t __riscv_xl_vdsredsumn_vx_i32m2(vint32m2_t vs2, int32_t rs1, size_t vl);
vint32m4_t __riscv_xl_vdsredsumn_vx_i32m4(vint32m4_t vs2, int32_t rs1, size_t vl);
vint32m8_t __riscv_xl_vdsredsumn_vx_i32m8(vint32m8_t vs2, int32_t rs1, size_t vl);
vdsredsumn.vi vd, vs2, uimmsew = 8/16/32 none_tu_preds
Tip
intrinsic 的名字参考 vdsredsumn.vs intrinsic 的名字,只需要将 _vx 替换为 _vi 即可。 vdsredsumn.vs/vi 指令的rs1和uimm的值必须是整数[1,2,3,4]之内的。
vredmaxi.vvsew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv intrinsic 的名字,只需要将 vdsmul 替换为 vredmaxi 即可。
vredmini.vvsew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv intrinsic 的名字,只需要将 vdsmul 替换为 vredmini 即可。
vperm.visew = 8/16/32 none_tu_preds
vint8mf8_t __riscv_xl_vperm_vi_i8mf8(vint8mf8_t vs2, int8_t rs1, size_t vl);
vint8mf4_t __riscv_xl_vperm_vi_i8mf4(vint8mf4_t vs2, int8_t rs1, size_t vl);
vint8mf2_t __riscv_xl_vperm_vi_i8mf2(vint8mf2_t vs2, int8_t rs1, size_t vl);
vint8m1_t __riscv_xl_vperm_vi_i8m1(vint8m1_t vs2, int8_t rs1, size_t vl);
vint8m2_t __riscv_xl_vperm_vi_i8m2(vint8m2_t vs2, int8_t rs1, size_t vl);
vint8m4_t __riscv_xl_vperm_vi_i8m4(vint8m4_t vs2, int8_t rs1, size_t vl);
vint8m8_t __riscv_xl_vperm_vi_i8m8(vint8m8_t vs2, int8_t rs1, size_t vl);
vint16mf4_t __riscv_xl_vperm_vi_i16mf4(vint16mf4_t vs2, int16_t rs1, size_t vl);
vint16mf2_t __riscv_xl_vperm_vi_i16mf2(vint16mf2_t vs2, int16_t rs1, size_t vl);
vint16m1_t __riscv_xl_vperm_vi_i16m1(vint16m1_t vs2, int16_t rs1, size_t vl);
vint16m2_t __riscv_xl_vperm_vi_i16m2(vint16m2_t vs2, int16_t rs1, size_t vl);
vint16m4_t __riscv_xl_vperm_vi_i16m4(vint16m4_t vs2, int16_t rs1, size_t vl);
vint16m8_t __riscv_xl_vperm_vi_i16m8(vint16m8_t vs2, int16_t rs1, size_t vl);
vint32mf2_t __riscv_xl_vperm_vi_i32mf2(vint32mf2_t vs2, int32_t rs1, size_t vl);
vint32m1_t __riscv_xl_vperm_vi_i32m1(vint32m1_t vs2, int32_t rs1, size_t vl);
vint32m2_t __riscv_xl_vperm_vi_i32m2(vint32m2_t vs2, int32_t rs1, size_t vl);
vint32m4_t __riscv_xl_vperm_vi_i32m4(vint32m4_t vs2, int32_t rs1, size_t vl);
vint32m8_t __riscv_xl_vperm_vi_i32m8(vint32m8_t vs2, int32_t rs1, size_t vl);
vfsl.vvsew = 8/16/32 none_tu_preds
Tip
intrinsic 的名字 参考 vdsmul.vv intrinsic 的名字,只需要将 vdsmul 替换为 vfsl 即可。
vfsr.vvsew = 8/16/32 none_tu_preds
Tip
intrinsic 的名字 参考 vdsmul.vv intrinsic 的名字,只需要将 vdsmul 替换为 vfsr 即可。
vlnlp0.v/vlnlp1.vsew = 8/16/32 none_preds
该指令的intrinsic的使用需要满足以下关系
当VLEN=128时,LMUL=8
当VLEN=256时,LMUL=4
当VLEN=512时,LMUL=2
当VLEN=1024时,LMUL=1
编译器可通过 -march=*_zvl${vlen}b 来控制vlen的长度,其中 vlen 可取值{128,256,512,1024,…}等,默认不指定的情况下是128
_zvl1024b 以下intrinsic可以使用
vint8m1_t __riscv_xl_vlnlp0_v_i8m1(vint8m1_t vs, size_t vl);
vint16m1_t __riscv_xl_vlnlp0_v_i16m1(vint16m1_t vs, size_t vl);
vint32m1_t __riscv_xl_vlnlp0_v_i32m1(vint32m1_t vs, size_t vl);
vint8m1_t __riscv_xl_vlnlp1_v_i8m1(vint8m1_t vs, size_t vl);
vint16m1_t __riscv_xl_vlnlp1_v_i16m1(vint16m1_t vs, size_t vl);
vint32m1_t __riscv_xl_vlnlp1_v_i32m1(vint32m1_t vs, size_t vl);
_zvl512b 以下intrinsic可以使用
vint8m2_t __riscv_xl_vlnlp0_v_i8m2(vint8m2_t vs, size_t vl);
vint16m2_t __riscv_xl_vlnlp0_v_i16m2(vint16m2_t vs, size_t vl);
vint32m2_t __riscv_xl_vlnlp0_v_i32m2(vint32m2_t vs, size_t vl);
vint8m2_t __riscv_xl_vlnlp1_v_i8m2(vint8m2_t vs, size_t vl);
vint16m2_t __riscv_xl_vlnlp1_v_i16m2(vint16m2_t vs, size_t vl);
vint32m2_t __riscv_xl_vlnlp1_v_i32m2(vint32m2_t vs, size_t vl);
_zvl256b 以下intrinsic可以使用
vint8m4_t __riscv_xl_vlnlp0_v_i8m4(vint8m4_t vs, size_t vl);
vint16m4_t __riscv_xl_vlnlp0_v_i16m4(vint16m4_t vs, size_t vl);
vint32m4_t __riscv_xl_vlnlp0_v_i32m4(vint32m4_t vs, size_t vl);
vint8m4_t __riscv_xl_vlnlp1_v_i8m4(vint8m4_t vs, size_t vl);
vint16m4_t __riscv_xl_vlnlp1_v_i16m4(vint16m4_t vs, size_t vl);
vint32m4_t __riscv_xl_vlnlp1_v_i32m4(vint32m4_t vs, size_t vl);
_zvl128b 以下intrinsic可以使用
vint8m8_t __riscv_xl_vlnlp0_v_i8m8(vint8m8_t vs, size_t vl);
vint16m8_t __riscv_xl_vlnlp0_v_i16m8(vint16m8_t vs, size_t vl);
vint32m8_t __riscv_xl_vlnlp0_v_i32m8(vint32m8_t vs, size_t vl);
vint8m8_t __riscv_xl_vlnlp1_v_i8m8(vint8m8_t vs, size_t vl);
vint16m8_t __riscv_xl_vlnlp1_v_i16m8(vint16m8_t vs, size_t vl);
vint32m8_t __riscv_xl_vlnlp1_v_i32m8(vint32m8_t vs, size_t vl);
Tip
在使用上述intrinsic的时候,如果遇到以下这种 unrecognizable insn 错误:
vlnlp_m1.c: In function 'test_vlnlp0_v_i8m1':
vlnlp_m1.c:8:1: error: unrecognizable insn:
8 | }
| ^
(insn 7 4 11 2 (set (reg:RVVM1QI 134 [ <retval> ])
(if_then_else:RVVM1QI (unspec:RVVMF8BI [
(const_vector:RVVMF8BI repeat [
(const_int 1 [0x1])
])
(reg/v:SI 136 [ vl ])
(const_int 2 [0x2]) repeated x2
(const_int 0 [0])
(reg:SI 66 vl)
(reg:SI 67 vtype)
] UNSPEC_VPREDICATE)
(unspec:RVVM1QI [
(reg/v:RVVM1QI 135 [ vs ])
] UNSPEC_VLNLP0)
(unspec:RVVM1QI [
(reg:SI 0 zero)
] UNSPEC_VUNDEF))) "vlnlp_m1.c":7:10 -1
(nil))
during RTL pass: vregs
vlnlp_m1.c:8:1: internal compiler error: in extract_insn, at recog.cc:2812
0x7f8636076082 __libc_start_main
../csu/libc-start.c:308
Please submit a full bug report, with preprocessed source (by using -freport-bug).
Please include the complete backtrace with any bug report.
See <https://gcc.gnu.org/bugs/> for instructions.
可能就是vlen长度和intrinsic使用时的lmul没有对应导致的
vnle.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vnle 即可。
vnlm.vv/vssew = 8/16/32 full_preds
Tip
intrinsic 的名字 参考 vdsmul.vv/vs intrinsic 的名字,只需要将 vdsmul 替换为 vnlm 即可。
vfcvt_b2h.vnone_tu_preds
vint16mf4_t __riscv_xl_vfcvt_b2h_v_i16mf4(vint8mf8_t vs2, size_t vl);
vint16mf2_t __riscv_xl_vfcvt_b2h_v_i16mf2(vint8mf4_t vs2, size_t vl);
vint16m1_t __riscv_xl_vfcvt_b2h_v_i16m1(vint8mf2_t vs2, size_t vl);
vint16m2_t __riscv_xl_vfcvt_b2h_v_i16m2(vint8m1_t vs2, size_t vl);
vint16m4_t __riscv_xl_vfcvt_b2h_v_i16m4(vint8m2_t vs2, size_t vl);
vint16m8_t __riscv_xl_vfcvt_b2h_v_i16m8(vint8m4_t vs2, size_t vl);
vfcvt_b2w.vnone_tu_preds
vint32mf2_t __riscv_xl_vfcvt_b2w_v_i32mf2(vint8mf8_t vs2, size_t vl);
vint32m1_t __riscv_xl_vfcvt_b2w_v_i32m1(vint8mf4_t vs2, size_t vl);
vint32m2_t __riscv_xl_vfcvt_b2w_v_i32m2(vint8mf2_t vs2, size_t vl);
vint32m4_t __riscv_xl_vfcvt_b2w_v_i32m4(vint8m1_t vs2, size_t vl);
vint32m8_t __riscv_xl_vfcvt_b2w_v_i32m8(vint8m2_t vs2, size_t vl);
vfcvt_h2w.vnone_tu_preds
vint32mf2_t __riscv_xl_vfcvt_h2w_v_i32mf2(vint16mf4_t vs2, size_t vl);
vint32m1_t __riscv_xl_vfcvt_h2w_v_i32m1(vint16mf2_t vs2, size_t vl);
vint32m2_t __riscv_xl_vfcvt_h2w_v_i32m2(vint16m1_t vs2, size_t vl);
vint32m4_t __riscv_xl_vfcvt_h2w_v_i32m4(vint16m2_t vs2, size_t vl);
vint32m8_t __riscv_xl_vfcvt_h2w_v_i32m8(vint16m4_t vs2, size_t vl);
vfcvt_p2c.vnone_tu_preds
vint32mf2_t __riscv_xl_vfcvt_p2c_v_i32mf2(vint16mf4_t vs2, size_t vl);
vint32m1_t __riscv_xl_vfcvt_p2c_v_i32m1(vint16mf2_t vs2, size_t vl);
vint32m2_t __riscv_xl_vfcvt_p2c_v_i32m2(vint16m1_t vs2, size_t vl);
vint32m4_t __riscv_xl_vfcvt_p2c_v_i32m4(vint16m2_t vs2, size_t vl);
vint32m8_t __riscv_xl_vfcvt_p2c_v_i32m8(vint16m4_t vs2, size_t vl);
vfcvt.h2b.vnone_tu_preds
vint8mf8_t __riscv_xl_vfcvt_h2b_v_i8mf8(vint16mf4_t vs2, size_t vl);
vint8mf4_t __riscv_xl_vfcvt_h2b_v_i8mf4(vint16mf2_t vs2, size_t vl);
vint8mf2_t __riscv_xl_vfcvt_h2b_v_i8mf2(vint16m1_t vs2, size_t vl);
vint8m1_t __riscv_xl_vfcvt_h2b_v_i8m1(vint16m2_t vs2, size_t vl);
vint8m2_t __riscv_xl_vfcvt_h2b_v_i8m2(vint16m4_t vs2, size_t vl);
vint8m4_t __riscv_xl_vfcvt_h2b_v_i8m4(vint16m8_t vs2, size_t vl);
vfcvt.w2b.vnone_tu_preds
vint8mf8_t __riscv_xl_vfcvt_w2b_v_i8mf8(vint32mf2_t vs2, size_t vl);
vint8mf4_t __riscv_xl_vfcvt_w2b_v_i8mf4(vint32m1_t vs2, size_t vl);
vint8mf2_t __riscv_xl_vfcvt_w2b_v_i8mf2(vint32m2_t vs2, size_t vl);
vint8m1_t __riscv_xl_vfcvt_w2b_v_i8m1(vint32m4_t vs2, size_t vl);
vint8m2_t __riscv_xl_vfcvt_w2b_v_i8m2(vint32m8_t vs2, size_t vl);
vfcvt.w2h.vnone_tu_preds
vint16mf4_t __riscv_xl_vfcvt_w2h_v_i16mf4(vint32mf2_t vs2, size_t vl);
vint16mf2_t __riscv_xl_vfcvt_w2h_v_i16mf2(vint32m1_t vs2, size_t vl);
vint16m1_t __riscv_xl_vfcvt_w2h_v_i16m1(vint32m2_t vs2, size_t vl);
vint16m2_t __riscv_xl_vfcvt_w2h_v_i16m2(vint32m4_t vs2, size_t vl);
vint16m4_t __riscv_xl_vfcvt_w2h_v_i16m4(vint32m8_t vs2, size_t vl);
vfcvt.c2p.vnone_tu_preds
vint16mf4_t __riscv_xl_vfcvt_c2p_v_i16mf4(vint32mf2_t vs2, size_t vl);
vint16mf2_t __riscv_xl_vfcvt_c2p_v_i16mf2(vint32m1_t vs2, size_t vl);
vint16m1_t __riscv_xl_vfcvt_c2p_v_i16m1(vint32m2_t vs2, size_t vl);
vint16m2_t __riscv_xl_vfcvt_c2p_v_i16m2(vint32m4_t vs2, size_t vl);
vint16m4_t __riscv_xl_vfcvt_c2p_v_i16m4(vint32m8_t vs2, size_t vl);
3.1.11.6. Examples
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <riscv_vector.h>
typedef union {
struct {
int16_t re;
int16_t im;
} sc16;
int32_t i32;
} cplx_sc16;
cplx_sc16 src1[4];
cplx_sc16 src2[4];
cplx_sc16 dst_cmul[4];
cplx_sc16 dst_mac[4];
cplx_sc16 dst_cmac[4];
void cplx_mul(const cplx_sc16 *src1, const cplx_sc16 *src2, cplx_sc16 *dst, int len) {
for (int i = 0; i < len; ++i) {
int64_t re = (int64_t)src1[i].sc16.re * src2[i].sc16.re - (int64_t)src1[i].sc16.im * src2[i].sc16.im;
int64_t im = (int64_t)src1[i].sc16.re * src2[i].sc16.im + (int64_t)src1[i].sc16.im * src2[i].sc16.re;
dst[i].sc16.re = re >> 15;
dst[i].sc16.im = im >> 15;
}
}
int main()
{
unsigned long vcsr = (0x0 << 8) | (0x1 << 3);
asm volatile ("csrw 0xf,%0" : : "r"(vcsr));
asm volatile("csrr %0,0xf" : "=r"(vcsr));
printf("vcsr = %x\r\n", vcsr);
memset(&src1[0], 0, sizeof(cplx_sc16) * 4);
memset(&src2[0], 0, sizeof(cplx_sc16) * 4);
memset(&dst_cmul[0], 0, sizeof(cplx_sc16) * 4);
memset(&dst_mac[0], 0, sizeof(cplx_sc16) * 4);
memset(&dst_cmac[0], 0, sizeof(cplx_sc16) * 4);
for (int i = 0; i < 4; ++i) {
src1[i].sc16.re = rand() % 32 - 16;
src1[i].sc16.im = rand() % 32 - 16;
src2[i].sc16.re = rand() % 32 - 16;
src2[i].sc16.im = rand() % 32 - 16;
}
size_t vl = 4;
vint32m1_t vs1 = __riscv_vle32_v_i32m1(&src1[0].i32, vl);
vint32m1_t vs2 = __riscv_vle32_v_i32m1(&src2[0].i32, vl);
vint32m1_t vd = __riscv_xl_vdscmul_vv_i32m1(vs2, vs1, vl);
__riscv_vse32_v_i32m1(&dst_cmul[0].i32, vd, vl);
// tmp值可以不被用到,但是需要有,才能保证vdsmacini指令正常使用
vint32m1_t tmp = __riscv_xl_vdsmacini_x_i32m1(1, vl);
vd = __riscv_xl_vdscmaco_vv_i32m1(vs2, vs1, vl);
__riscv_vse32_v_i32m1(&dst_cmac[0].i32, vd, vl);
return 0;
}