首页服务器Web服务器 学习在kernel态下使用NEON对算法进行加速的方法

学习在kernel态下使用NEON对算法进行加速的方法

本文跟着小编一起来学习在linux kernel态下如何使用NEON对算法进行加速的技巧,内容通过图文实例给大家做了详细分析,一起来看下。 ARM处理器从cortex系列开始集成NEON处理单…

本文跟着小编一起来学习在linux kernel态下如何使用NEON对算法进行加速的技巧,内容通过图文实例给大家做了详细分析,一起来看下。

ARM处理器从cortex系列开始集成NEON处理单元,该单元可以简单理解为协处理器,专门为矩阵运算等算法设计,特别适用于图像、视频、音频处理等场景,应用也很广泛。

本文先对NEON处理单元进行简要介绍,然后介绍如何在内核态下使用NEON,最后列举实例说明。

一.NEON简介

其实最好的资料就是官方文档,Cortex™-A Series Programmer's Guide ,以下描述摘自该文档

1.1 SIMD

NEON采用SIMD架构,single instruction multy data,一条指令处理多个数据,NEON中这多个数据可以很多,而且配置灵活(8bit、16bit、32bit为单位,可多个单位数据),这是优势所在。

如下图,APU需要至少四条指令完成加操作,而NEON只需要1条,考虑到ld和st,节省的指令更多。

 kernel,NEON,算法,加速

上述特性,使NEON特别适合处理块数据、图像、视频、音频等。

 1.2 NEON architecture overview

NEON也是load/store架构,寄存器为64bit/128bit,可形成向量化数据,配合若干便于向量操作的指令。

1.2.1 commonality with VFP         1.2.2 data type

kernel,NEON,算法,加速

 指令中的数据类型表示,例如VMLAL.S8:

kernel,NEON,算法,加速

1.2.3 registers 

32个64bit寄存器,D0~D31;同时可组成16个128 bit寄存器,Q0~Q15。与VFP公用。

kernel,NEON,算法,加速

寄存器内部的数据单位为8bit、16bit、32bit,可以根据需要灵活配置。

kernel,NEON,算法,加速

NEON的指令有Normal,Long,Wide,Narrow和Saturating variants等几种后缀,是根据操作的源src和dst寄存器的类型确定的。

kernel,NEON,算法,加速

   kernel,NEON,算法,加速

1.2.4 instruction set

kernel,NEON,算法,加速

                     kernel,NEON,算法,加速

1.3 NEON 指令分类概述

指令比较多, 详细可参考Cortex™-A Series Programmer's Guide。可大体分为:

NEON general data processing instructions   NEON shift instructions  NEON logical and compare operations  NEON arithmetic instructions NEON multiply instructions  NEON load and store element and structure instructions B.8 NEON and VFP pseudo-instructions

简单罗列一下各指令

kernel,NEON,算法,加速

                kernel,NEON,算法,加速

                  kernel,NEON,算法,加速

                kernel,NEON,算法,加速

  kernel,NEON,算法,加速

无循环左移,负数左移按右移处理。

load和store指令不太好理解,说明一下。

  kernel,NEON,算法,加速

1.4 NEON 使用方式      

1.4.1 NEON使用方式

NEON有若干种使用方式:

C语言被编译器自动向量化,需要增加编译选项,且C语言编码时有若干注意事项。这种方式不确定性太大,没啥实用价值   NEON汇编,可行,汇编稍微复杂一点,但是核心算法还是值得的   intrinsics,gcc和armcc等编译器提供了若干与NEON对应的inline函数,可直接在C语言里调用,这些函数反汇编时会直接编程响应的NEON指令。这种方式比较实用与C语言环境,且相对简单。本文后续使用这种方式进行详细说明。          1.4.2  C语言NEON数据类型

需包含arm_neon.h头文件,该头文件在gcc目录里。都是向量数据。

typedef __builtin_neon_qi int8x8_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_hi int16x4_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_si int32x2_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_di int64x1_t;typedef __builtin_neon_sf float32x2_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_poly8 poly8x8_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_poly16 poly16x4_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_uqi uint8x8_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_uhi uint16x4_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_usi uint32x2_t  __attribute__ ((__vector_size__ (8)));typedef __builtin_neon_udi uint64x1_t;typedef __builtin_neon_qi int8x16_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_hi int16x8_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_si int32x4_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_di int64x2_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_sf float32x4_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_poly8 poly8x16_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_poly16 poly16x8_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_uqi uint8x16_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_uhi uint16x8_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_usi uint32x4_t  __attribute__ ((__vector_size__ (16)));typedef __builtin_neon_udi uint64x2_t  __attribute__ ((__vector_size__ (16)));typedef float float32_t;typedef __builtin_neon_poly8 poly8_t;typedef __builtin_neon_poly16 poly16_t;typedef struct int8x8x2_t{ int8x8_t val[2];} int8x8x2_t;typedef struct int8x16x2_t{ int8x16_t val[2];} int8x16x2_t;typedef struct int16x4x2_t{ int16x4_t val[2];} int16x4x2_t;typedef struct int16x8x2_t{ int16x8_t val[2];} int16x8x2_t;typedef struct int32x2x2_t{ int32x2_t val[2];} int32x2x2_t;typedef struct int32x4x2_t{ int32x4_t val[2];} int32x4x2_t;typedef struct int64x1x2_t{ int64x1_t val[2];} int64x1x2_t;typedef struct int64x2x2_t{ int64x2_t val[2];} int64x2x2_t;typedef struct uint8x8x2_t{ uint8x8_t val[2];} uint8x8x2_t;typedef struct uint8x16x2_t{ uint8x16_t val[2];} uint8x16x2_t;typedef struct uint16x4x2_t{ uint16x4_t val[2];} uint16x4x2_t;typedef struct uint16x8x2_t{ uint16x8_t val[2];} uint16x8x2_t;typedef struct uint32x2x2_t{ uint32x2_t val[2];} uint32x2x2_t;typedef struct uint32x4x2_t{ uint32x4_t val[2];} uint32x4x2_t;typedef struct uint64x1x2_t{ uint64x1_t val[2];} uint64x1x2_t;typedef struct uint64x2x2_t{ uint64x2_t val[2];} uint64x2x2_t;typedef struct float32x2x2_t{ float32x2_t val[2];} float32x2x2_t;typedef struct float32x4x2_t{ float32x4_t val[2];} float32x4x2_t;typedef struct poly8x8x2_t{ poly8x8_t val[2];} poly8x8x2_t;typedef struct poly8x16x2_t{ poly8x16_t val[2];} poly8x16x2_t;typedef struct poly16x4x2_t{ poly16x4_t val[2];} poly16x4x2_t;typedef struct poly16x8x2_t{ poly16x8_t val[2];} poly16x8x2_t;typedef struct int8x8x3_t{ int8x8_t val[3];} int8x8x3_t;typedef struct int8x16x3_t{ int8x16_t val[3];} int8x16x3_t;typedef struct int16x4x3_t{ int16x4_t val[3];} int16x4x3_t;typedef struct int16x8x3_t{ int16x8_t val[3];} int16x8x3_t;typedef struct int32x2x3_t{ int32x2_t val[3];} int32x2x3_t;typedef struct int32x4x3_t{ int32x4_t val[3];} int32x4x3_t;typedef struct int64x1x3_t{ int64x1_t val[3];} int64x1x3_t;typedef struct int64x2x3_t{ int64x2_t val[3];} int64x2x3_t;typedef struct uint8x8x3_t{ uint8x8_t val[3];} uint8x8x3_t;typedef struct uint8x16x3_t{ uint8x16_t val[3];} uint8x16x3_t;typedef struct uint16x4x3_t{ uint16x4_t val[3];} uint16x4x3_t;typedef struct uint16x8x3_t{ uint16x8_t val[3];} uint16x8x3_t;typedef struct uint32x2x3_t{ uint32x2_t val[3];} uint32x2x3_t;typedef struct uint32x4x3_t{ uint32x4_t val[3];} uint32x4x3_t;typedef struct uint64x1x3_t{ uint64x1_t val[3];} uint64x1x3_t;typedef struct uint64x2x3_t{ uint64x2_t val[3];} uint64x2x3_t;typedef struct float32x2x3_t{ float32x2_t val[3];} float32x2x3_t;typedef struct float32x4x3_t{ float32x4_t val[3];} float32x4x3_t;typedef struct poly8x8x3_t{ poly8x8_t val[3];} poly8x8x3_t;typedef struct poly8x16x3_t{ poly8x16_t val[3];} poly8x16x3_t;typedef struct poly16x4x3_t{ poly16x4_t val[3];} poly16x4x3_t;typedef struct poly16x8x3_t{ poly16x8_t val[3];} poly16x8x3_t;typedef struct int8x8x4_t{ int8x8_t val[4];} int8x8x4_t;typedef struct int8x16x4_t{ int8x16_t val[4];} int8x16x4_t;typedef struct int16x4x4_t{ int16x4_t val[4];} int16x4x4_t;typedef struct int16x8x4_t{ int16x8_t val[4];} int16x8x4_t;typedef struct int32x2x4_t{ int32x2_t val[4];} int32x2x4_t;typedef struct int32x4x4_t{ int32x4_t val[4];} int32x4x4_t;typedef struct int64x1x4_t{ int64x1_t val[4];} int64x1x4_t;typedef struct int64x2x4_t{ int64x2_t val[4];} int64x2x4_t;typedef struct uint8x8x4_t{ uint8x8_t val[4];} uint8x8x4_t;typedef struct uint8x16x4_t{ uint8x16_t val[4];} uint8x16x4_t;typedef struct uint16x4x4_t{ uint16x4_t val[4];} uint16x4x4_t;typedef struct uint16x8x4_t{ uint16x8_t val[4];} uint16x8x4_t;typedef struct uint32x2x4_t{ uint32x2_t val[4];} uint32x2x4_t;typedef struct uint32x4x4_t{ uint32x4_t val[4];} uint32x4x4_t;typedef struct uint64x1x4_t{ uint64x1_t val[4];} uint64x1x4_t;typedef struct uint64x2x4_t{ uint64x2_t val[4];} uint64x2x4_t;typedef struct float32x2x4_t{ float32x2_t val[4];} float32x2x4_t;typedef struct float32x4x4_t{ float32x4_t val[4];} float32x4x4_t;typedef struct poly8x8x4_t{ poly8x8_t val[4];} poly8x8x4_t;typedef struct poly8x16x4_t{ poly8x16_t val[4];} poly8x16x4_t;typedef struct poly16x4x4_t{ poly16x4_t val[4];} poly16x4x4_t;typedef struct poly16x8x4_t{ poly16x8_t val[4];} poly16x8x4_t;
本文来自网络,不代表1号站长-站长学院|资讯交流平台立场。转载请注明出处: https://www.1cn.cc/fwq/web/3432.html
上一篇Vim如何使用相对行号实现一切操作详解
下一篇 Shell中如何删除文本比较长的行的实现方法
admin

作者: admin

这里可以再内容模板定义一些文字和说明,也可以调用对应作者的简介!或者做一些网站的描述之类的文字或者HTML!

为您推荐

评论列表()

    联系我们

    联系我们

    0898-88888888

    在线咨询: QQ交谈

    邮箱: email@wangzhan.com

    工作时间:周一至周五,9:00-17:30,节假日休息

    关注微信
    微信扫一扫关注我们

    微信扫一扫关注我们

    关注微博
    返回顶部