fukasawa e60969
fukasawa e60969
/* filter_neon.S - NEON optimised filter functions
fukasawa e60969
 *
fukasawa e60969
 * Copyright (c) 2014 Glenn Randers-Pehrson
fukasawa e60969
 * Written by Mans Rullgard, 2011.
fukasawa e60969
 * Last changed in libpng 1.6.16 [December 22, 2014]
fukasawa e60969
 *
fukasawa e60969
 * This code is released under the libpng license.
fukasawa e60969
 * For conditions of distribution and use, see the disclaimer
fukasawa e60969
 * and license in png.h
fukasawa e60969
 */
fukasawa e60969
fukasawa e60969
/* This is required to get the symbol renames, which are #defines, and the
fukasawa e60969
 * definitions (or not) of PNG_ARM_NEON_OPT and PNG_ARM_NEON_IMPLEMENTATION.
fukasawa e60969
 */
fukasawa e60969
#define PNG_VERSION_INFO_ONLY
fukasawa e60969
#include "../pngpriv.h"
fukasawa e60969
fukasawa e60969
#if defined(__linux__) && defined(__ELF__)
fukasawa e60969
.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
fukasawa e60969
#endif
fukasawa e60969
fukasawa e60969
#ifdef PNG_READ_SUPPORTED
fukasawa e60969
fukasawa e60969
/* Assembler NEON support - only works for 32-bit ARM (i.e. it does not work for
fukasawa e60969
 * ARM64).  The code in arm/filter_neon_intrinsics.c supports ARM64, however it
fukasawa e60969
 * only works if -mfpu=neon is specified on the GCC command line.  See pngpriv.h
fukasawa e60969
 * for the logic which sets PNG_USE_ARM_NEON_ASM:
fukasawa e60969
 */
fukasawa e60969
#if PNG_ARM_NEON_IMPLEMENTATION == 2 /* hand-coded assembler */
fukasawa e60969
fukasawa e60969
#if PNG_ARM_NEON_OPT > 0
fukasawa e60969
fukasawa e60969
#ifdef __ELF__
fukasawa e60969
#   define ELF
fukasawa e60969
#else
fukasawa e60969
#   define ELF @
fukasawa e60969
#endif
fukasawa e60969
fukasawa e60969
        .arch armv7-a
fukasawa e60969
        .fpu  neon
fukasawa e60969
fukasawa e60969
.macro  func    name, export=0
fukasawa e60969
    .macro endfunc
fukasawa e60969
ELF     .size   \name, . - \name
fukasawa e60969
        .endfunc
fukasawa e60969
        .purgem endfunc
fukasawa e60969
    .endm
fukasawa e60969
        .text
fukasawa e60969
fukasawa e60969
        /* Explicitly specifying alignment here because some versions of
fukasawa e60969
         * GAS don't align code correctly.  This is harmless in correctly
fukasawa e60969
         * written versions of GAS.
fukasawa e60969
         */
fukasawa e60969
        .align 2
fukasawa e60969
fukasawa e60969
    .if \export
fukasawa e60969
        .global \name
fukasawa e60969
    .endif
fukasawa e60969
ELF     .type   \name, STT_FUNC
fukasawa e60969
        .func   \name
fukasawa e60969
\name:
fukasawa e60969
.endm
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_sub4_neon, export=1
fukasawa e60969
        ldr             r3,  [r0, #4]           @ rowbytes
fukasawa e60969
        vmov.i8         d3,  #0
fukasawa e60969
1:
fukasawa e60969
        vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
fukasawa e60969
        vadd.u8         d0,  d3,  d4
fukasawa e60969
        vadd.u8         d1,  d0,  d5
fukasawa e60969
        vadd.u8         d2,  d1,  d6
fukasawa e60969
        vadd.u8         d3,  d2,  d7
fukasawa e60969
        vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
fukasawa e60969
        subs            r3,  r3,  #16
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        bx              lr
fukasawa e60969
endfunc
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_sub3_neon, export=1
fukasawa e60969
        ldr             r3,  [r0, #4]           @ rowbytes
fukasawa e60969
        vmov.i8         d3,  #0
fukasawa e60969
        mov             r0,  r1
fukasawa e60969
        mov             r2,  #3
fukasawa e60969
        mov             r12, #12
fukasawa e60969
        vld1.8          {q11},    [r0], r12
fukasawa e60969
1:
fukasawa e60969
        vext.8          d5,  d22, d23, #3
fukasawa e60969
        vadd.u8         d0,  d3,  d22
fukasawa e60969
        vext.8          d6,  d22, d23, #6
fukasawa e60969
        vadd.u8         d1,  d0,  d5
fukasawa e60969
        vext.8          d7,  d23, d23, #1
fukasawa e60969
        vld1.8          {q11},    [r0], r12
fukasawa e60969
        vst1.32         {d0[0]},  [r1,:32], r2
fukasawa e60969
        vadd.u8         d2,  d1,  d6
fukasawa e60969
        vst1.32         {d1[0]},  [r1], r2
fukasawa e60969
        vadd.u8         d3,  d2,  d7
fukasawa e60969
        vst1.32         {d2[0]},  [r1], r2
fukasawa e60969
        vst1.32         {d3[0]},  [r1], r2
fukasawa e60969
        subs            r3,  r3,  #12
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        bx              lr
fukasawa e60969
endfunc
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_up_neon, export=1
fukasawa e60969
        ldr             r3,  [r0, #4]           @ rowbytes
fukasawa e60969
1:
fukasawa e60969
        vld1.8          {q0}, [r1,:128]
fukasawa e60969
        vld1.8          {q1}, [r2,:128]!
fukasawa e60969
        vadd.u8         q0,  q0,  q1
fukasawa e60969
        vst1.8          {q0}, [r1,:128]!
fukasawa e60969
        subs            r3,  r3,  #16
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        bx              lr
fukasawa e60969
endfunc
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_avg4_neon, export=1
fukasawa e60969
        ldr             r12, [r0, #4]           @ rowbytes
fukasawa e60969
        vmov.i8         d3,  #0
fukasawa e60969
1:
fukasawa e60969
        vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
fukasawa e60969
        vld4.32         {d16[],d17[],d18[],d19[]},[r2,:128]!
fukasawa e60969
        vhadd.u8        d0,  d3,  d16
fukasawa e60969
        vadd.u8         d0,  d0,  d4
fukasawa e60969
        vhadd.u8        d1,  d0,  d17
fukasawa e60969
        vadd.u8         d1,  d1,  d5
fukasawa e60969
        vhadd.u8        d2,  d1,  d18
fukasawa e60969
        vadd.u8         d2,  d2,  d6
fukasawa e60969
        vhadd.u8        d3,  d2,  d19
fukasawa e60969
        vadd.u8         d3,  d3,  d7
fukasawa e60969
        vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
fukasawa e60969
        subs            r12, r12, #16
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        bx              lr
fukasawa e60969
endfunc
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_avg3_neon, export=1
fukasawa e60969
        push            {r4,lr}
fukasawa e60969
        ldr             r12, [r0, #4]           @ rowbytes
fukasawa e60969
        vmov.i8         d3,  #0
fukasawa e60969
        mov             r0,  r1
fukasawa e60969
        mov             r4,  #3
fukasawa e60969
        mov             lr,  #12
fukasawa e60969
        vld1.8          {q11},    [r0], lr
fukasawa e60969
1:
fukasawa e60969
        vld1.8          {q10},    [r2], lr
fukasawa e60969
        vext.8          d5,  d22, d23, #3
fukasawa e60969
        vhadd.u8        d0,  d3,  d20
fukasawa e60969
        vext.8          d17, d20, d21, #3
fukasawa e60969
        vadd.u8         d0,  d0,  d22
fukasawa e60969
        vext.8          d6,  d22, d23, #6
fukasawa e60969
        vhadd.u8        d1,  d0,  d17
fukasawa e60969
        vext.8          d18, d20, d21, #6
fukasawa e60969
        vadd.u8         d1,  d1,  d5
fukasawa e60969
        vext.8          d7,  d23, d23, #1
fukasawa e60969
        vld1.8          {q11},    [r0], lr
fukasawa e60969
        vst1.32         {d0[0]},  [r1,:32], r4
fukasawa e60969
        vhadd.u8        d2,  d1,  d18
fukasawa e60969
        vst1.32         {d1[0]},  [r1], r4
fukasawa e60969
        vext.8          d19, d21, d21, #1
fukasawa e60969
        vadd.u8         d2,  d2,  d6
fukasawa e60969
        vhadd.u8        d3,  d2,  d19
fukasawa e60969
        vst1.32         {d2[0]},  [r1], r4
fukasawa e60969
        vadd.u8         d3,  d3,  d7
fukasawa e60969
        vst1.32         {d3[0]},  [r1], r4
fukasawa e60969
        subs            r12, r12, #12
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        pop             {r4,pc}
fukasawa e60969
endfunc
fukasawa e60969
fukasawa e60969
.macro  paeth           rx,  ra,  rb,  rc
fukasawa e60969
        vaddl.u8        q12, \ra, \rb           @ a + b
fukasawa e60969
        vaddl.u8        q15, \rc, \rc           @ 2*c
fukasawa e60969
        vabdl.u8        q13, \rb, \rc           @ pa
fukasawa e60969
        vabdl.u8        q14, \ra, \rc           @ pb
fukasawa e60969
        vabd.u16        q15, q12, q15           @ pc
fukasawa e60969
        vcle.u16        q12, q13, q14           @ pa <= pb
fukasawa e60969
        vcle.u16        q13, q13, q15           @ pa <= pc
fukasawa e60969
        vcle.u16        q14, q14, q15           @ pb <= pc
fukasawa e60969
        vand            q12, q12, q13           @ pa <= pb && pa <= pc
fukasawa e60969
        vmovn.u16       d28, q14
fukasawa e60969
        vmovn.u16       \rx, q12
fukasawa e60969
        vbsl            d28, \rb, \rc
fukasawa e60969
        vbsl            \rx, \ra, d28
fukasawa e60969
.endm
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_paeth4_neon, export=1
fukasawa e60969
        ldr             r12, [r0, #4]           @ rowbytes
fukasawa e60969
        vmov.i8         d3,  #0
fukasawa e60969
        vmov.i8         d20, #0
fukasawa e60969
1:
fukasawa e60969
        vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
fukasawa e60969
        vld4.32         {d16[],d17[],d18[],d19[]},[r2,:128]!
fukasawa e60969
        paeth           d0,  d3,  d16, d20
fukasawa e60969
        vadd.u8         d0,  d0,  d4
fukasawa e60969
        paeth           d1,  d0,  d17, d16
fukasawa e60969
        vadd.u8         d1,  d1,  d5
fukasawa e60969
        paeth           d2,  d1,  d18, d17
fukasawa e60969
        vadd.u8         d2,  d2,  d6
fukasawa e60969
        paeth           d3,  d2,  d19, d18
fukasawa e60969
        vmov            d20, d19
fukasawa e60969
        vadd.u8         d3,  d3,  d7
fukasawa e60969
        vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
fukasawa e60969
        subs            r12, r12, #16
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        bx              lr
fukasawa e60969
endfunc
fukasawa e60969
fukasawa e60969
func    png_read_filter_row_paeth3_neon, export=1
fukasawa e60969
        push            {r4,lr}
fukasawa e60969
        ldr             r12, [r0, #4]           @ rowbytes
fukasawa e60969
        vmov.i8         d3,  #0
fukasawa e60969
        vmov.i8         d4,  #0
fukasawa e60969
        mov             r0,  r1
fukasawa e60969
        mov             r4,  #3
fukasawa e60969
        mov             lr,  #12
fukasawa e60969
        vld1.8          {q11},    [r0], lr
fukasawa e60969
1:
fukasawa e60969
        vld1.8          {q10},    [r2], lr
fukasawa e60969
        paeth           d0,  d3,  d20, d4
fukasawa e60969
        vext.8          d5,  d22, d23, #3
fukasawa e60969
        vadd.u8         d0,  d0,  d22
fukasawa e60969
        vext.8          d17, d20, d21, #3
fukasawa e60969
        paeth           d1,  d0,  d17, d20
fukasawa e60969
        vst1.32         {d0[0]},  [r1,:32], r4
fukasawa e60969
        vext.8          d6,  d22, d23, #6
fukasawa e60969
        vadd.u8         d1,  d1,  d5
fukasawa e60969
        vext.8          d18, d20, d21, #6
fukasawa e60969
        paeth           d2,  d1,  d18, d17
fukasawa e60969
        vext.8          d7,  d23, d23, #1
fukasawa e60969
        vld1.8          {q11},    [r0], lr
fukasawa e60969
        vst1.32         {d1[0]},  [r1], r4
fukasawa e60969
        vadd.u8         d2,  d2,  d6
fukasawa e60969
        vext.8          d19, d21, d21, #1
fukasawa e60969
        paeth           d3,  d2,  d19, d18
fukasawa e60969
        vst1.32         {d2[0]},  [r1], r4
fukasawa e60969
        vmov            d4,  d19
fukasawa e60969
        vadd.u8         d3,  d3,  d7
fukasawa e60969
        vst1.32         {d3[0]},  [r1], r4
fukasawa e60969
        subs            r12, r12, #12
fukasawa e60969
        bgt             1b
fukasawa e60969
fukasawa e60969
        pop             {r4,pc}
fukasawa e60969
endfunc
fukasawa e60969
#endif /* PNG_ARM_NEON_OPT > 0 */
fukasawa e60969
#endif /* PNG_ARM_NEON_IMPLEMENTATION == 2 (assembler) */
fukasawa e60969
#endif /* READ */