shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * MIPS DSPr2 optimizations for libjpeg-turbo
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Copyright (C) 2013, MIPS Technologies, Inc., California.
shun-iwasawa 82a8f5
 * Copyright (C) 2018, Matthieu Darbois.
shun-iwasawa 82a8f5
 * All Rights Reserved.
shun-iwasawa 82a8f5
 * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
shun-iwasawa 82a8f5
 *           Darko Laus       (darko.laus@imgtec.com)
shun-iwasawa 82a8f5
 * This software is provided 'as-is', without any express or implied
shun-iwasawa 82a8f5
 * warranty.  In no event will the authors be held liable for any damages
shun-iwasawa 82a8f5
 * arising from the use of this software.
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * Permission is granted to anyone to use this software for any purpose,
shun-iwasawa 82a8f5
 * including commercial applications, and to alter it and redistribute it
shun-iwasawa 82a8f5
 * freely, subject to the following restrictions:
shun-iwasawa 82a8f5
 *
shun-iwasawa 82a8f5
 * 1. The origin of this software must not be misrepresented; you must not
shun-iwasawa 82a8f5
 *    claim that you wrote the original software. If you use this software
shun-iwasawa 82a8f5
 *    in a product, an acknowledgment in the product documentation would be
shun-iwasawa 82a8f5
 *    appreciated but is not required.
shun-iwasawa 82a8f5
 * 2. Altered source versions must be plainly marked as such, and must not be
shun-iwasawa 82a8f5
 *    misrepresented as being the original software.
shun-iwasawa 82a8f5
 * 3. This notice may not be removed or altered from any source distribution.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define zero  $0
shun-iwasawa 82a8f5
#define AT    $1
shun-iwasawa 82a8f5
#define v0    $2
shun-iwasawa 82a8f5
#define v1    $3
shun-iwasawa 82a8f5
#define a0    $4
shun-iwasawa 82a8f5
#define a1    $5
shun-iwasawa 82a8f5
#define a2    $6
shun-iwasawa 82a8f5
#define a3    $7
shun-iwasawa 82a8f5
#define t0    $8
shun-iwasawa 82a8f5
#define t1    $9
shun-iwasawa 82a8f5
#define t2    $10
shun-iwasawa 82a8f5
#define t3    $11
shun-iwasawa 82a8f5
#define t4    $12
shun-iwasawa 82a8f5
#define t5    $13
shun-iwasawa 82a8f5
#define t6    $14
shun-iwasawa 82a8f5
#define t7    $15
shun-iwasawa 82a8f5
#define s0    $16
shun-iwasawa 82a8f5
#define s1    $17
shun-iwasawa 82a8f5
#define s2    $18
shun-iwasawa 82a8f5
#define s3    $19
shun-iwasawa 82a8f5
#define s4    $20
shun-iwasawa 82a8f5
#define s5    $21
shun-iwasawa 82a8f5
#define s6    $22
shun-iwasawa 82a8f5
#define s7    $23
shun-iwasawa 82a8f5
#define t8    $24
shun-iwasawa 82a8f5
#define t9    $25
shun-iwasawa 82a8f5
#define k0    $26
shun-iwasawa 82a8f5
#define k1    $27
shun-iwasawa 82a8f5
#define gp    $28
shun-iwasawa 82a8f5
#define sp    $29
shun-iwasawa 82a8f5
#define fp    $30
shun-iwasawa 82a8f5
#define s8    $30
shun-iwasawa 82a8f5
#define ra    $31
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#define f0    $f0
shun-iwasawa 82a8f5
#define f1    $f1
shun-iwasawa 82a8f5
#define f2    $f2
shun-iwasawa 82a8f5
#define f3    $f3
shun-iwasawa 82a8f5
#define f4    $f4
shun-iwasawa 82a8f5
#define f5    $f5
shun-iwasawa 82a8f5
#define f6    $f6
shun-iwasawa 82a8f5
#define f7    $f7
shun-iwasawa 82a8f5
#define f8    $f8
shun-iwasawa 82a8f5
#define f9    $f9
shun-iwasawa 82a8f5
#define f10   $f10
shun-iwasawa 82a8f5
#define f11   $f11
shun-iwasawa 82a8f5
#define f12   $f12
shun-iwasawa 82a8f5
#define f13   $f13
shun-iwasawa 82a8f5
#define f14   $f14
shun-iwasawa 82a8f5
#define f15   $f15
shun-iwasawa 82a8f5
#define f16   $f16
shun-iwasawa 82a8f5
#define f17   $f17
shun-iwasawa 82a8f5
#define f18   $f18
shun-iwasawa 82a8f5
#define f19   $f19
shun-iwasawa 82a8f5
#define f20   $f20
shun-iwasawa 82a8f5
#define f21   $f21
shun-iwasawa 82a8f5
#define f22   $f22
shun-iwasawa 82a8f5
#define f23   $f23
shun-iwasawa 82a8f5
#define f24   $f24
shun-iwasawa 82a8f5
#define f25   $f25
shun-iwasawa 82a8f5
#define f26   $f26
shun-iwasawa 82a8f5
#define f27   $f27
shun-iwasawa 82a8f5
#define f28   $f28
shun-iwasawa 82a8f5
#define f29   $f29
shun-iwasawa 82a8f5
#define f30   $f30
shun-iwasawa 82a8f5
#define f31   $f31
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
#ifdef __ELF__
shun-iwasawa 82a8f5
#define HIDDEN_SYMBOL(symbol)  .hidden symbol;
shun-iwasawa 82a8f5
#else
shun-iwasawa 82a8f5
#define HIDDEN_SYMBOL(symbol)
shun-iwasawa 82a8f5
#endif
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
#define LEAF_MIPS32R2(symbol) \
shun-iwasawa 82a8f5
    .globl      symbol; \
shun-iwasawa 82a8f5
    HIDDEN_SYMBOL(symbol) \
shun-iwasawa 82a8f5
    .align      2; \
shun-iwasawa 82a8f5
    .type       symbol, @function; \
shun-iwasawa 82a8f5
    .ent        symbol, 0; \
shun-iwasawa 82a8f5
symbol: \
shun-iwasawa 82a8f5
    .frame      sp, 0, ra; \
shun-iwasawa 82a8f5
    .set        push; \
shun-iwasawa 82a8f5
    .set        arch = mips32r2; \
shun-iwasawa 82a8f5
    .set        noreorder; \
shun-iwasawa 82a8f5
    .set        noat;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
#define LEAF_DSPR2(symbol) \
shun-iwasawa 82a8f5
LEAF_MIPS32R2(symbol) \
shun-iwasawa 82a8f5
    .set        dspr2;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * END - mark end of function
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
#define END(function) \
shun-iwasawa 82a8f5
    .set        pop; \
shun-iwasawa 82a8f5
    .end        function; \
shun-iwasawa 82a8f5
    .size       function, .-function
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Checks if stack offset is big enough for storing/restoring regs_num
shun-iwasawa 82a8f5
 * number of register to/from stack. Stack offset must be greater than
shun-iwasawa 82a8f5
 * or equal to the number of bytes needed for storing registers (regs_num*4).
shun-iwasawa 82a8f5
 * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
shun-iwasawa 82a8f5
 * preserved for input arguments of the functions, already stored in a0-a3),
shun-iwasawa 82a8f5
 * stack size can be further optimized by utilizing this space.
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
.macro CHECK_STACK_OFFSET regs_num, stack_offset
shun-iwasawa 82a8f5
.if \stack_offset < \regs_num * 4 - 16
shun-iwasawa 82a8f5
.error "Stack offset too small."
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Saves set of registers on stack. Maximum number of registers that
shun-iwasawa 82a8f5
 * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
shun-iwasawa 82a8f5
 * Stack offset is number of bytes that are added to stack pointer (sp)
shun-iwasawa 82a8f5
 * before registers are pushed in order to provide enough space on stack
shun-iwasawa 82a8f5
 * (offset must be multiple of 4, and must be big enough, as described by
shun-iwasawa 82a8f5
 * CHECK_STACK_OFFSET macro). This macro is intended to be used in
shun-iwasawa 82a8f5
 * combination with RESTORE_REGS_FROM_STACK macro. Example:
shun-iwasawa 82a8f5
 *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
shun-iwasawa 82a8f5
 *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
.macro SAVE_REGS_ON_STACK  stack_offset = 0, r1, \
shun-iwasawa 82a8f5
                           r2  = 0, r3  = 0, r4  = 0, \
shun-iwasawa 82a8f5
                           r5  = 0, r6  = 0, r7  = 0, \
shun-iwasawa 82a8f5
                           r8  = 0, r9  = 0, r10 = 0, \
shun-iwasawa 82a8f5
                           r11 = 0, r12 = 0, r13 = 0, \
shun-iwasawa 82a8f5
                           r14 = 0
shun-iwasawa 82a8f5
.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
shun-iwasawa 82a8f5
    .error "Stack offset must be pozitive and multiple of 4."
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \stack_offset != 0
shun-iwasawa 82a8f5
    addiu       sp, sp, -\stack_offset
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
    sw          \r1, 0(sp)
shun-iwasawa 82a8f5
.if \r2 != 0
shun-iwasawa 82a8f5
    sw          \r2, 4(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r3 != 0
shun-iwasawa 82a8f5
    sw          \r3, 8(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r4 != 0
shun-iwasawa 82a8f5
    sw          \r4, 12(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r5 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 5, \stack_offset
shun-iwasawa 82a8f5
    sw          \r5, 16(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r6 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 6, \stack_offset
shun-iwasawa 82a8f5
    sw          \r6, 20(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r7 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 7, \stack_offset
shun-iwasawa 82a8f5
    sw          \r7, 24(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r8 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 8, \stack_offset
shun-iwasawa 82a8f5
    sw          \r8, 28(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r9 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 9, \stack_offset
shun-iwasawa 82a8f5
    sw          \r9, 32(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r10 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 10, \stack_offset
shun-iwasawa 82a8f5
    sw          \r10, 36(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r11 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 11, \stack_offset
shun-iwasawa 82a8f5
    sw          \r11, 40(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r12 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 12, \stack_offset
shun-iwasawa 82a8f5
    sw          \r12, 44(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r13 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 13, \stack_offset
shun-iwasawa 82a8f5
    sw          \r13, 48(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r14 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 14, \stack_offset
shun-iwasawa 82a8f5
    sw          \r14, 52(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.endm
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
/*
shun-iwasawa 82a8f5
 * Restores set of registers from stack. Maximum number of registers that
shun-iwasawa 82a8f5
 * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
shun-iwasawa 82a8f5
 * Stack offset is number of bytes that are added to stack pointer (sp)
shun-iwasawa 82a8f5
 * after registers are restored (offset must be multiple of 4, and must
shun-iwasawa 82a8f5
 * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
shun-iwasawa 82a8f5
 * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
shun-iwasawa 82a8f5
 * Example:
shun-iwasawa 82a8f5
 *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
shun-iwasawa 82a8f5
 *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
shun-iwasawa 82a8f5
 */
shun-iwasawa 82a8f5
.macro RESTORE_REGS_FROM_STACK  stack_offset = 0, r1, \
shun-iwasawa 82a8f5
                                r2  = 0, r3  = 0, r4  = 0, \
shun-iwasawa 82a8f5
                                r5  = 0, r6  = 0, r7  = 0, \
shun-iwasawa 82a8f5
                                r8  = 0, r9  = 0, r10 = 0, \
shun-iwasawa 82a8f5
                                r11 = 0, r12 = 0, r13 = 0, \
shun-iwasawa 82a8f5
                                r14 = 0
shun-iwasawa 82a8f5
.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
shun-iwasawa 82a8f5
    .error "Stack offset must be pozitive and multiple of 4."
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
    lw          \r1, 0(sp)
shun-iwasawa 82a8f5
.if \r2 != 0
shun-iwasawa 82a8f5
    lw          \r2, 4(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r3 != 0
shun-iwasawa 82a8f5
    lw          \r3, 8(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r4 != 0
shun-iwasawa 82a8f5
    lw          \r4, 12(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r5 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 5, \stack_offset
shun-iwasawa 82a8f5
    lw          \r5, 16(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r6 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 6, \stack_offset
shun-iwasawa 82a8f5
    lw          \r6, 20(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r7 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 7, \stack_offset
shun-iwasawa 82a8f5
    lw          \r7, 24(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r8 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 8, \stack_offset
shun-iwasawa 82a8f5
    lw          \r8, 28(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r9 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 9, \stack_offset
shun-iwasawa 82a8f5
    lw          \r9, 32(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r10 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 10, \stack_offset
shun-iwasawa 82a8f5
    lw          \r10, 36(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r11 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 11, \stack_offset
shun-iwasawa 82a8f5
    lw          \r11, 40(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r12 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 12, \stack_offset
shun-iwasawa 82a8f5
    lw          \r12, 44(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r13 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 13, \stack_offset
shun-iwasawa 82a8f5
    lw          \r13, 48(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \r14 != 0
shun-iwasawa 82a8f5
    CHECK_STACK_OFFSET 14, \stack_offset
shun-iwasawa 82a8f5
    lw          \r14, 52(sp)
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.if \stack_offset != 0
shun-iwasawa 82a8f5
    addiu       sp, sp, \stack_offset
shun-iwasawa 82a8f5
.endif
shun-iwasawa 82a8f5
.endm