shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; jsimdcpu.asm - SIMD instruction support check
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB</ossman@cendio.se>
shun-iwasawa 82a8f5
; Copyright (C) 2016, D. R. Commander.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Based on
shun-iwasawa 82a8f5
; x86 SIMD extension for IJG JPEG library
shun-iwasawa 82a8f5
; Copyright (C) 1999-2006, MIYASAKA Masaru.
shun-iwasawa 82a8f5
; For conditions of distribution and use, see copyright notice in jsimdext.inc
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file should be assembled with NASM (Netwide Assembler),
shun-iwasawa 82a8f5
; can *not* be assembled with Microsoft's MASM or any compatible
shun-iwasawa 82a8f5
; assembler (including Borland's Turbo Assembler).
shun-iwasawa 82a8f5
; NASM is available from http://nasm.sourceforge.net/ or
shun-iwasawa 82a8f5
; http://sourceforge.net/project/showfiles.php?group_id=6208
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%include "jsimdext.inc"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
    SECTION     SEG_TEXT
shun-iwasawa 82a8f5
    BITS        64
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Check if the CPU supports SIMD instructions
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(unsigned int)
shun-iwasawa 82a8f5
; jpeg_simd_cpu_support(void)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jpeg_simd_cpu_support):
shun-iwasawa 82a8f5
    push        rbx
shun-iwasawa 82a8f5
    push        rdi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    xor         rdi, rdi                ; simd support flag
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Assume that all x86-64 processors support SSE & SSE2 instructions
shun-iwasawa 82a8f5
    or          rdi, JSIMD_SSE2
shun-iwasawa 82a8f5
    or          rdi, JSIMD_SSE
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check whether CPUID leaf 07H is supported
shun-iwasawa 82a8f5
    ; (leaf 07H is used to check for AVX2 instruction support)
shun-iwasawa 82a8f5
    mov         rax, 0
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    cmp         rax, 7
shun-iwasawa 82a8f5
    jl          short .return           ; Maximum leaf < 07H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check for AVX2 instruction support
shun-iwasawa 82a8f5
    mov         rax, 7
shun-iwasawa 82a8f5
    xor         rcx, rcx
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    mov         rax, rbx                ; rax = Extended feature flags
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    test        rax, 1<<5               ; bit5:AVX2
shun-iwasawa 82a8f5
    jz          short .return
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check for AVX2 O/S support
shun-iwasawa 82a8f5
    mov         rax, 1
shun-iwasawa 82a8f5
    xor         rcx, rcx
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    test        rcx, 1<<27
shun-iwasawa 82a8f5
    jz          short .return           ; O/S does not support XSAVE
shun-iwasawa 82a8f5
    test        rcx, 1<<28
shun-iwasawa 82a8f5
    jz          short .return           ; CPU does not support AVX2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    xor         rcx, rcx
shun-iwasawa 82a8f5
    xgetbv
shun-iwasawa 82a8f5
    and         rax, 6
shun-iwasawa 82a8f5
    cmp         rax, 6                  ; O/S does not manage XMM/YMM state
shun-iwasawa 82a8f5
                                        ; using XSAVE
shun-iwasawa 82a8f5
    jnz         short .return
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    or          rdi, JSIMD_AVX2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.return:
shun-iwasawa 82a8f5
    mov         rax, rdi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pop         rdi
shun-iwasawa 82a8f5
    pop         rbx
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; For some reason, the OS X linker does not honor the request to align the
shun-iwasawa 82a8f5
; segment unless we do this.
shun-iwasawa 82a8f5
    align       32