shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; jsimdcpu.asm - SIMD instruction support check
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB</ossman@cendio.se>
shun-iwasawa 82a8f5
; Copyright (C) 2016, D. R. Commander.
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Based on the x86 SIMD extension for IJG JPEG library
shun-iwasawa 82a8f5
; Copyright (C) 1999-2006, MIYASAKA Masaru.
shun-iwasawa 82a8f5
; For conditions of distribution and use, see copyright notice in jsimdext.inc
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; This file should be assembled with NASM (Netwide Assembler),
shun-iwasawa 82a8f5
; can *not* be assembled with Microsoft's MASM or any compatible
shun-iwasawa 82a8f5
; assembler (including Borland's Turbo Assembler).
shun-iwasawa 82a8f5
; NASM is available from http://nasm.sourceforge.net/ or
shun-iwasawa 82a8f5
; http://sourceforge.net/project/showfiles.php?group_id=6208
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
%include "jsimdext.inc"
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; --------------------------------------------------------------------------
shun-iwasawa 82a8f5
    SECTION     SEG_TEXT
shun-iwasawa 82a8f5
    BITS        32
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; Check if the CPU supports SIMD instructions
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
; GLOBAL(unsigned int)
shun-iwasawa 82a8f5
; jpeg_simd_cpu_support(void)
shun-iwasawa 82a8f5
;
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    align       32
shun-iwasawa 82a8f5
    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
EXTN(jpeg_simd_cpu_support):
shun-iwasawa 82a8f5
    push        ebx
shun-iwasawa 82a8f5
;   push        ecx                     ; need not be preserved
shun-iwasawa 82a8f5
;   push        edx                     ; need not be preserved
shun-iwasawa 82a8f5
;   push        esi                     ; unused
shun-iwasawa 82a8f5
    push        edi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    xor         edi, edi                ; simd support flag
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pushfd
shun-iwasawa 82a8f5
    pop         eax
shun-iwasawa 82a8f5
    mov         edx, eax
shun-iwasawa 82a8f5
    xor         eax, 1<<21              ; flip ID bit in EFLAGS
shun-iwasawa 82a8f5
    push        eax
shun-iwasawa 82a8f5
    popfd
shun-iwasawa 82a8f5
    pushfd
shun-iwasawa 82a8f5
    pop         eax
shun-iwasawa 82a8f5
    xor         eax, edx
shun-iwasawa 82a8f5
    jz          near .return            ; CPUID is not supported
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check whether CPUID leaf 07H is supported
shun-iwasawa 82a8f5
    ; (leaf 07H is used to check for AVX2 instruction support)
shun-iwasawa 82a8f5
    xor         eax, eax
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    test        eax, eax
shun-iwasawa 82a8f5
    jz          near .return
shun-iwasawa 82a8f5
    cmp         eax, 7
shun-iwasawa 82a8f5
    jl          short .no_avx2          ; Maximum leaf < 07H
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check for AVX2 instruction support
shun-iwasawa 82a8f5
    mov         eax, 7
shun-iwasawa 82a8f5
    xor         ecx, ecx
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    mov         eax, ebx
shun-iwasawa 82a8f5
    test        eax, 1<<5               ; bit5:AVX2
shun-iwasawa 82a8f5
    jz          short .no_avx2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check for AVX2 O/S support
shun-iwasawa 82a8f5
    mov         eax, 1
shun-iwasawa 82a8f5
    xor         ecx, ecx
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    test        ecx, 1<<27
shun-iwasawa 82a8f5
    jz          short .no_avx2          ; O/S does not support XSAVE
shun-iwasawa 82a8f5
    test        ecx, 1<<28
shun-iwasawa 82a8f5
    jz          short .no_avx2          ; CPU does not support AVX2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    xor         ecx, ecx
shun-iwasawa 82a8f5
    xgetbv
shun-iwasawa 82a8f5
    and         eax, 6
shun-iwasawa 82a8f5
    cmp         eax, 6                  ; O/S does not manage XMM/YMM state
shun-iwasawa 82a8f5
                                        ; using XSAVE
shun-iwasawa 82a8f5
    jnz         short .no_avx2
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    or          edi, JSIMD_AVX2
shun-iwasawa 82a8f5
.no_avx2:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
shun-iwasawa 82a8f5
    xor         eax, eax
shun-iwasawa 82a8f5
    inc         eax
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    mov         eax, edx                ; eax = Standard feature flags
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check for MMX instruction support
shun-iwasawa 82a8f5
    test        eax, 1<<23              ; bit23:MMX
shun-iwasawa 82a8f5
    jz          short .no_mmx
shun-iwasawa 82a8f5
    or          edi, byte JSIMD_MMX
shun-iwasawa 82a8f5
.no_mmx:
shun-iwasawa 82a8f5
    test        eax, 1<<25              ; bit25:SSE
shun-iwasawa 82a8f5
    jz          short .no_sse
shun-iwasawa 82a8f5
    or          edi, byte JSIMD_SSE
shun-iwasawa 82a8f5
.no_sse:
shun-iwasawa 82a8f5
    test        eax, 1<<26              ; bit26:SSE2
shun-iwasawa 82a8f5
    jz          short .no_sse2
shun-iwasawa 82a8f5
    or          edi, byte JSIMD_SSE2
shun-iwasawa 82a8f5
.no_sse2:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    ; Check for 3DNow! instruction support
shun-iwasawa 82a8f5
    mov         eax, 0x80000000
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    cmp         eax, 0x80000000
shun-iwasawa 82a8f5
    jbe         short .return
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    mov         eax, 0x80000001
shun-iwasawa 82a8f5
    cpuid
shun-iwasawa 82a8f5
    mov         eax, edx                ; eax = Extended feature flags
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    test        eax, 1<<31              ; bit31:3DNow!(vendor independent)
shun-iwasawa 82a8f5
    jz          short .no_3dnow
shun-iwasawa 82a8f5
    or          edi, byte JSIMD_3DNOW
shun-iwasawa 82a8f5
.no_3dnow:
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
.return:
shun-iwasawa 82a8f5
    mov         eax, edi
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
    pop         edi
shun-iwasawa 82a8f5
;   pop         esi                     ; unused
shun-iwasawa 82a8f5
;   pop         edx                     ; need not be preserved
shun-iwasawa 82a8f5
;   pop         ecx                     ; need not be preserved
shun-iwasawa 82a8f5
    pop         ebx
shun-iwasawa 82a8f5
    ret
shun-iwasawa 82a8f5
shun-iwasawa 82a8f5
; For some reason, the OS X linker does not honor the request to align the
shun-iwasawa 82a8f5
; segment unless we do this.
shun-iwasawa 82a8f5
    align       32