Blob Blame Raw
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
   3. Neither the name of the ISCAS nor the names of its contributors may 
      be used to endorse or promote products derived from this software 
      without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

**********************************************************************************/

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

//#undef  DEBUG

#include "common.h"
#include <errno.h>

#ifdef OS_WINDOWS
#define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES  0x20000000
#endif
#else
#define ALLOC_MMAP
#define ALLOC_MALLOC
#endif

#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>

#ifndef OS_WINDOWS
#include <sys/mman.h>
#include <sys/shm.h>
#include <sys/ipc.h>
#endif

#include <sys/types.h>

#ifdef OS_LINUX
#include <sys/sysinfo.h>
#include <sched.h>
#include <errno.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#endif

#if defined(OS_FreeBSD) || defined(OS_Darwin)
#include <sys/sysctl.h>
#endif

#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
#include <conio.h>
#undef  printf
#define printf	_cprintf
#endif

#ifdef OS_LINUX

#ifndef MPOL_PREFERRED
#define MPOL_PREFERRED  1
#endif

#endif

#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
#define NO_WARMUP
#endif

#ifdef ALLOC_HUGETLB
#define SHM_HUGETLB 04000
#endif

#ifndef FIXED_PAGESIZE
#define FIXED_PAGESIZE 4096
#endif

#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))

#define CONSTRUCTOR	__attribute__ ((constructor)) 
#define DESTRUCTOR	__attribute__ ((destructor)) 

#ifdef DYNAMIC_ARCH
gotoblas_t *gotoblas = NULL;
#endif

#ifndef SMP

#define blas_cpu_number 1
#define blas_num_threads 1

/* Dummy Function */
int  goto_get_num_procs  (void) { return 1;};
void goto_set_num_threads(int num_threads) {};

#else

#ifdef OS_LINUX
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
int get_num_procs(void) {
  static int nums = 0;
  if (!nums) nums = get_nprocs();
  return nums;
}
#endif
#endif

#ifdef OS_WINDOWS

int get_num_procs(void) {
  
  static int nums = 0;

  if (nums == 0) {

    SYSTEM_INFO sysinfo;
    
    GetSystemInfo(&sysinfo);

    nums = sysinfo.dwNumberOfProcessors;
  }
  
  return nums;
}

#endif

#if defined(OS_FreeBSD) || defined(OS_Darwin)

int get_num_procs(void) {
  
  static int nums = 0;

  int m[2];
  size_t len;
  
  if (nums == 0) {
    m[0] = CTL_HW;
    m[1] = HW_NCPU;
    len = sizeof(int);
    sysctl(m, 2, &nums, &len, NULL, 0);
  }

  return nums;
}

#endif

int blas_cpu_number  = 0;
int blas_num_threads = 0;

int  goto_get_num_procs  (void) {
  return blas_cpu_number;
}

int blas_get_cpu_number(void){
  char *p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  int max_num;
#endif
  int blas_goto_num   = 0;
  int blas_omp_num    = 0;

  if (blas_num_threads) return blas_num_threads;

#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  max_num = get_num_procs();
#endif

  blas_goto_num = 0;
#ifndef USE_OPENMP
  p = getenv("OPENBLAS_NUM_THREADS");
  if (p) blas_goto_num = atoi(p);
  if (blas_goto_num < 0) blas_goto_num = 0;

  if (blas_goto_num == 0) {
	    p = getenv("GOTO_NUM_THREADS");
		if (p) blas_goto_num = atoi(p);
		if (blas_goto_num < 0) blas_goto_num = 0;
  }
  
#endif

  blas_omp_num = 0;
  p = getenv("OMP_NUM_THREADS");
  if (p) blas_omp_num = atoi(p);
  if (blas_omp_num < 0) blas_omp_num = 0;

  if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  else blas_num_threads = MAX_CPU_NUMBER;

#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif

  if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;

#ifdef DEBUG
  printf( "Adjusted number of threads : %3d\n", blas_num_threads);
#endif
  
  blas_cpu_number = blas_num_threads;  

  return blas_num_threads;
}
#endif

struct release_t {
  void *address;
  void (*func)(struct release_t *);
  long attr;
};

int hugetlb_allocated = 0;

static struct release_t release_info[NUM_BUFFERS];
static int release_pos = 0;

#if defined(OS_LINUX) && !defined(NO_WARMUP)
static int hot_alloc = 0;
#endif

#ifdef ALLOC_MMAP

static void alloc_mmap_free(struct release_t *release){

  if (munmap(release -> address, BUFFER_SIZE)) {
    printf("OpenBLAS : munmap failed\n");
  }
}

#ifdef NO_WARMUP

static void *alloc_mmap(void *address){
  void *map_address;

  if (address){
    map_address = mmap(address, 
		       BUFFER_SIZE, 
		       MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  } else {
    map_address = mmap(address, 
		       BUFFER_SIZE, 
		       MMAP_ACCESS, MMAP_POLICY, -1, 0);
  }

  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].func    = alloc_mmap_free;
    release_pos ++;
  }

#ifdef OS_LINUX
  my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif

  return map_address;
}

#else

#define BENCH_ITERATION 4
#define SCALING		2

static inline BLASULONG run_bench(BLASULONG address, long size) {

  BLASULONG original, *p;
  BLASULONG start, stop, min;
  int iter, i, count;
  
  min = (BLASULONG)-1;

  original = *(BLASULONG *)(address + size - PAGESIZE);

  *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;

  for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {

    p = (BLASULONG *)address;
    
    count = size / PAGESIZE;
    
    start = rpcc();
    
    for (i = 0; i < count; i ++) {
      p = (BLASULONG *)(*p);
    }
    
    stop = rpcc();
    
    if (min > stop - start) min = stop - start;
  }
  
  *(BLASULONG *)(address + size - PAGESIZE +  0) = original;
  *(BLASULONG *)(address + size - PAGESIZE +  8) = (BLASULONG)p;

  return min;
}

static void *alloc_mmap(void *address){
  void *map_address, *best_address;
  BLASULONG best, start, current;
  BLASULONG allocsize;

  if (address){
    /* Just give up use advanced operation */
    map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);

#ifdef OS_LINUX
    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif

  } else {
#if defined(OS_LINUX) && !defined(NO_WARMUP)
    if (hot_alloc == 0) {
      map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);

#ifdef OS_LINUX
      my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif

    } else {
#endif

      map_address = mmap(NULL, BUFFER_SIZE * SCALING, 
			 MMAP_ACCESS, MMAP_POLICY, -1, 0);
      
      if (map_address != (void *)-1) {
		  
#ifdef OS_LINUX
#ifdef DEBUG
		  int ret=0;
		  ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
		  if(ret==-1){
			  int errsv=errno;
			  perror("OpenBLAS alloc_mmap:");
			  printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
		  }

#else
		  my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
#endif
#endif

	
	allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
	
	start   = (BLASULONG)map_address;
	current = (SCALING - 1) * BUFFER_SIZE;
	
	while(current > 0) {
	  *(long *)start = (long)start + PAGESIZE;
	  start += PAGESIZE;
	  current -= PAGESIZE;
	}
	
	*(long *)(start - PAGESIZE) = (BLASULONG)map_address;
	
	start = (BLASULONG)map_address;
	
	best = (BLASULONG)-1;
	best_address = map_address;
	
	while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
	  
	  current = run_bench(start, allocsize);
	  
	  if (best > current) {
	    best = current;
	    best_address = (void *)start;
	  }
	  
	  start += PAGESIZE;
	  
	}
	
      if ((BLASULONG)best_address > (BLASULONG)map_address)
	munmap(map_address,  (BLASULONG)best_address - (BLASULONG)map_address);
      
      munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
      
      map_address = best_address;
      
#if defined(OS_LINUX) && !defined(NO_WARMUP)
      hot_alloc = 2;
#endif
      }
    }
#if defined(OS_LINUX) && !defined(NO_WARMUP)
  }
#endif

  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].func    = alloc_mmap_free;
    release_pos ++;
  }

  return map_address;
}

#endif

#endif


#ifdef ALLOC_MALLOC

static void alloc_malloc_free(struct release_t *release){

  free(release -> address);

}

static void *alloc_malloc(void *address){

  void *map_address;

  map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);

  if (map_address == (void *)NULL) map_address = (void *)-1;

  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].func    = alloc_malloc_free;
    release_pos ++;
  }

  return map_address;

}

#endif

#ifdef ALLOC_QALLOC

void *qalloc(int flags, size_t bytes);
void *qfree (void *address);

#define QNONCACHE 0x1
#define QCOMMS    0x2
#define QFAST     0x4

static void alloc_qalloc_free(struct release_t *release){

  qfree(release -> address);

}

static void *alloc_qalloc(void *address){
  void *map_address;

  map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);

  if (map_address == (void *)NULL) map_address = (void *)-1;

  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].func    = alloc_qalloc_free;
    release_pos ++;
  }

  return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
}

#endif

#ifdef ALLOC_WINDOWS

static void alloc_windows_free(struct release_t *release){

  VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);

}

static void *alloc_windows(void *address){
  void *map_address;

  map_address  = VirtualAlloc(address,
			      BUFFER_SIZE,
			      MEM_RESERVE | MEM_COMMIT,
			      PAGE_READWRITE);

  if (map_address == (void *)NULL) map_address = (void *)-1;

  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].func    = alloc_windows_free;
    release_pos ++;
  }

  return map_address;
}

#endif

#ifdef ALLOC_DEVICEDRIVER
#ifndef DEVICEDRIVER_NAME
#define DEVICEDRIVER_NAME "/dev/mapper"
#endif

static void alloc_devicedirver_free(struct release_t *release){

  if (munmap(release -> address, BUFFER_SIZE)) {
    printf("OpenBLAS : Bugphysarea unmap failed.\n");
  }

  if (close(release -> attr)) {
    printf("OpenBLAS : Bugphysarea close failed.\n");
  }

}

static void *alloc_devicedirver(void *address){
  
  int fd;
  void *map_address;

  if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {

    return (void *)-1;

  }

  map_address = mmap(address, BUFFER_SIZE,
		     PROT_READ | PROT_WRITE,
		     MAP_FILE | MAP_SHARED,
		     fd, 0);
  
  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].attr    = fd;
    release_info[release_pos].func    = alloc_devicedirver_free;
    release_pos ++;
  }

  return map_address;
}

#endif

#ifdef ALLOC_SHM

static void alloc_shm_free(struct release_t *release){

  if (shmdt(release -> address)) {
    printf("OpenBLAS : Shared memory unmap failed.\n");
    }
}

static void *alloc_shm(void *address){
  void *map_address;
  int shmid;
  
  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  
  map_address = (void *)shmat(shmid, address, 0);

  if (map_address != (void *)-1){

#ifdef OS_LINUX
    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif

    shmctl(shmid, IPC_RMID, 0);

    release_info[release_pos].address = map_address;
    release_info[release_pos].attr    = shmid;
    release_info[release_pos].func    = alloc_shm_free;
    release_pos ++;
  }

  return map_address;
}

#endif

#if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS

static void alloc_hugetlb_free(struct release_t *release){

#if defined(OS_LINUX) || defined(OS_AIX)
  if (shmdt(release -> address)) {
    printf("OpenBLAS : Hugepage unmap failed.\n");
  }
#endif

#ifdef __sun__

  munmap(release -> address, BUFFER_SIZE);

#endif

#ifdef OS_WINDOWS

  VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);

#endif

}

static void *alloc_hugetlb(void *address){

  void *map_address = (void *)-1;

#if defined(OS_LINUX) || defined(OS_AIX)
  int shmid;
  
  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
#ifdef OS_LINUX
		 SHM_HUGETLB |
#endif
#ifdef OS_AIX
		 SHM_LGPAGE | SHM_PIN |
#endif
		 IPC_CREAT | SHM_R | SHM_W);
  
  if (shmid != -1) {
    map_address = (void *)shmat(shmid, address, SHM_RND);
    
#ifdef OS_LINUX
    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
#endif

    if (map_address != (void *)-1){
      shmctl(shmid, IPC_RMID, 0);
    }
  }
#endif

#ifdef __sun__
  struct memcntl_mha mha;
  
  mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  mha.mha_flags = 0;
  mha.mha_pagesize = HUGE_PAGESIZE;
  memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);

  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
#endif

#ifdef OS_WINDOWS

  HANDLE hToken;
  TOKEN_PRIVILEGES tp;

  if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;

  tp.PrivilegeCount = 1;
  tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  
  if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1;

  if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1;

  map_address  = (void *)VirtualAlloc(address,
				      BUFFER_SIZE,
				      MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
				      PAGE_READWRITE);

  AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL);

  if (map_address == (void *)NULL) map_address = (void *)-1;
  
#endif

  if (map_address != (void *)-1){
    release_info[release_pos].address = map_address;
    release_info[release_pos].func    = alloc_hugetlb_free;
    release_pos ++;
  }

  return map_address;
}
#endif

#ifdef  ALLOC_HUGETLBFILE

static int hugetlb_pid = 0;

static void alloc_hugetlbfile_free(struct release_t *release){

  if (munmap(release -> address, BUFFER_SIZE)) {
    printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  }

  if (close(release -> attr)) {
    printf("OpenBLAS : HugeTLBfs close failed.\n");
  }
}

static void *alloc_hugetlbfile(void *address){

  void *map_address = (void *)-1;
  int fd;
  char filename[64];

  if (!hugetlb_pid) hugetlb_pid = getpid();

  sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);

  if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
    return (void *)-1;
  }

  unlink(filename);

  map_address = mmap(address, BUFFER_SIZE,
		     PROT_READ | PROT_WRITE,
		     MAP_SHARED,
		     fd, 0);
  
  if (map_address != (void *)-1) {
    release_info[release_pos].address = map_address;
    release_info[release_pos].attr    = fd;
    release_info[release_pos].func    = alloc_hugetlbfile_free;
    release_pos ++;
  }

  return map_address;
}
#endif

/* Global lock for memory allocation */

#if   defined(USE_PTHREAD_LOCK)
static pthread_mutex_t    alloc_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
#else
static BLASULONG  alloc_lock = 0UL;
#endif

#ifdef SEEK_ADDRESS
static BLASULONG base_address      = 0UL;
#else
static BLASULONG base_address      = BASE_ADDRESS;
#endif

static volatile struct {
  BLASULONG lock;
  void *addr;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
  int   pos;
#endif
  int used;
#ifndef __64BIT__
  char dummy[48];
#else
  char dummy[40];
#endif

} memory[NUM_BUFFERS];

static int memory_initialized = 0;
static void gotoblas_memory_init(void);

/*       Memory allocation routine           */
/* procpos ... indicates where it comes from */
/*                0 : Level 3 functions      */
/*                1 : Level 2 functions      */
/*                2 : Thread                 */

void *blas_memory_alloc(int procpos){
  
  int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
  int mypos;
#endif

  void *map_address;

  void *(*memoryalloc[])(void *address) = {
#ifdef ALLOC_DEVICEDRIVER
    alloc_devicedirver,
#endif
#if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS
    alloc_hugetlb,
#endif
#ifdef ALLOC_SHM
    alloc_shm,
#endif
#ifdef ALLOC_MMAP
    alloc_mmap,
#endif
#ifdef ALLOC_QALLOC
    alloc_qalloc,
#endif
#ifdef ALLOC_WINDOWS
    alloc_windows,
#endif
#ifdef ALLOC_MALLOC
    alloc_malloc,
#endif
    NULL,
  };
  void *(**func)(void *address);

  if (!memory_initialized) {
    
    LOCK_COMMAND(&alloc_lock);
    
    if (!memory_initialized) {
      
#if defined(WHEREAMI) && !defined(USE_OPENMP)
      for (position = 0; position < NUM_BUFFERS; position ++){
	memory[position].addr   = (void *)0;
	memory[position].pos    = -1;
	memory[position].used   = 0;
	memory[position].lock   = 0;
      }
#endif
      
#ifdef DYNAMIC_ARCH
      gotoblas_dynamic_init();
#endif

#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
      gotoblas_affinity_init();
#endif
      
#ifdef SMP
      if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
      
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#ifndef DYNAMIC_ARCH
      blas_set_parameter();
#endif
#endif

      memory_initialized = 1;
    }

    UNLOCK_COMMAND(&alloc_lock);
  }

#ifdef DEBUG
  printf("Alloc Start ...\n");
#endif

#if defined(WHEREAMI) && !defined(USE_OPENMP)

  mypos = WhereAmI();

  position = mypos;
  while (position > NUM_BUFFERS) position >>= 1;

  do {
    if (!memory[position].used && (memory[position].pos == mypos)) {
      
      blas_lock(&memory[position].lock);
      
      if (!memory[position].used) goto allocation;
      
      blas_unlock(&memory[position].lock);
    }
    
    position ++;
    
  } while (position < NUM_BUFFERS);


#endif

  position = 0;

  do {
    if (!memory[position].used) {
      
      blas_lock(&memory[position].lock);

      if (!memory[position].used) goto allocation;
      
      blas_unlock(&memory[position].lock);
    }
    
    position ++;
    
  } while (position < NUM_BUFFERS);
  
  goto error;

  allocation :

#ifdef DEBUG
  printf("  Position -> %d\n", position);
#endif

  memory[position].used = 1;

  blas_unlock(&memory[position].lock);

  if (!memory[position].addr) {
    do {
#ifdef DEBUG
      printf("Allocation Start : %lx\n", base_address);
#endif

      map_address = (void *)-1;

      func = &memoryalloc[0];

      while ((func != NULL) && (map_address == (void *) -1)) {

	map_address = (*func)((void *)base_address);

#ifdef ALLOC_DEVICEDRIVER
	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
	    fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
	}
#endif

#ifdef ALLOC_HUGETLBFILE
	if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
	    fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
#endif
	}
#endif

#if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS
	if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
#endif

	func ++;
      }

#ifdef DEBUG
      printf("  Success -> %08lx\n", map_address);
#endif
      if (((BLASLONG) map_address) == -1) base_address = 0UL;

      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;

    } while ((BLASLONG)map_address == -1);

    memory[position].addr = map_address; 

#ifdef DEBUG
    printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
#endif
  }
 
#if defined(WHEREAMI) && !defined(USE_OPENMP)

  if (memory[position].pos == -1) memory[position].pos = mypos;

#endif

#ifdef DYNAMIC_ARCH

  if (memory_initialized == 1) {
    
    LOCK_COMMAND(&alloc_lock);
    
    if (memory_initialized == 1) {
      
      if (!gotoblas) gotoblas_dynamic_init();
      
      memory_initialized = 2;
    }
    
    UNLOCK_COMMAND(&alloc_lock);
    
  }
#endif


#ifdef DEBUG
  printf("Mapped   : %p  %3d\n\n",
	  (void *)memory[position].addr, position);
#endif  
  
  return (void *)memory[position].addr;

 error:
  printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");

  return NULL;
}

void blas_memory_free(void *free_area){

  int position;

#ifdef DEBUG
  printf("Unmapped Start : %p ...\n", free_area);
#endif  
 
  position = 0;

  while ((memory[position].addr != free_area)
	 && (position < NUM_BUFFERS)) position++;

  if (memory[position].addr != free_area) goto error;

#ifdef DEBUG
  printf("  Position : %d\n", position);
#endif  

  memory[position].used = 0;

#ifdef DEBUG
  printf("Unmap Succeeded.\n\n");
#endif  

  return;
  
 error:
  printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);

#ifdef DEBUG
  for (position = 0; position < NUM_BUFFERS; position++) 
    printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
#endif

  return;
}

void blas_shutdown(void){

  int pos;

#ifdef SMP
  BLASFUNC(blas_thread_shutdown)();
#endif

  LOCK_COMMAND(&alloc_lock);

  for (pos = 0; pos < release_pos; pos ++) {
    release_info[pos].func(&release_info[pos]);
  }
  
#ifdef SEEK_ADDRESS
  base_address      = 0UL;
#else
  base_address      = BASE_ADDRESS;
#endif

  for (pos = 0; pos < NUM_BUFFERS; pos ++){
    memory[pos].addr   = (void *)0;
    memory[pos].used   = 0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
    memory[pos].pos    = -1;
#endif
    memory[pos].lock   = 0;
  }

  UNLOCK_COMMAND(&alloc_lock);

  return;
}

#if defined(OS_LINUX) && !defined(NO_WARMUP)
 
#ifdef SMP
#if   defined(USE_PTHREAD_LOCK)
static pthread_mutex_t    init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t init_lock = 0;
#else
static BLASULONG   init_lock = 0UL;
#endif
#endif

static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, 
			  void *sa, void *sb, BLASLONG pos) {

#ifndef ARCH_POWER

  long size;
  BLASULONG buffer;

  size   = BUFFER_SIZE - PAGESIZE;
  buffer = (BLASULONG)sa + GEMM_OFFSET_A;

#if defined(OS_LINUX) && !defined(NO_WARMUP)
    if (hot_alloc != 2) {
#endif

#ifdef SMP
  LOCK_COMMAND(&init_lock);
#endif

  while (size > 0) {
    *(int *)buffer = size;
    buffer  += PAGESIZE;
    size    -= PAGESIZE;
  }

#ifdef SMP
  UNLOCK_COMMAND(&init_lock);
#endif

  size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  buffer = (BLASULONG)sa + GEMM_OFFSET_A;

  while (size > 0) {
    *(int *)buffer = size;
    buffer  += 64;
    size    -= 64;
  }

#if defined(OS_LINUX) && !defined(NO_WARMUP)
    }
#endif

#endif
}

#ifdef SMP

static void _init_thread_memory(void *buffer) {

  blas_queue_t queue[MAX_CPU_NUMBER];
  int num_cpu;

  for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {

    blas_queue_init(&queue[num_cpu]);
    queue[num_cpu].mode    = BLAS_DOUBLE | BLAS_REAL;
    queue[num_cpu].routine = &_touch_memory;
    queue[num_cpu].args    = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
  }

  queue[num_cpu - 1].next = NULL;
  queue[0].sa = buffer;
  
  exec_blas(num_cpu, queue);

}
#endif

static void gotoblas_memory_init(void) {

  void *buffer;

  hot_alloc = 1;

  buffer = (void *)blas_memory_alloc(0);

#ifdef SMP
  if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER
  if (blas_server_avail == 0) blas_thread_init();
#endif
    
  _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  
#else
  
  _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  
#endif
  
  blas_memory_free(buffer);
}
#endif

/* Initialization for all function; this function should be called before main */

static int gotoblas_initialized = 0;

void CONSTRUCTOR gotoblas_init(void) {

  if (gotoblas_initialized) return;

#ifdef PROFILE
   moncontrol (0);
#endif

#ifdef DYNAMIC_ARCH
   gotoblas_dynamic_init();
#endif

#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
   gotoblas_affinity_init();
#endif

#if defined(OS_LINUX) && !defined(NO_WARMUP)
   gotoblas_memory_init();
#endif

#ifdef SMP
  if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER
  if (blas_server_avail == 0) blas_thread_init();
#endif
#endif

#ifdef FUNCTION_PROFILE
   gotoblas_profile_init();
#endif

   gotoblas_initialized = 1;

#ifdef PROFILE
   moncontrol (1);
#endif

}

void DESTRUCTOR gotoblas_quit(void) {

  if (gotoblas_initialized == 0) return;

#ifdef PROFILE
   moncontrol (0);
#endif

#ifdef FUNCTION_PROFILE
   gotoblas_profile_quit();
#endif

#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
   gotoblas_affinity_quit();
#endif

#ifdef DYNAMIC_ARCH
   gotoblas_dynamic_quit();
#endif

   gotoblas_initialized = 0;

#ifdef PROFILE
   moncontrol (1);
#endif

}

#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
/* Don't call me; this is just work around for PGI / Sun bug */
void gotoblas_dummy_for_PGI(void) {

  gotoblas_init();
  gotoblas_quit();

#if 0
  asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
#else
  asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
}
#endif