diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b1c8dd140..df92cf4ef 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -124,7 +124,13 @@ In chronological order:
 * Jerome Robert <jeromerobert@gmx.com>
   * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
   * [2015-12-23] `stack_check` in `gemv.c` (bug #722)
+  * [2015-12-28] Allow to force the number of parallel make job
+  * [2015-12-28] Fix detection of AMD E2-3200 detection
+  * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
   * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
+  * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
+  * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
+  * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
 
 * Dan Kortschak
   * [2015-01-07] Added test for drotmg bug #484.
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 492d2617f..bcf3da2cc 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
        sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
        spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
-       ssymm.goto dsymm.goto csymm.goto zsymm.goto
+       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
+       smallscaling
 
 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
        scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
 zgemm3m.$(SUFFIX) : gemm3m.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+smallscaling: smallscaling.c ../$(LIBNAME)
+	$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
 
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib
diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c
new file mode 100644
index 000000000..daed8f3da
--- /dev/null
+++ b/benchmark/smallscaling.c
@@ -0,0 +1,191 @@
+// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <cblas.h>
+#include <omp.h>
+#define MIN_SIZE 5
+#define MAX_SIZE 60
+#define NB_SIZE 10
+
+// number of loop for a 1x1 matrix. Lower it if the test is
+// too slow on you computer.
+#define NLOOP 2e7
+
+typedef struct {
+    int matrix_size;
+    int n_loop;
+    void (* bench_func)();
+    void (* blas_func)();
+    void * (* create_matrix)(int size);
+} BenchParam;
+
+void * s_create_matrix(int size) {
+    float * r = malloc(size * sizeof(double));
+    for(int i = 0; i < size; i++)
+        r[i] = 1e3 * i / size;
+    return r;
+}
+
+void * c_create_matrix(int size) {
+    float * r = malloc(size * 2 * sizeof(double));
+    for(int i = 0; i < 2 * size; i++)
+        r[i] = 1e3 * i / size;
+    return r;
+}
+
+void * z_create_matrix(int size) {
+    double * r = malloc(size * 2 * sizeof(double));
+    for(int i = 0; i < 2 * size; i++)
+        r[i] = 1e3 * i / size;
+    return r;
+}
+
+void * d_create_matrix(int size) {
+    double * r = malloc(size * sizeof(double));
+    for(int i = 0; i < size; i++)
+        r[i] = 1e3 * i / size;
+    return r;
+}
+
+void trmv_bench(BenchParam * param)
+{
+    int i, n;
+    int size = param->matrix_size;
+    n = param->n_loop / size;
+    int one = 1;
+    void * A = param->create_matrix(size * size);
+    void * y = param->create_matrix(size);
+    for(i = 0; i < n; i++) {
+        param->blas_func("U", "N", "N", &size, A, &size, y, &one);
+    }
+    free(A);
+    free(y);
+}
+
+void gemv_bench(BenchParam * param)
+{
+    int i, n;
+    int size = param->matrix_size;
+    n = param->n_loop / size;
+    double v = 1.01;
+    int one = 1;
+    void * A = param->create_matrix(size * size);
+    void * y = param->create_matrix(size);
+    for(i = 0; i < n; i++) {
+        param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
+    }
+    free(A);
+    free(y);
+}
+
+void ger_bench(BenchParam * param) {
+    int i, n;
+    int size = param->matrix_size;
+    n = param->n_loop / size;
+    double v = 1.01;
+    int one = 1;
+    void * A = param->create_matrix(size * size);
+    void * y = param->create_matrix(size);
+    for(i = 0; i < n; i++) {
+        param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
+    }
+    free(A);
+    free(y);
+}
+
+#ifndef _WIN32
+void * pthread_func_wrapper(void * param) {
+    ((BenchParam *)param)->bench_func(param);
+    pthread_exit(NULL);
+}
+#endif
+
+#define NB_TESTS 5
+void * TESTS[4 * NB_TESTS] = {
+    trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
+    gemv_bench, dgemv_, d_create_matrix, "dgemv",
+    gemv_bench, zgemv_, z_create_matrix, "zgemv",
+    ger_bench, dger_, d_create_matrix, "dger",
+    ger_bench, zgerc_, z_create_matrix, "zgerc",
+};
+
+inline static double delta_time(struct timespec tick) {
+    struct timespec tock;
+	clock_gettime(CLOCK_MONOTONIC, &tock);
+	return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
+}
+
+double pthread_bench(BenchParam * param, int nb_threads)
+{
+#ifdef _WIN32
+    return 0;
+#else
+    BenchParam threaded_param = *param;
+    pthread_t threads[nb_threads];
+    int t, rc;
+    struct timespec tick;
+    threaded_param.n_loop /= nb_threads;
+    clock_gettime(CLOCK_MONOTONIC, &tick);
+    for(t=0; t<nb_threads; t++){
+        rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
+        if (rc){
+            printf("ERROR; return code from pthread_create() is %d\n", rc);
+            exit(-1);
+        }
+    }
+    for(t=0; t<nb_threads; t++){
+        pthread_join(threads[t], NULL);
+    }
+	return delta_time(tick);
+#endif
+}
+
+double seq_bench(BenchParam * param) {
+    struct timespec tick;
+    clock_gettime(CLOCK_MONOTONIC, &tick);
+    param->bench_func(param);
+    return delta_time(tick);
+}
+
+double omp_bench(BenchParam * param) {
+    BenchParam threaded_param = *param;
+    struct timespec tick;
+    int t;
+    int nb_threads = omp_get_max_threads();
+    threaded_param.n_loop /= nb_threads;
+    clock_gettime(CLOCK_MONOTONIC, &tick);
+    #pragma omp parallel for
+    for(t = 0; t < nb_threads; t ++){
+        param->bench_func(&threaded_param);
+    }
+    return delta_time(tick);
+}
+
+int main(int argc, char * argv[]) {
+    double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
+    BenchParam param;
+    int test_id;
+    printf ("Running on %d threads\n", omp_get_max_threads());
+    for(test_id = 0; test_id < NB_TESTS; test_id ++) {
+        double size = MIN_SIZE;
+        param.bench_func = TESTS[test_id * 4];
+        param.blas_func = TESTS[test_id * 4 + 1];
+        param.create_matrix = TESTS[test_id * 4 + 2];
+        printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
+        param.n_loop = NLOOP;
+        while(size <= MAX_SIZE) {
+            param.matrix_size = (int)(size + 0.5);
+            double seq_time = seq_bench(&param);
+            double omp_time = omp_bench(&param);
+            double pthread_time = pthread_bench(&param, omp_get_max_threads());
+            printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
+                   "pthread %gs, speedup %g\n",
+                   param.matrix_size, seq_time,
+                   omp_time, seq_time / omp_time,
+                   pthread_time, seq_time / pthread_time);
+            size *= inc_factor;
+        }
+    }
+}
diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
index a9dc2dc62..42edb83cb 100644
--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
 #endif
 
     x = buffer;
-    buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
+    buffer += ((COMPSIZE * args -> m + 3) & ~3);
   }
 
 #ifndef TRANS
@@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
 
   if (num_cpu) {
     queue[0].sa = NULL;
-    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
+    queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
 
     queue[num_cpu - 1].next = NULL;
 
diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c
index f9671c9d6..063de6cbc 100644
--- a/driver/level2/ztrmv_U.c
+++ b/driver/level2/ztrmv_U.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
 
   if (incb != 1) {
     B = buffer;
-    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
+    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
     COPY_K(m, b, incb, buffer, 1);
   }
 
diff --git a/interface/zgemv.c b/interface/zgemv.c
index 584080d30..e5ba3757c 100644
--- a/interface/zgemv.c
+++ b/interface/zgemv.c
@@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
   blasint incy = *INCY;
 
   FLOAT *buffer;
+  int buffer_size;
 #ifdef SMP
   int nthreads;
 #endif
@@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
 
   FLOAT *buffer;
   blasint    lenx, leny;
-  int trans;
+  int trans, buffer_size;
   blasint info, t;
 #ifdef SMP
   int nthreads;
@@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order,
   if (incx < 0) x -= (lenx - 1) * incx * 2;
   if (incy < 0) y -= (leny - 1) * incy * 2;
 
-  buffer = (FLOAT *)blas_memory_alloc(1);
+  buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+  buffer_size += 160 / sizeof(FLOAT) ;
+#endif
+  // for alignment
+  buffer_size = (buffer_size + 3) & ~3;
+  STACK_ALLOC(buffer_size, FLOAT, buffer);
+
+#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
+  // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
+  if(trans && stack_alloc_size)
+    memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
+#endif
 
 #ifdef SMP
 
@@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
   }
 #endif
 
-  blas_memory_free(buffer);
+  STACK_FREE(buffer);
 
   FUNCTION_PROFILE_END(4, m * n + m + n,  2 * m * n);
 
diff --git a/interface/zger.c b/interface/zger.c
index f7354d26d..db72b4e4c 100644
--- a/interface/zger.c
+++ b/interface/zger.c
@@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
   if (incy < 0) y -= (n - 1) * incy * 2;
   if (incx < 0) x -= (m - 1) * incx * 2;
 
-  buffer = (FLOAT *)blas_memory_alloc(1);
+  STACK_ALLOC(2 * m, FLOAT, buffer);
 
 #ifdef SMPTEST
   // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
@@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
   }
 #endif
 
-  blas_memory_free(buffer);
+  STACK_FREE(buffer);
 
   FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
 
diff --git a/interface/ztrmv.c b/interface/ztrmv.c
index 1abaac920..2be915c32 100644
--- a/interface/ztrmv.c
+++ b/interface/ztrmv.c
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
   blasint info;
   int uplo;
   int unit;
-  int trans;
+  int trans, buffer_size;
   FLOAT *buffer;
 #ifdef SMP
   int nthreads;
@@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 	   blasint n, FLOAT  *a, blasint lda, FLOAT  *x, blasint incx) {
 
-  int trans, uplo, unit;
+  int trans, uplo, unit, buffer_size;
   blasint info;
   FLOAT *buffer;
 #ifdef SMP
@@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
   if (incx < 0 ) x -= (n - 1) * incx * 2;
 
-  buffer = (FLOAT *)blas_memory_alloc(1);
+#ifdef SMP
+  // Calibrated on a Xeon E5-2630
+  if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
+    nthreads = num_cpu_avail(2);
+    if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
+      nthreads = 2;
+  } else
+      nthreads = 1;
+
+  if(nthreads > 1) {
+    buffer_size = n > 16 ? 0 : n * 4 + 40;
+  }
+  else
+#endif
+  {
+    buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
+    if(incx != 1)
+      buffer_size += n * 2;
+  }
+  STACK_ALLOC(buffer_size, FLOAT, buffer);
 
 #ifdef SMP
-  nthreads = num_cpu_avail(2);
-
   if (nthreads == 1) {
 #endif
 
@@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
   }
 #endif
 
-  blas_memory_free(buffer);
+  STACK_FREE(buffer);
 
   FUNCTION_PROFILE_END(4, n * n / 2 + n,  n * n);