I'm please (but also bemused) to report that I got BLAS invoked from PHP via FFI--bemused because I'm unable to get it working with the self-compiled cblas_LINUX.so
. I was able to get a speedy BLAS matrix multiplication working on all my various ubuntu machines by first installing libblas-dev:
sudo apt install libblas-dev
and by using the cblas.h
file that I had previously reduced by removing openblas functions etc. from the ghostjat/np blas.h file. Here is the entire reduced file, which I have saved as cblas.h
:
#define FFI_SCOPE "blas"
#define FFI_LIB "libblas.so"
typedef size_t CBLAS_INDEX_t;
size_t cblas_idamax(const int N, const float *X, const int incX);
double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
const int incY);
double cblas_ddot(const int N, const double *X, const int incX,
const double *Y, const int incY);
double cblas_dnrm2(const int N, const double *X, const int incX);
double cblas_dasum(const int N, const double *X, const int incX);
void cblas_dswap(const int N, double *X, const int incX,
double *Y, const int incY);
void cblas_dcopy(const int N, const double *X, const int incX,
double *Y, const int incY);
void cblas_daxpy(const int N, const double alpha, const double *X,
const int incX, double *Y, const int incY);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
void cblas_drot(const int N, double *X, const int incX,
double *Y, const int incY, const double c, const double s);
void cblas_drotm(const int N, double *X, const int incX,
double *Y, const int incY, const double *P);
void cblas_dscal(const int N, const double alpha, double *X, const int incX);
void cblas_dgemv(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const double alpha, const double *A, const int lda,
const double *X, const int incX, const double beta,
double *Y, const int incY);
void cblas_dgbmv(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const int KL, const int KU, const double alpha,
const double *A, const int lda, const double *X,
const int incX, const double beta, double *Y, const int incY);
void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const int N, const double *A, const int lda,
double *X, const int incX);
void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const int N, const int K, const double *A, const int lda,
double *X, const int incX);
void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const int N, const double *Ap, double *X, const int incX);
void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const int N, const double *A, const int lda, double *X,
const int incX);
void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const int N, const int K, const double *A, const int lda,
double *X, const int incX);
void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const int N, const double *Ap, double *X, const int incX);
void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *A,
const int lda, const double *X, const int incX,
const double beta, double *Y, const int incY);
void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const int K, const double alpha, const double *A,
const int lda, const double *X, const int incX,
const double beta, double *Y, const int incY);
void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *Ap,
const double *X, const int incX,
const double beta, double *Y, const int incY);
void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N,
const double alpha, const double *X, const int incX,
const double *Y, const int incY, double *A, const int lda);
void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *X,
const int incX, double *A, const int lda);
void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *X,
const int incX, double *Ap);
void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *X,
const int incX, const double *Y, const int incY, double *A,
const int lda);
void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
const int N, const double alpha, const double *X,
const int incX, const double *Y, const int incY, double *A);
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const double alpha, const double *A,
const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc);
void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
const enum CBLAS_UPLO Uplo, const int M, const int N,
const double alpha, const double *A, const int lda,
const double *B, const int ldb, const double beta,
double *C, const int ldc);
void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
const double alpha, const double *A, const int lda,
const double beta, double *C, const int ldc);
void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
const double alpha, const double *A, const int lda,
const double *B, const int ldb, const double beta,
double *C, const int ldc);
void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const int M, const int N,
const double alpha, const double *A, const int lda,
double *B, const int ldb);
void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const int M, const int N,
const double alpha, const double *A, const int lda,
double *B, const int ldb);
NOTE : FFI_LIB
is simply 'libblas.so' without any path information. This should result in linux using the shared library installed with the libblas-ev package.
Lastly, here's a simplified script that loads my JSON file and multiplies my training set by its transpose. The cblas_dgemm call executes in about 284ms on my workstation. Slower on the virtual machines for whatever reason.
$data = json_decode(file_get_contents(__DIR__ . '/../machine-learning/training_data_sets.json'), TRUE);
// more trial and error...cooked this up after reading the ghostjat/np source code
$ffi_blas = FFI::load(__DIR__ . '/cblas.h');
$m1 = $data['Xtrain'];
$start = microtime(true);
$m1t = array_map(null, ...$m1);
echo "transpose elapsed time: " . (microtime(true) - $start) . "seconds\n";
// lifted from ghostjat/np, converts matrix to obj with FFI\CData
function get_cdata($m) {
// this check should be more specific, but works for now
if (!is_array($m) || !is_array($m[0])) {
throw new Exception('param must be array of arrays');
}
$rowcount = sizeof($m);
$colcount = sizeof($m[0]);
$flat_m = [];
$size = $rowcount * $colcount;
$cdata = \FFI::cast('double *', \FFI::new("double[$size]"));
$i = 0;
foreach($m as $row) {
foreach($row as $val) {
$flat_m[$i] = $val;
$cdata[$i] = $val;
$i++;
}
}
return [
'rows' => $rowcount,
'cols' => $colcount,
'flat_m' => $flat_m,
'cdata' => $cdata
];
}
$m1_c = get_cdata($m1);
$m1t_c = get_cdata($m1t);
define('CBLAS_ROW_MAJOR', 101);
define('CBLAS_NO_TRANS', 111);
$mr_size = $m1_c['rows'] * $m1t_c['cols'];
$mr_rows = $m1_c['rows'];
$mr_cols = $m1t_c['cols'];
$mr_cdata = \FFI::cast('double *', \FFI::new("double[$mr_size]"));
$start = microtime(true);
// this fn works by pointer, returns some void object, I think?
$some_val = $ffi_blas->cblas_dgemm(CBLAS_ROW_MAJOR, CBLAS_NO_TRANS, CBLAS_NO_TRANS, $m1_c['rows'], $m1t_c['cols'], $m1_c['cols'], 1.0, $m1_c['cdata'], $m1_c['cols'], $m1t_c['cdata'], $m1t_c['cols'], 0.0, $mr_cdata, $mr_cols);
echo "cblas_dgemm elapsed time: " . (microtime(true) - $start) . "seconds\n";
// convert cdata to php object
$idx = 0;
$mr = [];
for($i=0; $i<$mr_rows; $i++) {
$mr[$i] = [];
for($j=0; $j<$mr_cols; $j++) {
$mr[$i][$j] = $mr_cdata[$idx];
$idx++;
}
}
echo "rows: " . sizeof($mr) . "\n";
echo "cols: " . sizeof($mr[0]) . "\n";
$json = json_encode($mr, JSON_PRETTY_PRINT);
file_put_contents(__DIR__ . '/cblas-product.json', $json);
echo "json md5 is " . md5($json) . "\n";
The result, $mr
is precisely what the other matrix multiply/dot functions returned. The output:
transpose elapsed time: 0.057420015335083seconds
cblas_dgemm elapsed time: 0.28933596611023seconds
rows: 1375
cols: 1375
json md5 is 4a33a230c1cf8bb6ab75dbd0d3d7bf0e