Not having much luck wtih SplFixedArray. Here's a script:

class JMatrix {
	public $data;

	public function __construct($arr) {
		if (is_a($arr, 'SplFixedArray')) {
			// shortcut...provide SplFixedArray param, bypass checks and just assign
			$this->data = $arr;
		} elseif (!is_array($arr)) {
			// if not SplFixedArray, must be standard array
			throw new Exception('Constructor param must be an array');
		} else {
			// if it's an array, check if 2D array (matrix) or just 1D array vector, which we treat as matrix
			if (!is_array($arr[0])) {
				// the supplied parameter is a simple array, so we'll construct a row vector
				// from its elements
				$row_count = 1;
				$col_count = sizeof($arr);
				$this->data = new SplFixedArray(1);
				$this->data->offsetSet(0, new SplFixedArray($col_count));
				foreach($arr as $j => $v) {
					$this->data->offsetGet(0)->offsetSet($j, $v);
				}

			} else {
				// an array of arrays...sanity check on element counts
				$row_count = sizeof($arr);
				$col_count = sizeof($arr[0]);
				foreach($arr as $v) {
					if (sizeof($v) !== $col_count) {
						throw new Exception('Invalid matrix array. Inconsistent column count in constructor array.');
					}
				}
				$this->data = new SplFixedArray($row_count);
				foreach($arr as $i => $row) {
					$this->data->offsetSet($i, new SplFixedArray($col_count));
					foreach($row as $j => $v) {
						$this->data->offsetGet($i)->offsetSet($j, $v);
					}
				}
				
			}
		}
	}

	public function rowCount() {
		return $this->data->getSize();
	}
	public function colCount() {
		return $this->data->offsetGet(0)->getSize();
	}

	public function size() {
		return [$this->rowCount(), $this->colCount()];
	}
	public function sizeAsString() {
		return $this->rowCount() . 'x' . $this->colCount();
	}

	public function __toString() {
		$retval = "[\n";
		foreach($this->data as $i => $row) {
			$retval .= "\t[" . implode(",", (array)$row) . "]\n";
		}
		$retval .= "]\n";
		return $retval;
	}

	public function multiply($matrix) {
		$iteration_count = $this->colCount();
		if ($iteration_count !== $matrix->rowCount()) {
			throw new Exception('nonconformant arguments (op1 is ' . $this->sizeAsString() . ', op2 is ' . $matrix->sizeAsString() . ')');
		}

		$result_rows = $this->rowCount();
		$result_cols = $matrix->colCount();

		// accumulate results in this array
		$ret_array = new SplFixedArray($result_rows);

		// temp vars to avoid object property lookup
		$td = $this->data;
		$md = $matrix->data;

		for($i=0; $i<$result_rows; $i++) {
			// current row from this matrix
			$drow = $td->offsetGet($i);

			// accumulate output matrix row here
			$retrow = new SplFixedArray($result_cols);

			// loop thru $matrix columns
			for($j=0; $j<$result_cols; $j++) {

				// accumulate product sum here
				$sum = 0;

				// iteration count is number of columns in this matrix == number of rows in $matrix
				for($k=0; $k<$iteration_count; $k++) {
					$sum += ($drow->offsetGet($k) * $matrix->data->offsetGet($k)->offsetGet($j));
				}

				// all products summed, store it in our output matrix row
				$retrow->offsetSet($j, $sum);
			}

			// put the output matrix row into our output matrix
			$ret_array->offsetSet($i, $retrow);
		}
		return new static($ret_array);
	}

	public function multiply2($matrix) {
		$iteration_count = $this->colCount();
		if ($iteration_count !== $matrix->rowCount()) {
			throw new Exception('nonconformant arguments (op1 is ' . $this->sizeAsString() . ', op2 is ' . $matrix->sizeAsString() . ')');
		}

		$result_rows = $this->rowCount();
		$result_cols = $matrix->colCount();

		// accumulate results in this array
		$ret_array = new SplFixedArray($result_rows);

		// temp vars to avoid object property lookup
		$td = $this->data;
		$md = $matrix->data;


		$td->rewind();
		$i = 0;
		while ($td->valid()) {
			// current row from this matrix
			$drow = $td->current();

			// accumulate output matrix row here
			$retrow = new SplFixedArray($result_cols);

			// loop thru $matrix columns
			for($j=0; $j<$result_cols; $j++) {

				// accumulate product sum here
				$sum = 0;

				// iteration count is number of columns in this matrix == number of rows in $matrix
				$md->rewind();
				$k = 0;
				while($md->valid()) {
					$mrow = $md->current();
					$sum += ($drow->offsetGet($k) * $mrow->offsetGet($j));

					$md->next();
					$k++;
				}

				// all products summed, store it in our output matrix row
				$retrow->offsetSet($j, $sum);
			}

			// put the output matrix row into our output matrix
			$ret_array->offsetSet($i, $retrow);

			$td->next();
			$i++;
		}
		return new static($ret_array);
	}

	public function transpose() {
		$arr = [];
		foreach($this->data as $row) {
			$arr[] = (array)$row;
		}
		$arr = array_map(null, ...$arr);
		return new static($arr);
	}
}


echo "SANITY CHECK\n";
$a = new JMatrix([
	[1, 4, -2],
	[3, 5, -6]
]);
echo $a;
echo "size: " .  $a->sizeAsString() . "\n";

$b = new JMatrix([
	[5, 2, 8, -1],
	[3, 6, 4, 5],
	[-2, 9, 7, -3]
]);
echo $b;
echo "size: " .  $b->sizeAsString() . "\n";

$v = $a->multiply2($b);
echo $v . "\n";
echo "size: " .  $v->sizeAsString() . "\n";

$vt = $v->transpose();
echo "$vt\n";
echo "size: " .  $vt->sizeAsString() . "\n";


$data_file = './training_data_sets.json';
$data = json_decode(file_get_contents($data_file), TRUE);

echo 'Xtrain rows: ' . sizeof($data['Xtrain']) . "\n";
echo 'Xtrain cols: ' . sizeof($data['Xtrain'][0]) . "\n";

$X = new JMatrix($data['Xtrain']);

$start = microtime(TRUE);
$XT = $X->transpose();
echo "transpose in " . (microtime(TRUE) - $start) . " seconds\n";

$start = microtime(TRUE);
$K = $X->multiply2($XT);
echo "multiply2 in " . (microtime(TRUE) - $start) . " seconds\n";

multiply, my first attempt, runs in 973 seconds:

Xtrain rows: 1375
Xtrain cols: 1966
transpose in 0.79622912406921 seconds
multiply in 973.16395616531 seconds

multiply2, where i attempt to avoid indexed lookups, actually takes longer -- about 20 minutes 😢.

Xtrain rows: 1375
Xtrain cols: 1966
transpose in 0.79264497756958 seconds
multiply2 in 1199.6731460094 seconds

    Oh, well. Worth a try. (Incidentally, the instanceof operator could be used instead of the is_a function. Since SplFixedArray implements Countable you can use count() instead of calling ->getSize() manually; implementing IteratorAggregate means that the ->valid() ->rewind(), ->current(), and ->next() methods will be called by foreach as and when needed; and implementing the ArrayAccess interface means the ability to use array index notation as implemented by ->offsetGet() and ->offsetSet(), e.g. $this->data->offsetGet(0)->offsetSet($j, $v); becomes $this->data[0][$j] = $v;, And I'm guessing you're using PHP 7: SplFixedArray dropped the rewind() method in v8 when it was switched from being an Iterator to an IteratorAggregate. Woo. Big sidebar. But I did get some improvement by making those changes. Possibly because of the dedicated opcodes for those things.)

    But I think PHP will (even with JIT enabled) lag behind domain-specific languages that will have had dedicated native code for those tasks for which the language is intended ("a matrix multiplication? I have machine code for that compiled from C++ right here"); PHP's execution would still be passing through the various reference-counted zval structures and hashtables and whatnot before getting right to the gist of the task. (See, e.g. Internal value representation in PHP 7 - Part 2.)

    Weedpacket Oh, well. Worth a try. (Incidentally, the instanceof operator could be used instead of the is_a function...I did get some improvement by making those changes. Possibly because of the dedicated opcodes for those things.)

    Thanks for these helpful clarifications. It had occurred to me that the transpose operation is quite fast, and part of the slowness seems to be due to the array indexing required. Note how the fastest PHP implementation, PHP-ML, makes use of the array_column function and explicitly tries to avoid array indexing as much as possible. It therefore occurred to me that I might quickly transpose the matrix supplied to my multiply function and benefit from slightly more efficient array operations. However, I don't think these little changes will bring us anywhere near the supremely fast performance I see in Octave. I exported my 1375x1966 array to an octave-friendly text format and loaded it up on octave and multipled it with this little octave/matlab script:

    Xtrain = dlmread("Xtrain.txt");
    sz = size(Xtrain)
    t0 = clock();
    K = Xtrain * Xtrain';
    elapsed_time = etime(clock(), t0);
    fprintf("transpose multiplication completed in %d seconds\n", elapsed_time);

    Octave completes the matrix multiplication in a stunning 67ms:

    sz =
    
       1375   1966
    
    transpose multiplication completed in 0.0668411 seconds

    This is 4376.7 times faster than the fastest PHP code I've found, which is accomplished by the matrix classes in the PHP-ML library.

    NOTE that the PHP-ML library apparently punts to some kind of compiled executable to perform its SVM training operations. It constructs a command and invokes the external executable with exec.

    I consulted with a friend who's a talented python jockey who offered some interesting initial observations on matrix multiplication algorithms:

    Strassen's was the first notable speedup, 1970 — O(n ^ 2.8) iirc — and people have knocked the exponent down further and further in the ensuing decades. In fact just within the last two months I read about this recent improvement: as per https://en.wikipedia.org/wiki/Matrix_multiplication#Computational_complexity

    As of December 2020, the best matrix multiplication algorithm is by Josh Alman and
    Virginia Vassilevska Williams and has complexity O(n ^ 2.3728596).

    That statement just quoted has a footnote to this paper: https://arxiv.org/abs/2010.05846, "A Refined Laser Method and Faster Matrix Multiplication".
    It's a tiny speedup over the previous best-known algo: best was formerly 

    O(n^{2.37287 + ε}) for all ε;

    the paper improves that bound to 

    O(n^{2.37286 + ε}) for all ε;

    My friend rightly observes:

    It would not be easy to whip up working code from the paper, written for actual and aspiring specialists.

    He pointed out an interesting exercise in matrix operation optimization and noted that numpy is very fast. Like the paper, these articles get into a lot of the nitty gritty (i.e., the weeds) of bottlenecks and various optimizations.

    I guess Octave and Matlab use BLAS (Basic Linear Algebra Subprograms), originally a Fortran library now ported to C or something, under the hood.

    My kind friend imported my 1375x1966 matrix for some python profiling. Some interesting results on his 2015 Macbook Pro:

    Numpy takes 2.06s to matmult Xtrain by its transpose

    He took the trouble to write a "pure python" function which he ran in a "notebook" :

    PS I'm trying to time the naive algorithm in pure Python, operating either on the same numpy "ndarray"s, or on native Python data structures (list of lists of numbers)
    I've tried both, a few times; I give up waiting, I haven't seen either complete, it takes minutes (at least!) and I get bored.

    Safe conclusion: Performance of pure Python naive algorithm in a notebook is worse than PHP. Way worse: in a practical sense it doesn't even terminate.
    Performance at command line: (minutes later, still waiting… I'll check back in an hour, if it terminates ) 

    Maybe it's faster at the command line than in a notebook? I bet not enough to matter, but let's try. 
    With native Python datatypes: it takes at least tens of seconds... a minute… 5+ minutes… 
    Just checked back: it took 20m 29.15s
    (So, yes: faster than when running in a notebook, but still unusable.)

    He went on to make some minor improvements, but the 'pure python' approach, which doesn't make use of NumPY, is clearly slow like the pure PHP approach. Oddly enough, I find this something of a relief. I'd hate to think python is inherently better than my dear old PHP.

    The lesson here is that it's clearly advantageous--necessary!--to delegate matrix operations to something outside PHP (or python) which takes advantage of decades of algorithmic refinement by the world's finest nerds. I truly doubt that your typical linux server will have octave installed, but it's easy enough with apt if you have su access via CLI. I think your typical server is much more likely to have python installed. Or perhaps I could distribute my own executable to handle matrix operations? Any suggestions about farming out these matrix operations from PHP would be much appreciated. Ideally we'd find some tool for matrix operations that will already exist in your typical server so we can just exec it from PHP with reasonable hope of success.

      The last time I had to do anything numerically-matricy (it was actually a dynamic programming problem) that was just way too slow and/or large to run in PHP, I whipped up a program in C to do it and compiled (TinyCC was good enough) it as an executable that I communicated with via proc_open. That was years and major versions ago. If PHP had had FFI back then I would have considered compiling it as a library and calling the API directly (warning: requires third-party library, which I suspect is the same one NumPY ends up using) instead of that mucking around with stdio. (The PHP manual has a note that accessing FFI structures is twice as slow as accessing PHP ones, so "it makes no sense to use the FFI extension for speed". But accessing the structure isn't the bottleneck here.)

      Thanks, Weedpacket, for the detail. Your post is over my head, and I'll have to do some remedial study on the particulars of compiling/linking/assembling to understand the dynamics here. Offhand, it seems tricky to find some speedy lib external to PHP which can perform fast matrix operations. I expect there are two paths:
      1) try using exec to run some CLI commands in the hope that python or octave or some other package has been installed on the machine. Might also have to download a python lib or two like NumPy.
      2) try invoking some executable file that I cook up myself via exec or FFI or some other process-to-process method. This seems quite involved, and it sounds like a real chore to distribute executables that'll work on linux, MacOS, windows, or whatever.

      Obviously, my goal of a 'pure PHP' library is unattainable. I still hope that I might work up a solution that doesn't require su access to install (e.g., to configure PHP with FFI module or FANN, or to install octave or python).

        8 days later

        I google around for 'php language binding blas' and ran across this NP lib. It requires PHP 8 with FFI, as well as libblas and liblapack (not sure what the latter two are). It's evidently a work in progress and the composer command complains about stable versions on my MacOS laptop until I specified a dev branch:

        composer require ghostjat/np:dev-main

        This gets the composer install to work but then the PHP sample script they provide:

        require __DIR__ . '/vendor/autoload.php';
        use Np\matrix;
        
        $ta = matrix::randn(1000, 1000);    
        $tb = matrix::randn(1000, 1000); // to generate random 2d matrix
        $ta->dot($tb);                  // do a dot operation on given matrix
        $ta->getMemory();              // get memory use
        $ta->time();                  // get time

        fails, complaining that FFI:scope had a problem:

        PHP Fatal error: Uncaught FFI\Exception: Failed loading scope 'blas' in /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php:28
        Stack trace:
        #0 /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php(28): FFI::scope('blas')
        #1 /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php(73): Np\core\blas::init()
        #2 /Users/sneakyimp/np/vendor/ghostjat/np/src/linAlgb/linAlg.php(45): Np\core\blas::gemm(Object(Np\matrix), Object(Np\matrix), Object(Np\matrix))
        #3 /Users/sneakyimp/np/vendor/ghostjat/np/src/linAlgb/linAlg.php(30): Np\matrix->dotMatrix(Object(Np\matrix))
        #4 /Users/sneakyimp/np/foo.php(8): Np\matrix->dot(Object(Np\matrix))
        #5 {main}
        thrown in /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php on line 28
        
        Fatal error: Uncaught FFI\Exception: Failed loading scope 'blas' in /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php:28
        Stack trace:
        #0 /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php(28): FFI::scope('blas')
        #1 /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php(73): Np\core\blas::init()
        #2 /Users/sneakyimp/np/vendor/ghostjat/np/src/linAlgb/linAlg.php(45): Np\core\blas::gemm(Object(Np\matrix), Object(Np\matrix), Object(Np\matrix))
        #3 /Users/sneakyimp/np/vendor/ghostjat/np/src/linAlgb/linAlg.php(30): Np\matrix->dotMatrix(Object(Np\matrix))
        #4 /Users/sneakyimp/np/foo.php(8): Np\matrix->dot(Object(Np\matrix))
        #5 {main}
        thrown in /Users/sneakyimp/np/vendor/ghostjat/np/src/core/blas.php on line 28

        The FFI docs are sorely incomplete, so I've no idea what this command is meant to do or what conditions must be met for it to succeed. There is mention of FFI_SCOPE there in the docs, and there's a blas.h file that contains a #define that looks related to this, but I don't know how to fix this problem.

          I've made a tiny bit of progress with ghostjat/np by reading the source code (which is not well-commented) and by reading the PHP-FI documentation (which is not great). I don't have much faith that it'll be well-behaved and robust, but I have managed to get ghostjat/np to multiply my large matrix by its transpose and performance seems very fast. To get it working on my Ubuntu 20 workstation:

          ./composer.phar require ghostjat/np:dev-main

          Edit the file vendor/ghostjat/np/src/core/blas.h and change the value of FFI_LIB to refer to your BLAS shared library file. On my machine, this works:

          #define FFI_LIB "/usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so"

          NOTE I don't really know much about what this accomplishes. The FFI docs say:

          The header file may contain a #define statement for the FFI_LIB variable to specify the library it exposes. If it is a system library only the file name is required, e.g.: #define FFI_LIB "libc.so.6". If it is a custom library, a relative path is required, e.g.: #define FFI_LIB "./mylib.so".

          I'd also point that I just got lucky by specifying that file. Just trial and error. A sudo apt search blas command returns a giant list of possible packages that one might or might not have installed on one's machine. My workstation has octave installed and I was able to locate one shared library, libopenblasp-r0.3.8.so, with this command:

          # barfs up lots of 'not permitted' warnings so route stderr to /dev/null
          sudo find / -name "*blas*.so" 2>/dev/null

          I'd like to take a moment to point out that this path is highly specific to my particular machine. THIS IS NOT REALLY PORTABLE AT ALL.

          Finally, i create this file bar.php in my working directory, which loads the JSON file containing my 1375*1966 matrix, among other things:

          require __DIR__ . '/vendor/autoload.php';
          use Np\matrix;
          
          $data = json_decode(file_get_contents(__DIR__ . '/../training_data_sets.json'), TRUE);
          
          // more trial and error...cooked this up after reading the ghostjat/np source code
          Np\core\blas::$ffi_blas = FFI::load(__DIR__ . '/vendor/ghostjat/np/src/core/blas.h');
          
          $ta = matrix::ar($data['Xtrain']);
          
          $start = microtime(true);
          $tb = $ta->transpose(); // to generate random 2d matrix
          echo "transpose elapsed time: " . (microtime(true) - $start) . "seconds\n";
          
          $start = microtime(true);
          $tc = $ta->dot($tb);
          echo "dot elapsed time: " . (microtime(true) - $start) . "seconds\n";
          
          $arr = $tc->asArray();
          echo "rows: " . sizeof($arr) . "\n";
          echo "cols: " . sizeof($arr[0]) . "\n";
          
          $json = json_encode($arr, JSON_PRETTY_PRINT);
          file_put_contents(__DIR__ . '/np-product.json', $json);
          echo "json md5 is " . md5($json) . "\n";

          It's fast!

          transpose elapsed time: 0.17577886581421seconds
          dot elapsed time: 0.089972019195557seconds
          rows: 1375
          cols: 1375
          json md5 is 4a33a230c1cf8bb6ab75dbd0d3d7bf0e

          The resulting JSON file (with pretty print turned on) has 1893376 lines. I calculate the md5 of the JSON for comparison with the other algorithms. These appear to all produce the exact same JSON:

          MarkBaker:

          require_once 'vendor/autoload.php';
          
          $data = json_decode(file_get_contents(__DIR__ . '/../training_data_sets.json'), TRUE);
          
          $X = new Matrix\Matrix($data['Xtrain']);
          
          $start = microtime(TRUE);
          $XT = $X->transpose();
          echo "getTranspose in " . (microtime(TRUE) - $start) . " seconds\n";
          
          $start = microtime(TRUE);
          $K = $X->multiply($XT);
          echo "dot product in " . (microtime(TRUE) - $start) . " seconds\n";
          
          $arr = $K->toArray();
          echo "rows: " . sizeof($arr) . "\n";
          echo "cols: " . sizeof($arr[0]) . "\n";
          $json = json_encode($arr, JSON_PRETTY_PRINT);
          echo "json md5 is " . md5($json) . "\n";
          file_put_contents(__DIR__ . '/markbaker-product.json', $json);
          
          echo "COMPLETE\n";

          Multiplication is crazy slow:

          getTranspose in 0.083314180374146 seconds
          dot product in 1227.2787439823 seconds
          rows: 1375
          cols: 1375
          json md5 is 4a33a230c1cf8bb6ab75dbd0d3d7bf0e
          COMPLETE

          NumPHP:

          require 'vendor/autoload.php';
          
          use NumPHP\Core\NumArray;
          
          $data = json_decode(file_get_contents(__DIR__ . '/../training_data_sets.json'), TRUE);
          
          $X = new NumArray($data['Xtrain']);
          
          $start = microtime(TRUE);
          $XT = $X->getTranspose();
          echo "getTranspose in " . (microtime(TRUE) - $start) . " seconds\n"; // 5.9611051082611 seconds
          
          $start = microtime(TRUE);
          $K = $X->dot($XT);
          echo "dot product in " . (microtime(TRUE) - $start) . " seconds\n"; // 466.75154590607 seconds
          
          $arr = $K->getData();
          echo "rows: " . sizeof($arr) . "\n";
          echo "cols: " . sizeof($arr[0]) . "\n";
          $json = json_encode($arr, JSON_PRETTY_PRINT);
          echo "json md5 is " . md5($json) . "\n";
          file_put_contents(__DIR__ . '/numphp-product.json', $json);
          
          echo "COMPLETE\n";

          transpose in numphp is surprisingly slow. Dot product unacceptably slow for deployment on a web server application:

          getTranspose in 3.0702431201935 seconds
          dot product in 224.07051301003 seconds
          rows: 1375
          cols: 1375
          json md5 is 4a33a230c1cf8bb6ab75dbd0d3d7bf0e
          COMPLETE

          PHP-ML:

          require_once __DIR__ . '/vendor/autoload.php';
          
          use Phpml\Math\Matrix;
          
          $data = json_decode(file_get_contents(__DIR__ . '/../training_data_sets.json'), TRUE);
          
          $X = new Matrix($data['Xtrain']);
          
          $start = microtime(TRUE);
          $XT = $X->transpose();
          echo "transpose in " . (microtime(TRUE) - $start) . " seconds\n";
          
          $start = microtime(TRUE);
          $K = $X->multiply($XT);
          echo "multiply in " . (microtime(TRUE) - $start) . " seconds\n"; // 292.54629206657 seconds
          
          $arr = $K->toArray();
          echo "rows: " . sizeof($arr) . "\n";
          echo "cols: " . sizeof($arr[0]) . "\n";
          $json = json_encode($arr, JSON_PRETTY_PRINT);
          echo "json md5 is " . md5($json) . "\n";
          file_put_contents(__DIR__ . '/php-ml-product.json', $json);
          
          echo "COMPLETE\n";

          Perhaps the best 'pure php' implementation, but still unacceptably slow for web server applications:

          transpose in 0.082174062728882 seconds
          multiply in 178.16531610489 seconds
          rows: 1375
          cols: 1375
          json md5 is 4a33a230c1cf8bb6ab75dbd0d3d7bf0e
          COMPLETE

          Home-rolled JMatrix:

          require_once __DIR__ . '/JMatrix.php';
          
          $data = json_decode(file_get_contents(__DIR__ . '/../training_data_sets.json'), TRUE);
          
          $X = new JMatrix($data['Xtrain']);
          
          $start = microtime(TRUE);
          $XT = $X->transpose();
          echo "transpose in " . (microtime(TRUE) - $start) . " seconds\n";
          
          $start = microtime(TRUE);
          $K = $X->multiply($XT);
          echo "multiply in " . (microtime(TRUE) - $start) . " seconds\n";
          
          $arr = $X->data;
          echo 'rows: ' . sizeof($arr) . "\n";
          echo 'cols: ' . sizeof($arr[0]) . "\n";

          This is also super slow:

          transpose in 0.51202392578125 seconds
          multiply in 739.11785197258 seconds
          rows: 1375
          cols: 1375
          json md5 is 4a33a230c1cf8bb6ab75dbd0d3d7bf0e

          The ghostjat/np method via FFI seems promising, but I have a lot of concerns:

          • where to find the right shared library on a given system? my macos laptop has no BLAS file ending in .so
          • does the blas.h file included in ghostjat/np match up with my shared lib file? Will different libraries have different .h files with different declarations?
          • which BLASH libraries are compatible with ghostjat/np?
          • portability: what do do on windows or macos?
          • how does FFI work? I don't yet understand why ghostjat/np can multiply so much faster, and there appears to be some new-fangled PHP at work. the matrix class extends nd, but neither of these has a dot function.

          sneakyimp I don't yet understand why ghostjat/np can multiply so much faster,

          blas is written in Fortran (and, for the purposes of Octave, translated to C during the build). It gets compiled to machine code via assembly. So the matrix multiplication is being done on CPU-native types using an optimised sequence of CPU instructions:

          	movl	$1, j.8(%rip)
          	jmp	.L2
          .L9:
          	movq	-56(%rbp), %rax
          	movl	(%rax), %eax
          	movl	%eax, -20(%rbp)
          	movl	$1, i__.7(%rip)
          	jmp	.L3
          .L4:
          	movl	j.8(%rip), %eax
          	imull	-8(%rbp), %eax
          	movl	%eax, %edx
          	movl	i__.7(%rip), %eax
          	addl	%edx, %eax
          	cltq
          	leaq	0(,%rax,8), %rdx
          	movq	56(%rbp), %rax
          	addq	%rdx, %rax
          	pxor	%xmm0, %xmm0
          	movsd	%xmm0, (%rax)
          	movl	i__.7(%rip), %eax
          	addl	$1, %eax
          	movl	%eax, i__.7(%rip)
          .L3:
          	movl	i__.7(%rip), %eax
          	cmpl	%eax, -20(%rbp)
          	jge	.L4
          	movq	-72(%rbp), %rax
          	movl	(%rax), %eax
          	movl	%eax, -20(%rbp)
          	movl	$1, l.6(%rip)
          	jmp	.L5
          .L8:
          	movl	j.8(%rip), %eax
          	imull	-16(%rbp), %eax
          	movl	%eax, %edx
          	movl	l.6(%rip), %eax
          	addl	%edx, %eax
          	cltq
          	leaq	0(,%rax,8), %rdx
          	movq	32(%rbp), %rax
          	addq	%rdx, %rax
          	movsd	(%rax), %xmm0
          	movsd	%xmm0, temp.5(%rip)
          	movq	-56(%rbp), %rax
          	movl	(%rax), %eax
          	movl	%eax, -12(%rbp)
          	movl	$1, i__.7(%rip)
          	jmp	.L6
          .L7:
          	movl	j.8(%rip), %eax
          	imull	-8(%rbp), %eax
          	movl	%eax, %edx
          	movl	i__.7(%rip), %eax
          	addl	%edx, %eax
          	cltq
          	leaq	0(,%rax,8), %rdx
          	movq	56(%rbp), %rax
          	addq	%rdx, %rax
          	movsd	(%rax), %xmm1
          	movl	l.6(%rip), %eax
          	imull	-4(%rbp), %eax
          	movl	%eax, %edx
          	movl	i__.7(%rip), %eax
          	addl	%edx, %eax
          	cltq
          	leaq	0(,%rax,8), %rdx
          	movq	16(%rbp), %rax
          	addq	%rdx, %rax
          	movsd	(%rax), %xmm2
          	movsd	temp.5(%rip), %xmm0
          	mulsd	%xmm2, %xmm0
          	movl	j.8(%rip), %eax
          	imull	-8(%rbp), %eax
          	movl	%eax, %edx
          	movl	i__.7(%rip), %eax
          	addl	%edx, %eax
          	cltq
          	leaq	0(,%rax,8), %rdx
          	movq	56(%rbp), %rax
          	addq	%rdx, %rax
          	addsd	%xmm1, %xmm0
          	movsd	%xmm0, (%rax)
          	movl	i__.7(%rip), %eax
          	addl	$1, %eax
          	movl	%eax, i__.7(%rip)
          .L6:
          	movl	i__.7(%rip), %eax
          	cmpl	%eax, -12(%rbp)
          	jge	.L7
          	movl	l.6(%rip), %eax
          	addl	$1, %eax
          	movl	%eax, l.6(%rip)
          .L5:
          	movl	l.6(%rip), %eax
          	cmpl	%eax, -20(%rbp)
          	jge	.L8
          	movl	j.8(%rip), %eax
          	addl	$1, %eax
          	movl	%eax, j.8(%rip)
          .L2:
          	movl	j.8(%rip), %eax
          	cmpl	%eax, -24(%rbp)
          	jge	.L9

          The PHP-only implementations are not. Even if the Zend VM's opcodes are jitted, it would still be doing all the reference-counted copy-on-write zobject manipulation stuff that is PHP's managed environment.

          Thanks, weedpacket. If I'm not mistaken, that is assembler code you posted? Where did this code come from? Are you inspecting source code for some BLAS lib? Thanks also for the blas link.

          It seems obvious to me that pure PHP approaches to matrix multiplication are futile. The fastest PHP implementation is about 2000 times slower on this particular $X->dot($XT) operation. The FI approach, on the other hand, runs in 80 milliseconds, which is plenty fast for web server applications.

          My jury-rigged approach using the ghostjat/np library evidently does yield the correct matrix product, but I'm really concerned that I might be doing something wrong. I don't really know what could happen, but this warning in the FFI docs seems pretty stark:

          Caution
          FFI is dangerous, since it allows to interface with the system on a very low level. The FFI extension should only be used by developers having a working knowledge of C and the used C APIs. To minimize the risk, the FFI API usage may be restricted with the ffi.enable php.ini directive.

          My approach is clearly not portable, either. A given machine might be running Windows or MacOS or one of the myriad flavors of linux and, if I'm not mistaken, my workstation only happens to have libopenblasp-r0.3.8.so installed because I had previously installed octave on it using the Ubuntu package manager, apt. I hope there might be some way to provide the blazing speed of a proper BLAS library to PHP in a way that doesn't require a user to have root/su access to install system packages. Perhaps there's some way to distribute a shared library with my PHP source?

          I also am concerned that libopenblasp-r0.3.8.so might be totally incompatible with the blas.h file distributed in the ghostjat/np lib. I did a diff between that file and the cblas.h file linked in the netlib link and the two files have quite a few dissimilarities. E.g., ghostjat blas.h defines some things that clbas does not, e.g.

          void    openblas_set_num_threads(int num_threads);

          conversely, cblas defines functions ghostjat blas does not, e.g.

          void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
            const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
            const void *alpha, const void *A, const int lda,
             const void *B, const int ldb, const double beta,
             void *C, const int ldc);

          But they do seem to agree on some functions:

          //cblas.h
          double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
            const int incY);
          //ghostjat-blas.h
          double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
            const int incY);

          I'd be content to devise some adaptation of ghostjat's lib that works on linux platforms without the need for root/su permissions to install additional packages via yum/apt. I'd be delighted if I could get it to work on MacOS, too. I note that my mac laptop has some .SO files containing the string "blas" which were apparently installed with python:

          $ sudo find / -name "*blas*.so" 2>/dev/null
          Password:
          /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/_dotblas.so
          /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/linalg/_fblas.so
          /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/fblas.so
          /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/cblas.so
          /System/Volumes/Data/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/_dotblas.so
          /System/Volumes/Data/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/linalg/_fblas.so
          /System/Volumes/Data/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/fblas.so
          /System/Volumes/Data/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/cblas.so

          I've tried putting the path of both of those cblas.so libs in my blas.h file:

          // tried this, doesn't work
          #define FFI_LIB /System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/cblas.so"
          // this also doesn't work
          #define FFI_LIB /System/Volumes/Data/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/cblas.so

          but FFI complains about both of them:

          PHP Fatal error:  Uncaught FFI\Exception: Failed loading '/System/Volumes/Data/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/scipy/lib/blas/cblas.so' in /Users/sneakyimp/Desktop/biz/machine-learning/np/foo.php:8
          Stack trace:
          #0 /Users/sneakyimp/Desktop/biz/machine-learning/np/foo.php(8): FFI::load('/Users/sneakyimp/Des...')
          #1 {main}
            thrown in /Users/sneakyimp/Desktop/biz/machine-learning/np/foo.php on line 8

          I searched the mac file system for files containing the string "libblas" and found /usr/lib/libblas.dylib. I tried putting that in blas.h:

          #define FFI_LIB "/usr/lib/libblas.dylib"

          And my php script doesn't like that either:

          PHP Fatal error:  Uncaught FFI\Exception: Failed resolving C function 'openblas_set_num_threads' in /Users/sneakyimp/Desktop/biz/machine-learning/np/foo.php:8
          Stack trace:
          #0 /Users/sneakyimp/Desktop/biz/machine-learning/np/foo.php(8): FFI::load('/Users/sneakyimp/Des...')
          #1 {main}
            thrown in /Users/sneakyimp/Desktop/biz/machine-learning/np/foo.php on line 8

          sneakyimp Thanks, weedpacket. If I'm not mistaken, that is assembler code you posted? Where did this code come from? Are you inspecting source code for some BLAS lib? Thanks also for the blas link.

          Yes; DGEMM is the BLAS routine that does matrix-matrix multiplication. I converted that to C (which probably added a layer of indirection complexity, but I didn't use any optimisation switches), cut out the premultiplier and accumulator (set alpha to 1 and beta to 0), compiled it to assembly, and excerpted the part that runs if neither matrix is to be transposed. The corresponding Fortran is

                        DO 90 J = 1,N
                            IF (BETA.EQ.ZERO) THEN
                                DO 50 I = 1,M
                                    C(I,J) = ZERO
             50                 CONTINUE
                            ELSE IF (BETA.NE.ONE) THEN
                                DO 60 I = 1,M
                                    C(I,J) = BETA*C(I,J)
             60                 CONTINUE
                            END IF
                            DO 80 L = 1,K
                                TEMP = ALPHA*B(L,J)
                                DO 70 I = 1,M
                                    C(I,J) = C(I,J) + TEMP*A(I,L)
             70                 CONTINUE
             80             CONTINUE
             90         CONTINUE

          With ALPHA and BETA being hardcoded to 1 and 0, respectively, which is why they don't show up in the assembler.

          sneakyimp My jury-rigged approach using the ghostjat/np library evidently does yield the correct matrix product, but I'm really concerned that I might be doing something wrong. I don't really know what could happen, but this warning in the FFI docs seems pretty stark:

          Well, that's just a warning that external and OS libraries can do anything on your system. It's like the warning around eval only moreso, since now you've got the ability to execute arbitrary code at the operating system level (like, on Windows you can call the InitiateSystemShutdown method in advapi32.dll).

          However, like I said, I haven't had that much experience with FFI. You might like to consider downloading cblas.tgz and cblas.h from netlib above and compiling the library yourself. Then you'd have something you could distribute with the rest of the application.

          Weedpacket However, like I said, I haven't had that much experience with FFI. You might like to consider downloading cblas.tgz and cblas.h from netlib above and compiling the library yourself. Then you'd have something you could distribute with the rest of the application.

          This sounds like a reasonable next step. I would point out that PHP-ML delegates some operations to libsvm, and distributes distinct executables/libs for linux, macos, and windows. If I distribute my own shared library, that would eliminate the need to search for whatever flavor of BLAS may (or may not) be installed on the target machine, and we could be sure that we had the correct blas.h file.

          I am somewhat sheepishly wondering whether a CBLAS library compiled on my linux workstation would work on other linux distros? I dimly suspect that the chip architecture matters, and harbor a feeble hope that if I compile CBLAS on my intel system that the resulting .SO file would also run on red hat or centos if that system also had a 64-bit intel architecture -- and possibly also might run on machines with AMD chips? Sadly, I'm not very knowledgeable about these things.

          It seems to me that ghostjat/np is not well designed to address the need to specifically locate whichever BLAS library may (or may not) be installed on one's machine and then parse the correct blas.h file for that library. The blas.h distributed with ghostjat/np just refers to libblas.so:

          #define FFI_LIB "libblas.so"

            I'm not having much luck compiling CBLAS on either MacOS or my Ubuntu workstation. Part of the problem is that the compiler output is greek to me. If I'm not mistaken, the process appears to build an archive file, lib/cblas_LINUX.a, but the make operation fails. I posted stack overflow seeking help and I'm getting confusing/confused posts about either installing libopenblas instead or installing libblas. I thought compiling cblas would be creating a BLAS library?

            I've been sniffing around the OpenBLAS library releases and the various zip/tgz archives I've downloaded don't appear to contain any .so files.

            Lots of trial and error so far. I've been trying some of the .h files and .a files here but FFI doesn't seem to like any of them but its original blas.h file, modified to refer to the /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so shared lib which happens to be installed on my machine. It also seems like FFI has its own ideas about what a valid blas.h file is. It complains about semicolons and other #define statements in the cblas.h file from that netlib site, for instance.

            for my own reference: further googling i found some suggested steps, which may be helpful.
            EDIT: those instructions didn't work, either. I'm starting to wonder if perhaps the complaint is related to the fortran code? Calling make from the cblas folder fails and this is the final output:

            gcc -I../include -O3 -DADD_ -c c_sblas1.c
            gfortran -O3   -c c_sblat1.f
            c_sblat1.f:214:48:
            
              214 |                CALL STEST1(SNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC)
                  |                                                1
            Error: Rank mismatch in argument 'strue1' at (1) (scalar and rank-1)
            c_sblat1.f:218:48:
            
              218 |                CALL STEST1(SASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC)
                  |                                                1
            Error: Rank mismatch in argument 'strue1' at (1) (scalar and rank-1)
            make[1]: *** [c_sblat1.o] Error 1
            make: *** [alltst] Error 2

            I wonder if perhaps the BLAS fortran code is old somehow?

              Soooo I have been able to download and make/build the latest OpenBlas and build it both on Ubuntu 20 and MacOs (Catalina, I think? With intel chip) and get it to work with ghostjat/np on both platforms.

              The steps, should anyone be curious. These are the ubuntu steps but should also work with little modification on MacOs -- e.g., download the file with your browser instead of using wget

              #make a dir for openblas:
              mkdir openblas
              cd openblas
              #download latest OpenBlas, extract it, cd into the new dir and make the project.
              # make operation takes awile, and your machine will need a compiler and probably gfortrain, etc.:
              wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21.tar.gz
              tar -xvzf OpenBLAS-0.3.21.tar.gz
              cd OpenBLAS-0.3.21
              make

              Take note of the library's location, which will probably have a name specific to your machine's CPU architecture. In my case, this is it: libopenblas_sandybridgep-r0.3.21.so

              Take note of the full path to that lib:
              /home/sneakyimp/biz/machine-learning/openblas/OpenBLAS-0.3.21/libopenblas_sandybridgep-r0.3.21.so

              We can now put that shared lib path in the blas.h file included with the ghostjat/np PHP code. I repeat here the steps to set up a brand new ghostjat/np file:

              cd ..
              mkdir np
              cd np
              # you may need to get composer, see: https://getcomposer.org/download/
              ./composer.phar require ghostjat/np:dev-main
              #edit the blas.h file that comes with it:
              nano vendor/ghostjat/np/src/core/blas.h

              In the blas.h file, change the FFI_LIB to point to the full path of the OpenBLAS lib you just built:
              #define FFI_LIB "/home/sneakyimp/biz/machine-learning/openblas/OpenBLAS-0.3.21/libopenblas_sandybridgep-r0.3.21.so"

              Now you should be able to run this PHP file, bar.php which performs a matrix multiplication:

              <?php
              require __DIR__ . '/vendor/autoload.php';
              use Np\matrix;
              
              // NOTE THIS IS MY DATA YOU'LL NEED TO COOK YOUR OWN
              // OR REFER BACK TO OUR PRIOR CORRESPONDENCE
              $data = json_decode(file_get_contents(__DIR__ . '/../training_data_sets.json'), TRUE);
              
              // more trial and error...cooked this up after reading the ghostjat/np source code
              Np\core\blas::$ffi_blas = FFI::load(__DIR__ . '/vendor/ghostjat/np/src/core/blas.h');
              
              $ta = matrix::ar($data['Xtrain']);
              
              $start = microtime(true);
              $tb = $ta->transpose(); // to generate random 2d matrix
              echo "transpose elapsed time: " . (microtime(true) - $start) . "seconds\n";
              
              $start = microtime(true);
              $tc = $ta->dot($tb);
              echo "dot elapsed time: " . (microtime(true) - $start) . "seconds\n";
              
              $arr = $tc->asArray();
              echo "rows: " . sizeof($arr) . "\n";
              echo "cols: " . sizeof($arr[0]) . "\n";
              
              $json = json_encode($arr, JSON_PRETTY_PRINT);
              file_put_contents(__DIR__ . '/np-product.json', $json);
              echo "json md5 is " . md5($json) . "\n";

              While I am delighted that bar.php runs and produces the correct output, I would point out that I'm just using the blas.h file that is distributed with ghostjat/np. I'm still quite concerned that blas.h might be totally mismatched against the latest OpenBlas. I tried using the cblas.h that is distributed with OpenBLAS, but FFI::load complains about it. Here's a typical error:

              PHP Fatal error:  Uncaught FFI\ParserException: ';' expected, got '<STRING>' at line 8 in /home/jaith/biz/machine-learning/np2/bar.php:9
              Stack trace:
              #0 /home/jaith/biz/machine-learning/np2/bar.php(9): FFI::load()
              #1 {main}

              It would appear that FFI has some stunted/limited ability to parse these h files, but its limitations are not very clear, nor am I sure how to convert the OpenBlas header file to something FFI can digest. The documentation offers a little help:

              C preprocessor directives are not supported, i.e. #include, #define and CPP macros do not work, except for special cases listed below.
              The header file should contain a #define statement for the FFI_SCOPE variable, e.g.: #define FFI_SCOPE "MYLIB". Refer to the class introduction for details.
              The header file may contain a #define statement for the FFI_LIB variable to specify the library it exposes. If it is a system library only the file name is required, e.g.: #define FFI_LIB "libc.so.6". If it is a custom library, a relative path is required, e.g.: #define FFI_LIB "./mylib.so".

              There's also a user comment that seems promising:

              Since #include's and #define's other than FFI_LIB and FFI_SCOPE are not supported in the header, you may want to use the C preprocessor to pre-process your headers in order to resolve all #include's and macros.

              I use -D"attribute(ARGS)=" to remove function attributes as well, which are not supported by FFI either.

              This is my script:

              echo '#define FFI_SCOPE "YOUR_SCOPE"' > header-ffi.h
              echo '#define FFI_LIB "/path/to/your_lib.so"' >> header-ffi.h
              cpp -P -C -D"attribute(ARGS)=" header_original >> header-ffi.h

              Unfortunately, I tried it and, although it produced a 15,000-line header-ffi.h when I ran it on cblas.h, this file also produced FFI errors. I've not been able to use any blas.h/cblas.h file from CBLAS or OpenBLAS and get it working. Any advice on how I might do this would be much appreciated.

              I'm also a bit concerned about using OpenBLAS for a couple of reasons:

              • this is a big project, probably overkill. It takes a while to make and the shared lib libopenblas_sandybridgep-r0.3.21.so is 14MB
              • I worry a bit about security. The contributors include quite a few far flung devs and the Chinese Academy of Sciences

              I have been able to build CBLAS on my ubuntu machine by first separately downloading and building BLAS and editing the CBLAS Makefile to refer to the static library it produces in this line:

              BLLIB = /home/sneakyimp/biz/machine-learning/blas/BLAS-3.11.0/blas_LINUX.a

              This won't build on MacOS, complaining about the previously mentioned fortran stuff, but on Ubuntu 20 it seems to build. Unforunately, it does not produce any .so shared libraries. It just produces a static library, lib/cblas_LINUX.a. I've tried that with ghostjat but it fails.

              Is there some simple flag I can provide to make so that CBLAS will build the shared libraries (i.e., the .so files)? Can anyone suggest an approach to creating a blas.h file for ghostjat/np that better matches either OpenBLAS or CBLAS?

                More trial and error...

                Looks like ghostjat/np requires OpenBLAS rather than CBLAS. Thanks to this little recipe I was finally able to get CBLAS compiled into a cblas_LINUX.so shared lib by making some changes to the CBLAS makefile. I then edited the blas.h file vendor/ghostjat/np/src/core/blas.h to point to this newly built library:

                #define FFI_LIB "/home/sneakyimp/biz/machine-learning/blas/CBLAS/lib/cblas_LINUX.so"

                and tried to run my bar.php file and ghostjat/np is evidently using openblas functions:

                $ php bar.php
                PHP Fatal error:  Uncaught FFI\Exception: Failed resolving C function 'openblas_set_num_threads' in /home/sneakyimp/biz/machine-learning/np/bar.php:9
                Stack trace:
                #0 /home/sneakyimp/biz/machine-learning/np/bar.php(9): FFI::load()
                #1 {main}
                  thrown in /home/sneakyimp/biz/machine-learning/np/bar.php on line 9

                This is disappointing as CBLAS seemed much more compact. I.e., cblas_LINUX.so is only 130K, whereas libopenblas_sandybridgep-r0.3.21.so is a hulking 14MB. Kinda makes one wonder what is in that shared library?

                This now brings me back to my attempts to try and improve the blas.h file that's currently in ghostjat/np. It is quite dissimilar to the h files in either OpenBlas or CBLAS. I've tried manually removing preprocessor directives from the various blas.h and cblas.h files that exist in these libraries, but I can't seem to get FFI to parse the files that result. I tried that approach mentioned in the PHP-FFI documentation and it yielded a giant, 15000-line h file that also barfed.

                Is it safe to use the blas.h that comes with ghostjat/np if I am compiling OpenBlas or CBLAS from source?

                sneakyimp Is it safe to use the blas.h that comes with ghostjat/np if I am compiling OpenBlas or CBLAS from source?

                For the purposes of FFI, I reckon the header file only needs to declare the interface that you're actually using. So you'd be writing it from scratch (starting with the #define FFI_LIB to name the library).

                sneakyimp Kinda makes one wonder what is in that shared library?

                At a brief glance, for a start it looks like the whole of LAPACK is in there as well.
                https://www.openblas.net/

                  I am not C/C++ pro but I cooked up some PHP to parse the ghostjat/np blas.h file and the OpenBlas cblas.h file and report some stuff. I can't guarantee this grabs all the fn definitions declared, but it seems pretty good:

                  // ghostjat file
                  $gj_h_file = __DIR__ . '/vendor/ghostjat/np/src/core/blas.h';
                  echo "parsing $gj_h_file\n";
                  $code = file_get_contents($gj_h_file);
                  $pattern = '#^([a-z\*_]+)\s+([a-z0-9_]+)\s*\(([^)]+)\)#ismU';
                  $matches = null;
                  $mcount = preg_match_all($pattern, $code, $matches);
                  echo "$mcount functions declared in ghostjat\n";
                  $gj_fns = $matches[2];
                  
                  // there are several blas.h files in OpenBlas, i just parse the first
                  //openblas/OpenBLAS-0.3.21/cblas.h
                  //openblas/OpenBLAS-0.3.21/lapack-netlib/CBLAS/include/cblas.h
                  //openblas/OpenBLAS-0.3.21/relapack/src/blas.h
                  $ob_h_file = realpath(__DIR__ . '/../openblas/OpenBLAS-0.3.21/cblas.h');
                  echo "parsing $ob_h_file\n";
                  $code = file_get_contents($ob_h_file);
                  $pattern = '#^([a-z\*_]+)\s+([a-z0-9_]+)\s*\(([^)]+)\)#ismU';
                  $matches = null;
                  $mcount = preg_match_all($pattern, $code, $matches);
                  echo "$mcount functions declared in openblas\n";
                  $ob_fns = $matches[2];
                  
                  $gj_not_ob = array_diff($gj_fns, $ob_fns);
                  echo sizeof($gj_not_ob) . " fns in gj but not ob\n";
                  //echo implode("\n", $gj_not_ob), "\n";
                  
                  $ob_not_gj = array_diff($ob_fns, $gj_fns);
                  echo sizeof($ob_not_gj) . " fns in ob but not gj\n";
                  //echo implode("\n", $ob_not_gj), "\n";

                  OpenBlas appears to declare some 200 functions, and FFI doesn't like a lot of the stuff in there. I've not yet managed any version of this file that survives FFI::load.

                  parsing /home/sneakyimp/biz/machine-learning/np/vendor/ghostjat/np/src/core/blas.h
                  44 functions declared in ghostjat
                  parsing /home/sneakyimp/biz/machine-learning/openblas/OpenBLAS-0.3.21/cblas.h
                  201 functions declared in openblas
                  0 fns in gj but not ob
                  157 fns in ob but not gj

                  I haven't bothered to compare return type or parameter declarations, but it looks like every function in the ghostjat/np blas.h file is accounted for in the OpenBlas.h file. I have a dim inkling this is good, and imagine it means that FFI will somehow locate all the functions declared in blas.h in the libopenblas_sandybridgep-r0.3.21.so shared library and, conversely, we aren't declaring functions in PHP that don't exist in the library. It seems like it'd be bad if our PHP code thought there was some function in the library that wasn't in there and we tried to jump to the wrong memory location? I really have no idea, though.

                    Well, the header file is used to define the object returned by FFI::cdef; which methods and types it exposes. If they're not in the header they're not part of the object's interface and trying to call something else would be a plain old "Call to undefined method" error. The basic examples have the header declaration within the PHP code, instead of using the external file/preloader stuff used by FFI::load. The latter is more if you want to provide a full-featured Openblas interface to PHP users as part of your application. If you're only going to call dgemm then that and a couple of enums (CBLAS_ORDER, CBLAS_TRANSPOSE) are all you need to declare. And if dgemm is all you're using you might could compile a library from source code containing just that function after translating the Fortran to C yourself (or just write a matrix multiplication routine from scratch and just use the blas code as a guideline).

                    8 days later

                    While the OpenBlas performance is certainly lightning fast, I'm beginning to have serious doubts about how to distribute any code I might write which requires distribution of a libopenblas.so file. I downloaded the openblas source and built it on my Ubuntu 20 workstation:

                    mkdir ~/openblas
                    cd ~/openblas
                    wget https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21.tar.gz
                    tar -xvzf OpenBLAS-0.3.21.tar.gz
                    cd OpenBLAS-0.3.21
                    make

                    Because of the openblas configuration, this yields a file which apparently has a name specific to my intel CPU's architecture, libopenblas_sandybridgep-r0.3.21.so.

                    I uploaded this libopenblas_sandybridgep-r0.3.21.so file and my bar.php along with its dependencies to a freshly set up Ubuntu 22.04 virtual machine (an Amazon EC2 instance with php 8.1 installed). I modified the bar.php script and blas.h file to refer to the correct file locations in the EC2 vm and tried to run it and it hates the so file:

                    $ php bar.php
                    PHP Fatal error:  Uncaught FFI\Exception: Failed loading '/home/sneakyimp/np/libopenblas_sandybridgep-r0.3.21.so' in /home/sneakyimp/np/bar.php:9
                    Stack trace:
                    #0 /home/sneakyimp/np/bar.php(9): FFI::load()
                    #1 {main}
                      thrown in /home/sneakyimp/np/bar.php on line 9

                    It would appear that OpenBlas doesn't build shared libraries that are at all portable, at least not by default. I suspect this might be due to highly optimized code. E.g., assembler or something.

                    I also note that this freshly established Ubuntu 22.04 vm lacks the tools to build openblas. Attempting to the download & make steps above on the vm yields a complaint:

                    $ make
                    Command 'make' not found, but can be installed with:
                    sudo apt install make        # version 4.3-4.1build1, or
                    sudo apt install make-guile  # version 4.3-4.1build1

                    SUMMARY: Portability of OpenBLAS is a problem. It appears to build a lib specific to not just OS but also the CPU architecture. I can't even build a shared lib on my u20 workstation that'll run on another machine running u22. Forget about other linux distros or MacOS or windows. Also, the tools to make the OpenBLAS source code are not installed by default. To even make the source, you'd have to install the tools. It'd be completely impractical to try and precompile a lib SO file for every common architecture/distro/release to include as part of some composer package. I might add that the SO lib file is 14 Megabytes and it takes quite a while to compile -- a minute perhaps? Two minutes?

                    I wonder just how narrowly compatible a particular open blas lib is? My workstation is Intel(R) Core(TM) i7-4820K CPU on Ubuntu 20 and the amazon EC2 Ubuntu 22.04 reports Intel(R) Xeon(R) CPU E5-2676 v3. Is there some way to compile for wider compatibility? I wonder how Numpy deals with this?

                      I've been lurking on this thread for the past couple of weeks while I've been busy testing matrix multiplication algorithms written in pure ANSI C/C++:

                      https://github.com/cubiclesoft/matrix-multiply

                      If you've got a few minutes and don't mind running a stranger's code, I'd like to see what output you get for:

                      matrixmult bench n 6 1 1375 1966 1375
                      matrixmult bench n 8 1 1375 1966 1375
                      matrixmult bench n 9 1 1375 1966 1375
                      matrixmult bench c 6 1 1375 1966 1375
                      matrixmult bench c 8 1 1375 1966 1375
                      matrixmult bench c 9 1 1375 1966 1375

                      I get this as output on my computer:

                      matrixmult bench n 6 1 1375 1966 1375
                      1.702430
                      
                      matrixmult bench n 8 1 1375 1966 1375
                      1.718223
                      
                      matrixmult bench n 9 1 1375 1966 1375
                      1.491469
                      
                      matrixmult bench c 6 1 1375 1966 1375
                      1.972144
                      
                      matrixmult bench c 8 1 1375 1966 1375
                      1.742858
                      
                      matrixmult bench c 9 1 1375 1966 1375
                      1.781659

                      So about 1.5 to 2 seconds for your test array size of 1375x1966 on my computer. Obviously not as fast as CBLAS but it compiles and runs and is roughly 100 times faster than the fastest pure PHP userland implementation you've attempted to use. And should hopefully be in the ballpark of NumPy's runtime.

                      CBLAS might be the fastest BUT getting it to build, based on watching both your AND previously Christoph's (cmb69) attempts to build it:

                      https://github.com/RubixML/Tensor/issues/22

                      Reveals that Tensor/CBLAS/LAPACK is unlikely to build in certain environments and a pain in the neck to build in all other environments. The RubixML Tensor project also appears to be largely dead:

                      https://github.com/RubixML/Tensor/issues/32

                      You might have also noticed that FFI only works with the PHP CLI. The regular web server versions (web SAPIs) of PHP do not have FFI compiled in for security reasons.

                      As to building in the cloud, you need to install the "build-essential" package to get gcc, make, etc. As to architectures for OpenBlas, the clue is in the name "Sandy bridge." Intel has different microcode architectures and it looks like BLAS/CBLAS/OpenBlas targets each one very specifically. As to why BLAS runs so fast, there's a second clue in the path where you find it: pthreads. BLAS starts multiple threads and splits up the workload across them. You either run longer (single threaded) OR draw more wattage from the wall (multithreaded). However, if multiple users are using the same machine, multithreading might actually be a hindrance rather than a help.

                      As to the question about NumPy, the official NumPy website says their implementation is written in pure C. So they probably aren't using BLAS/CBLAS/LAPACK, which explains why it takes a couple of seconds of CPU to complete instead of 0.08 seconds.

                      There is some good news though. I've been working on something for the past couple of months: A new PHP extension that is actually a hodgepodge of functions I'm calling the "Quality of Life Improvement Functions" extension. I figured I was basically done until you suddenly showed up on the PHP Internals list with your comments about matrix multiplication. The next step is to take one of the matrix multiplication algorithms and implement it into the extension. The extension itself already has a bunch of random, fully tested functions that look like someone barfed up 15 different ideas and threw them all into one extension...because that's pretty much what happened. So what's a couple more functions?

                      By the way, transposing an array can already be accomplished with array_map():

                      array_map(null, ...$arraytotranspose);   // The ... prefix expands the array as individual parameters.

                      I'm not certain if array_map() is the fastest way to transpose an array in PHP but it is already built into the language. The extension I've made aims to be free of superfluous stuff that will likely be rejected by the core devs because I intend to push for adoption into PHP core. So a dedicated mat_transpose() function won't fit into that model since array_map() can already handle that operation natively.

                      Multiplying two matrices together in PHP will also need some careful thought about how to avoid someone using all available CPU time on the web SAPIs without making a bunch of unnecessary calls to a timer routine to avoid exceeding the web SAPI timeout period.

                      Hope this at least answers some of your questions. But it will probably be a few more weeks until I have the extension ready for any sort of use since the holidays are upon us.

                      cubic If you've got a few minutes and don't mind running a stranger's code, I'd like to see what output you get for...

                      Here are my results:

                      matrixmult bench n 6 1 1375 1966 1375
                      2.204161
                      matrixmult bench n 8 1 1375 1966 1375
                      2.312766
                      matrixmult bench n 9 1 1375 1966 1375
                      2.729322
                      matrixmult bench c 6 1 1375 1966 1375
                      2.254812
                      matrixmult bench c 8 1 1375 1966 1375
                      2.321597
                      matrixmult bench c 9 1 1375 1966 1375
                      2.810967

                      My machine's CPU is a 4-core/8-thread Intel Core i7-4820K CPU @ 3.70GHz. I've got 28GB of RAM. I note that the ubuntu system monitor shows one thread/core max out while it's running. Assuming you can efficiently feed 1375 x 1966 ints/floats into it, those performance numbers are quite respectable. As it currently is, I have no idea what it's multiplying.

                      Tensor doesn't seem to work for me. My workstation has PHP 8.2 and pecl stubbornly refused to work:

                      $ sudo pecl install tensor
                      [sudo] password for sneakyimp: 
                      WARNING: channel "pecl.php.net" has updated its protocols, use "pecl channel-update pecl.php.net" to update
                      pecl/tensor requires PHP (version >= 7.4.0, version <= 8.1.99), installed version is 8.2.0
                      No valid packages found
                      install failed

                      Trying to compile it directly also fails:

                      /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c: In function ‘zephir_function_exists’:
                      /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c:285:101: warning: comparison between pointer and integer
                        285 |  if (zend_hash_str_exists(CG(function_table), Z_STRVAL_P(function_name), Z_STRLEN_P(function_name)) != NULL) {
                            |                                                                                                     ^~
                      /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c: In function ‘zephir_function_exists_ex’:
                      /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c:301:76: warning: comparison between pointer and integer
                        301 |  if (zend_hash_str_exists(CG(function_table), function_name, function_len) != NULL) {
                            |                                                                            ^~
                      /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c: In function ‘zephir_get_arg’:
                      /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c:571:9: error: too many arguments to function ‘zend_forbid_dynamic_call’
                        571 |     if (zend_forbid_dynamic_call("func_get_arg()") == FAILURE) {
                            |         ^~~~~~~~~~~~~~~~~~~~~~~~
                      In file included from /usr/include/php/20220829/main/php.h:35,
                                       from /home/sneakyimp/biz/machine-learning/tensor/Tensor/ext/kernel/main.c:16:
                      /usr/include/php/20220829/Zend/zend_API.h:782:39: note: declared here
                        782 | static zend_always_inline zend_result zend_forbid_dynamic_call(void)
                            |                                       ^~~~~~~~~~~~~~~~~~~~~~~~
                      make: *** [Makefile:213: kernel/main.lo] Error 1

                      This guy offers an update that will compile and I made/installed it. You get the tensor extension listed in the installed extensions and you get various classes defined, but it doesn't seem to work when you try to multiply my data (click that issue to see my results). It seems to work sometimes, but the dimensions of the result are wrong. Other times it segfaults.

                      cubic You might have also noticed that FFI only works with the PHP CLI. The regular web server versions (web SAPIs) of PHP do not have FFI compiled in for security reasons.

                      I think it's possible to get FFI working in a web server environment. The docs have some detail about preloading FFI definitions and libs 'at PHP startup':

                      FFI definition parsing and shared library loading may take significant time. It is not useful to do it on each HTTP request in a Web environment. However, it is possible to preload FFI definitions and libraries at PHP startup, and to instantiate FFI objects when necessary. Header files may be extended with special FFI_SCOPE defines (e.g. #define FFI_SCOPE "foo"”"; the default scope is "C") and then loaded by FFI::load() during preloading. This leads to the creation of a persistent binding, that will be available to all the following requests through FFI::scope(). Refer to the complete PHP/FFI/preloading example for details.

                      cubic As to building in the cloud, you need to install the "build-essential" package to get gcc, make, etc. As

                      Yep. And this is more confusing/complicated than just sudo apt install php-module-whatever and therefore might be a greater impediment.

                      cubic As to architectures for OpenBlas, the clue is in the name "Sandy bridge." Intel has different microcode architectures and it looks like BLAS/CBLAS/OpenBlas targets each one very specifically

                      Curiously, my CPU is Ivy Bridge, not Sandy Bridge. I might add that OpenBLAS in its current incarnation is MASSIVE (the compiled shared lib is 14MB!) and features contributions from Russian devs with yandex addresses as well as the Institute of Software Chinese Academy of Sciences.

                      cubic As to why BLAS runs so fast, there's a second clue in the path where you find it: pthreads. BLAS starts multiple threads and splits up the workload across them. You either run longer (single threaded) OR draw more wattage from the wall (multithreaded). However, if multiple users are using the same machine, multithreading might actually be a hindrance rather than a help.

                      Good observations here.

                      cubic A new PHP extension that is actually a hodgepodge of functions I'm calling the "Quality of Life Improvement Functions" extension.

                      I admire your ambition, but the 'hodgepodge' nature might prove an impediment to acceptance by the curmudgeonly community.

                      cubic As to the question about NumPy, the official NumPy website says their implementation is written in pure C. So they probably aren't using BLAS/CBLAS/LAPACK, which explains why it takes a couple of seconds of CPU to complete instead of 0.08 seconds.

                      I have a vague recollection of seeing some BLAS libs in various python paths on my Mac. Sorry I do not have additional information. I suspect NumPy is doing something ffi-like, but can't offer any authoritative detail.

                      cubic transposing an array can already be accomplished with array_map():

                      I saw someone else doing that and that's how my own sad matrix multiplier was defining transpose().

                      cubic Multiplying two matrices together in PHP will also need some careful thought about how to avoid someone using all available CPU time on the web SAPIs without making a bunch of unnecessary calls to a timer routine to avoid exceeding the web SAPI timeout period.

                      You might keep a running tally of how much time a calculation has taken and examine the system load average? Maybe let the calling scope specify some limits on how greedy it can be? I've done something like this in an image-munging asset pipeline.

                      cubic Hope this at least answers some of your questions. But it will probably be a few more weeks until I have the extension ready for any sort of use since the holidays are upon us.

                      It's nice to have input from someone who seems to understand and appreciate the issue, so thanks for your input here. I'd very much like to feed my matrix into your multiplier and get a matrix/array (a php array of arrays) out so I can check the results.