compositor: implement inverse matrix transformation

Implement 4x4 matrix inversion based on LU-decomposition with partial
pivoting.

Instead of simply computing the inverse matrix explicitly, introduce the
type struct weston_inverse_matrix for storing the LU-decomposition and
the permutation from pivoting. Using doubles, this struct has greater
precision than struct weston_matrix.

If you need only few (less than 5, presumably) multiplications with the
inverse matrix, is it cheaper to use weston_inverse_matrix, and not
compute the inverse matrix explicitly into a weston_matrix.

Signed-off-by: Pekka Paalanen <ppaalanen@gmail.com>
diff --git a/src/matrix.c b/src/matrix.c
index 941e958..06cdc5e 100644
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2011 Intel Corporation
+ * Copyright © 2012 Collabora, Ltd.
  *
  * Permission to use, copy, modify, distribute, and sell this software and
  * its documentation for any purpose is hereby granted without fee, provided
@@ -22,6 +23,7 @@
 
 #include <string.h>
 #include <stdlib.h>
+#include <math.h>
 #include <GLES2/gl2.h>
 #include <wayland-server.h>
 
@@ -101,9 +103,126 @@
 	*v = t;
 }
 
+static inline void
+swap_rows(double *a, double *b)
+{
+	unsigned k;
+	double tmp;
+
+	for (k = 0; k < 13; k += 4) {
+		tmp = a[k];
+		a[k] = b[k];
+		b[k] = tmp;
+	}
+}
+
+static inline unsigned
+find_pivot(double *column, unsigned k)
+{
+	unsigned p = k;
+	for (++k; k < 4; ++k)
+		if (fabs(column[p]) < fabs(column[k]))
+			p = k;
+
+	return p;
+}
+
+/*
+ * reference: Gene H. Golub and Charles F. van Loan. Matrix computations.
+ * 3rd ed. The Johns Hopkins University Press. 1996.
+ * LU decomposition, forward and back substitution: Chapter 3.
+ */
+
 WL_EXPORT int
-weston_matrix_invert(struct weston_matrix *inverse,
+weston_matrix_invert(struct weston_inverse_matrix *inverse,
 		     const struct weston_matrix *matrix)
 {
-	return -1; /* fail */
+	double A[16];
+	unsigned p[4] = { 0, 1, 2, 3 };
+	unsigned i, j, k;
+	unsigned pivot;
+	double pv;
+
+	for (i = 16; i--; )
+		A[i] = matrix->d[i];
+
+	/* LU decomposition with partial pivoting */
+	for (k = 0; k < 4; ++k) {
+		pivot = find_pivot(&A[k * 4], k);
+		if (pivot != k) {
+			unsigned tmp = p[k];
+			p[k] = p[pivot];
+			p[pivot] = tmp;
+			swap_rows(&A[k], &A[pivot]);
+		}
+
+		pv = A[k * 4 + k];
+		if (fabs(pv) < 1e-9)
+			return -1; /* zero pivot, not invertible */
+
+		for (i = k + 1; i < 4; ++i) {
+			A[i + k * 4] /= pv;
+
+			for (j = k + 1; j < 4; ++j)
+				A[i + j * 4] -= A[i + k * 4] * A[k + j * 4];
+		}
+	}
+
+	memcpy(inverse->LU, A, sizeof(A));
+	memcpy(inverse->p, p, sizeof(p));
+	return 0;
+}
+
+WL_EXPORT void
+weston_matrix_inverse_transform(struct weston_inverse_matrix *inverse,
+				struct weston_vector *v)
+{
+	/* Solve A * x = v, when we have P * A = L * U.
+	 * P * A * x = P * v  =>  L * U * x = P * v
+	 * Let U * x = b, then L * b = P * v.
+	 */
+	unsigned *p = inverse->p;
+	double *LU = inverse->LU;
+	double b[4];
+	unsigned k, j;
+
+	/* Forward substitution, column version, solves L * b = P * v */
+	/* The diagonal of L is all ones, and not explicitly stored. */
+	b[0] = v->f[p[0]];
+	b[1] = (double)v->f[p[1]] - b[0] * LU[1 + 0 * 4];
+	b[2] = (double)v->f[p[2]] - b[0] * LU[2 + 0 * 4];
+	b[3] = (double)v->f[p[3]] - b[0] * LU[3 + 0 * 4];
+	b[2] -= b[1] * LU[2 + 1 * 4];
+	b[3] -= b[1] * LU[3 + 1 * 4];
+	b[3] -= b[2] * LU[3 + 2 * 4];
+
+	/* backward substitution, column version, solves U * y = b */
+#if 1
+	/* hand-unrolled, 25% faster for whole function */
+	b[3] /= LU[3 + 3 * 4];
+	b[0] -= b[3] * LU[0 + 3 * 4];
+	b[1] -= b[3] * LU[1 + 3 * 4];
+	b[2] -= b[3] * LU[2 + 3 * 4];
+
+	b[2] /= LU[2 + 2 * 4];
+	b[0] -= b[2] * LU[0 + 2 * 4];
+	b[1] -= b[2] * LU[1 + 2 * 4];
+
+	b[1] /= LU[1 + 1 * 4];
+	b[0] -= b[1] * LU[0 + 1 * 4];
+
+	b[0] /= LU[0 + 0 * 4];
+#else
+	for (j = 3; j > 0; --j) {
+		b[j] /= LU[j + j * 4];
+		for (k = 0; k < j; ++k)
+			b[k] -= b[j] * LU[k + j * 4];
+	}
+
+	b[0] /= LU[0 + 0 * 4];
+#endif
+
+	/* the result */
+	for (j = 0; j < 4; ++j)
+		v->f[j] = b[j];
 }