#include <stdlib.h>
#include <stdio.h>

float powf(float, float);

// DOES NOT initialize out's values to zero!
// takes "flat" matrix, order by rows
void vectorMatrixMult(float *v, float *m, float *out) {
	int i,j;
	for (i = 0 ; i < 4; i++) {
		for (j = 0 ; j < 4; j++)
			// changed from i*4 +j to i + j*4
			out[i] += m[i + j*4]*v[j];
  	}
}

// DOES NOT initialize out's values to zero!
void sqMatrixMult(float **m1, float **m2, int n, float **out) {
	int i,j,k;
	for (i = 0 ; i < n; i++) {
		for (j = 0 ; j < n; j++) {
			for (k = 0 ; k < n; k++)
		// NOT FIXED FOR FLAT MATRIX
			out[i][j] += m1[i][k]*m2[k][j];
		}
  	}
}

float Determinant(float *a, int n)
{
   float det = 0;
   if (n == 2) {
      det = a[0] * a[3] - a[2] * a[1];
   } else {
      float *m = NULL;
      int i,j,j1,j2, dec = n - 1;
      for (j1=0;j1<n;j1++) {
         m = malloc(dec*dec*sizeof(float *));
         for (i=1;i<n;i++) {
            j2 = 0;
            for (j=0;j<n;j++) {
               if (j == j1)
                  continue;
               m[(j2*dec)+(i-1)] = a[(j*n)+i];
               j2++;
            }
         }
         det += (j1 % 2 ? -1 : 1) * a[j1] * Determinant(m,dec);
         free(m);
      }
   }
   return(det);
}

/*
   Find the cofactor matrix of a 4x4 matrix
*/
void CoFactor(float *a, float *b)
{
   int i,j,ii,jj,i1,j1;
   float *c = malloc(3*3*sizeof(float));

   for (j=0;j<4;j++) {
      for (i=0;i<4;i++) {

         /* Form the adjoint a_ij */
         i1 = 0;
         for (ii=0;ii<4;ii++) {
            if (ii == i)
               continue;
            j1 = 0;
            for (jj=0;jj<4;jj++) {
               if (jj == j)
                  continue;
               c[(j1*3)+i1] = a[(jj*4)+ii];
               j1++;
            }
            i1++;
         }

         /* Fill in the elements of the cofactor */
         b[(i*4)+j]= ( ((i+j) % 2 ? -1 : 1) * Determinant(c,3));
	 //b[i][j] = powf(-1.0,i+j+2.0) * det;
      }
   }
   free(c);
}

/*
   Find the cofactor matrix of a square matrix
*/
void transposedScaledCoFactor(float *a, float *b, float s)
{
   int i,j,ii,jj,i1,j1;
   float *c = malloc(3*3*sizeof(float));

   for (j=0;j<4;j++) {
      for (i=0;i<4;i++) {

         /* Form the adjoint a_ij */
         i1 = 0;
         for (ii=0;ii<4;ii++) {
            if (ii == i)
               continue;
            j1 = 0;
            for (jj=0;jj<4;jj++) {
               if (jj == j)
                  continue;
               c[(j1*3)+i1] = a[(jj*4)+ii];
               j1++;
            }
            i1++;
         }

         /* Fill in the elements of the cofactor */
         b[(i*4)+j]= ( ((i+j) % 2 ? -1 : 1) * Determinant(c,3)) * s;
	 //b[i][j] = powf(-1.0,i+j+2.0) * det;
      }
   }
   free(c);
}

/*
   Transpose of a square matrix, do it in place
*/
void Transpose(float *a, int leng)
{
   int i,j;
   float tmp;

   for (i=1;i<leng;i++) {
      for (j=0;j<i;j++) {
				tmp = a[(j*leng)+i];
				a[(j*leng)+i] = a[(i*leng)+j];
				a[(i*leng)+j] = tmp;
      }
   }
}

void scalarMatrixMult(float scale, float *a, int leng) {
	int i,j;
	for (i = 0 ; i < leng; i++ ) {
		for (j = 0 ; j < leng; j++ ) {
			a[j*leng+i] = a[j*leng+i] *scale;
			//a[i][j] = a[i][j]*scale;
		}
	}

}


void printRolledArray(float **a, int n) {
	int i,j;
	for (i = 0 ; i < n; i++ ) {
		for (j = 0 ; j < n; j++ ) {
			printf("%f, ",a[i][j]);
		}
		printf("\n");
	}
}

// Assume b is malloc'd
void rollUpArray(float *a, int leng, float **b) {
	int i,j;
	for (i = 0 ; i < leng; i++ ) {
		for (j = 0 ; j < leng; j++ ) {
			b[i][j] = a[(j*leng)+i];
		}
	}
}

// assume b is malloc'd
void unrollSqArray(float **a, int leng, float *b) {
	int i,j;
	for (i = 0 ; i < leng; i++ ) {
		for (j = 0 ; j < leng; j++ ) {
			b[(j*leng)+i] = a[i][j];
		}
	}
}

void invertMatrixOld(float *a) {
	float *result = calloc(16,sizeof(float));
	CoFactor(a, result);
	Transpose(result,4);
	scalarMatrixMult(1.0f/Determinant(a,4),result, 4);
	memcpy(a,result,sizeof(float)*16);
	free(result);
}
void invertMatrix(float *a) {
	float *result = calloc(16,sizeof(float));
	transposedScaledCoFactor(a, result, 1.0f/Determinant(a,4));
	memcpy(a,result,sizeof(float)*16);
	free(result);
}

/* Unrolled 4x4 matrix inversion
 * http://cache-www.intel.com/cd/00/00/01/76/17668_24504301.pdf
 *
 * Invert() is unrolled GEPP
 * Invert2() is unrolled Cramer's rule inversion (cofactor method)
 */
void Invert(float b[][4], float a[][4])
{
  long indxc[4], indxr[4], ipiv[4];
  long i, icol, irow, j, ir, ic;
  float big, dum, pivinv, temp, bb;
  ipiv[0] = -1;
  ipiv[1] = -1;
  ipiv[2] = -1;
  ipiv[3] = -1;
  a[0][0] = b[0][0];
  a[1][0] = b[1][0];
  a[2][0] = b[2][0];
  a[3][0] = b[3][0];
  a[0][1] = b[0][1];
  a[1][1] = b[1][1];
  a[2][1] = b[2][1];
  a[3][1] = b[3][1];
  a[0][2] = b[0][2];
  a[1][2] = b[1][2];
  a[2][2] = b[2][2];
  a[3][2] = b[3][2];
  a[0][3] = b[0][3];
  a[1][3] = b[1][3];
  a[2][3] = b[2][3];
  a[3][3] = b[3][3];
  for(i = 0; i < 4; i++) {
    big = 0.0f;
    for(j = 0; j < 4; j++) {
      if(ipiv[j] != 0) {
        if(ipiv[0] == -1) {
          if((bb = (float) fabs(a[j][0])) > big) {
            big = bb;
            irow = j;
            icol = 0;
          }
        } else if(ipiv[0] > 0) {
          return;
        }
        if(ipiv[1] == -1) {
          if((bb = (float) fabs((float) a[j][1])) > big) {
            big = bb;
            irow = j;
            icol = 1;
          }
        } else if(ipiv[1] > 0) {
          return;
        }
        if(ipiv[2] == -1) {
          if((bb = (float) fabs((float) a[j][2])) > big) {
            big = bb;
            irow = j;
            icol = 2;
          }
        } else if(ipiv[2] > 0) {
          return;
        }
        if(ipiv[3] == -1) {
          if((bb = (float) fabs((float) a[j][3])) > big) {
            big = bb;
            irow = j;
            icol = 3;
          }
        } else if(ipiv[3] > 0) {
          return;
        }
      }
    }
    ++(ipiv[icol]);
    if(irow != icol) {
      temp = a[irow][0];
      a[irow][0] = a[icol][0];
      a[icol][0] = temp;
      temp = a[irow][1];
      a[irow][1] = a[icol][1];
      a[icol][1] = temp;
      temp = a[irow][2];
      a[irow][2] = a[icol][2];
      a[icol][2] = temp;
      temp = a[irow][3];
      a[irow][3] = a[icol][3];
      a[icol][3] = temp;
    }
    indxr[i] = irow;
    indxc[i] = icol;
    if(a[icol][icol] == 0.0) {
      return;
    }
    pivinv = 1.0f / a[icol][icol];
    a[icol][icol] = 1.0f;
    a[icol][0] *= pivinv;
    a[icol][1] *= pivinv;
    a[icol][2] *= pivinv;
    a[icol][3] *= pivinv;
    if(icol != 0) {
      dum = a[0][icol];
      a[0][icol] = 0.0f;
      a[0][0] -= a[icol][0] * dum;
      a[0][1] -= a[icol][1] * dum;
      a[0][2] -= a[icol][2] * dum;
      a[0][3] -= a[icol][3] * dum;
    }
    if(icol != 1) {
      dum = a[1][icol];
      a[1][icol] = 0.0f;
      a[1][0] -= a[icol][0] * dum;
      a[1][1] -= a[icol][1] * dum;
      a[1][2] -= a[icol][2] * dum;
      a[1][3] -= a[icol][3] * dum;
    }
    if(icol != 2) {
      dum = a[2][icol];
      a[2][icol] = 0.0f;
      a[2][0] -= a[icol][0] * dum;
      a[2][1] -= a[icol][1] * dum;
      a[2][2] -= a[icol][2] * dum;
      a[2][3] -= a[icol][3] * dum;
    }
    if(icol != 3) {
      dum = a[3][icol];
      a[3][icol] = 0.0f;
      a[3][0] -= a[icol][0] * dum;
      a[3][1] -= a[icol][1] * dum;
      a[3][2] -= a[icol][2] * dum;
      a[3][3] -= a[icol][3] * dum;
    }
  }
  if(indxr[3] != indxc[3]) {
    ir = indxr[3];
    ic = indxc[3];
    temp = a[0][ir];
    a[0][ir] = a[0][ic];
    a[0][ic] = temp;
    temp = a[1][ir];
    a[1][ir] = a[1][ic];
    a[1][ic] = temp;
    temp = a[2][ir];
    a[2][ir] = a[2][ic];
    a[2][ic] = temp;
    temp = a[3][ir];
    a[3][ir] = a[3][ic];
    a[3][ic] = temp;
  }
  if(indxr[2] != indxc[2]) {
    ir = indxr[2];
    ic = indxc[2];
    temp = a[0][ir];
    a[0][ir] = a[0][ic];
    a[0][ic] = temp;
    temp = a[1][ir];
    a[1][ir] = a[1][ic];
    a[1][ic] = temp;
    temp = a[2][ir];
    a[2][ir] = a[2][ic];
    a[2][ic] = temp;
    temp = a[3][ir];
    a[3][ir] = a[3][ic];
    a[3][ic] = temp;
  }
  if(indxr[1] != indxc[1]) {
    ir = indxr[1];
    ic = indxc[1];
    temp = a[0][ir];
    a[0][ir] = a[0][ic];
    a[0][ic] = temp;
    temp = a[1][ir];
    a[1][ir] = a[1][ic];
    a[1][ic] = temp;
    temp = a[2][ir];
    a[2][ir] = a[2][ic];
    a[2][ic] = temp;
    temp = a[3][ir];
    a[3][ir] = a[3][ic];
    a[3][ic] = temp;
  }
  if(indxr[0] != indxc[0]) {
    ir = indxr[0];
    ic = indxc[0];
    temp = a[0][ir];
    a[0][ir] = a[0][ic];
    a[0][ic] = temp;
    temp = a[1][ir];
    a[1][ir] = a[1][ic];
    a[1][ic] = temp;
    temp = a[2][ir];
    a[2][ir] = a[2][ic];
    a[2][ic] = temp;
    temp = a[3][ir];
    a[3][ir] = a[3][ic];
    a[3][ic] = temp;
  }
}

void Invert2(float *mat, float *dst)
{
  float tmp[12];                /* temp array for pairs */
  float src[16];                /* array of transpose source matrix */
  float det;                    /* determinant *//* transpose matrix */
	int i,j;
  for(i = 0; i < 4; i++) {
    src[i] = mat[i * 4];
    src[i + 4] = mat[i * 4 + 1];
    src[i + 8] = mat[i * 4 + 2];
    src[i + 12] = mat[i * 4 + 3];
  }
  /* calculate pairs for first 8 elements (cofactors) */
  tmp[0] = src[10] * src[15];
  tmp[1] = src[11] * src[14];
  tmp[2] = src[9] * src[15];
  tmp[3] = src[11] * src[13];
  tmp[4] = src[9] * src[14];
  tmp[5] = src[10] * src[13];
  tmp[6] = src[8] * src[15];
  tmp[7] = src[11] * src[12];
  tmp[8] = src[8] * src[14];
  tmp[9] = src[10] * src[12];
  tmp[10] = src[8] * src[13];
  tmp[11] = src[9] * src[12];   /* calculate first 8 elements (cofactors) */
  dst[0] = tmp[0] * src[5] + tmp[3] * src[6] + tmp[4] * src[7];
  dst[0] -= tmp[1] * src[5] + tmp[2] * src[6] + tmp[5] * src[7];
  dst[1] = tmp[1] * src[4] + tmp[6] * src[6] + tmp[9] * src[7];
  dst[1] -= tmp[0] * src[4] + tmp[7] * src[6] + tmp[8] * src[7];
  dst[2] = tmp[2] * src[4] + tmp[7] * src[5] + tmp[10] * src[7];
  dst[2] -= tmp[3] * src[4] + tmp[6] * src[5] + tmp[11] * src[7];
  dst[3] = tmp[5] * src[4] + tmp[8] * src[5] + tmp[11] * src[6];
  dst[3] -= tmp[4] * src[4] + tmp[9] * src[5] + tmp[10] * src[6];
  dst[4] = tmp[1] * src[1] + tmp[2] * src[2] + tmp[5] * src[3];
  dst[4] -= tmp[0] * src[1] + tmp[3] * src[2] + tmp[4] * src[3];
  dst[5] = tmp[0] * src[0] + tmp[7] * src[2] + tmp[8] * src[3];
  dst[5] -= tmp[1] * src[0] + tmp[6] * src[2] + tmp[9] * src[3];
  dst[6] = tmp[3] * src[0] + tmp[6] * src[1] + tmp[11] * src[3];
  dst[6] -= tmp[2] * src[0] + tmp[7] * src[1] + tmp[10] * src[3];
  dst[7] = tmp[4] * src[0] + tmp[9] * src[1] + tmp[10] * src[2];
  dst[7] -= tmp[5] * src[0] + tmp[8] * src[1] + tmp[11] * src[2];
  /* calculate pairs for second 8 elements (cofactors) */
  tmp[0] = src[2] * src[7];
  tmp[1] = src[3] * src[6];
  tmp[2] = src[1] * src[7];
  tmp[3] = src[3] * src[5];
  tmp[4] = src[1] * src[6];
  tmp[5] = src[2] * src[5];
  tmp[6] = src[0] * src[7];
  tmp[7] = src[3] * src[4];
  tmp[8] = src[0] * src[6];
  tmp[9] = src[2] * src[4];
  tmp[10] = src[0] * src[5];
  tmp[11] = src[1] * src[4];
  /* calculate second 8 elements (cofactors) */
  dst[8] = tmp[0] * src[13] + tmp[3] * src[14] + tmp[4] * src[15];
  dst[8] -= tmp[1] * src[13] + tmp[2] * src[14] + tmp[5] * src[15];
  dst[9] = tmp[1] * src[12] + tmp[6] * src[14] + tmp[9] * src[15];
  dst[9] -= tmp[0] * src[12] + tmp[7] * src[14] + tmp[8] * src[15];
  dst[10] = tmp[2] * src[12] + tmp[7] * src[13] + tmp[10] * src[15];
  dst[10] -= tmp[3] * src[12] + tmp[6] * src[13] + tmp[11] * src[15];
  dst[11] = tmp[5] * src[12] + tmp[8] * src[13] + tmp[11] * src[14];
  dst[11] -= tmp[4] * src[12] + tmp[9] * src[13] + tmp[10] * src[14];
  dst[12] = tmp[2] * src[10] + tmp[5] * src[11] + tmp[1] * src[9];
  dst[12] -= tmp[4] * src[11] + tmp[0] * src[9] + tmp[3] * src[10];
  dst[13] = tmp[8] * src[11] + tmp[0] * src[8] + tmp[7] * src[10];
  dst[13] -= tmp[6] * src[10] + tmp[9] * src[11] + tmp[1] * src[8];
  dst[14] = tmp[6] * src[9] + tmp[11] * src[11] + tmp[3] * src[8];
  dst[14] -= tmp[10] * src[11] + tmp[2] * src[8] + tmp[7] * src[9];
  dst[15] = tmp[10] * src[10] + tmp[4] * src[8] + tmp[9] * src[9];
  dst[15] -= tmp[8] * src[9] + tmp[11] * src[10] + tmp[5] * src[8];
  /* calculate determinant */
  det = src[0] * dst[0] + src[1] * dst[1] + src[2] * dst[2] + src[3] * dst[3];
  /* calculate matrix inverse */
  det = 1 / det;
  for(j = 0; j < 16; j++)
    dst[j] *= det;
}
