Nevermind, here it is. Scaled partial pivoting and all.


#include <stdlib.h> /*malloc*/
#include <stdio.h> /*printf*/
#include <math.h> /*fabs*/
#include <memory.h> /*memset*/

typedef double control_type;

void display(control_type *m, int n, int o, int digits)
{/*display a matrix, "null" if null, 3 digits if bad value.
if you are saving thousands of numbers and need
speed, customize this routine; it is relatively slow.
*/
int c,d;
/* int rnd; */
char ch[5000]; /*sometimes bad numerics can
create HUGE numbers, which can create a memory crash if this
is smaller...*/
char p1;
if (( digits > 18) || (digits < 1))
digits = 3;
ch[1] = 0;
if(m == NULL)
{
printf("%s","null\n");
return;
}

for(c = 0; c<n*o; c++)
{
p1 = 0;
sprintf(ch, "%1.19f",m[c]); /*compete w/ matlab*/
if(ch[0] == '9')
p1++;
d = 0;
while(ch[d++] != '.');
ch[d+digits] = 0;
printf("%s ",ch);
if((c+1) % (o) == 0)
printf("\n");
}
printf("\n");
}



int checklegal(control_type *m, int n, int o)
{/*true if no zeros on diagonal.*/
int i;
for( i = 0; i< n; i++) /*n*n*/
{/*if(fabs(m[j+i*o])<1e-14) /* tol version*/
if(m[i+i*o]==0)
return(0);
}
return(1);
}



control_type *rswap(control_type *m, int n, int o, int r1, int r2)
{/*r1,r2 are from row 0
swaps rows
*/
int i;
control_type tmp;
if(r1 == r2) /*fast exit for no exchanges*/
return(m);
r1 = r1*o; /*avoiding the multiply in loop saved time*/
r2 = r2*o;
for(i = 0; i<o; i++)
{
tmp = m[r1+i];
m[r1+i]= m[r2+i];
m[r2+i]=tmp;
}
return(m);
}




control_type *ident(int size)
{
/*returns the size * size identity. */
control_type *i;
int a;
i = (control_type*)malloc(sizeof(control_type)*size*size);
memset(i,0,sizeof(control_type)*size*size);
for (a = 0; a< size; a++)
i[a*size+a] = 1;
return(i);
}






control_type * scale(control_type *m, int n, int o)
{/*scale largest element in each row to one.*/

control_type *scaled;
int i,j;
control_type s;
scaled = (control_type*)malloc(sizeof(control_type)*n*o);
for(i = 0; i<n*o; i++)
scaled[i]=m[i];
for(i = 0; i<n; i++)
{
s = 0;
for(j = 0; j<o; j++)
{
if(fabs(scaled[j+i*o])>s)
s = fabs(scaled[j+i*o]);
}
if(s == 0) /*fix /0 issues*/
s = 0.00000001;
for(j = 0; j<o; j++)
{
scaled[j+i*o] /= s;
}
}
return(scaled);
}

control_type *pivotfix(control_type *m, int n, int o)
{/*seeks good row (1 in proper place) before swapping. (thus avoiding
more swaps later) probably not as fast as it could be, but any lost time

is not noticed here compared to inverse.*/

int i,j,kr;
int er;
long double k;

kr = 0;
er = n-1;
if(m[er*o+er] == 0) /*force last row to not have zero in last col.*/
{
for(j = 0; j<n-1; j++)
{
if(m[er+j*o] != 0)
{
kr = j;
j = n; /*break loop...*/
}
}
m = rswap(m,n,o,er,kr); /*foos orig matrix!*/
}
/*now we don't allow last row to be swapped to fix other rows.
this may fubar small arrays but skip partial pivot for them
if its a problem.*/

for(i = 0; i<n-1; i++)
{
k = 0;
kr = i;
/*quit if 1, the max value after scale, is found as best row*/
if(fabs(m[i+i*o]) != 1) /*if row needs a swap*/
for(j = i+1; j<n-1; j++) /*look for good row*/
{
if(fabs(m[i+j*o]) == 1) /*found good row? */
{
m = rswap(m,n,o,i,j); /*swap good row*/
j = n+1; /*loop breaker*/
}
else /*if not 1 then find and use max remaining.*/
{
if(fabs(m[i+j*o]) > k)
{
k = fabs(m[i+j*o]);
kr = j;
}
if(j == (n-2))
{
m = rswap(m,n,o,i,kr);
}
}/*else*/
}/*for j*/
}
return(m);
}




control_type *invert(control_type *m, int n, int o)
{
double tmp,tmp2;
int i,j,k,dx;
control_type *new_m;
control_type *id;
control_type *result;
control_type *del;
const int o2 = o*2;/*strip out multiplies for speed*/
/*//////////////////////////////////////////////////*/
/*append identity*/
id = ident(n);
new_m = (control_type*)malloc(sizeof( control_type)*o2*n);
dx = 0;
for(i = 0; i<n; i++)
for(j = 0; j<o; j++)
{
new_m[j+i*o2] = m[dx];
new_m[j+i*o2+o] = id[dx];
dx++;
}
/*/////////////////////////////////////////////////////////
scale all rows to have max value of 1 (scaled partial pivoting)*/
del = new_m;
new_m = scale(new_m,n,o2); /*new_m gets a new mem location*/
free(del); /*kill old pointer*/
/*partial pivot to maximize diagonal */
new_m = pivotfix(new_m,n,o2);
/*//////////////////////////////////////////////////////////*/
/*row reduce to zeros below diagonal.*/

for(k = 0;k<n;k++) /*for each row*/
{
tmp = 1/ new_m[k*o2+k]; /*get scale*/
for(j = k; j<o2; j++)
{
new_m[k*o2+j] *= tmp; /*scale row*/
}
for(i = k+1; i < n; i++) /*for each row under row k*/
{
tmp2 = new_m[i*o2+k]; /*get scale */
for(j = k+1; j<o2; j++)
{
if(new_m[k*o2+j] != 0)
new_m[i*o2+j] -= new_m[k*o2+j]*tmp2;
/*zeroing small elements can help reduce error
but must choose cutoff carefully, or it goes bad*/
/*if(fabs(new_m[i*o2+j]) < 1.0e-15)
new_m[i*o2+j] = 0;
*/
}
}
}
/*///////////////////////////////////////////////////////////////*/
/*if not legal then no inverse (non-singular) (use pinv)*/
if(checklegal(new_m,n,o2) == 0)
return(NULL);
/*///////////////////////////////////////////////////////////////
back sub for identity on left, inverse on right.*/
for(k= o-1; k>0; k--)
for(i=(k-1);i >= 0; i--)/*each row b up*/
{
const double tmp = new_m[k+i*o2];
new_m[i*o2+k] -= tmp;/* *new_m[k*o2+k]; /* we know is = 1 */
for (j = o; j< o2; j++) /*each col -value*/
{
new_m[i*o2+j] -= tmp*new_m[k*o2+j];
/*if(fabs(new_m[i*o2+j]) < 1.0e-15)
new_m[i*o2+j] = 0;
*/
}
}
/*/////////////////////////////////////////////////////////////////
split off and return right side = inverse */
result = (control_type*)malloc(sizeof(control_type)*n*o);
for(i = 0; i<n; i++)
for(j = 0; j<n; j++)
result[i*o+j] = new_m[i*o2+j+o];
/*clean memory*/
free(id);
free(new_m);
return(result);
}




/////////////////////////////////

//example:

#include"invert.c"
int main()
{
control_type matrix [] =
{

1,2,3,
7,11,13,
23,67,1131

};


control_type * result;

result = invert(matrix,3,3);

display(result,3,3,10);

free(result);
}