//-------------------------------------------------------------------
//	matmult.cpp
//
//	This program uses separate threads to compute the entries
//	in a matrix-product.  It serves to illustrate certain key
//	principles of 'thread safe' multiprogramming: its thread-
//	routine is made 'reentrant' by avoiding any alteration of
//	shared storage-locations, and synchronization is achieved
//	by 'blocking' the parent-process until every child-thread
//	has terminated.  (With large size matrices there could be
//	a speedup when executing on a machine with multiple CPUs,
//	though clearly no speed-advantages would be achieved on a 
//	single-processor system -- just extra system "overhead".)    
//
//	programmer: ALLAN CRUSE
//	written on: 17 OCT 2004
//-------------------------------------------------------------------

#include <stdio.h>	// for printf(), perror() 
#include <stdlib.h>	// for exit() 
#include <sched.h>	// for clone()
#include <time.h>	// for time()
#include <sys/wait.h>	// for wait()

#define N 4
#define FLAGS ( SIGCHLD | CLONE_VM ) 


typedef int MATRIX[ N ][ N ];
typedef struct	{ MATRIX *u, *v, *w; int row, col; } INFO;


int dot( void *data )	// <--- This is the 'reentrant' thread-routine
{
	INFO	*info = (INFO*)data;
	MATRIX	*a = info->u;
	MATRIX	*b = info->v;
	MATRIX	*c = info->w;
	int	r = info->row;
	int	k = info->col;
	
	(*c)[r][k] = 0;
	for (int i = 0; i < N; i++) (*c)[r][k] += (*a)[r][i] * (*b)[i][k];
	return	0;
}

void show_matrix( int r, int k, MATRIX &m )
{
	for (int i = 0; i < r; i++)
		{
		printf( "\n" );
		for (int j = 0; j < k; j++) printf( "%5d ", m[i][j] );
		}
	printf( "\n" );
}		

int main( int argc, char **argv )
{
	// initialize the matrices a and b
	MATRIX	a, b, c;
	srand( time( NULL ) );
	for (int i = 0; i < N; i++) for (int j = 0; j < N; j++)
		{
		a[ i ][ j ] = ( rand()%19 ) - 9;
		b[ i ][ j ] = ( rand()%19 ) - 9;
		}

	// initialize our array of info-structures
	INFO	info[ N ][ N ];
	for (int r = 0; r < N; r++) for (int k = 0; k < N; k++)
		{
		info[ r ][ k ].u = &a;
		info[ r ][ k ].v = &b;
		info[ r ][ k ].w = &c;
		info[ r ][ k ].row = r;
		info[ r ][ k ].col = k;
		}
	
	// allocate storage for the threads' stacks
	typedef char stack_t[ 4096 ];
	int	number_of_threads = N * N;
	void	*storage = malloc( number_of_threads * sizeof( stack_t ) );
	if ( storage == NULL ) { perror( "malloc" ); exit(1); }
	
	// execute a thread for each entry in the matrix-product
	void	*tos = storage; 
	for (int r = 0; r < N; r++) for (int k = 0; k < N; k++)
		{
		tos = (void*)((int)tos + sizeof( stack_t ));
		clone( dot, tos, FLAGS, &info[r][k] );
		}
	// wait until all the threads finish, then release their stacks 
	for (int r = 0; r < N; r++) for (int k = 0; k < N; k++) wait( NULL );
	free( storage );

	// display the computation results
	printf( "\nMatrix A:\n" );
	show_matrix( N, N, a );
	printf( "\nMatrix B:\n" );
	show_matrix( N, N, b );
	printf( "\nMatrix C:\n" );
	show_matrix( N, N, c );
	
	// verify the accuracy of our matrix result
	int	errors = 0;
	for (int r = 0; r < N; r++) for (int k = 0; k < N; k++)
		{
		for (int j = 0; j < N; j++) c[r][k] -= a[r][j] * b[j][k];
		if ( c[r][k] != 0 ) ++errors;
		}
	printf( "\nNumber of errors = %d \n\n", errors );
}