Matrix_Mult


#include<stdio.h>
#include<stdlib.h>


__global__ void matrix_mult(long int *dA,long int *dB,long int *dC,long int m)
{

    long int i=blockIdx.x*blockDim.x+threadIdx.x;
    long int j=blockIdx.y*blockDim.y+threadIdx.y;

    long int product=0,k;
    for(k=0;k<m;k++)
    {
        product=product+dA[i*m+k]*dB[k*m+j];
    }
    dC[i*m+j]=product;
    i+=blockDim.x*gridDim.x;
    j+=blockDim.y*gridDim.y;
}


void read_matrix_from_file(FILE *fp,long int *matrix,long int x,long int y)
{
long int i,j;
    for(i=0;i<x;i++)
    {
        for(j=0;j<y;j++)
        {
            fscanf(fp,"%ld",&matrix[i*x+j]);
        }
    }
}


void print_matrix_file(FILE *fp,long int *matrix,long int x,long int y)
{
long int i,j;
    for(i=0;i<x;i++)
    {
        for(j=0;j<y;j++)
        {
            fprintf(fp,"%ld ",matrix[i*x+j]);
        }
        fprintf(fp,"\n");
    }
}
void print_matrix(long int *matrix,long int x,long int y)
{
    long int i,j;
    for(i=0;i<x;i++)
    {
        for(j=0;j<y;j++)
        {
            printf("%ld ",matrix[i*x+j]);
        }
        printf("\n");
    }
}


int main(int argc,char *argv[])
{
    FILE *fp1=NULL,*fp2=NULL,*fp3=NULL;
    long int m,n,o,p;
    float elapsedTime;
    long int *dA=NULL,*dB=NULL,*dC=NULL;
    long int *matrixA=NULL,*matrixB=NULL,*matrixC=NULL;
    enum cudaError error;
    if(argc!=8)
    {
        printf("\n(8 Parameters)./a.out matrixfile1 m n matrixfile2 o p outputfile.txt");
        exit(0);
    }


        fp1=fopen(argv[1],"r+");
        fp2=fopen(argv[4],"r+");
        fp3=fopen(argv[7],"w+");
        if(fp1==NULL||fp2==NULL||fp3==NULL)
        {
            printf("\nError in opening the file");
            exit(0);
        }


            m=atol(argv[2]);
            n=atol(argv[3]);
            o=atol(argv[5]);
            p=atol(argv[6]);

            matrixA=(long int *)malloc(m*n*sizeof(long int));
            matrixB=(long int *)malloc(o*p*sizeof(long int));
            matrixC=(long int *)malloc(m*p*sizeof(long int));
            if(matrixA==NULL||matrixB==NULL||matrixC==NULL)
            {
                printf("Error Memory allocation for matrix in host\n");
                exit(0);
            }
            //size=m*m;
            // Reading the matrix from file
            read_matrix_from_file(fp1,matrixA,m,n);
            read_matrix_from_file(fp2,matrixB,o,p);
            //Print the matrix on the console
            //printf("\n\nMatrixA is:\n");
            //print_matrix(matrixA,m,n);
            //printf("\n\nMatrixB is:\n");
            //print_matrix(matrixB,o,p);
            //The first parameter of the cudaMalloc() function is the address of a pointer variable that must point to the allocated object after allocation.
            //MatrixA GPU Memory allocation
            error=cudaMalloc((void**)&dA,m*n*sizeof(long int));
            if(error) printf("\nError in allocation");

            //MatrixB GPU Memory allocation
            error=cudaMalloc((void**)&dB,p*o*sizeof(long int));
            if(error)printf("\nError in allocation");

            //MatrixC GPU Memory allocation
            error=cudaMalloc((void**)&dC,m*p*sizeof(long int));
            if(error)printf("\nError in allocation");

            //m GPU Memory allocation
            /*error=cudaMalloc((void **)&dM,sizeof(long int));
            if(error)printf("Error in allocation");*/

            //Copying matrixA to GPU Memory from CPU
            error=cudaMemcpy(dA,matrixA,m*n*sizeof(long int),cudaMemcpyHostToDevice);
            if(error)printf("\nError in copying data from host to device");

            //Copying matrixB to GPU Memory from CPU
            error=cudaMemcpy(dB,matrixB,o*p*sizeof(long int),cudaMemcpyHostToDevice);
            if(error)printf("\nError in copying data from host to device");

            //Copying m to GPU Memory from CPU
            /*error=cudaMemcpy(dM,&m,sizeof(long int),cudaMemcpyHostToDevice);
            if(error) printf("\nError in copying m:");*/

            dim3 dimBlock(10,10);
            dim3 dimGrid(100,100);

            cudaEvent_t start,stop;//To compute the elapsed time
            cudaEventCreate(&start);//Creating a start event. It marks the starting of an event
            cudaEventCreate(&stop);//Creating a stop event. It marks the stoping of an event
            cudaEventRecord(start,0);
            matrix_mult<<<dimGrid,dimBlock>>>(dA,dB,dC,m);
            cudaEventRecord(stop,0);//Same as above 'start' event
            cudaEventSynchronize(stop);//Blocks until the event has actually been recorded
            cudaEventElapsedTime(&elapsedTime,start,stop);
            cudaEventDestroy(start);//Destroys the specified start object.
            cudaEventDestroy(stop);//Destroys the specified stop object.
            printf("\n\nElapsed Time for computation on GPU(Seconds)=%lf\n",(double)(elapsedTime/1000));// Prints the computation time
            error=cudaMemcpy(matrixC,dC,m*p*sizeof(long int),cudaMemcpyDeviceToHost);
            if(error)printf("\nError in copying data from device to host");
            cudaFree(dA);
            cudaFree(dB);
            cudaFree(dC);
            printf("\n\nThe Product matrix is:(C=A*B):\n");
            //print_matrix(matrixC,m,p);
            print_matrix_file(fp3,matrixC,m,p);


}