Matrix_Multiplication_Using_UDA_With_Dynamically_Matrix

/*
    Matrix Multiplication using dynamically allocated metrices-Using GPU

    matrixA m*n
    matrixB o*p

    Method
        1. Alllocate memory for the matrix a,b, and c in cuda
        2. Copy a and b to cuda memory
        3. Set up the launch parameter
        4. Call kernal with parameter and thread configuration
        5. Define the kernal
        6. Copy back the result to cpu memory
        7. Write the result to file


*/

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
//Function Allocates the matrix
long int ** alloc_matrix(long int m,long int n)
{
    long int **matrix1=NULL;
    long int *temp=NULL;
    long int i;
        matrix1=(long int **)malloc(m*sizeof(long int *));
    if(matrix1==NULL)
    {
        printf("\nError in allocating memory");
        exit(0);
    }

    for(i=0;i<m;i++)
    {
        matrix1[i]=(long int *)malloc(n*sizeof(long int));
        if(matrix1[i]==NULL)
        {
            printf("\nError in allocating memory");
            exit(0);
        }
    }
    free(temp);
    return (matrix1);
}//End of matrix allocation function

//Function to generate or read matrix
void read_matrix(FILE *fp,long int **matrix,long int x,long int y)
{
    long int i,j;
    srand(time(NULL));

    //printf("\nEnter the matrix:");
    for(i=0;i<x;i++)
    {
        for(j=0;j<y;j++)
        {
            //matrix[i][j]=rand()%10;
            fscanf(fp,"%ld",&matrix[i][j]);
            //scanf("%ld",&matrix[i][j]);
        }
    }
}//End of read_matrix function

//Print the matrix
void print_matrix(long int **matrix,long int x,long int y)
{
    long int i,j;
    for(i=0;i<x;i++)
    {
        for(j=0;j<y;j++)
        {
            printf("%ld\t",matrix[i][j]);
        }
        printf("\n");
    }
}//End of print function


//Print the matrix to the file
void file_print(FILE *fp,long int **matrix,long int x,long int y)
{
    long int i,j;
    for(i=0;i<x;i++)
    {
        for(j=0;j<y;j++)
        {
            fprintf(fp,"%ld ",matrix[i][j]);
        }
        fprintf(fp,"\n");
    }
}//End of print function

//Multiplies the two matrices and produce the result
/*void matrix_multiply(long int **matrix1,long int **matrix2,long int **matrixR,long int m,long int n,long int p)
{
    long int i,j,k;

    for(i=0;i<m;i++)
    {
        for(j=0;j<p;j++)
        {
            matrixR[i][j]=0;
            for(k=0;k<n;k++)
            {
                matrixR[i][j]+=matrix1[i][k]*matrix2[k][j];
            }
        }
    }


}//End of matrix multiplication function*/

 //CUDA Kernal for multiplication

__global__ void matrix_parallel_mult(int long *DmatrixA, int long *DmatrixB,int long *DmatrixC,int long *Dm)

{

    long int k,i,j;
    i=blockIdx.x;
    j=blockIdx.y;
    long int product=0;
    for(k=0;k<*Dm;k++)
    {
        product=product+DmatrixA[i**Dm+k]*DmatrixB[k**Dm+j];
    }
    DmatrixC[i**Dm+j]=product;
}
//Main Function
int main(int argc,char *argv[])
{
    long int m,n,o,p;
    long int **matrixA=NULL,**matrixB=NULL,**matrixC=NULL;
    long int *DmatrixA=NULL,*DmatrixB=NULL,*DmatrixC=NULL,*Dm=NULL,*Dp=NULL;
    cudaError_t error;

    FILE *fp,*fp1,*fp2;
    if(argc!=8)
    {
        printf("\nThe format is:./executable  Dim._of_A( m n)   Dim._of_B(o p)  Output_file.txt Martix1.txt Matrix2.txt\n");
        exit(0);
    }
    m=atol(argv[1]);
    n=atol(argv[2]);
    o=atol(argv[3]);
    p=atol(argv[4]);
    fp=fopen(argv[5],"w+");
    fp1=fopen(argv[6],"r+");
    fp2=fopen(argv[7],"r+");
    if(fp==NULL||fp1==NULL||fp2==NULL)
    {
        printf("\nError in opening the file:");
        exit(0);
    }

    //printf("\nEnter dimension:");
    //scanf("%ld%ld",&m,&n);
    matrixA=alloc_matrix(m,n);
    matrixB=alloc_matrix(o,p);
    if(n!=o)
    {
        printf("\nNo. of columns in matrixA and number of rows in matrixB are not equal");
        exit(0);
    }
    matrixC=alloc_matrix(m,p);//Allocating space for the product matrix


    read_matrix(fp1,matrixA,m,n);// Read the matrixA
    //file_print(fp1,matrixA,m,n);//Write the matrixA to file specified in argv[6]

    read_matrix(fp2,matrixB,o,p);//Read the matrixB
    //file_print(fp2,matrixB,o,p);//Write the matrixA to file specified in argv[7]

    //printf("\nThe matrixA is:\n");//Print the matrixA
    print_matrix(matrixA,m,n);

    //printf("\nThe matrixB is:\n");//Print the matrixB
    print_matrix(matrixB,o,p);

    //matrix_multiply(matrixA,matrixB,matrixC,m,n,p);//Multiply matrixA and matrixB to get matrixC

    //Allocating space in gpu
    error=cudaMalloc((void **)&DmatrixA,m*n*sizeof(long int));
    if(error){printf("\nMatrixA ::%s", cudaGetErrorString(error));exit(0);}
    error=cudaMalloc((void **)&DmatrixB,o*p*sizeof(long int));
    if(error){printf("\nMatrixB ::%s", cudaGetErrorString(error));exit(0);}
    error=cudaMalloc((void **)&DmatrixC,m*p*sizeof(long int));
    if(error){printf("\nMatrixC ::%s", cudaGetErrorString(error));exit(0);}
    error=cudaMalloc((void **)&Dm,sizeof(long int));
    if(error){printf("\nm ::%s", cudaGetErrorString(error));exit(0);}

    //Copying essential data needed during Kernal invocation

    error=cudaMemcpy(DmatrixA,matrixA,m*n*sizeof(long int),cudaMemcpyHostToDevice);
    if(error){printf("MatrixA %s",cudaGetErrorString(error));exit(0);}

    error=cudaMemcpy(DmatrixB,matrixB,o*p*sizeof(long int),cudaMemcpyHostToDevice);
    if(error){printf("MatrixB %s",cudaGetErrorString(error));exit(0);}

    error=cudaMemcpy(Dm,&m,sizeof(long int),cudaMemcpyHostToDevice);
    if(error){printf("m%s",cudaGetErrorString(error));exit(0);}


    //Setting block and grid configuration
    dim3 Grid(1,1);
    dim3 Block(m,m);

    //Launching Kernal with appropriate parameters

    matrix_parallel_mult<<<Grid,Block>>>(DmatrixA,DmatrixB,DmatrixC,Dm);

    //Copying Back the Result

    error=cudaMemcpy(matrixC,DmatrixC,m*p*sizeof(long int),cudaMemcpyDeviceToHost);


    printf("\nThe productmatrix is:\n");//Print the result of multiplication to the console
    print_matrix(matrixC,m,p);

    file_print(fp,matrixC,m,p);//Write the matrixA to file specified in argv[5]  Result file

    //Deallocation in CPU
    free(matrixA);
    free(matrixB);
    free(matrixC);
    //Deallocation in GPU
    cudaFree(DmatrixA);
    cudaFree(DmatrixB);
    cudaFree(DmatrixC);
    cudaFree(Dm);
    cudaFree(Dp);
    //Closing all files
    fclose(fp);
    fclose(fp1);
    fclose(fp2);
return 0;
}//End of main function