Best

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include "evaluate.h"
#include <omp.h>

void initialize(double** A, double** B, double** C, int* n);
void* tune_matrix_multiply();
void matrix_multiply(double* A, double* B, double* C, int n, void* tunning_params);
void getUserID(char userID[20],char userName[30]);


int main(int argc, char *argv[]){
    srand(time(NULL));
    void* tunning_params = tune_matrix_multiply();

    int n;
    double *A, *B, *C, tolerance = pow(10,-7);
    char userID[20];
    char userName[30];
    getUserID(userID,userName);

    if (debug_run(argc, argv,userID,userName)) {  /* This is debug run for students to test their code. Do not edit this. */
        initialize(&A, &B, &C, &n);
        double etOpenMPst, etOpenMPsp;
        etOpenMPst = omp_get_wtime();
        matrix_multiply(A, B, C, n, tunning_params); /* Multiply B and C to store the results into A <- B * C */
        etOpenMPsp = omp_get_wtime();
        double total_flops = myCalcFlops(etOpenMPst, etOpenMPsp, n);
        printf("Flops : %lf\n",total_flops);

        int check = check_Correctness(A, B, C, n, tolerance);
        if(check) printf("Correct : true \n");
        else printf("Correct : false \n");
        free(A);
        free(B);
        free(C);

    } else{
        /* This is an evaluation run
         * and it will be used for evaluation of student answers
         * so dont wory about the code in the else block.
        */
        int sizes[4];

        find_matrix_sizes(argc, argv, sizes);

        double limit = 100.0, flops[4]= {0.0,0.0,0.0,0.0};
        int s, correctness[4] = {0,0,0,0};
        for(s=0;s<4;s++){
            int n0=sizes[s];
            //allocate memory of A, B, C
            A = (double *) malloc(n0*n0*sizeof(double));
            B = (double *) malloc(n0*n0*sizeof(double));
            C = (double *) malloc(n0*n0*sizeof(double));
            int j, i, k;

            for(j=0;j<5;j++){
                // Randomly initializing elements B & C from 0-limit
                /* B is a Lower Triangular Matrix and C is a n/16 banded Matrix */
                for(i=0;i<n0;i++){
                    for(k=0;k<n0;k++){
                        *(A + n0*i + k) = 0.0;

                        if (k>i) *(B + n0*i + k) = 0.0;
                        else *(B + n0*i + k) = (double)rand()/(double)(RAND_MAX/limit);

                        if (k>i+n0/16-1 || i>k+n0/16-1) *(C + n0*i + k) = 0.0;
                        else *(C + n0*i + k) = (double)rand()/(double)(RAND_MAX/limit);
                    }
                }

                if(j==0){
                    double etOpenMPst, etOpenMPsp;
                    etOpenMPst = omp_get_wtime();
                    matrix_multiply(A, B, C, n0, tunning_params);
                    etOpenMPsp = omp_get_wtime();
                }
                else{
                    double etOpenMPst, etOpenMPsp;
                    etOpenMPst = omp_get_wtime();
                    matrix_multiply(A, B, C, n0, tunning_params);
                    etOpenMPsp = omp_get_wtime();

                    flops[s] += myCalcFlops(etOpenMPst,etOpenMPsp, n0);

                    int check = check_Correctness(A, B, C, n0, tolerance);
                    if(check==1){
                        correctness[s]++;
                    }
                }
            }
            flops[s] = flops[s]/4.0;
            free(A);
            free(B);
            free(C);
        }
        double overall_flop_score = (flops[0] + 2 * flops[1] + 3 * flops[2] + 4 * flops[3])/10;
        int q,overall_correctness = 1;
        for(q=0;q<4;q++){
            if(correctness[q] != 4){
                overall_correctness = 0;
            }
        }
            printf("UserName:%s\tuserID:%s\n",userName,userID);
        printf("Size\tcorrectness\tflops\n");
        printf("small \t\t%d\t%lf\n",correctness[0],flops[0]);
        printf("medium \t\t%d\t%lf\n",correctness[1],flops[1]);
        printf("large \t\t%d\t%lf\n",correctness[2],flops[2]);
        printf("extralarge \t%d\t%lf\n",correctness[3],flops[3]);
        printf("\n");
        printf("Score = %lf\n",overall_correctness * overall_flop_score);

        FILE* outputfile=fopen("output.csv","w");
        fprintf(outputfile, "\"%s\",\"%s\",%d,%lf,%d,%lf,%d,%lf,%d,%lf,%lf",userID,userName,correctness[0],flops[0],correctness[1],flops[1],correctness[2],flops[2],correctness[3],flops[3],overall_correctness * overall_flop_score);
        fclose(outputfile);
        FILE* outputfile2=fopen("output2.csv","w");
        fprintf(outputfile, "\"%s\",%d,%lf,%d,%lf,%d,%lf,%d,%lf,%lf",userName,correctness[0],flops[0],correctness[1],flops[1],correctness[2],flops[2],correctness[3],flops[3],overall_correctness * overall_flop_score);
        fclose(outputfile2);
    }
    return 0;
}
//Add only code in the functions
void getUserID(char userID[20],char userName[30]){
    //Enter your moodle userID here i.e. your entry number
    strcpy(userID,"jca142468");
    //Enter your name here
    strcpy(userName,"Rohit Katariya");
}

void initialize(double** A, double** B, double** C, int* n){
    /*
    Initialize matrix A , B & C where each of them is a square matrix of size n*n
    Try 4 sizes of matrix:
    a) small - should fit in the L1 cache
    b) medium - should fit in the L2 cache
    b) large - should fit in the L3 cache
    b) extra-large - should fit in the memory
    */
    *n=1024;
    *A = (double *) malloc(*n**n*sizeof(double));
    *B = (double *) malloc(*n**n*sizeof(double));
    *C = (double *) malloc(*n**n*sizeof(double));
    int i, k;
    for(i=0;i<*n;i++){
        for(k=0;k<*n;k++){
            *(*A + *n*i + k) = 0.0;

            if(k>i) *(*B + *n*i + k) = 0.0;
            else *(*B + *n*i + k) = (double)rand()/(double)(RAND_MAX/100.0);

            if(k>i+*n/16-1 || i>k+*n/16-1) *(*C + *n*i + k) = 0.0;
            else *(*C + *n*i + k) = (double)rand()/(double)(RAND_MAX/100.0);
        }

    }
}
#include <assert.h>

void* tune_matrix_multiply(){
    /* Write code to test various methods to optimize the matrix multiplication
       and store these parameters in a data structure(for ex, array of values ). This
       data structure can be returned by this function and will be passed to the
       matrix multiply function so that it can efficiently multiply the matrices.
    */


}

#define MAXT(A,B) ({ __typeof__ (A) a = (A); __typeof__ (B) b = (B); a > b ? a : b; })
#define MINT(A,B) ({ __typeof__ (A) a = (A); __typeof__ (B) b = (B); a > b ? b : a; })

#define maX(A,B) ({ int a = (A); int b = (B); a > b ? a : b; })
#define miN(A,B) ({ int a = (A); int b = (B); a > b ? b : a; })
#define MAX(A,B) ({ ((A) > (B)) ? A : B; })
#define MIN(A,B) ({ ((A) > (B)) ? B : A; })

void matrix_multiply(double* A, double* B, double* C, int n, void* tunning_params){
    // Multiply matrix B & C and store the results in A

    int i,j,k,procsize,threadsize;
    // double *pA, *pB, *pC;
    // int chunksize;
    // chunksize = (n==256)?2:1;
    // omp_set_num_threads(2);
    // Change omp_proc_bind
    procsize = omp_get_num_procs();
    threadsize = omp_get_max_threads();

    if(n&32) {
        // #pragma omp for collapse(2) schedule(static, 2)
        // for(i=0;i<n;i++){
        //     for(j=0;j<n;j++){
        //         A[n*i+j] = 0;
        //         int lim1 = MAX(0,j-n/16+1);
        //         int lim2 = MIN(n, j+n/16);
        //         register double temp=0;
        //         for(k=lim1;k<lim2;k++){
        //             temp += B[n*i+k]*C[n*k+j];
        //         }
        //         A[n*i+j] = temp;
        //     }
        // }
        for(i = 0 ; i < n ; i++){
            int l, p ;
            for(j = 0 , l = MIN(i+2,n) , p = i<<5 ; j < i + 1 ; j++){
                A[p+j] += B[p + j] * C[(j<<5) + j];
                if(i && j < i)  A[p + j] += B[p + j + 1] * C[((j+1)<<5) + j];
                if(j + 1 && j + 1 < l)  A[p + j + 1] += B[p + j]*C[(j<<5) + j + 1];
            }
        }
        return;
        // for(i=0;i<n;i++){
        //     double *pA = A + (n*i);
        //     for(k=0;k< (i+1);k++){
        //         double *pC = C + n*k;
        //         int lim2 = MIN(n, k+(n>>4));
        //         int lim1 = MAX(0,k-(n>>4)+1);
        //         // register double temp = B[n*i+k];
        //         for(j=lim1;j<lim2;j++){
        //             *(pA+j) += B[n*i+k]**(pC+j);
        //         }
        //     }
        // }
        // return;
    }

    if(n&64) {
        int l , q , p;
        for(i = 0 ; i < n ; i++){
            for(j = 0 , l = MIN(i+2,n) ,q = MIN(i+3,n) , p = i<<6 ; j < i + 1 ; j++){
                A[p+j] += B[p + j] * C[(j<<6) + j];
                if(i && j < i)  A[p + j] += B[p + j + 1] * C[((j+1)<<6) + j];
                if(i-1 && j < i-1)  A[p + j] += B[p + j + 2] * C[((j+2)<<6) + j];
                if(i-2 && j < i-2)  A[p + j] += B[p + j + 3] * C[((j+3)<<6) + j];
                if(j + 1 && j + 1 < l)  A[p + j + 1] += B[p + j]*C[(j<<6) + j + 1];
                if(j + 2 && j + 2 < q)  A[p + j + 2] += B[p + j]*C[(j<<6) + j + 2];
                if(j + 3 && j + 3 < MIN(i+4,n))  A[p + j + 3] += B[p + j]*C[(j<<6) + j + 3];
            }
        }
        return;
        // #pragma omp parallel for num_threads(2) default(none) private(k,j) shared(n,A,B,C)
        // for(i=0;i<n;i++){
        //     double *pA = A + (n*i);
        //     for(k=0;k< (i+1);k++){
        //         double *pC = C + n*k;
        //         int lim2 = MIN(n, k+(n>>4));
        //         int lim1 = MAX(0,k-(n>>4)+1);
        //         // register double temp = B[n*i+k];
        //         for(j=lim1;j<lim2;j++){
        //             *(pA+j) += B[n*i+k]**(pC+j);
        //         }
        //     }
        // }
        // return;
    }

        // #pragma omp for schedule(guided)
        // for(i=0;i<n;i++){
        //     for(j=i+1;j<n;j++){
        //         double temp=B[n*i+j];
        //         B[n*i+j] = B[n*j+i];
        //         B[n*j+i] = temp;
        //     }
        // }


    if(n&256){
        // #pragma omp parallel default(none) private(i,k,j) shared(n,A,B,C)
        // #pragma omp for schedule(dynamic, 4)
        #pragma omp parallel for default(none) schedule(dynamic,4) private(k,j) shared(n,A,B,C) num_threads(7)
        for(i=n-1 ; i>=0 ; i--){
            double *pA = A+(n*i);
            for(k=0;k<(i+1);k++){
                double *pC = C+n*k;
                // double *pB = B+n*i+k
                int lim2 = MIN(n, k+(n>>4));
                int lim1 = MAX(0,k-(n>>4)+1);
                register double temp = B[n*i+k];
                for(j=lim1;j<lim2;j++){
                    *(pA+j) += temp**(pC+j);
                }
            }
        }
        return;
    }


            // double *pA = A+(n*i);
            // double *pB = B+(n*i);

        #pragma omp parallel for default(none) schedule(dynamic) private(i,k,j) shared(n,A,B,C) num_threads(procsize+1)
        for(i=0; i<n; i++){
            double *pA = A+(n*i);
            for(k=0;k<(i+1);k++){
                double *pC = C+n*k;
                int lim2 = MIN(n, k+(n>>4));
                int lim1 = MAX(0, k-(n>>4)+1);
                register double temp=B[n*i+k];
                for(j=lim1;j<lim2;j++){
                    *(pA+j) += temp**(pC+j);
                }
            }
            // int ii = i+n/2;
            // pA = A + (n*(ii));
            // for(k=0;k<(ii+1);k++){
            //     double *pC = C+n*k;
            //     int lim2 = MIN(n, k+(n>>4));
            //     int lim1 = MAX(0, k-(n>>4)+1);
            //     register double temp=B[n*ii+k];
            //     for(j=lim1;j<lim2;j++){
            //         *(pA+j) += temp**(pC+j);
            //     }
            // }
        }
        return;


    // Basic Parallel
    // int i=0,j=0,k=0;
    // // int temp=0;
    // // printf("Number of procs %d\n",omp_get_num_procs());
    // // omp_set_num_threads(omp_get_num_procs());
    // #pragma omp parallel for private(j)
    //     for(i=0;i<n;i++){
    //         for(k=0;k<n;k++){
    //             for(j=0;j<n;j++){
    //                 A[n*i+j] += B[n*i+k]*C[n*k+j];
    //             }
    //         }
    //     }


/*

Version 2 - 1.65 Gflops
    int i=0,j=0,k=0;
    int chunk=10;
    // int temp=0;
    // printf("Number of procs %d\n",omp_get_num_procs());
    // omp_set_num_threads(omp_get_num_procs());
    #pragma omp parallel default(none) shared(A,B,C,n,chunk) private(i,j,k)
    {
        #pragma omp for collapse(2)
        for(i=0;i<n;i++){
            for(j=0;j<n;j++){
                // A[n*i+j] = 0;
                double temp=0;
                for(k=0;k<n;k++){
                    temp += B[n*i+k]*C[n*k+j];
                }
                A[n*i+j]=temp;
            }
        }
    }
*/
}