lundi 22 juin 2015

OpenCL conditional statement issue

I've attempted to parallelize a set of conditional statements, but the output does not match existing implementation after the first loop which contains the kernel executes (mapI is an int array of 135, and on the 60th index of the second loop it fails, totaling 195 calls to mapI). I've checked that all arrays are passing to and from the kernel correctly by comparing them to the existing implementation and am baffled as to why this computation does not return the correct result, as it does for the first loop of the code. All OpenCL overhead functions return CL_SUCCESS.

cl_mem Q1cl, Q3cl;
Q1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q1, &err);
Q3cl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q3, &err);
nxcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, nx, &err);
nycl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, ny, &err);
mapIcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Ninflow), mapI, &err);
mapFcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Nfar), mapF, &err);
fluxQ1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, &err);

err = clSetKernelArg(kernel[7], 0, sizeof(cl_mem), (void*)&mapIcl);
err = clSetKernelArg(kernel[7], 1, sizeof(cl_mem), (void*)&nxcl);
err = clSetKernelArg(kernel[7], 2, sizeof(cl_mem), (void*)&nycl);
err = clSetKernelArg(kernel[7], 3, sizeof(cl_mem), (void*)&mapFcl);
err = clSetKernelArg(kernel[7], 4, sizeof(cl_mem), (void*)&Q1cl);
err = clSetKernelArg(kernel[7], 5, sizeof(cl_mem), (void*)&Q3cl);
err = clSetKernelArg(kernel[7], 6, sizeof(cl_mem), (void*)&fluxQ1cl);

globalWorkSize[0] = Ninflow;
globalWorkSize[1] = Nfar;

err = clEnqueueNDRangeKernel(queue[0], kernel[7], 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue[0], fluxQ1cl, CL_TRUE, 0, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, 0, NULL, NULL);

Kernel Code:

__kernel void umBC(__global int* mapI,
               __global double* nx,
               __global double* ny,
               __global int* mapF,
               __global double* Q1,
               __global double* Q3,
               __global double* fluxQ1)
{

int id, idF;
double e[9][2] = { {0, 0},  {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;

int globalx = get_global_id(0);
int globaly = get_global_id(1);
id = mapI[globalx];
fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.;

uf = 0.01;
vf = 0.;
idF = mapF[globaly];
fluxQ1[idF] = ((e[1][0]*nx[idF] + e[1][1]*ny[idF]) < 0)*((Q1[idF]-Q3[idF] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[idF] + e[1][1]*ny[idF])) + 0.;

}

As always, any help is greatly appreciated.

Aucun commentaire:

Enregistrer un commentaire