diff --git a/CMakeLists.txt b/CMakeLists.txt index 895c91f..dfe6881 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,7 @@ set(dart_src ${PROJECT_SOURCE_DIR}/src/optimization/kernels/obsToMod.h ${PROJECT_SOURCE_DIR}/src/optimization/kernels/modToObs.h + ${PROJECT_SOURCE_DIR}/src/optimization/kernels/kinematics.h ${PROJECT_SOURCE_DIR}/src/optimization/kernels/intersection.h ${PROJECT_SOURCE_DIR}/src/optimization/kernels/raycast.h ${PROJECT_SOURCE_DIR}/src/optimization/contact_prior.cpp @@ -152,6 +153,7 @@ set(gpu_specific_src ${PROJECT_SOURCE_DIR}/src/optimization/kernels/kernel_common.h ${PROJECT_SOURCE_DIR}/src/optimization/kernels/obsToMod.cu ${PROJECT_SOURCE_DIR}/src/optimization/kernels/modToObs.cu + ${PROJECT_SOURCE_DIR}/src/optimization/kernels/kinematics.cu ${PROJECT_SOURCE_DIR}/src/optimization/kernels/intersection.cu ${PROJECT_SOURCE_DIR}/src/optimization/kernels/raycast.cu diff --git a/src/model/mirrored_model.h b/src/model/mirrored_model.h index 1689fab..8995ceb 100644 --- a/src/model/mirrored_model.h +++ b/src/model/mirrored_model.h @@ -86,6 +86,13 @@ class MirroredModel : public Model { inline const SE3 getTransformJointAxisToParent(const int joint) const { return _T_pf->hostPtr()[joint]; } + // Accessors for the kinematics code + inline const SE3 *getTransformsParentJointToFrame() const { return _T_pf->hostPtr(); } + inline const SE3 *getDeviceTransformsParentJointToFrame() const { return _T_pf->devicePtr(); } + inline const void syncKinematics() { _T_mf->syncDeviceToHost(); } + inline const void syncKinematicsHostToDevice() { _T_mf->syncHostToDevice(); } + inline SE3 *getTransformsFrameToModel() { return _T_mf->hostPtr(); } + private: uint _modelID; diff --git a/src/model/model.h b/src/model/model.h index a5647ee..b48af53 100644 --- a/src/model/model.h +++ b/src/model/model.h @@ -99,6 +99,7 @@ class Model { inline float getJointMin(const int joint) const { return _jointLimits[joint].x; } inline float getJointMax(const int joint) const { return _jointLimits[joint].y; } + inline const std::vector &getJointLimits() const { return _jointLimits; } const std::string & getJointName(const int joint) const { return _jointNames[joint]; } void renderSdf(const dart::Grid3D & sdf, float levelSet) const; diff --git a/src/optimization/kernels/intersection.cu b/src/optimization/kernels/intersection.cu index 287ef64..14aa40c 100644 --- a/src/optimization/kernels/intersection.cu +++ b/src/optimization/kernels/intersection.cu @@ -144,6 +144,84 @@ __global__ void gpu_normEqnsSelfIntersection(const float4 * testSites, } +// Computes a single intersection +template +__global__ void gpu_normEquationsIntersectionRaw(const float4 * testSites, + const int nSites, + const SE3 T_ds, + const SE3 T_sd, + const SE3 * T_mfs_src, + const SE3 * T_fms_src, + const int nFramesSrc, + const int * sdfFrames_src, + const SE3 * T_mfs_dst, + const SE3 * T_fms_dst, + const int nFramesDst, + const int nConfigs, + const int * sdfFrames_dst, + const Grid3D * sdfs_dst, + const int nSdfs_dst, + float * result, + int * resultSdf, + bool vary_dst, + float * debugError) { + + const int index = blockIdx.x*blockDim.x + threadIdx.x; + const int configIdx = blockIdx.y*blockDim.y + threadIdx.y; + // index corresponds to which pair of T_mfs_src, T_mfs_dst we are using + if (configIdx >= nConfigs) { + return; + } + // setup pointers correctly + T_mfs_src = T_mfs_src + configIdx * nFramesSrc; + T_fms_src = T_fms_src + configIdx * nFramesSrc; + if (vary_dst) { + T_mfs_dst = T_mfs_dst + configIdx * nFramesDst; + T_fms_dst = T_fms_dst + configIdx * nFramesDst; + } + result = result + configIdx * (nSites * nConfigs); + resultSdf = resultSdf + configIdx * (nSites * nConfigs); + + + // overflow + if (index >= nSites) { + return; + } + + float4 v_src_f = testSites[index]; + const int srcGrid = round(v_src_f.w); + const int srcFrame = sdfFrames_src[srcGrid]; + + v_src_f.w = 1; + const float4 v_src_m = T_mfs_src[srcFrame]*v_src_f; + const float4 v_dst_m = T_ds*v_src_m; + + int dstIdx = -1; + float smallestResidual = 1.0e50; + + for (int dstGrid=0; dstGrid & dstSdf = sdfs_dst[dstGrid]; + const float3 v_dst_g = dstSdf.getGridCoords(make_float3(v_dst_f)); + + if (dstSdf.isInBoundsGradientInterp(v_dst_g)) { + + const float residual = dstSdf.getValueInterpolated(v_dst_g)*dstSdf.resolution; + if (residual < smallestResidual) { + dstIdx = dstGrid; + smallestResidual = residual; + } + } + + } + result[index] = smallestResidual; + resultSdf[index] = dstIdx; + +} + __global__ void gpu_normEqnsSelfIntersectionReduced(const float4 * testSites, const int nSites, const int fullDims, @@ -839,6 +917,104 @@ void normEqnsIntersection(const float4 * testSites, } +void normEqnsIntersectionRaw(const float4 * testSites, + const int nSites, + const SE3 T_ds, + const SE3 T_sd, + const SE3 *src_T_mfs, + const SE3 *src_T_fms, + const int nFramesSrc, // number of frames per joint config + const SE3 *dst_T_mfs, + const SE3 *dst_T_fms, + const int nFramesDst, // number of frames per joint config + const int nConfigs,// number of total joint configs + const MirroredModel & srcModel, + const MirroredModel & dstModel, + float * result, + int * resultsdf, + float * debugError) { + cudaMemset(result,0,(nSites*nConfigs)*sizeof(float)); + cudaMemset(resultsdf,1,(nSites*nConfigs)*sizeof(int)); + dim3 block(64,64,1); + dim3 grid(ceil(nSites/(float)block.x),ceil(nConfigs/(float)block.y),1); + if (debugError == 0) { + gpu_normEquationsIntersectionRaw<<>>(testSites, nSites, + T_ds, T_sd, + src_T_mfs, src_T_fms, + nFramesSrc, + srcModel.getDeviceSdfFrames(), + dst_T_mfs, dst_T_fms, + nFramesDst, + nConfigs, + dstModel.getDeviceSdfFrames(), + dstModel.getDeviceSdfs(), + dstModel.getNumSdfs(), + result, resultsdf, true, debugError); + } else { + gpu_normEquationsIntersectionRaw<<>>(testSites, nSites, + T_ds, T_sd, + src_T_mfs, src_T_fms, + nFramesSrc, + srcModel.getDeviceSdfFrames(), + dst_T_mfs, dst_T_fms, + nFramesDst, + nConfigs, + dstModel.getDeviceSdfFrames(), + dstModel.getDeviceSdfs(), + dstModel.getNumSdfs(), + result, resultsdf, true, debugError); + } +} + +void normEqnsIntersectionRawSingleTgt(const float4 * testSites, + const int nSites, + const SE3 T_ds, + const SE3 T_sd, + const SE3 *src_T_mfs, + const SE3 *src_T_fms, + const int nFramesSrc, // number of frames per joint config + const SE3 *dst_T_mf, + const SE3 *dst_T_fm, + const int nFramesDst, // number of frames per joint config + const int nConfigs,// number of total joint configs + const MirroredModel & srcModel, + const MirroredModel & dstModel, + float * result, + int * resultsdf, + float * debugError) { + cudaMemset(result,0,(nSites*nConfigs)*sizeof(float)); + cudaMemset(resultsdf,1,(nSites*nConfigs)*sizeof(int)); + dim3 block(64,64,1); + dim3 grid(ceil(nSites/(float)block.x),ceil(nConfigs/(float)block.y),1); + if (debugError == 0) { + gpu_normEquationsIntersectionRaw<<>>(testSites, nSites, + T_ds, T_sd, + src_T_mfs, src_T_fms, + nFramesSrc, + srcModel.getDeviceSdfFrames(), + dst_T_mf, dst_T_fm, + nFramesDst, + nConfigs, + dstModel.getDeviceSdfFrames(), + dstModel.getDeviceSdfs(), + dstModel.getNumSdfs(), + result, resultsdf, false, debugError); + } else { + gpu_normEquationsIntersectionRaw<<>>(testSites, nSites, + T_ds, T_sd, + src_T_mfs, src_T_fms, + nFramesSrc, + srcModel.getDeviceSdfFrames(), + dst_T_mf, dst_T_fm, + nFramesDst, + nConfigs, + dstModel.getDeviceSdfFrames(), + dstModel.getDeviceSdfs(), + dstModel.getNumSdfs(), + result, resultsdf, false, debugError); + } +} + void normEqnsSelfIntersectionReduced(const float4 * testSites, const int nSites, @@ -994,5 +1170,4 @@ void initDebugIntersectionError(float * debugError, gpu_initDebugIntersectionError<<>>(debugError, nSites); } - } diff --git a/src/optimization/kernels/intersection.h b/src/optimization/kernels/intersection.h index 4bd41cd..5b54da2 100644 --- a/src/optimization/kernels/intersection.h +++ b/src/optimization/kernels/intersection.h @@ -53,6 +53,62 @@ void normEqnsIntersection(const float4 * testSites, float * result, float * debugError = 0); +/* + * src_T_mfs, src_T_fms, dst_T_mfs, dst_T_fms are all arrays of transforms on the + * gpu as computed by computeKinematicsBatchGPU, they are both nConfigs arrays + * of model-frame and frame-model transforms of their respective models + * + * result and resultsdf are nconfigs * nsites long arrays on the gpu to be used + * for results of distance checking. (result + config_idx * nsites)[site_idx] is + * the way to access a particular result for joint config config_idx and testSite + * site_idx. (Note that the memory should be copied over to the cpu first if that + * is where you intend to read the memory.) + * */ +void normEqnsIntersectionRaw(const float4 * testSites, + const int nSites, + const SE3 T_ds, + const SE3 T_sd, + const SE3 *src_T_mfs, + const SE3 *src_T_fms, + const int nFramesSrc, // number of frames per joint config + const SE3 *dst_T_mfs, + const SE3 *dst_T_fms, + const int nFramesDst, // number of frames per joint config + const int nConfigs,// number of total joint configs + const MirroredModel & srcModel, + const MirroredModel & dstModel, + // distance to closest object + // results array is a block of results (per joint config) with layout paralleling the joint configs + // (nsites results for jtconfig 1 at result, nsites results for jtconfig 2 at result + nsites) + float * result, // should be an nsites * nconfigs long array that has already been allocated on the gpu + // index of the sdf corresponding to above + int * resultsdf, // should be an nsites * nconfigs long array that has already been allocated on the gpu + float * debugError = 0); + +// same as above except your destination to intersect with is a single object instead of a +// multitude of joint configurations +void normEqnsIntersectionRawSingleTgt(const float4 * testSites, + const int nSites, + const SE3 T_ds, + const SE3 T_sd, + const SE3 *src_T_mfs, + const SE3 *src_T_fms, + const int nFramesSrc, // number of frames per joint config + const SE3 *dst_T_mf, + const SE3 *dst_T_fm, + const int nFramesDst, // number of frames per joint config + const int nConfigs,// number of total joint configs + const MirroredModel & srcModel, + const MirroredModel & dstModel, + // distance to closest object + // results array is a block of results (per joint config) with layout paralleling the joint configs + // (nsites results for jtconfig 1 at result, nsites results for jtconfig 2 at result + nsites) + float * result, // should be an nsites * nconfigs long array that has already been allocated on the gpu + // index of the sdf corresponding to above + int * resultsdf, // should be an nsites * nconfigs long array that has already been allocated on the gpu + float * debugError = 0); + + void normEqnsIntersectionReduced(const float4 * testSites, const int nSites, const int fullDims, diff --git a/src/optimization/kernels/kinematics.cu b/src/optimization/kernels/kinematics.cu new file mode 100644 index 0000000..88600f5 --- /dev/null +++ b/src/optimization/kernels/kinematics.cu @@ -0,0 +1,450 @@ +#include "kinematics.h" + +#include +#include + +#include "kernel_common.h" +#include "geometry/grid_3d.h" +#include "geometry/SE3.h" +#include "optimization/optimization.h" +#include "util/mirrored_memory.h" + +namespace dart { + +static const float truncVal = 1000.0; + + +// -=-=-=-=-=-=-=-=-=- kernels -=-=-=-=-=-=-=-=-=- +// probably need some well defined place to store the result +// that we can access later +// also a flag or something to show when we're done that the main +// thread can block on +// use cudaMemcpy(_hVector,_dVector,_length*sizeof(T),cudaMemcpyDeviceToHost); for getting things back +__global__ void gpu_computeForwardKinematics( + const int maxidx, + const int numtoproc, + const int poselen, + // array that corresponds to a pose/joint configuration + const float * poses, + //const int numDofs, + // result store + SE3 *T_mfss, + //const int numJoints, + const float2 * jointLims, + const SE3 *T_pfs, + const int *frameParents, + const int numframes, + const JointType *jointTypes, + const float3 *jointAxes) { + int start_idx = (threadIdx.x + blockIdx.x * blockDim.x) * numtoproc; + for (int idx = start_idx;idx < start_idx + numtoproc; ++idx){ + if (idx >= maxidx) return; + const float * pose = poses + poselen * idx; + SE3 * T_mfs = T_mfss + numframes * idx; + SE3 val; + // setup + T_mfs[0] = val; + + // need to figure out how to turn a configuration into a full set of joint angles (pose reduction to full pose) + // for now assume that jointConfigurations has already been transformed into a full pose + int j = 6; + for (int f=1; f= poselen) p_ = 0.0; + float p = fminf(fmaxf(jointLims[j-6].x,p_),jointLims[j-6].y); + + const int joint = f-1; + SE3 T_pf = T_pfs[joint]; + switch(jointTypes[joint]) { + case RotationalJoint: + T_pf = T_pf*SE3Fromse3(se3(0, 0, 0, + p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z)); + ++j; + break; + case PrismaticJoint: + T_pf = T_pf*SE3Fromse3(se3(p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z, + 0, 0, 0)); + ++j; + break; + } + const int parent = frameParents[f]; + T_mfs[f] = T_mfs[parent]*T_pf; + }} +} + +__global__ void gpu_computeKinematics( + const int maxidx, + const int numtoproc, + const int poselen, + // array that corresponds to a pose/joint configuration + const float * poses, + //const int numDofs, + // result stores + SE3 *T_mfss, + SE3 *T_fmss, + //const int numJoints, + const float2 * jointLims, + const SE3 *T_pfs, + const int *frameParents, + const int numframes, + const JointType *jointTypes, + const float3 *jointAxes) { + int start_idx = (threadIdx.x + blockIdx.x * blockDim.x) * numtoproc; + for (int idx = start_idx;idx < start_idx + numtoproc; ++idx){ + if (idx >= maxidx) return; + const float * pose = poses + poselen * idx; + SE3 * T_mfs = T_mfss + numframes * idx; + SE3 * T_fms = T_fmss + numframes * idx; + SE3 val; + // setup + T_mfs[0] = val; + int j = 6; + for (int f=1; f= poselen) p_ = 0.0; + float p = fminf(fmaxf(jointLims[j-6].x,p_),jointLims[j-6].y); + + const int joint = f-1; + SE3 T_pf = T_pfs[joint]; + switch(jointTypes[joint]) { + case RotationalJoint: + T_pf = T_pf*SE3Fromse3(se3(0, 0, 0, + p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z)); + ++j; + break; + case PrismaticJoint: + T_pf = T_pf*SE3Fromse3(se3(p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z, + 0, 0, 0)); + ++j; + break; + } + const int parent = frameParents[f]; + T_mfs[f] = T_mfs[parent]*T_pf; + T_fms[f] = SE3Invert(T_mfs[f]); + }} +} + +void computeForwardKinematics( + float *host_pose, + int poselen, + MirroredModel &robot, + MirroredVector *limits) { + // limit pose by joint limits and extend the articulation + // make sure not to leak around this + //MirroredVector limits = new MirroredVector(robot._jointLimits); + // move these to a spot on the gpu + // need to tune these/accept larger parameters so that this makes sense + //dim3 block(8,1,1); + //dim3 grid(ceil(total / block.x), ceil(total / block.y)); + //std::cout << host_pose[0] << std::endl; + float *device_pose; + cudaError_t mal_err = cudaMalloc((void**) &device_pose, sizeof(float) * poselen); + //float *posea = (float*) malloc(sizeof(float) * poselen); + cudaError_t mc_err1 = cudaMemcpy(device_pose, host_pose, sizeof(float) * poselen, cudaMemcpyHostToDevice); + //cudaError_t mc_err2 = cudaMemcpy(posea, device_pose, sizeof(float) * poselen, cudaMemcpyDeviceToHost); + if (mal_err != cudaSuccess) { + std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl; + return; + } + if (mc_err1 != cudaSuccess) { + std::cout << "Cuda copy error1:" << cudaGetErrorString(mc_err1) << std::endl; + return; + } + /*if (mc_err2 != cudaSuccess) { + std::cout << "Cuda copy error2:" << cudaGetErrorString(mc_err2) << std::endl; + return; + } + /*std::cout << "Calling fk code with " << robot.getNumFrames() << " frames, " << poselen << " dimensions" << std::endl; + for (int i = 0; i < robot.getNumFrames() - 1; ++i) { + std::cout << "POSEX: " << posea[i] << " " << limits->hostPtr()[i].x + << " " << limits->hostPtr()[i].y << std::endl; + } + free(posea);*/ + robot.syncKinematicsHostToDevice(); + + std::cout << 1 <<" " << poselen << " " << (uint64_t) limits->devicePtr() << std::endl; + gpu_computeForwardKinematics<<<1,1>>>( + 1,1, + poselen, + device_pose, + robot.getDeviceTransformsFrameToModel(), + limits->devicePtr(), + robot.getDeviceTransformsParentJointToFrame(), + robot.getDeviceFrameParents(), + robot.getNumFrames(), + robot.getDeviceJointTypes(), + robot.getDeviceJointAxes() + ); + cudaFree(device_pose); + robot.syncKinematics(); +} + +SE3 **computeForwardKinematicsBatch( + float *host_poses, + int nposes, + int ndims, + MirroredModel &robot, + MirroredVector *limits, + bool BENCHMARK) { + + clock_t s,e; + if (BENCHMARK) s = clock(); + // limit pose by joint limits and extend the articulation + // make sure not to leak around this + //MirroredVector limits = new MirroredVector(robot._jointLimits); + // move these to a spot on the gpu + // need to tune these/accept larger parameters so that this makes sense + //dim3 block(8,1,1); + //dim3 grid(ceil(total / block.x), ceil(total / block.y)); + // round up to a power of two for better calculation + int calc_num = std::pow(2, std::ceil(std::log(nposes)/std::log(2))); + int nblk = 16; + int perthr = 128; + int threadct = max(calc_num / perthr / nblk, 1); + int nframes = robot.getNumFrames(); + float *device_poses; + cudaError_t mal_err = cudaMalloc((void**) &device_poses, sizeof(float) * ndims * nposes); + if (mal_err != cudaSuccess) { + std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl; + return NULL; + } + SE3 *device_results; + mal_err = cudaMalloc((void**) &device_results, sizeof(SE3) * nframes * nposes); + if (mal_err != cudaSuccess) { + std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl; + return NULL; + } + cudaMemset(device_results,0, sizeof(SE3) * nframes * nposes); + + if (BENCHMARK) { + e = clock(); + std::cout << "CUDA Mallocs: " << e << " " << s << " " << (e-s) << std::endl; + s = clock(); + } + cudaError_t mc_err1 = cudaMemcpy(device_poses, host_poses, sizeof(float) * ndims * nposes, cudaMemcpyHostToDevice); + //for (int i = 0; i < host_poses.size(); ++i) { + //cudaError_t mc_err1 = cudaMemcpy(device_poses + ndims * i, host_poses[i], sizeof(float) * ndims, cudaMemcpyHostToDevice); + if (mc_err1 != cudaSuccess) { + std::cout << "Cuda copy error1:" << cudaGetErrorString(mc_err1) << std::endl; + return NULL; + } + //} + + if(BENCHMARK) { + e = clock(); + std::cout << "Device setup: " << e << " " << s << " " << (e-s) << std::endl; + } + /*float *posea = (float*) malloc(sizeof(float) * ndims); + cudaError_t mc_err2 = cudaMemcpy(posea, device_poses, sizeof(float) * ndims, cudaMemcpyDeviceToHost); + if (mc_err2 != cudaSuccess) { + std::cout << "Cuda copy error2:" << cudaGetErrorString(mc_err2) << std::endl; + return NULL; + } + //std::cout << "Calling fk code with " << robot.getNumFrames() << " frames, " << ndims << " dimensions" << std::endl; + for (int i = 0; i < ndims; ++i) { + std::cout << "POSEX: " << posea[i] << " " << limits->hostPtr()[i].x + << " " << limits->hostPtr()[i].y << " " <devicePtr() << std::endl;*/ + if(BENCHMARK) s = clock(); + std::cout << "Launching " << calc_num<< " "<< threadct << " " << nblk << " " << perthr << std::endl; + gpu_computeForwardKinematics<<>>( + nposes, + perthr, + ndims, + device_poses, + device_results, + limits->devicePtr(), + robot.getDeviceTransformsParentJointToFrame(), + robot.getDeviceFrameParents(), + robot.getNumFrames(), + robot.getDeviceJointTypes(), + robot.getDeviceJointAxes() + ); + if (BENCHMARK) { + e = clock(); + std::cout << "Actual computation: " << e << " " << s << " " << (e-s) << std::endl; + } + /*nposes, + ndims, + device_poses, + robot.getDeviceTransformsFrameToModel(),//device_results, + limits->devicePtr(), + robot.getDeviceTransformsParentJointToFrame(), + robot.getDeviceFrameParents(), + robot.getNumFrames(), + robot.getDeviceJointTypes(), + robot.getDeviceJointAxes() + );*/ + //cudaFree(device_poses); + + /*cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost); + std::cout << "Result " << datapos[2].r0.w << " " << datapos[2].r0.x << " " < *limits, + SE3 *&t_mfs, + SE3 *&t_fms) { + + // limit pose by joint limits and extend the articulation + // make sure not to leak around this + //MirroredVector limits = new MirroredVector(robot._jointLimits); + // move these to a spot on the gpu + // need to tune these/accept larger parameters so that this makes sense + //dim3 block(8,1,1); + //dim3 grid(ceil(total / block.x), ceil(total / block.y)); + // round up to a power of two for better calculation + int calc_num = std::pow(2, std::ceil(std::log(nposes)/std::log(2))); + int nblk = 16; + int perthr = 128; + int threadct = max(calc_num / perthr / nblk, 1); + int nframes = robot.getNumFrames(); + float *device_poses; + cudaError_t mal_err = cudaMalloc((void**) &device_poses, sizeof(float) * ndims * nposes); + if (mal_err != cudaSuccess) { + std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl; + return; + } + SE3 *device_mfs; + mal_err = cudaMalloc((void**) &device_mfs, sizeof(SE3) * nframes * nposes); + if (mal_err != cudaSuccess) { + std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl; + return; + } + t_mfs = device_mfs; + cudaMemset(device_mfs,0, sizeof(SE3) * nframes * nposes); + SE3 *device_fms; + mal_err = cudaMalloc((void**) &device_fms, sizeof(SE3) * nframes * nposes); + if (mal_err != cudaSuccess) { + std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl; + return; + } + t_fms = device_fms; + cudaMemset(device_fms,0, sizeof(SE3) * nframes * nposes); + + cudaError_t mc_err1 = cudaMemcpy(device_poses, host_poses, sizeof(float) * ndims * nposes, cudaMemcpyHostToDevice); + //for (int i = 0; i < host_poses.size(); ++i) { + //cudaError_t mc_err1 = cudaMemcpy(device_poses + ndims * i, host_poses[i], sizeof(float) * ndims, cudaMemcpyHostToDevice); + if (mc_err1 != cudaSuccess) { + std::cout << "Cuda copy error1:" << cudaGetErrorString(mc_err1) << std::endl; + return; + } + //} + + /*float *posea = (float*) malloc(sizeof(float) * ndims); + cudaError_t mc_err2 = cudaMemcpy(posea, device_poses, sizeof(float) * ndims, cudaMemcpyDeviceToHost); + if (mc_err2 != cudaSuccess) { + std::cout << "Cuda copy error2:" << cudaGetErrorString(mc_err2) << std::endl; + return; + } + //std::cout << "Calling fk code with " << robot.getNumFrames() << " frames, " << ndims << " dimensions" << std::endl; + for (int i = 0; i < ndims; ++i) { + std::cout << "POSEX: " << posea[i] << " " << limits->hostPtr()[i].x + << " " << limits->hostPtr()[i].y << " " <devicePtr() << std::endl;*/ + std::cout << "Launching " << calc_num<< " "<< threadct << " " << nblk << " " << perthr << std::endl; + gpu_computeKinematics<<>>( + nposes, + perthr, + ndims, + device_poses, + device_mfs, + device_fms, + limits->devicePtr(), + robot.getDeviceTransformsParentJointToFrame(), + robot.getDeviceFrameParents(), + robot.getNumFrames(), + robot.getDeviceJointTypes(), + robot.getDeviceJointAxes() + ); + + /*nposes, + ndims, + device_poses, + robot.getDeviceTransformsFrameToModel(),//device_results, + limits->devicePtr(), + robot.getDeviceTransformsParentJointToFrame(), + robot.getDeviceFrameParents(), + robot.getNumFrames(), + robot.getDeviceJointTypes(), + robot.getDeviceJointAxes() + );*/ + //cudaFree(device_poses); + + /*cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost); + std::cout << "Result " << datapos[2].r0.w << " " << datapos[2].r0.x << " " < *limits, + SE3 **&t_mfs, + SE3 **&t_fms) { + int nframes = robot.getNumFrames(); + t_mfs = (SE3 **) malloc(sizeof(SE3*) * nposes + sizeof(SE3) * nframes * nposes); + t_fms = (SE3 **) malloc(sizeof(SE3*) * nposes + sizeof(SE3) * nframes * nposes); + SE3 *mfs_pos = (SE3 *) &t_mfs[nposes]; + // initalize raw array + for (int i = 0; i < nposes; ++i) { + t_mfs[i] = mfs_pos + i * nframes; + } + SE3 *fms_pos = (SE3 *) &t_fms[nposes]; + // initalize raw array + for (int i = 0; i < nposes; ++i) { + t_fms[i] = fms_pos + i * nframes; + } + SE3 *device_mfs; + SE3 *device_fms; + computeKinematicsBatchGPU(host_poses, nposes, ndims, robot, limits, device_mfs, device_fms); + cudaMemcpy(mfs_pos, device_mfs, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost); + cudaMemcpy(fms_pos, device_fms, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost); + cudaFree(device_mfs); + cudaFree(device_fms); +} + +} diff --git a/src/optimization/kernels/kinematics.h b/src/optimization/kernels/kinematics.h new file mode 100644 index 0000000..5336878 --- /dev/null +++ b/src/optimization/kernels/kinematics.h @@ -0,0 +1,67 @@ +#ifndef GPUKINEMATICS_H +#define GPUKINEMATICS_H + +#include + +#include +#include "geometry/grid_3d.h" +#include "geometry/SE3.h" +#include "model/mirrored_model.h" +#include "util/dart_types.h" + +namespace dart { + +// Computes forward kkinematics for a single joint configuration on the GPU +// puts it in the mirrored model passed in +void computeForwardKinematics( + float *pose, + int poselen, + MirroredModel &robot, + MirroredVector *limits); + +// poses is a contiguous block of 1d arrays (e.g. floats 0-poselen are pose 1, +// poselen-2*poselen is pose 2. up to nposes +// benchmark is to turn on benchmark logging +// returns a 2D array of transforms, caller is responsible for freeing the result +// as well as array of poses passed in +// WARNING: make sure to free the array that is returned once finished +SE3 **computeForwardKinematicsBatch(float *poses, int nposes, + int poselen, + MirroredModel &robot, + MirroredVector *limits, + bool benchmark=false); + +// will compute both forward and backward kinematics, will put FK in t_mfs, +// inverse kinematics in t_fms +// WARNING: caller is responsible for freeing t_mfs and t_fms once they are +// no longer needed. +void computeKinematicsBatch( + float *poses, + int nposes, + int poselen, + MirroredModel &robot, + MirroredVector *limits, + SE3 **&t_mfs, + SE3 **&t_fms); + +// same as above except the pointers passed in are set to the pointers for the gpu +// results and the gpu results are not freed. +// Note that the type is SE3*, the memory pointed to is a contiguous block of +// results in order of the poses given +// pose 1 is at t_mfs, t_fms +// pose 2 is at t_mfs + numframes +// etc... +// WARNING: caller is responsible for eventually freeing the gpu memory of t_mfs, t_fms +// (use cudaFree) +void computeKinematicsBatchGPU( + float *poses, + int nposes, + int poselen, + MirroredModel &robot, + MirroredVector *limits, + SE3 *&t_mfs, + SE3 *&t_fms); + +} + +#endif // MODTOOBS_H