diff --git a/CMakeLists.txt b/CMakeLists.txt
index 895c91f..dfe6881 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,6 +87,7 @@ set(dart_src
 
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/obsToMod.h
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/modToObs.h
+    ${PROJECT_SOURCE_DIR}/src/optimization/kernels/kinematics.h
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/intersection.h
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/raycast.h
     ${PROJECT_SOURCE_DIR}/src/optimization/contact_prior.cpp
@@ -152,6 +153,7 @@ set(gpu_specific_src
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/kernel_common.h
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/obsToMod.cu
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/modToObs.cu
+    ${PROJECT_SOURCE_DIR}/src/optimization/kernels/kinematics.cu
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/intersection.cu
     ${PROJECT_SOURCE_DIR}/src/optimization/kernels/raycast.cu
 
diff --git a/src/model/mirrored_model.h b/src/model/mirrored_model.h
index 1689fab..8995ceb 100644
--- a/src/model/mirrored_model.h
+++ b/src/model/mirrored_model.h
@@ -86,6 +86,13 @@ class  MirroredModel : public Model {
 
     inline const SE3 getTransformJointAxisToParent(const int joint) const { return _T_pf->hostPtr()[joint]; }
 
+    // Accessors for the kinematics code
+    inline const SE3 *getTransformsParentJointToFrame() const { return _T_pf->hostPtr(); }
+    inline const SE3 *getDeviceTransformsParentJointToFrame() const { return _T_pf->devicePtr(); }
+    inline const void syncKinematics() { _T_mf->syncDeviceToHost(); }
+    inline const void syncKinematicsHostToDevice() { _T_mf->syncHostToDevice(); }
+    inline SE3 *getTransformsFrameToModel() { return _T_mf->hostPtr(); }
+
 private:
 
     uint _modelID;
diff --git a/src/model/model.h b/src/model/model.h
index a5647ee..b48af53 100644
--- a/src/model/model.h
+++ b/src/model/model.h
@@ -99,6 +99,7 @@ class Model {
 
     inline float getJointMin(const int joint) const { return _jointLimits[joint].x; }
     inline float getJointMax(const int joint) const { return _jointLimits[joint].y; }
+    inline const std::vector<float2> &getJointLimits() const { return _jointLimits; }
 
     const std::string & getJointName(const int joint) const { return _jointNames[joint]; }
     void renderSdf(const dart::Grid3D<float> & sdf, float levelSet) const;
diff --git a/src/optimization/kernels/intersection.cu b/src/optimization/kernels/intersection.cu
index 287ef64..14aa40c 100644
--- a/src/optimization/kernels/intersection.cu
+++ b/src/optimization/kernels/intersection.cu
@@ -144,6 +144,84 @@ __global__ void gpu_normEqnsSelfIntersection(const float4 * testSites,
 
 }
 
+// Computes a single intersection
+template <bool dbgErr>
+__global__ void gpu_normEquationsIntersectionRaw(const float4 * testSites,
+                                              const int nSites,
+                                              const SE3 T_ds,
+                                              const SE3 T_sd,
+                                              const SE3 * T_mfs_src,
+                                              const SE3 * T_fms_src,
+                                              const int nFramesSrc,
+                                              const int * sdfFrames_src,
+                                              const SE3 * T_mfs_dst,
+                                              const SE3 * T_fms_dst,
+                                              const int nFramesDst,
+                                              const int nConfigs,
+                                              const int * sdfFrames_dst,
+                                              const Grid3D<float> * sdfs_dst,
+                                              const int nSdfs_dst,
+                                              float * result,
+                                              int * resultSdf,
+                                              bool vary_dst,
+                                              float * debugError) {
+
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    const int configIdx = blockIdx.y*blockDim.y + threadIdx.y;
+    // index corresponds to which pair of T_mfs_src, T_mfs_dst we are using
+    if (configIdx >= nConfigs) {
+      return;
+    }
+    // setup pointers correctly
+    T_mfs_src = T_mfs_src + configIdx * nFramesSrc;
+    T_fms_src = T_fms_src + configIdx * nFramesSrc;
+    if (vary_dst) {
+      T_mfs_dst = T_mfs_dst + configIdx * nFramesDst;
+      T_fms_dst = T_fms_dst + configIdx * nFramesDst;
+    }
+    result = result + configIdx * (nSites * nConfigs);
+    resultSdf = resultSdf + configIdx * (nSites * nConfigs);
+
+
+    // overflow
+    if (index >= nSites) {
+        return;
+    }
+
+    float4 v_src_f = testSites[index];
+    const int srcGrid = round(v_src_f.w);
+    const int srcFrame = sdfFrames_src[srcGrid];
+
+    v_src_f.w = 1;
+    const float4 v_src_m = T_mfs_src[srcFrame]*v_src_f;
+    const float4 v_dst_m = T_ds*v_src_m;
+
+    int dstIdx = -1;
+    float smallestResidual = 1.0e50;
+
+    for (int dstGrid=0; dstGrid<nSdfs_dst; ++dstGrid) {
+
+        const int dstFrame = sdfFrames_dst[dstGrid];
+        const float4 v_dst_f = T_fms_dst[dstFrame]*v_dst_m;
+
+        const Grid3D<float> & dstSdf = sdfs_dst[dstGrid];
+        const float3 v_dst_g = dstSdf.getGridCoords(make_float3(v_dst_f));
+
+        if (dstSdf.isInBoundsGradientInterp(v_dst_g)) {
+
+            const float residual = dstSdf.getValueInterpolated(v_dst_g)*dstSdf.resolution;
+            if (residual < smallestResidual) {
+              dstIdx = dstGrid;
+              smallestResidual = residual;
+            }
+        }
+
+    }
+    result[index] = smallestResidual;
+    resultSdf[index] = dstIdx;
+
+}
+
 __global__ void gpu_normEqnsSelfIntersectionReduced(const float4 * testSites,
                                                     const int nSites,
                                                     const int fullDims,
@@ -839,6 +917,104 @@ void normEqnsIntersection(const float4 * testSites,
 
 }
 
+void normEqnsIntersectionRaw(const float4 * testSites,
+                          const int nSites,
+                          const SE3 T_ds,
+                          const SE3 T_sd,
+                          const SE3 *src_T_mfs,
+                          const SE3 *src_T_fms,
+                          const int nFramesSrc, // number of frames per joint config
+                          const SE3 *dst_T_mfs,
+                          const SE3 *dst_T_fms,
+                          const int nFramesDst, // number of frames per joint config
+                          const int nConfigs,// number of total joint configs
+                          const MirroredModel & srcModel,
+                          const MirroredModel & dstModel,
+                          float * result,
+                          int * resultsdf,
+                          float * debugError) {
+    cudaMemset(result,0,(nSites*nConfigs)*sizeof(float));
+    cudaMemset(resultsdf,1,(nSites*nConfigs)*sizeof(int));
+    dim3 block(64,64,1);
+    dim3 grid(ceil(nSites/(float)block.x),ceil(nConfigs/(float)block.y),1);
+    if (debugError == 0) {
+        gpu_normEquationsIntersectionRaw<false><<<grid,block>>>(testSites, nSites,
+                                                                                    T_ds, T_sd,
+                                                                                    src_T_mfs, src_T_fms,
+                                                                                    nFramesSrc,
+                                                                                    srcModel.getDeviceSdfFrames(),
+                                                                                    dst_T_mfs, dst_T_fms,
+                                                                                    nFramesDst,
+                                                                                    nConfigs,
+                                                                                    dstModel.getDeviceSdfFrames(),
+                                                                                    dstModel.getDeviceSdfs(),
+                                                                                    dstModel.getNumSdfs(),
+                                                                                    result, resultsdf, true, debugError);
+    } else {
+        gpu_normEquationsIntersectionRaw<true><<<grid,block>>>(testSites, nSites,
+                                                                                    T_ds, T_sd,
+                                                                                    src_T_mfs, src_T_fms,
+                                                                                    nFramesSrc,
+                                                                                    srcModel.getDeviceSdfFrames(),
+                                                                                    dst_T_mfs, dst_T_fms,
+                                                                                    nFramesDst,
+                                                                                    nConfigs,
+                                                                                    dstModel.getDeviceSdfFrames(),
+                                                                                    dstModel.getDeviceSdfs(),
+                                                                                    dstModel.getNumSdfs(),
+                                                                                    result, resultsdf, true, debugError);
+    }
+}
+
+void normEqnsIntersectionRawSingleTgt(const float4 * testSites,
+                          const int nSites,
+                          const SE3 T_ds,
+                          const SE3 T_sd,
+                          const SE3 *src_T_mfs,
+                          const SE3 *src_T_fms,
+                          const int nFramesSrc, // number of frames per joint config
+                          const SE3 *dst_T_mf,
+                          const SE3 *dst_T_fm,
+                          const int nFramesDst, // number of frames per joint config
+                          const int nConfigs,// number of total joint configs
+                          const MirroredModel & srcModel,
+                          const MirroredModel & dstModel,
+                          float * result,
+                          int * resultsdf,
+                          float * debugError) {
+    cudaMemset(result,0,(nSites*nConfigs)*sizeof(float));
+    cudaMemset(resultsdf,1,(nSites*nConfigs)*sizeof(int));
+    dim3 block(64,64,1);
+    dim3 grid(ceil(nSites/(float)block.x),ceil(nConfigs/(float)block.y),1);
+    if (debugError == 0) {
+        gpu_normEquationsIntersectionRaw<false><<<grid,block>>>(testSites, nSites,
+                                                                                    T_ds, T_sd,
+                                                                                    src_T_mfs, src_T_fms,
+                                                                                    nFramesSrc,
+                                                                                    srcModel.getDeviceSdfFrames(),
+                                                                                    dst_T_mf, dst_T_fm,
+                                                                                    nFramesDst,
+                                                                                    nConfigs,
+                                                                                    dstModel.getDeviceSdfFrames(),
+                                                                                    dstModel.getDeviceSdfs(),
+                                                                                    dstModel.getNumSdfs(),
+                                                                                    result, resultsdf, false, debugError);
+    } else {
+        gpu_normEquationsIntersectionRaw<true><<<grid,block>>>(testSites, nSites,
+                                                                                    T_ds, T_sd,
+                                                                                    src_T_mfs, src_T_fms,
+                                                                                    nFramesSrc,
+                                                                                    srcModel.getDeviceSdfFrames(),
+                                                                                    dst_T_mf, dst_T_fm,
+                                                                                    nFramesDst,
+                                                                                    nConfigs,
+                                                                                    dstModel.getDeviceSdfFrames(),
+                                                                                    dstModel.getDeviceSdfs(),
+                                                                                    dstModel.getNumSdfs(),
+                                                                                    result, resultsdf, false, debugError);
+    }
+}
+
 
 void normEqnsSelfIntersectionReduced(const float4 * testSites,
                                   const int nSites,
@@ -994,5 +1170,4 @@ void initDebugIntersectionError(float * debugError,
     gpu_initDebugIntersectionError<<<grid,block>>>(debugError, nSites);
 
 }
-
 }
diff --git a/src/optimization/kernels/intersection.h b/src/optimization/kernels/intersection.h
index 4bd41cd..5b54da2 100644
--- a/src/optimization/kernels/intersection.h
+++ b/src/optimization/kernels/intersection.h
@@ -53,6 +53,62 @@ void normEqnsIntersection(const float4 * testSites,
                           float * result,
                           float * debugError = 0);
 
+/*
+ * src_T_mfs, src_T_fms, dst_T_mfs, dst_T_fms are all arrays of transforms on the
+ * gpu as computed by computeKinematicsBatchGPU, they are both nConfigs arrays
+ * of model-frame and frame-model transforms of their respective models
+ *
+ * result and resultsdf are nconfigs * nsites long arrays on the gpu to be used
+ * for results of distance checking. (result + config_idx * nsites)[site_idx] is
+ * the way to access a particular result for joint config config_idx and testSite
+ * site_idx. (Note that the memory should be copied over to the cpu first if that
+ * is where you intend to read the memory.)
+ * */
+void normEqnsIntersectionRaw(const float4 * testSites,
+                          const int nSites,
+                          const SE3 T_ds,
+                          const SE3 T_sd,
+                          const SE3 *src_T_mfs,
+                          const SE3 *src_T_fms,
+                          const int nFramesSrc, // number of frames per joint config
+                          const SE3 *dst_T_mfs,
+                          const SE3 *dst_T_fms,
+                          const int nFramesDst, // number of frames per joint config
+                          const int nConfigs,// number of total joint configs
+                          const MirroredModel & srcModel,
+                          const MirroredModel & dstModel,
+                          // distance to closest object
+                          // results array is a block of results (per joint config) with layout paralleling the joint configs 
+                          // (nsites results for jtconfig 1 at result, nsites results for jtconfig 2 at result + nsites)
+                          float * result, // should be an nsites * nconfigs long array that has already been allocated on the gpu
+                          // index of the sdf corresponding to above
+                          int * resultsdf, // should be an nsites * nconfigs long array that has already been allocated on the gpu
+                          float * debugError = 0);
+
+// same as above except your destination to intersect with is a single object instead of a
+// multitude of joint configurations
+void normEqnsIntersectionRawSingleTgt(const float4 * testSites,
+                          const int nSites,
+                          const SE3 T_ds,
+                          const SE3 T_sd,
+                          const SE3 *src_T_mfs,
+                          const SE3 *src_T_fms,
+                          const int nFramesSrc, // number of frames per joint config
+                          const SE3 *dst_T_mf,
+                          const SE3 *dst_T_fm,
+                          const int nFramesDst, // number of frames per joint config
+                          const int nConfigs,// number of total joint configs
+                          const MirroredModel & srcModel,
+                          const MirroredModel & dstModel,
+                          // distance to closest object
+                          // results array is a block of results (per joint config) with layout paralleling the joint configs 
+                          // (nsites results for jtconfig 1 at result, nsites results for jtconfig 2 at result + nsites)
+                          float * result, // should be an nsites * nconfigs long array that has already been allocated on the gpu
+                          // index of the sdf corresponding to above
+                          int * resultsdf, // should be an nsites * nconfigs long array that has already been allocated on the gpu
+                          float * debugError = 0);
+
+
 void normEqnsIntersectionReduced(const float4 * testSites,
                                  const int nSites,
                                  const int fullDims,
diff --git a/src/optimization/kernels/kinematics.cu b/src/optimization/kernels/kinematics.cu
new file mode 100644
index 0000000..88600f5
--- /dev/null
+++ b/src/optimization/kernels/kinematics.cu
@@ -0,0 +1,450 @@
+#include "kinematics.h"
+
+#include <cmath>
+#include <ctime>
+
+#include "kernel_common.h"
+#include "geometry/grid_3d.h"
+#include "geometry/SE3.h"
+#include "optimization/optimization.h"
+#include "util/mirrored_memory.h"
+
+namespace dart {
+
+static const float truncVal = 1000.0;
+
+
+// -=-=-=-=-=-=-=-=-=- kernels -=-=-=-=-=-=-=-=-=-
+// probably need some well defined place to store the result
+// that we can access later
+// also a flag or something to show when we're done that the main
+// thread can block on
+// use cudaMemcpy(_hVector,_dVector,_length*sizeof(T),cudaMemcpyDeviceToHost); for getting things back
+__global__ void gpu_computeForwardKinematics(
+        const int maxidx,
+        const int numtoproc,
+        const int poselen,
+        // array that corresponds to a pose/joint configuration
+        const float * poses,
+        //const int numDofs,
+        // result store
+        SE3 *T_mfss,
+        //const int numJoints,
+        const float2 * jointLims,
+        const SE3 *T_pfs,
+        const int *frameParents,
+        const int numframes,
+        const JointType *jointTypes,
+        const float3 *jointAxes) {
+    int start_idx = (threadIdx.x + blockIdx.x * blockDim.x) * numtoproc;
+    for (int idx = start_idx;idx < start_idx + numtoproc; ++idx){
+    if (idx >= maxidx) return;
+    const float * pose = poses + poselen * idx;
+    SE3 * T_mfs = T_mfss + numframes * idx;
+    SE3 val;
+    // setup
+    T_mfs[0] = val;
+
+    // need to figure out how to turn a configuration into a full set of joint angles (pose reduction to full pose)
+    // for now assume that jointConfigurations has already been transformed into a full pose
+    int j = 6;
+    for (int f=1; f<numframes; ++f) {
+        float p_ = pose[j];
+        if(j >= poselen) p_ = 0.0;
+        float p = fminf(fmaxf(jointLims[j-6].x,p_),jointLims[j-6].y);
+
+        const int joint = f-1;
+        SE3 T_pf = T_pfs[joint];
+        switch(jointTypes[joint]) {
+            case RotationalJoint:
+                T_pf = T_pf*SE3Fromse3(se3(0, 0, 0,
+                                p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z));
+                ++j;
+                break;
+            case PrismaticJoint:
+                T_pf = T_pf*SE3Fromse3(se3(p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z,
+                                0, 0, 0));
+                ++j;
+                break;
+        }
+        const int parent = frameParents[f];
+        T_mfs[f] = T_mfs[parent]*T_pf;
+    }}
+}
+
+__global__ void gpu_computeKinematics(
+        const int maxidx,
+        const int numtoproc,
+        const int poselen,
+        // array that corresponds to a pose/joint configuration
+        const float * poses,
+        //const int numDofs,
+        // result stores
+        SE3 *T_mfss,
+        SE3 *T_fmss,
+        //const int numJoints,
+        const float2 * jointLims,
+        const SE3 *T_pfs,
+        const int *frameParents,
+        const int numframes,
+        const JointType *jointTypes,
+        const float3 *jointAxes) {
+    int start_idx = (threadIdx.x + blockIdx.x * blockDim.x) * numtoproc;
+    for (int idx = start_idx;idx < start_idx + numtoproc; ++idx){
+    if (idx >= maxidx) return;
+    const float * pose = poses + poselen * idx;
+    SE3 * T_mfs = T_mfss + numframes * idx;
+    SE3 * T_fms = T_fmss + numframes * idx;
+    SE3 val;
+    // setup
+    T_mfs[0] = val;
+    int j = 6;
+    for (int f=1; f<numframes; ++f) {
+        float p_ = pose[j];
+        if(j >= poselen) p_ = 0.0;
+        float p = fminf(fmaxf(jointLims[j-6].x,p_),jointLims[j-6].y);
+
+        const int joint = f-1;
+        SE3 T_pf = T_pfs[joint];
+        switch(jointTypes[joint]) {
+            case RotationalJoint:
+                T_pf = T_pf*SE3Fromse3(se3(0, 0, 0,
+                                p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z));
+                ++j;
+                break;
+            case PrismaticJoint:
+                T_pf = T_pf*SE3Fromse3(se3(p*jointAxes[joint].x, p*jointAxes[joint].y, p*jointAxes[joint].z,
+                                0, 0, 0));
+                ++j;
+                break;
+        }
+        const int parent = frameParents[f];
+        T_mfs[f] = T_mfs[parent]*T_pf;
+        T_fms[f] = SE3Invert(T_mfs[f]);
+    }}
+}
+
+void computeForwardKinematics(
+        float *host_pose,
+        int poselen,
+        MirroredModel &robot,
+        MirroredVector<float2> *limits) {
+    // limit pose by joint limits and extend the articulation
+    // make sure not to leak around this
+    //MirroredVector<float2> limits = new MirroredVector<float2>(robot._jointLimits);
+    // move these to a spot on the gpu
+    // need to tune these/accept larger parameters so that this makes sense
+    //dim3 block(8,1,1);
+    //dim3 grid(ceil(total / block.x), ceil(total / block.y));
+    //std::cout << host_pose[0] << std::endl;
+    float *device_pose;
+    cudaError_t mal_err = cudaMalloc((void**) &device_pose, sizeof(float) * poselen);
+    //float *posea = (float*) malloc(sizeof(float) * poselen);
+    cudaError_t mc_err1 = cudaMemcpy(device_pose, host_pose, sizeof(float) * poselen, cudaMemcpyHostToDevice);
+    //cudaError_t mc_err2 = cudaMemcpy(posea, device_pose, sizeof(float) * poselen, cudaMemcpyDeviceToHost);
+    if (mal_err != cudaSuccess) {
+        std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl;
+        return;
+    }
+    if (mc_err1 != cudaSuccess) {
+        std::cout << "Cuda copy error1:" << cudaGetErrorString(mc_err1) << std::endl;
+        return;
+    }
+    /*if (mc_err2 != cudaSuccess) {
+        std::cout << "Cuda copy error2:" << cudaGetErrorString(mc_err2) << std::endl;
+        return;
+    }
+    /*std::cout << "Calling fk code with " << robot.getNumFrames() << " frames, " << poselen << " dimensions" << std::endl;
+    for (int i = 0; i < robot.getNumFrames() - 1; ++i) {
+        std::cout << "POSEX: " << posea[i] << " " << limits->hostPtr()[i].x
+                  << " " << limits->hostPtr()[i].y <<  std::endl;
+    }
+    free(posea);*/
+    robot.syncKinematicsHostToDevice();
+
+    std::cout << 1 <<" " << poselen << " " << (uint64_t) limits->devicePtr() << std::endl;
+    gpu_computeForwardKinematics<<<1,1>>>(
+                1,1,
+                poselen,
+                device_pose,
+                robot.getDeviceTransformsFrameToModel(),
+                limits->devicePtr(),
+                robot.getDeviceTransformsParentJointToFrame(),
+                robot.getDeviceFrameParents(),
+                robot.getNumFrames(),
+                robot.getDeviceJointTypes(),
+                robot.getDeviceJointAxes()
+                );
+    cudaFree(device_pose);
+    robot.syncKinematics();
+}
+
+SE3 **computeForwardKinematicsBatch(
+        float *host_poses,
+        int nposes,
+        int ndims,
+        MirroredModel &robot,
+        MirroredVector<float2> *limits,
+        bool BENCHMARK) {
+
+    clock_t s,e;
+    if (BENCHMARK) s = clock();
+    // limit pose by joint limits and extend the articulation
+    // make sure not to leak around this
+    //MirroredVector<float2> limits = new MirroredVector<float2>(robot._jointLimits);
+    // move these to a spot on the gpu
+    // need to tune these/accept larger parameters so that this makes sense
+    //dim3 block(8,1,1);
+    //dim3 grid(ceil(total / block.x), ceil(total / block.y));
+    // round up to a power of two for better calculation
+    int calc_num = std::pow(2, std::ceil(std::log(nposes)/std::log(2)));
+    int nblk = 16;
+    int perthr = 128;
+    int threadct = max(calc_num / perthr / nblk, 1);
+    int nframes = robot.getNumFrames();
+    float *device_poses;
+    cudaError_t mal_err = cudaMalloc((void**) &device_poses, sizeof(float) * ndims * nposes);
+    if (mal_err != cudaSuccess) {
+        std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl;
+        return NULL;
+    }
+    SE3 *device_results;
+    mal_err = cudaMalloc((void**) &device_results, sizeof(SE3) * nframes * nposes);
+    if (mal_err != cudaSuccess) {
+        std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl;
+        return NULL;
+    }
+    cudaMemset(device_results,0, sizeof(SE3) * nframes * nposes);
+
+    if (BENCHMARK) {
+      e = clock();
+      std::cout << "CUDA Mallocs: " << e << " " << s << " " << (e-s) << std::endl;
+      s = clock();
+    }
+    cudaError_t mc_err1 = cudaMemcpy(device_poses, host_poses, sizeof(float) * ndims * nposes, cudaMemcpyHostToDevice);
+    //for (int i = 0; i < host_poses.size(); ++i) {
+        //cudaError_t mc_err1 = cudaMemcpy(device_poses + ndims * i, host_poses[i], sizeof(float) * ndims, cudaMemcpyHostToDevice);
+        if (mc_err1 != cudaSuccess) {
+            std::cout << "Cuda copy error1:" << cudaGetErrorString(mc_err1) << std::endl;
+            return NULL;
+        }
+    //}
+
+    if(BENCHMARK) {
+      e = clock();
+      std::cout << "Device setup: " << e << " " << s << " " << (e-s) << std::endl;
+    }
+    /*float *posea = (float*) malloc(sizeof(float) * ndims);
+    cudaError_t mc_err2 = cudaMemcpy(posea, device_poses, sizeof(float) * ndims, cudaMemcpyDeviceToHost);
+    if (mc_err2 != cudaSuccess) {
+        std::cout << "Cuda copy error2:" << cudaGetErrorString(mc_err2) << std::endl;
+        return NULL;
+    }
+    //std::cout << "Calling fk code with " << robot.getNumFrames() << " frames, " << ndims << " dimensions" << std::endl;
+    for (int i = 0; i < ndims; ++i) {
+        std::cout << "POSEX: " << posea[i] << " " << limits->hostPtr()[i].x
+                  << " " << limits->hostPtr()[i].y << " " <<host_poses[i] <<  std::endl;
+    }
+    free(posea);//*/
+    if(BENCHMARK) s = clock();
+    SE3 **results = (SE3 **) malloc(sizeof(SE3*) * nposes + sizeof(SE3) * nframes * nposes);
+    SE3 *datapos = (SE3 *) &results[nposes];
+    // initalize raw array
+    for (int i = 0; i < nposes; ++i) {
+        results[i] = datapos + i * nframes;
+    }
+    if(BENCHMARK) {
+      e = clock();
+      std::cout << "End Setup: " << e << " " << s << " " << (e-s) << std::endl;
+    }
+    /*cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    std::cout << "Result " << datapos[2].r0.w << " " << datapos[2].r0.x << " " <<datapos[2].r0.y << " " <<
+                 datapos[2].r0.z << " " <<std::endl;
+    std::cout << nposes <<" " << ndims << " " << (uint64_t) limits->devicePtr() << std::endl;*/
+    if(BENCHMARK) s = clock();
+    std::cout << "Launching " << calc_num<< " "<< threadct << " " << nblk << " " << perthr << std::endl;
+    gpu_computeForwardKinematics<<<threadct,nblk>>>(
+                                            nposes,
+                                                         perthr,
+                                            ndims,
+                                            device_poses,
+                                            device_results,
+                                            limits->devicePtr(),
+                                            robot.getDeviceTransformsParentJointToFrame(),
+                                            robot.getDeviceFrameParents(),
+                                            robot.getNumFrames(),
+                                            robot.getDeviceJointTypes(),
+                                            robot.getDeviceJointAxes()
+                                            );
+    if (BENCHMARK) {
+      e = clock();
+      std::cout << "Actual computation: " << e << " " << s << " " << (e-s) << std::endl;
+    }
+                /*nposes,
+                ndims,
+                device_poses,
+                robot.getDeviceTransformsFrameToModel(),//device_results,
+                limits->devicePtr(),
+                robot.getDeviceTransformsParentJointToFrame(),
+                robot.getDeviceFrameParents(),
+                robot.getNumFrames(),
+                robot.getDeviceJointTypes(),
+                robot.getDeviceJointAxes()
+                );*/
+    //cudaFree(device_poses);
+
+    /*cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    std::cout << "Result " << datapos[2].r0.w << " " << datapos[2].r0.x << " " <<datapos[2].r0.y << " " <<
+                 datapos[2].r0.z << " " <<std::endl;
+
+    std::cout << (uint64_t) results << " " << (uint64_t) datapos << " " << (uint64_t) results[0] << " " << datapos[0].r0.w <<std::endl;
+    std::cout << results[0][0].r0.w <<" " << datapos[0].r0.w << " " <<std::endl;*/
+    if (BENCHMARK) s = clock();
+    cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    if (BENCHMARK) {
+      e = clock();
+      std::cout << "End Memcpy: " << e << " " << s << " " << (e-s) << std::endl;
+    }
+    cudaFree(device_results);
+    return results;
+}
+
+void computeKinematicsBatchGPU(
+        float *host_poses,
+        int nposes,
+        int ndims,
+        MirroredModel &robot,
+        MirroredVector<float2> *limits,
+        SE3 *&t_mfs,
+        SE3 *&t_fms) {
+
+    // limit pose by joint limits and extend the articulation
+    // make sure not to leak around this
+    //MirroredVector<float2> limits = new MirroredVector<float2>(robot._jointLimits);
+    // move these to a spot on the gpu
+    // need to tune these/accept larger parameters so that this makes sense
+    //dim3 block(8,1,1);
+    //dim3 grid(ceil(total / block.x), ceil(total / block.y));
+    // round up to a power of two for better calculation
+    int calc_num = std::pow(2, std::ceil(std::log(nposes)/std::log(2)));
+    int nblk = 16;
+    int perthr = 128;
+    int threadct = max(calc_num / perthr / nblk, 1);
+    int nframes = robot.getNumFrames();
+    float *device_poses;
+    cudaError_t mal_err = cudaMalloc((void**) &device_poses, sizeof(float) * ndims * nposes);
+    if (mal_err != cudaSuccess) {
+        std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl;
+        return;
+    }
+    SE3 *device_mfs;
+    mal_err = cudaMalloc((void**) &device_mfs, sizeof(SE3) * nframes * nposes);
+    if (mal_err != cudaSuccess) {
+        std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl;
+        return;
+    }
+    t_mfs = device_mfs;
+    cudaMemset(device_mfs,0, sizeof(SE3) * nframes * nposes);
+    SE3 *device_fms;
+    mal_err = cudaMalloc((void**) &device_fms, sizeof(SE3) * nframes * nposes);
+    if (mal_err != cudaSuccess) {
+        std::cout << "Cuda malloc error:" << cudaGetErrorString(mal_err) << std::endl;
+        return;
+    }
+    t_fms = device_fms;
+    cudaMemset(device_fms,0, sizeof(SE3) * nframes * nposes);
+
+    cudaError_t mc_err1 = cudaMemcpy(device_poses, host_poses, sizeof(float) * ndims * nposes, cudaMemcpyHostToDevice);
+    //for (int i = 0; i < host_poses.size(); ++i) {
+        //cudaError_t mc_err1 = cudaMemcpy(device_poses + ndims * i, host_poses[i], sizeof(float) * ndims, cudaMemcpyHostToDevice);
+        if (mc_err1 != cudaSuccess) {
+            std::cout << "Cuda copy error1:" << cudaGetErrorString(mc_err1) << std::endl;
+            return;
+        }
+    //}
+
+    /*float *posea = (float*) malloc(sizeof(float) * ndims);
+    cudaError_t mc_err2 = cudaMemcpy(posea, device_poses, sizeof(float) * ndims, cudaMemcpyDeviceToHost);
+    if (mc_err2 != cudaSuccess) {
+        std::cout << "Cuda copy error2:" << cudaGetErrorString(mc_err2) << std::endl;
+        return;
+    }
+    //std::cout << "Calling fk code with " << robot.getNumFrames() << " frames, " << ndims << " dimensions" << std::endl;
+    for (int i = 0; i < ndims; ++i) {
+        std::cout << "POSEX: " << posea[i] << " " << limits->hostPtr()[i].x
+                  << " " << limits->hostPtr()[i].y << " " <<host_poses[i] <<  std::endl;
+    }
+    free(posea);// */
+
+    /*cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    std::cout << "Result " << datapos[2].r0.w << " " << datapos[2].r0.x << " " <<datapos[2].r0.y << " " <<
+                 datapos[2].r0.z << " " <<std::endl;
+    std::cout << nposes <<" " << ndims << " " << (uint64_t) limits->devicePtr() << std::endl;*/
+    std::cout << "Launching " << calc_num<< " "<< threadct << " " << nblk << " " << perthr << std::endl;
+    gpu_computeKinematics<<<threadct,nblk>>>(
+                                            nposes,
+                                             perthr,
+                                            ndims,
+                                            device_poses,
+                                            device_mfs,
+                                            device_fms,
+                                            limits->devicePtr(),
+                                            robot.getDeviceTransformsParentJointToFrame(),
+                                            robot.getDeviceFrameParents(),
+                                            robot.getNumFrames(),
+                                            robot.getDeviceJointTypes(),
+                                            robot.getDeviceJointAxes()
+                                            );
+
+                /*nposes,
+                ndims,
+                device_poses,
+                robot.getDeviceTransformsFrameToModel(),//device_results,
+                limits->devicePtr(),
+                robot.getDeviceTransformsParentJointToFrame(),
+                robot.getDeviceFrameParents(),
+                robot.getNumFrames(),
+                robot.getDeviceJointTypes(),
+                robot.getDeviceJointAxes()
+                );*/
+    //cudaFree(device_poses);
+
+    /*cudaMemcpy(datapos, device_results, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    std::cout << "Result " << datapos[2].r0.w << " " << datapos[2].r0.x << " " <<datapos[2].r0.y << " " <<
+                 datapos[2].r0.z << " " <<std::endl;
+
+    std::cout << (uint64_t) results << " " << (uint64_t) datapos << " " << (uint64_t) results[0] << " " << datapos[0].r0.w <<std::endl;
+    std::cout << results[0][0].r0.w <<" " << datapos[0].r0.w << " " <<std::endl;*/
+}
+
+void computeKinematicsBatch(
+        float *host_poses,
+        int nposes,
+        int ndims,
+        MirroredModel &robot,
+        MirroredVector<float2> *limits,
+        SE3 **&t_mfs,
+        SE3 **&t_fms) {
+    int nframes = robot.getNumFrames();
+    t_mfs = (SE3 **) malloc(sizeof(SE3*) * nposes + sizeof(SE3) * nframes * nposes);
+    t_fms = (SE3 **) malloc(sizeof(SE3*) * nposes + sizeof(SE3) * nframes * nposes);
+    SE3 *mfs_pos = (SE3 *) &t_mfs[nposes];
+    // initalize raw array
+    for (int i = 0; i < nposes; ++i) {
+        t_mfs[i] = mfs_pos + i * nframes;
+    }
+    SE3 *fms_pos = (SE3 *) &t_fms[nposes];
+    // initalize raw array
+    for (int i = 0; i < nposes; ++i) {
+        t_fms[i] = fms_pos + i * nframes;
+    }
+    SE3 *device_mfs;
+    SE3 *device_fms;
+    computeKinematicsBatchGPU(host_poses, nposes, ndims, robot, limits, device_mfs, device_fms);
+    cudaMemcpy(mfs_pos, device_mfs, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(fms_pos, device_fms, sizeof(SE3) * nframes * nposes, cudaMemcpyDeviceToHost);
+    cudaFree(device_mfs);
+    cudaFree(device_fms);
+}
+
+}
diff --git a/src/optimization/kernels/kinematics.h b/src/optimization/kernels/kinematics.h
new file mode 100644
index 0000000..5336878
--- /dev/null
+++ b/src/optimization/kernels/kinematics.h
@@ -0,0 +1,67 @@
+#ifndef GPUKINEMATICS_H
+#define GPUKINEMATICS_H
+
+#include <vector_types.h>
+
+#include <vector>
+#include "geometry/grid_3d.h"
+#include "geometry/SE3.h"
+#include "model/mirrored_model.h"
+#include "util/dart_types.h"
+
+namespace dart {
+
+// Computes forward kkinematics for a single joint configuration on the GPU
+// puts it in the mirrored model passed in
+void computeForwardKinematics(
+        float *pose,
+        int poselen,
+        MirroredModel &robot,
+        MirroredVector<float2> *limits);
+
+// poses is a contiguous block of 1d arrays (e.g. floats 0-poselen are pose 1,
+// poselen-2*poselen is pose 2. up to nposes
+// benchmark is to turn on benchmark logging
+// returns a 2D array of transforms, caller is responsible for freeing the result
+// as well as array of poses passed in
+// WARNING: make sure to free the array that is returned once finished
+SE3 **computeForwardKinematicsBatch(float *poses, int nposes,
+        int poselen,
+        MirroredModel &robot,
+        MirroredVector<float2> *limits,
+        bool benchmark=false);
+
+// will compute both forward and backward kinematics, will put FK in t_mfs,
+// inverse kinematics in t_fms
+// WARNING: caller is responsible for freeing t_mfs and t_fms once they are
+// no longer needed.
+void computeKinematicsBatch(
+    float *poses,
+    int nposes,
+    int poselen,
+    MirroredModel &robot,
+    MirroredVector<float2> *limits,
+    SE3 **&t_mfs,
+    SE3 **&t_fms);
+
+// same as above except the pointers passed in are set to the pointers for the gpu
+// results and the gpu results are not freed.
+// Note that the type is SE3*, the memory pointed to is a contiguous block of 
+// results in order of the poses given
+// pose 1 is at t_mfs, t_fms
+// pose 2 is at t_mfs + numframes
+// etc...
+// WARNING: caller is responsible for eventually freeing the gpu memory of t_mfs, t_fms
+// (use cudaFree)
+void computeKinematicsBatchGPU(
+    float *poses,
+    int nposes,
+    int poselen,
+    MirroredModel &robot,
+    MirroredVector<float2> *limits,
+    SE3 *&t_mfs,
+    SE3 *&t_fms);
+
+}
+
+#endif // MODTOOBS_H