From 33f4beb0bcd838aa11ce8cbbda484e2008d69612 Mon Sep 17 00:00:00 2001
From: Anoop Saldanha <poonaatsoc@gmail.com>
Date: Fri, 28 May 2010 16:22:07 +0530
Subject: [PATCH] batching of packets support for cuda b2g mpm. Supported for
 both 32 and 64 bit platforms

---
 src/Makefile.am                 |    4 +-
 src/cuda-packet-batcher.c       | 1235 ++++++++++++
 src/cuda-packet-batcher.h       |  139 ++
 src/data-queue.c                |   93 +
 src/data-queue.h                |   64 +
 src/decode.h                    |   92 +-
 src/detect-engine-mpm.c         |   50 +-
 src/detect-engine.c             |   41 -
 src/detect.c                    |    1 +
 src/runmodes.c                  |  114 +-
 src/suricata.c                  |   15 +
 src/suricata.h                  |    3 +
 src/tm-modules.h                |    1 +
 src/tm-queues.c                 |    2 +
 src/tm-queues.h                 |    2 +
 src/tm-threads.c                |   59 +-
 src/tm-threads.h                |   28 +
 src/tmqh-simple.c               |   42 +-
 src/tmqh-simple.h               |    7 +-
 src/util-cuda-handlers.c        |  213 +-
 src/util-cuda-handlers.h        |    2 +
 src/util-mpm-b2g-cuda-kernel.cu |   77 +-
 src/util-mpm-b2g-cuda.c         | 3355 ++++++++++++++-----------------
 src/util-mpm-b2g-cuda.h         |   34 +-
 24 files changed, 3529 insertions(+), 2144 deletions(-)
 create mode 100644 src/cuda-packet-batcher.c
 create mode 100644 src/cuda-packet-batcher.h
 create mode 100644 src/data-queue.c
 create mode 100644 src/data-queue.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 9dece0c953..c40cbacb4f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -8,6 +8,7 @@ bin_PROGRAMS = suricata
 suricata_SOURCES = suricata.c suricata.h \
 runmodes.c runmodes.h \
 packet-queue.c packet-queue.h \
+data-queue.c data-queue.h \
 threads.c threads.h \
 source-nfq.c source-nfq.h \
 source-pcap.c source-pcap.h \
@@ -211,7 +212,8 @@ win32-misc.c win32-misc.h \
 win32-service.c win32-service.h \
 util-action.c util-action.h \
 win32-syslog.h \
-util-profiling.c util-profiling.h
+util-profiling.c util-profiling.h \
+cuda-packet-batcher.c cuda-packet-batcher.h
 
 # set the include path found by configure
 INCLUDES= $(all_includes)
diff --git a/src/cuda-packet-batcher.c b/src/cuda-packet-batcher.c
new file mode 100644
index 0000000000..0e8c7fa8b4
--- /dev/null
+++ b/src/cuda-packet-batcher.c
@@ -0,0 +1,1235 @@
+/**
+ * Copyright (c) 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ *
+ * \todo Some work yet to be done in this file.  Firstly change the way we get
+ *       the sgh.  Once we implement the retrieval of sghs from the flow, as
+ *       suggested by victor, we can get rid of the sgh retrieval here.
+ *       Make the various parameters user configurable.  Terribly hard-coded now.
+ */
+
+/* compile in, only if we have a CUDA enabled on this machine */
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "suricata-common.h"
+#include "suricata.h"
+
+#include "detect.h"
+#include "decode.h"
+#include "flow.h"
+#include "data-queue.h"
+
+#include "threads.h"
+#include "threadvars.h"
+#include "tm-queuehandlers.h"
+#include "tm-modules.h"
+
+#include "cuda-packet-batcher.h"
+#include "conf.h"
+
+#include "util-error.h"
+#include "util-debug.h"
+#include "util-unittest.h"
+
+#include "util-mpm-b2g-cuda.h"
+#include "detect-engine-address.h"
+#include "detect-engine-port.h"
+#include "detect-engine.h"
+#include "detect-parse.h"
+#include "tm-threads.h"
+#include "tmqh-packetpool.h"
+
+/* \todo Make this user configurable through our yaml file.  Also provide options
+ * where this can be dynamically updated based on the traffic */
+#define SC_CUDA_PB_BATCHER_ALARM_TIME 1
+
+/* holds the inq and outq between the cuda-packet-batcher TM and the cuda b2g mpm
+ * dispatcher thread */
+static Tmq *tmq_inq = NULL;
+static Tmq *tmq_outq = NULL;
+
+/* holds the packet inq between the batcher TM and, the TM feeding it packets
+ * in the runmode sequence.  We will need this to implement the alarm.  We will
+ * have a SIG_ALRM delivered every SC_CUDA_PB_BATCHER_ALARM_TIME seconds, after
+ * which we willf set a flag informing the batcher TM to queue the buffer to the
+ * GPU and wake the batcher thread, in case it is waiting on a conditional for a
+ * packet from the previous TM in the runmode */
+static Tmq *tmq_batcher_inq = NULL;
+
+/* used to indicate if we want to stop buffering the packets anymore.  We
+ * we will need this while we want to shut the engine down
+ * \todo give a better description */
+static int run_batcher = 1;
+
+/* indicates the maximum no of packets we are ready to buffer.  Theoretically the
+ * maximum value held by this var can't exceed the value held by
+ * "max_pending_packets".  Either ways we should make this user configurable like
+ * SC_CUDA_PB_BATCHER_ALARM_TIME.  Also allow dynamic updates to this value based
+ * on the traffic
+ * \todo make this user configurable, as well allow dynamic update of this
+ * variable based on the traffic seen */
+static uint32_t buffer_packet_threshhold = 1280;
+
+/* flag used by the SIG_ALRM handler to indicate that the batcher TM should queue
+ * the buffer to be processed by the Cuda Mpm B2g Batcher Thread for further
+ * processing on the GPU */
+static int queue_buffer = 0;
+
+/**
+ * \internal
+ * \brief The SIG_ALRM handler.  We will set the "queue_buffer" flag thus
+ *        informing the batcher TM that it needs to queue the buffer.  We
+ *        also signal the cond var for the batcher TM inq(the one it
+ *        receives packets from), incase it is waiting on the conditional
+ *        for a new packet from the previous TM in the runmodes list.
+ *
+ * \param signum The signal number that this function just woke up to.  In
+ *               our case it is SIG_ALRM.
+ */
+static void SCCudaPBSetQueueBufferFlag(int signum)
+{
+    SCLogDebug("Cuda Packet Batche alarm generated after %d seconds.  Set the"
+               "queue_buffer flag and signal the cuda TM inq.",
+               SC_CUDA_PB_BATCHER_ALARM_TIME);
+    //queue_buffer = 1;
+    //SCCondSignal(&((&trans_q[tmq_batcher_inq->id])->cond_q));
+
+    return;
+}
+
+/**
+ * \internal.
+ * \brief Set the SIG_ALRM handler
+ */
+static void SCCudaPBSetBatcherAlarmTimeHandler()
+{
+    struct sigaction action;
+
+    SCLogDebug("Setting the SIGALRM handler for the Cuda Batcher TM");
+    action.sa_handler = SCCudaPBSetQueueBufferFlag;
+    sigemptyset(&(action.sa_mask));
+    sigaddset(&(action.sa_mask), SIGALRM);
+    action.sa_flags = 0;
+    sigaction(SIGALRM, &action, 0);
+
+    return;
+}
+
+/**
+ * \internal
+ * \brief Used to retrieve the Signature Group Head for a packet.
+ *
+ * \param de_ctx Pointer the detection engine context to search for the
+ *               sgh for an incoming packet.
+ * \param p      Pointer to the incoming packet for which we will have to
+ *               search for a sgh.
+ *
+ * \retval sgh Pointer to the relevant matching sgh for the Packet.
+ */
+static SigGroupHead *SCCudaPBGetSgh(DetectEngineCtx *de_ctx, Packet *p)
+{
+    int ds, f;
+    SigGroupHead *sgh = NULL;
+
+    /* select the dsize_gh */
+    if (p->payload_len <= 100)
+        ds = 0;
+    else
+        ds = 1;
+
+    /* select the flow_gh */
+    if (p->flowflags & FLOW_PKT_TOCLIENT)
+        f = 0;
+    else
+        f = 1;
+
+    /* find the right mpm instance */
+    DetectAddress *ag = DetectAddressLookupInHead(de_ctx->dsize_gh[ds].flow_gh[f].src_gh[p->proto], &p->src);
+    if (ag != NULL) {
+        /* source group found, lets try a dst group */
+        ag = DetectAddressLookupInHead(ag->dst_gh,&p->dst);
+        if (ag != NULL) {
+            if (ag->port == NULL) {
+                SCLogDebug("we don't have ports");
+                sgh = ag->sh;
+            } else {
+                SCLogDebug("we have ports");
+
+                DetectPort *sport = DetectPortLookupGroup(ag->port,p->sp);
+                if (sport != NULL) {
+                    DetectPort *dport = DetectPortLookupGroup(sport->dst_ph, p->dp);
+                    if (dport != NULL) {
+                        sgh = dport->sh;
+                    } else {
+                        SCLogDebug("no dst port group found for the packet with dp %"PRIu16, p->dp);
+                    }
+                } else {
+                    SCLogDebug("no src port group found for the packet with sp %"PRIu16, p->sp);
+                }
+            }
+        } else {
+            SCLogDebug("no dst address group found for the packet");
+        }
+    } else {
+        SCLogDebug("no src address group found for the packet");
+    }
+
+    return sgh;
+}
+
+/**
+ * \internal
+ * \brief Handles the queuing of the buffer from this batcher TM to the cuda
+ *        mpm b2g dispatcher TM.
+ *
+ * \tctx The batcher thread context that holds the current operational buffer
+ *       which has to be buffered by this function.
+ */
+static void SCCudaPBQueueBuffer(SCCudaPBThreadCtx *tctx)
+{
+    SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)tctx->curr_pb;
+    uint32_t nop_in_buffer = pb->nop_in_buffer;
+    uint32_t *packets_offset_buffer = pb->packets_offset_buffer;
+    uint32_t offset = *(packets_offset_buffer + nop_in_buffer - 1);
+    SCCudaPBPacketDataForGPU *last_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer +
+                                                                         offset);
+
+    /* if we have no packets buffered in so far, get out */
+    if (pb->nop_in_buffer == 0) {
+        SCLogDebug("No packets buffered in so far in the cuda buffer.  Returning");
+        return;
+    }
+
+    /* calculate the total length of all the packets buffered in */
+    pb->packets_buffer_len = pb->packets_offset_buffer[pb->nop_in_buffer - 1] +
+        sizeof(SCCudaPBPacketDataForGPUNonPayload) +
+        last_packet->payload_len;
+
+    pb->packets_total_payload_len = pb->packets_payload_offset_buffer[pb->nop_in_buffer - 1] +
+        last_packet->payload_len;
+
+    /* enqueue the buffer in the outq to be consumed by the dispatcher TM */
+    SCDQDataQueue *dq_outq = &data_queues[tmq_outq->id];
+    SCMutexLock(&dq_outq->mutex_q);
+    SCDQDataEnqueue(dq_outq, (SCDQGenericQData *)tctx->curr_pb);
+    SCCondSignal(&dq_outq->cond_q);
+    SCMutexUnlock(&dq_outq->mutex_q);
+
+    while (run_batcher) {
+        /* dequeue a new buffer */
+        SCDQDataQueue *dq_inq = &data_queues[tmq_inq->id];
+        SCMutexLock(&dq_inq->mutex_q);
+        if (dq_inq->len == 0) {
+            /* if we have no data in queue, wait... */
+            SCondWait(&dq_inq->cond_q, &dq_inq->mutex_q);
+        }
+
+        if (run_batcher == 0) {
+            break;
+        }
+
+        if (dq_inq->len > 0) {
+            tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq_inq);
+            tctx->curr_pb->nop_in_buffer = 0;
+            tctx->curr_pb->packets_buffer_len = 0;
+            tctx->curr_pb->packets_total_payload_len = 0;
+            SCMutexUnlock(&dq_inq->mutex_q);
+            SCLogDebug("Dequeued a new packet buffer for the cuda batcher TM");
+            break;
+        } else {
+            /* Should only happen on signals. */
+            SCMutexUnlock(&dq_inq->mutex_q);
+            SCLogDebug("Unable to Relooping in the quest to dequeue new buffer\n");
+        }
+    } /* while (run_batcher) */
+
+    return;
+}
+
+/**
+ * \brief Custom slot function used by the Batcher TM.
+ *
+ * \param td Pointer to the ThreadVars instance.  In this case the batcher TM's
+ *           ThreadVars instance.
+ */
+void *SCCudaPBTmThreadsSlot1(void *td)
+{
+    ThreadVars *tv = (ThreadVars *)td;
+    Tm1Slot *s = (Tm1Slot *)tv->tm_slots;
+    Packet *p = NULL;
+    char run = 1;
+    TmEcode r = TM_ECODE_OK;
+
+    /* Set the thread name */
+    SCSetThreadName(tv->name);
+
+    if (tv->thread_setup_flags != 0) {
+        TmThreadSetupOptions(tv);
+    }
+
+    SCLogDebug("%s starting", tv->name);
+
+    if (s->s.SlotThreadInit != NULL) {
+        r = s->s.SlotThreadInit(tv, s->s.slot_initdata, &s->s.slot_data);
+        if (r != TM_ECODE_OK) {
+            EngineKill();
+
+            TmThreadsSetFlag(tv, THV_CLOSED);
+            pthread_exit((void *) -1);
+        }
+    }
+    memset(&s->s.slot_pq, 0, sizeof(PacketQueue));
+
+    TmThreadsSetFlag(tv, THV_INIT_DONE);
+    while(run) {
+        TmThreadTestThreadUnPaused(tv);
+
+        /* input a packet */
+        p = tv->tmqh_in(tv);
+
+        if (p == NULL) {
+            printf("packet is NULL for TM: %s\n", tv->name);
+            /* the only different between the actual Slot1 function in
+             * tm-threads.c and this custom Slot1 function is this call
+             * here.  We need to make the call here, even if we don't
+             * receive a packet from the previous stage in the runmodes.
+             * This is needed in cases where we the SIG_ALRM handler
+             * wants us to queue the buffer to the GPU and ends up waking
+             * the Batcher TM(which is waiting on a cond from the previous
+             * feeder TM).  Please handler the NULL packet case in the
+             * function that you now call */
+            r = s->s.SlotFunc(tv, p, s->s.slot_data, &s->s.slot_pq);
+        } else {
+            r = s->s.SlotFunc(tv, p, s->s.slot_data, &s->s.slot_pq);
+            /* handle error */
+            if (r == TM_ECODE_FAILED) {
+                TmqhReleasePacketsToPacketPool(&s->s.slot_pq);
+                TmqhOutputPacketpool(tv, p);
+                TmThreadsSetFlag(tv, THV_FAILED);
+                break;
+            }
+
+            /* output the packet */
+            tv->tmqh_out(tv, p);
+        }
+
+        if (TmThreadsCheckFlag(tv, THV_KILL)) {
+            SCPerfUpdateCounterArray(tv->sc_perf_pca, &tv->sc_perf_pctx, 0);
+            run = 0;
+        }
+    }
+
+    if (s->s.SlotThreadExitPrintStats != NULL) {
+        s->s.SlotThreadExitPrintStats(tv, s->s.slot_data);
+    }
+
+    if (s->s.SlotThreadDeinit != NULL) {
+        r = s->s.SlotThreadDeinit(tv, s->s.slot_data);
+        if (r != TM_ECODE_OK) {
+            TmThreadsSetFlag(tv, THV_CLOSED);
+            pthread_exit((void *) -1);
+        }
+    }
+
+    SCLogDebug("%s ending", tv->name);
+    TmThreadsSetFlag(tv, THV_CLOSED);
+    pthread_exit((void *) 0);
+}
+
+/**
+ * \brief Used to de-allocate an instance of SCCudaPBPacketsBuffer.
+ *
+ * \param pb Pointer to the SCCudaPacketsBuffer instance to be de-alloced.
+ */
+void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *pb)
+{
+    if (pb == NULL)
+        return;
+
+    if (pb->packets_buffer != NULL)
+        free(pb->packets_buffer);
+    if (pb->packets_offset_buffer != NULL)
+        free(pb->packets_offset_buffer);
+    if (pb->packets_payload_offset_buffer != NULL)
+        free(pb->packets_payload_offset_buffer);
+    if (pb->packets_address_buffer != NULL)
+        free(pb->packets_address_buffer);
+
+    free(pb);
+
+    return;
+}
+
+/**
+ * \brief Allocates a new instance of SCCudaPBPacketsBuffer.
+ *
+ * \param pb The newly created instance of SCCudaPBPacketsBuffer.
+ */
+SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
+{
+    SCCudaPBPacketsBuffer *pb = malloc(sizeof(SCCudaPBPacketsBuffer));
+    if (pb == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(pb, 0, sizeof(SCCudaPBPacketsBuffer));
+
+    /* the buffer for the packets to be sent over to the gpu.  We allot space for
+     * a minimum of SC_CUDA_PB_MIN_NO_OF_PACKETS, i.e. if each packet buffered
+     * is full to the brim */
+    pb->packets_buffer = malloc(sizeof(SCCudaPBPacketDataForGPU) *
+                                SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    if (pb->packets_buffer == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(pb->packets_buffer, 0, sizeof(SCCudaPBPacketDataForGPU) *
+           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+
+    /* used to hold the offsets of the buffered packets in the packets_buffer */
+    pb->packets_offset_buffer = malloc(sizeof(uint32_t) *
+                                       SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    if (pb->packets_offset_buffer == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(pb->packets_offset_buffer, 0, sizeof(uint32_t) *
+           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+
+    /* used to hold the offsets of the packets payload */
+    pb->packets_payload_offset_buffer = malloc(sizeof(uint32_t) *
+                                               SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    if (pb->packets_payload_offset_buffer == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(pb->packets_payload_offset_buffer, 0, sizeof(uint32_t) *
+           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+
+    /* used to hold the packet addresses for all the packets buffered inside
+     * packets_buffer */
+    pb->packets_address_buffer = malloc(sizeof(Packet *) *
+                                        SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    if (pb->packets_address_buffer == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(pb->packets_address_buffer, 0, sizeof(Packet *) *
+           SC_CUDA_PB_MIN_NO_OF_PACKETS);
+
+    return pb;
+}
+
+/**
+ * \brief Registration function for the Cuda Packet Batcher TM.
+ */
+void TmModuleCudaPacketBatcherRegister(void)
+{
+    tmm_modules[TMM_CUDA_PACKET_BATCHER].name = "CudaPacketBatcher";
+    tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadInit = SCCudaPBThreadInit;
+    tmm_modules[TMM_CUDA_PACKET_BATCHER].Func = SCCudaPBBatchPackets;
+    tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadExitPrintStats = SCCudaPBThreadExitStats;
+    tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadDeinit = SCCudaPBThreadDeInit;
+    tmm_modules[TMM_CUDA_PACKET_BATCHER].RegisterTests = SCCudaPBRegisterTests;
+
+    return;
+}
+
+/**
+ * \brief The cuda batcher TM init function.
+ *
+ * \param tv       The cuda packet batcher TM ThreadVars instance.
+ * \param initdata The initialization data needed by this cuda batcher TM.
+ * \param data     Pointer to a ponter memory location that would be updated
+ *                 with the newly created thread ctx instance.
+ *
+ * \retval TM_ECODE_OK     On success.
+ * \retval TM_ECODE_FAILED On failure.
+ */
+TmEcode SCCudaPBThreadInit(ThreadVars *tv, void *initdata, void **data)
+{
+    SCCudaPBThreadCtx *tctx = NULL;
+
+    if (initdata == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument.  initdata NULL "
+                   "for the cuda batcher TM init thread function");
+        return TM_ECODE_FAILED;
+    }
+
+    tctx = malloc(sizeof(SCCudaPBThreadCtx));
+    if (tctx == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(tctx, 0, sizeof(SCCudaPBThreadCtx));
+
+    /* the detection engine context.  We will need it to retrieve the sgh,
+     * when we start receiving and batching the packets */
+    tctx->de_ctx = initdata;
+
+    /* the first packet buffer from the queue */
+    tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(&data_queues[tmq_inq->id]);
+
+    *data = tctx;
+
+    /* we will need the cuda packet batcher TM's inq for further use later.  Read
+     * the comments associated with this var definition, for its use */
+    tmq_batcher_inq = tv->inq;
+
+    /* set the SIG_ALRM handler */
+    SCCudaPBSetBatcherAlarmTimeHandler();
+
+    /* Set the alarm time limit during which the batcher thread would buffer packets */
+    alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+
+    return TM_ECODE_OK;
+}
+
+/**
+ * \brief Batches packets into the packets buffer.
+ *
+ * \param tv   Pointer to the ThreadVars instance, in this case the cuda packet
+ *             batcher TM's TV instance.
+ * \param p    Pointer the the packet to be buffered.
+ * \param data Pointer the the batcher TM thread ctx.
+ * \param pq   Pointer to the packetqueue.  We don't need this.
+ *
+ * \retval TM_ECODE_OK     On success.
+ * \retval TM_ECODE_FAILED On failure.
+ */
+TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq)
+{
+#define ALIGN_UP(offset, alignment) \
+    (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
+
+    /* ah.  we have been signalled that we crossed the time limit within which we
+     * need to buffer packets.  Let us queue the buffer to the GPU */
+    if (queue_buffer) {
+        SCLogDebug("Cuda packet buffer TIME limit exceeded.  Buffering packet "
+                   "buffer and reseting the alarm");
+        queue_buffer = 0;
+        SCCudaPBQueueBuffer(data);
+        alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+    }
+
+    /* this is possible, since we are using a custom slot function that calls this
+     * function, even if it receives no packet from the packet queue */
+    if (p == NULL) {
+        SCLogDebug("packet NULL inside Cuda batcher TM");
+        return TM_ECODE_OK;
+    }
+
+    /* we set it for every incoming packet.  We will set this depending on whether
+     * we end up buffering the packet or not */
+    p->cuda_mpm_enabled = 0;
+
+    SCCudaPBThreadCtx *tctx = data;
+    /* the packets buffer */
+    SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)tctx->curr_pb;
+    /* the previous packet which has been buffered into the packets_buffer */
+    SCCudaPBPacketDataForGPU *prev_buff_packet = NULL;
+    /* holds the position in the packets_buffer where the curr packet would
+     * be buffered in */
+    SCCudaPBPacketDataForGPU *curr_packet = NULL;
+    /* the sgh to which the incoming packet belongs */
+    SigGroupHead *sgh = NULL;
+
+    /* get the signature group head to which this packet belongs.  If it belongs
+     * to no sgh, we don't need to buffer this packet.
+     * \todo Get rid of this, once we get the sgh from the flow */
+    sgh = SCCudaPBGetSgh(tctx->de_ctx, p);
+    if (sgh == NULL) {
+        SCLogDebug("No SigGroupHead match for this packet");
+        return TM_ECODE_OK;
+    }
+
+    /* if the payload is less than the maximum content length in this sgh we
+     * don't need to run the PM on this packet.  Chuck the packet out */
+    if (sgh->mpm_content_maxlen > p->payload_len) {
+        SCLogDebug("not mpm-inspecting as pkt payload is smaller than "
+                   "the largest content length we need to match");
+        return TM_ECODE_OK;
+    }
+
+    /* if one of these conditions fail we don't have to run the mpm on this
+     * packet.  Firstly if the payload_len is == 0, we don't have a payload
+     * to match against.  Next if we don't have a mpm_context against this
+     * sgh, indicating we don't have any patterns in this sgh, again we don't
+     * have anything to run the PM against.  Finally if the flow doesn't want
+     * to analyze packets for this flow, we can chuck this packet out as well */
+    if ( !(p->payload_len > 0 && sgh->mpm_ctx != NULL &&
+           !(p->flags & PKT_NOPAYLOAD_INSPECTION)) ) {
+        SCLogDebug("Either p->payload_len <= 0 or mpm_ctx for the packet is NULL "
+                   "or PKT_NOPAYLOAD_INSPECTION set for this packet");
+        return TM_ECODE_OK;
+    }
+
+    /* the cuda b2g context */
+    B2gCudaCtx *ctx = sgh->mpm_ctx->ctx;
+
+    /* if we have a 1 byte search kernel set we don't buffer this packet for
+     * cuda matching and instead run this non-cuda mpm function to be run on
+     * the packet */
+    if (ctx->Search == B2gCudaSearch1) {
+        SCLogDebug("The packet has a one byte patterns.  run mpm "
+                   "separately");
+        return TM_ECODE_OK;
+    }
+
+#ifdef B2G_CUDA_SEARCH2
+    /* if we have a 2 byte search kernel set we don't buffer this packet for
+     * cuda matching and instead run this non-cuda mpm function to be run on the
+     * packet */
+    if (ctx->Search == B2gCudaSearch2) {
+        SCLogDebug("The packet has two byte patterns.  run mpm "
+                   "separately");
+        return TM_ECODE_OK;
+    }
+#endif
+
+    /* we have passed all the criterions for buffering the packet.  Set the
+     * flag indicating that the packet goes through cuda mpm */
+    p->cuda_mpm_enabled = 1;
+
+    /* first packet to be buffered in */
+    if (pb->nop_in_buffer == 0) {
+        curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer;
+
+    /* buffer is not empty */
+    } else {
+        prev_buff_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer +
+                                                        pb->packets_offset_buffer[pb->nop_in_buffer - 1]);
+        curr_packet = (SCCudaPBPacketDataForGPU *)((uint8_t *)prev_buff_packet +
+                                                   sizeof(SCCudaPBPacketDataForGPUNonPayload) +
+                                                   prev_buff_packet->payload_len) ;
+        int diff = (int)((uint8_t *)curr_packet - pb->packets_buffer);
+        /* \todo Feel it is the wrong option taken by nvidia by setting CUdeviceptr
+         * to unsigned int.  Keep this option for now.  We will get back to this
+         * once nvidia responds to the filed bug */
+        ALIGN_UP(diff, sizeof(CUdeviceptr));
+        curr_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + diff);
+    }
+
+    /* store the data in the packets_buffer for this packet, which would be passed
+     * over to the GPU for processing */
+    curr_packet->m = ((B2gCudaCtx *)(sgh->mpm_ctx->ctx))->m;
+    curr_packet->table = ((B2gCudaCtx *)(sgh->mpm_ctx->ctx))->cuda_B2G;
+    curr_packet->payload_len = p->payload_len;
+    memcpy(curr_packet->payload, p->payload, p->payload_len);
+
+    /* store the address of the packet just buffered at the same index.  The
+     * dispatcher thread will need this address to communicate the results back
+     * to the packet */
+    pb->packets_address_buffer[pb->nop_in_buffer] = p;
+
+    /* if it is the first packet to be buffered, the offset is 0.  If it is not,
+     * then take the offset for the buffer from curr_packet */
+    if (pb->nop_in_buffer == 0) {
+        pb->packets_offset_buffer[pb->nop_in_buffer] = 0;
+        pb->packets_payload_offset_buffer[pb->nop_in_buffer] = 0;
+    } else {
+        pb->packets_offset_buffer[pb->nop_in_buffer] = (uint8_t *)curr_packet - pb->packets_buffer;
+        pb->packets_payload_offset_buffer[pb->nop_in_buffer] =
+            pb->packets_payload_offset_buffer[pb->nop_in_buffer - 1] +
+            prev_buff_packet->payload_len;
+    }
+
+    /* indicates the no of packets added so far into the buffer */
+    pb->nop_in_buffer++;
+
+    /* we have hit the threshhold for the total no of packets held in the buffer.
+     * We will change this in the future, instead relying on the remaining space
+     * left in the buffer or we have been informed that we have hit the time limit
+     * to queue the buffer */
+    if ( (pb->nop_in_buffer == buffer_packet_threshhold) || queue_buffer) {
+        queue_buffer = 0;
+        SCLogDebug("Either we have hit the threshold limit for packets(i.e.) we "
+                   "have %d packets limit) OR we have exceeded the buffering "
+                   "time limit.  Buffering the packet buffer and reseting the "
+                   "alarm.", buffer_packet_threshhold);
+        SCCudaPBQueueBuffer(tctx);
+        alarm(SC_CUDA_PB_BATCHER_ALARM_TIME);
+    }
+
+    return TM_ECODE_OK;
+}
+
+void SCCudaPBThreadExitStats(ThreadVars *tv, void *data)
+{
+    return;
+}
+
+/**
+ * \brief The thread de-init function for the cuda packet batcher TM.
+ *
+ * \param tv   Pointer to the cuda packet batcher TM ThreadVars instance.
+ * \param data Pointer the the Thread ctx for the cuda packet batcher TM.
+ *
+ * \retval TM_ECODE_OK     On success.
+ * \retval TM_ECODE_FAILED On failure.  Although we won't be returning this here.
+ */
+TmEcode SCCudaPBThreadDeInit(ThreadVars *tv, void *data)
+{
+    SCCudaPBThreadCtx *tctx = data;
+
+    if (tctx != NULL) {
+        if (tctx->curr_pb != NULL) {
+            SCCudaPBDeAllocSCCudaPBPacketsBuffer(tctx->curr_pb);
+            tctx->curr_pb = NULL;
+        }
+        free(tctx);
+    }
+
+    return TM_ECODE_OK;
+}
+
+/**
+ * \brief Sets up the queues and buffers needed by the cuda batcher TM function.
+ */
+void SCCudaPBSetUpQueuesAndBuffers(void)
+{
+    /* the b2g dispatcher thread would have to use the reverse for incoming
+     * and outgoing queues */
+    char *inq_name = "cuda_batcher_mpm_inqueue";
+    char *outq_name = "cuda_batcher_mpm_outqueue";
+    int i = 0;
+
+    /* set the incoming queue for the cuda_packet_batcher TM and the cuda B2g
+     * dispatcher */
+    tmq_inq = TmqGetQueueByName(inq_name);
+    if (tmq_inq == NULL) {
+        tmq_inq = TmqCreateQueue(inq_name);
+        if (tmq_inq == NULL) {
+            return;
+        }
+    }
+    tmq_inq->reader_cnt++;
+    tmq_inq->writer_cnt++;
+
+    /* set the outgoing queue from the cuda_packet_batcher TM and the cuda B2g
+     * dispatcher */
+    tmq_outq = TmqGetQueueByName(outq_name);
+    if (tmq_outq == NULL) {
+        tmq_outq = TmqCreateQueue(outq_name);
+        if (tmq_outq == NULL) {
+            return;
+        }
+    }
+    tmq_outq->reader_cnt++;
+    tmq_outq->writer_cnt++;
+
+    /* allocate the packet buffer */
+    /* \todo need to work out the right no of packet buffers that we need to
+     * queue.  I doubt we will need more than 4(as long as we don't run it on
+     * low traffic line).  We don't want to get into the business of creating
+     * new ones, when we run out of buffers, since malloc for a huge chunk
+     * like this will take time.  We need to figure out a value based on
+     * various other parameters like alarm time and buffer threshold value */
+    for (i = 0; i < 10; i++) {
+        SCCudaPBPacketsBuffer *pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
+        /* dump the buffer into the inqueue for this batcher TM.  the batcher
+         * thread would be the first consumer for these buffers */
+        SCDQDataEnqueue(&data_queues[tmq_inq->id], (SCDQGenericQData *)pb);
+    }
+
+    /* \todo This needs to be changed ASAP.  This can't exceed max_pending_packets.
+     * Also we need to make this user configurable and allow dynamic updaes
+     * based on live traffic */
+    buffer_packet_threshhold = 1280;
+
+    return;
+}
+
+/**
+ * \brief Clean up all the buffers queued in.  Need to write more on this.
+ */
+void SCCudaPBCleanUpQueuesAndBuffers(void)
+{
+    SCCudaPBPacketsBuffer *pb = NULL;
+    SCDQDataQueue *dq = NULL;
+
+    if (tmq_inq == NULL || tmq_outq == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments.  tmq_inq or "
+                   "tmq_outq NULL");
+        return;
+    }
+
+    /* clean all the buffers present in the inq */
+    dq = &data_queues[tmq_inq->id];
+    SCMutexLock(&dq->mutex_q);
+    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
+        if (pb->packets_buffer != NULL)
+            free(pb->packets_buffer);
+        if (pb->packets_offset_buffer != NULL)
+            free(pb->packets_offset_buffer);
+        if (pb->packets_payload_offset_buffer != NULL)
+            free(pb->packets_payload_offset_buffer);
+
+        free(pb);
+    }
+    SCMutexUnlock(&dq->mutex_q);
+    SCCondSignal(&dq->cond_q);
+
+    /* clean all the buffers present in the outq */
+    dq = &data_queues[tmq_outq->id];
+    SCMutexLock(&dq->mutex_q);
+    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
+        if (pb->packets_buffer != NULL)
+            free(pb->packets_buffer);
+        if (pb->packets_offset_buffer != NULL)
+            free(pb->packets_offset_buffer);
+        if (pb->packets_payload_offset_buffer != NULL)
+            free(pb->packets_payload_offset_buffer);
+
+        free(pb);
+    }
+    SCMutexUnlock(&dq->mutex_q);
+    SCCondSignal(&dq->cond_q);
+
+    return;
+}
+
+/**
+ * \brief Function used to set the packet threshhold limit in the packets buffer.
+ *
+ * \param threshhold_override The threshhold limit for the packets_buffer.
+ */
+void SCCudaPBSetBufferPacketThreshhold(uint32_t threshhold_override)
+{
+    buffer_packet_threshhold = threshhold_override;
+
+    return;
+}
+
+/**
+ * \brief Used to inform the cuda packet batcher that packet batching shouldn't
+ *        be done anymore and set the flag to indicate this.  We also need to
+ *        signal the cuda batcher data inq, in case it is waiting on the inq
+ *        for a new free packet buffer.
+ */
+void SCCudaPBKillBatchingPackets(void)
+{
+    run_batcher = 0;
+    SCDQDataQueue *dq = &data_queues[tmq_inq->id];
+    SCCondSignal(&dq->cond_q);
+
+    return;
+}
+
+/***********************************Unittests**********************************/
+
+#ifdef UNITTESTS
+
+int SCCudaPBTest01(void)
+{
+#define ALIGN_UP(offset, alignment) \
+    (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
+
+    uint8_t raw_eth[] = {
+        0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02,
+        0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00,
+        0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06,
+        0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8,
+        0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11,
+        0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18,
+        0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01,
+        0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21,
+        0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31,
+        0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46,
+        0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f,
+        0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43,
+        0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e,
+        0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72,
+        0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43,
+        0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54,
+        0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20,
+        0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d,
+        0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44,
+        0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e,
+        0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70,
+        0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38,
+        0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47,
+        0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76,
+        0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d,
+        0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+        0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a,
+        0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a,
+        0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48,
+        0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74,
+        0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65,
+        0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f,
+        0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79,
+        0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+        0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63,
+        0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75,
+        0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c,
+        0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30,
+        0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c,
+        0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c,
+        0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42,
+        0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31,
+        0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76,
+        0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a,
+        0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75,
+        0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73,
+        0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c,
+        0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68,
+        0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e,
+        0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59,
+        0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e,
+        0x0d, 0x0a };
+
+    int result = 0;
+    SCCudaPBThreadCtx *tctx = NULL;
+
+    Packet p;
+    DecodeThreadVars dtv;
+    ThreadVars tv;
+    ThreadVars tv_cuda_PB;
+    DetectEngineCtx *de_ctx = NULL;
+
+    SCCudaPBPacketsBuffer *pb = NULL;
+    SCCudaPBPacketDataForGPU *buff_packet = NULL;
+    SCDQDataQueue *dq = NULL;
+
+    uint32_t i = 0;
+
+    char *strings[] = {"test_one",
+                       "test_two",
+                       "test_three",
+                       "test_four",
+                       "test_five",
+                       "test_six",
+                       "test_seven",
+                       "test_eight",
+                       "test_nine",
+                       "test_ten"};
+
+    uint32_t packets_payload_offset_buffer[sizeof(strings)/sizeof(char *)];
+    memset(packets_payload_offset_buffer, 0, sizeof(packets_payload_offset_buffer));
+    uint32_t packets_offset_buffer[sizeof(strings)/sizeof(char *)];
+    memset(packets_offset_buffer, 0, sizeof(packets_offset_buffer));
+
+    uint32_t packets_total_payload_len = 0;
+    uint32_t packets_buffer_len = 0;
+
+    for (i = 0; i < sizeof(strings)/sizeof(char *); i++) {
+        packets_total_payload_len += strlen(strings[i]);
+    }
+
+    for (i = 1; i < sizeof(strings)/sizeof(char *); i++) {
+        packets_payload_offset_buffer[i] = packets_payload_offset_buffer[i - 1] + strlen(strings[i - 1]);
+        packets_offset_buffer[i] = packets_offset_buffer[i - 1] +
+            sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(strings[i - 1]);
+        ALIGN_UP(packets_offset_buffer[i], sizeof(CUdeviceptr));
+    }
+    packets_buffer_len += packets_offset_buffer[(sizeof(strings)/sizeof(char *)) - 1] +
+        sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(strings[(sizeof(strings)/sizeof(char *)) - 1]);
+
+    memset(&p, 0, sizeof(Packet));
+    memset(&dtv, 0, sizeof(DecodeThreadVars));
+    memset(&tv, 0, sizeof(ThreadVars));
+    memset(&tv_cuda_PB, 0, sizeof(ThreadVars));
+
+    FlowInitConfig(FLOW_QUIET);
+    DecodeEthernet(&tv, &dtv, &p, raw_eth, sizeof(raw_eth), NULL);
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL) {
+        goto end;
+    }
+
+    de_ctx->mpm_matcher = MPM_B2G_CUDA;
+    de_ctx->flags |= DE_QUIET;
+
+    de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                               "content:test; sid:1;)");
+    if (de_ctx->sig_list == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    SigGroupBuild(de_ctx);
+
+    result = 1;
+
+    SCCudaPBSetUpQueuesAndBuffers();
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 10);
+    SCCudaPBThreadInit(&tv_cuda_PB, de_ctx, (void *)&tctx);
+    SCCudaPBSetBufferPacketThreshhold(sizeof(strings)/sizeof(char *));
+
+    p.payload = (uint8_t *)strings[0];
+    p.payload_len = strlen(strings[0]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[1];
+    p.payload_len = strlen(strings[1]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[2];
+    p.payload_len = strlen(strings[2]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[3];
+    p.payload_len = strlen(strings[3]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[4];
+    p.payload_len = strlen(strings[4]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[5];
+    p.payload_len = strlen(strings[5]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[6];
+    p.payload_len = strlen(strings[6]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[7];
+    p.payload_len = strlen(strings[7]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[8];
+    p.payload_len = strlen(strings[8]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    p.payload = (uint8_t *)strings[9];
+    p.payload_len = strlen(strings[9]);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 1);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 8);
+
+    dq = &data_queues[tmq_outq->id];
+    pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq);
+    if (pb == NULL) {
+        result = 0;
+        goto end;
+    }
+    result &= (dq->len == 0);
+    result &= (pb->nop_in_buffer == 10);
+    if (result == 0)
+        goto end;
+
+    for (i = 0; i < pb->nop_in_buffer; i++) {
+        buff_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + pb->packets_offset_buffer[i]);
+        result &= (strlen(strings[i]) == buff_packet->payload_len);
+        result &= (memcmp(strings[i], buff_packet->payload, buff_packet->payload_len) == 0);
+        if (result == 0)
+            goto end;
+        result &= (packets_payload_offset_buffer[i] == pb->packets_payload_offset_buffer[i]);
+        result &= (packets_offset_buffer[i] == pb->packets_offset_buffer[i]);
+    }
+    result &= (packets_total_payload_len == pb->packets_total_payload_len);
+    result &= (packets_buffer_len == pb->packets_buffer_len);
+
+ end:
+    SCCudaPBCleanUpQueuesAndBuffers();
+    if (de_ctx) {
+        SigGroupCleanup(de_ctx);
+        SigCleanSignatures(de_ctx);
+        DetectEngineCtxFree(de_ctx);
+    }
+
+    SCCudaPBThreadDeInit(NULL, tctx);
+    return result;
+}
+
+int SCCudaPBTest02(void)
+{
+    uint8_t raw_eth[] = {
+        0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02,
+        0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00,
+        0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06,
+        0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8,
+        0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11,
+        0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18,
+        0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01,
+        0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21,
+        0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31,
+        0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46,
+        0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f,
+        0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43,
+        0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e,
+        0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72,
+        0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43,
+        0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54,
+        0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20,
+        0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d,
+        0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44,
+        0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e,
+        0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70,
+        0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38,
+        0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47,
+        0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76,
+        0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d,
+        0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+        0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a,
+        0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a,
+        0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48,
+        0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74,
+        0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65,
+        0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f,
+        0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79,
+        0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+        0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63,
+        0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75,
+        0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c,
+        0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30,
+        0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c,
+        0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c,
+        0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42,
+        0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31,
+        0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76,
+        0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a,
+        0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75,
+        0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73,
+        0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c,
+        0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68,
+        0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e,
+        0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59,
+        0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e,
+        0x0d, 0x0a };
+
+    int result = 0;
+    const char *string = NULL;
+    SCCudaPBThreadCtx *tctx = NULL;
+
+    Packet p;
+    DecodeThreadVars dtv;
+    ThreadVars tv;
+    ThreadVars tv_cuda_PB;
+    DetectEngineCtx *de_ctx = NULL;
+
+    SCCudaPBPacketsBuffer *pb = NULL;
+    SCDQDataQueue *dq = NULL;
+
+
+    memset(&p, 0, sizeof(Packet));
+    memset(&dtv, 0, sizeof(DecodeThreadVars));
+    memset(&tv, 0, sizeof(ThreadVars));
+    memset(&tv_cuda_PB, 0, sizeof(ThreadVars));
+
+    FlowInitConfig(FLOW_QUIET);
+    DecodeEthernet(&tv, &dtv, &p, raw_eth, sizeof(raw_eth), NULL);
+
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL) {
+        goto end;
+    }
+
+    de_ctx->mpm_matcher = MPM_B2G_CUDA;
+    de_ctx->flags |= DE_QUIET;
+
+    de_ctx->sig_list = SigInit(de_ctx, "alert tcp any 5555 -> any any (msg:\"Bamboo\"; "
+                               "content:test; sid:1;)");
+    if (de_ctx->sig_list == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    SigGroupBuild(de_ctx);
+
+    SCCudaPBSetUpQueuesAndBuffers();
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 10);
+    SCCudaPBThreadInit(&tv_cuda_PB, de_ctx, (void *)&tctx);
+
+    result = 1;
+
+    string = "test_one";
+    p.payload = (uint8_t *)string;
+    p.payload_len = strlen(string);
+    SCCudaPBBatchPackets(NULL, &p, tctx, NULL);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
+
+    pb = tctx->curr_pb;
+    result &= (pb->nop_in_buffer == 0);
+
+ end:
+    SCCudaPBCleanUpQueuesAndBuffers();
+    if (de_ctx) {
+        SigGroupCleanup(de_ctx);
+        SigCleanSignatures(de_ctx);
+        DetectEngineCtxFree(de_ctx);
+    }
+
+    SCCudaPBThreadDeInit(NULL, tctx);
+    return result;
+}
+
+#endif /* UNITTESTS */
+
+void SCCudaPBRegisterTests(void)
+{
+
+#ifdef UNITTESTS
+    UtRegisterTest("SCCudaPBTest01", SCCudaPBTest01, 1);
+    UtRegisterTest("SCCudaPBTest02", SCCudaPBTest02, 1);
+#endif
+
+    return;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
diff --git a/src/cuda-packet-batcher.h b/src/cuda-packet-batcher.h
new file mode 100644
index 0000000000..60586fb6a3
--- /dev/null
+++ b/src/cuda-packet-batcher.h
@@ -0,0 +1,139 @@
+/**
+ * Copyright (c) 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ */
+
+#ifndef __CUDA_PACKET_BATCHER_H__
+#define __CUDA_PACKET_BATCHER_H__
+
+#include "suricata-common.h"
+
+/* compile in, only if we have a CUDA enabled on this machine */
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "util-cuda.h"
+
+/* The min no of packets that we allot the buffer for.  We will make
+ * this user configurable(yaml) based on the traffic they expect.  Either ways
+ * for a low/medium traffic network with occasional sgh matches, we shouldn't
+ * be enabling cuda.  We will only end up screwing performance */
+#define SC_CUDA_PB_MIN_NO_OF_PACKETS 4000
+
+/**
+ * \brief Implement the template SCDQGenericQData to transfer the cuda
+ *        packet buffer from the cuda batcher thread to the dispatcher
+ *        thread using the queue SCDQDataQueue.
+ */
+typedef struct SCCudaPBPacketsBuffer_ {
+    /* these members from the template SCDQGenericQData that have to be
+     * compulsarily implemented */
+    struct SCDQGenericQData_ *next;
+    struct SCDQGenericQData_ *prev;
+    /* if we want to consider this pointer as the head of a list, this var
+     * holds the no of elements in the list */
+    //uint16_t len;
+    /* in case this data instance is the head of a list, we can refer the
+     * bottomost instance directly using this var */
+    //struct SCDQGenericaQData *bot;
+
+    /* our own members from here on*/
+
+    /* current count of packets held in packets_buffer.  nop = no of packets */
+    uint32_t nop_in_buffer;
+    /* the packets buffer.  We will assign buffer for SC_CUDA_PB_MIN_NO_OF_PACKETS
+     * packets.  Basically the size of this buffer would be
+     * SC_CUDA_PB_MIN_NO_OF_PACKETS * sizeof(SCCudaPBPacketDataForGPU), so that
+     * we can hold mininum SC_CUDA_PB_MIN_NO_OF_PACKETS */
+    uint8_t *packets_buffer;
+    /* length of data buffered so far in packets_buffer, which would be sent
+     * to the GPU.  We will need this to copy the buffered data from the
+     * packets_buffer here on the host, to the buffer on the GPU */
+    uint32_t packets_buffer_len;
+    /* packet offset within the packets_buffer.  Each packet would be stored in
+     * packets buffer at a particular offset.  This buffer would indicate the
+     * offset of a packet inside the packet buffer.  We will allot space to hold
+     * offsets for SC_CUDA_PB_MIN_NO_OF_PACKETS packets
+     * \todo change it to holds offsets for more than SC_CUDA_PB_MIN_NO_OF_PACKETS
+     * when we use the buffer to hold packets based on the remaining size in the
+     * buffer rather than on a fixed limit like SC_CUDA_PB_MIN_NO_OF_PACKETS */
+    uint32_t *packets_offset_buffer;
+
+    /* the total packet payload lengths buffered so far.  We will need this to
+     * transfer the total length of the results buffer that has to be transferred
+     * back from the gpu */
+    uint32_t packets_total_payload_len;
+    /* the payload offsets for the different payload lengths buffered in.  For
+     * example if we buffer 4 packets of lengths 3, 4, 5, 6, we will store four
+     * offsets in the buffer {0, 3, 7, 12, 18} */
+    uint32_t *packets_payload_offset_buffer;
+
+    /* packet addresses for all the packets buffered in the packets_buffer.  We
+     * will allot space to hold packet addresses for SC_CUDA_PB_MIN_NO_OF_PACKETS.
+     * We will need this, so that the cuda mpm b2g dispatcher thread can inform
+     * and store the b2g cuda mpm results for the packet*/
+    Packet **packets_address_buffer;
+} SCCudaPBPacketsBuffer;
+
+/**
+ * \brief Structure for each packet that is being batched to the GPU.
+ */
+typedef struct SCCudaPBPacketDataForGPU_ {
+    /* holds B2gCudaCtx->m */
+    unsigned int m;
+    /* holds B2gCudaCtx->cuda_B2g */
+    CUdeviceptr table;
+    /* holds the length of the payload */
+    unsigned int payload_len;
+    /* holds the payload.  While we actually store the payload in the buffer,
+     * we may not end up using the entire 1480 bytes if the payload is smaller */
+    uint8_t payload[1480];
+} SCCudaPBPacketDataForGPU;
+
+/**
+ * \brief Same as struct SCCudaPBPacketDataForGPU_ except for the payload part.
+ *        We will need this for calculating the size of the non-payload part
+ *        of the packet data to be buffered.
+ */
+typedef struct SCCudaPBPacketDataForGPUNonPayload_ {
+    /* holds B2gCudaCtx->m */
+    unsigned int m;
+    /* holds B2gCudaCtx->cuda_B2g */
+    CUdeviceptr table;
+    /* holds the length of the payload */
+    unsigned int payload_len;
+} SCCudaPBPacketDataForGPUNonPayload;
+
+/**
+ * \brief The cuda packet batcher threading context.
+ */
+typedef struct SCCudaPBThreadCtx_ {
+    /* we need the detection engine context to retrieve the sgh while we start
+     * receiving and batching the packets */
+    DetectEngineCtx *de_ctx;
+
+    /* packets buffer currently in use inside the cuda batcher thread */
+    SCCudaPBPacketsBuffer *curr_pb;
+} SCCudaPBThreadCtx;
+
+SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void);
+void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *);
+
+void SCCudaPBSetBufferPacketThreshhold(uint32_t);
+void SCCudaPBCleanUpQueuesAndBuffers(void);
+void SCCudaPBSetUpQueuesAndBuffers(void);
+void SCCudaPBKillBatchingPackets(void);
+
+TmEcode SCCudaPBBatchPackets(ThreadVars *, Packet *, void *, PacketQueue *);
+TmEcode SCCudaPBThreadInit(ThreadVars *, void *, void **);
+TmEcode SCCudaPBThreadDeInit(ThreadVars *, void *);
+void SCCudaPBThreadExitStats(ThreadVars *, void *);
+void SCCudaPBRegisterTests(void);
+
+void TmModuleCudaPacketBatcherRegister(void);
+
+void *SCCudaPBTmThreadsSlot1(void *);
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
+#endif /* __CUDA_PACKET_BATCHER_H__ */
diff --git a/src/data-queue.c b/src/data-queue.c
new file mode 100644
index 0000000000..297139b9b0
--- /dev/null
+++ b/src/data-queue.c
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) 2009, 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ */
+
+#include "suricata-common.h"
+#include "data-queue.h"
+#include "threads.h"
+
+/**
+ * \brief Enqueues data on the queue.
+ *
+ * \param q    Pointer to the data queue.
+ * \param data Pointer to the data to be queued.  It should be a pointer to a
+ *             structure instance that implements the template structure
+ *             struct SCDQGenericQData_ defined in data-queue.h.
+ */
+void SCDQDataEnqueue(SCDQDataQueue *q, SCDQGenericQData *data)
+{
+    /* we already have some data in queue */
+    if (q->top != NULL) {
+        data->next = q->top;
+        q->top->prev = data;
+        q->top = data;
+
+    /* the queue is empty */
+    } else {
+        q->top = data;
+        q->bot = data;
+    }
+
+    q->len++;
+
+#ifdef DBG_PERF
+    if (q->len > q->dbg_maxlen)
+        q->dbg_maxlen = q->len;
+#endif /* DBG_PERF */
+
+    return;
+}
+
+/**
+ * \brief Dequeues and returns an entry from the queue.
+ *
+ * \param q      Pointer to the data queue.
+ * \param retval Pointer to the data that has been enqueued.  The instance
+ *               returned is/should be a pointer to a structure instance that
+ *               implements the template structure struct SCDQGenericQData_
+ *               defined in data-queue.h.
+ */
+SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *q)
+{
+    SCDQGenericQData *data = NULL;
+
+    /* if the queue is empty there are is no data left and we return NULL */
+    if (q->len == 0) {
+        return NULL;
+    }
+
+    /* If we are going to get the last packet, set len to 0
+     * before doing anything else (to make the threads to follow
+     * the SCondWait as soon as possible) */
+    q->len--;
+
+    /* pull the bottom packet from the queue */
+    data = q->bot;
+
+#ifdef OS_DARWIN
+    /* Weird issue in OS_DARWIN
+     * Sometimes it looks that two thread arrive here at the same time
+     * so the bot ptr is NULL */
+    if (data == NULL) {
+        printf("No data to dequeue!\n");
+        return NULL;
+    }
+#endif /* OS_DARWIN */
+
+    /* more data in queue */
+    if (q->bot->prev != NULL) {
+        q->bot = q->bot->prev;
+        q->bot->next = NULL;
+    /* just the one we remove, so now empty */
+    } else {
+        q->top = NULL;
+        q->bot = NULL;
+    }
+
+    data->next = NULL;
+    data->prev = NULL;
+
+    return data;
+}
diff --git a/src/data-queue.h b/src/data-queue.h
new file mode 100644
index 0000000000..f3c11fcf74
--- /dev/null
+++ b/src/data-queue.h
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) 2009, 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ *
+ * \file Generic queues.  Any instance that wants to get itself on the generic
+ *       queue, would have to implement the template struct SCDQGenericQData_
+ *       defined below.
+ */
+
+#ifndef __DATA_QUEUE_H__
+#define __DATA_QUEUE_H__
+
+#include "threads.h"
+
+/**
+ * \brief Generic template for any data structure that wants to be on the
+ *        queue.  Any other data structure that wants to be on the queue
+ *        needs to use this template and define its own members from
+ *        <your_own_structure_members_from_here_on> onwards.
+ */
+typedef struct SCDQGenericQData_ {
+    /* this is needed when we want to supply a list of data items */
+    struct SCDQGenericQData_ *next;
+    struct SCDQGenericQData_ *prev;
+    /* if we want to consider this pointer as the head of a list, this var
+     * holds the no of elements in the list.  Else it holds a <need_to_think>. */
+    //uint16_t len;
+    /* in case this data instance is the head of a list, we can refer the
+     * bottomost instance directly using this var */
+    //struct SCDQGenericaQData *bot;
+
+
+    /* any other data structure that wants to be on the queue can implement
+     * its own memebers from here on, in its structure definition.  Just note
+     * that the first 2 members should always be next and prev in the same
+     * order */
+    // <your_own_structure_members_from_here_on>
+} SCDQGenericQData;
+
+/**
+ * \brief The data queue to hold instances that implement the template
+ *        SCDQGenericQData.
+ */
+typedef struct SCDQDataQueue_ {
+    /* holds the item at the top of the queue */
+    SCDQGenericQData *top;
+    /* holds the item at the bottom of the queue */
+    SCDQGenericQData *bot;
+    /* no of items currently in the queue */
+    uint16_t len;
+
+    SCMutex mutex_q;
+    SCCondT cond_q;
+
+#ifdef DBG_PERF
+    uint16_t dbg_maxlen;
+#endif /* DBG_PERF */
+} SCDQDataQueue;
+
+void SCDQDataEnqueue(SCDQDataQueue *, SCDQGenericQData *);
+SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *);
+
+#endif /* __DATA_QUEUE_H__ */
diff --git a/src/decode.h b/src/decode.h
index aa31d49275..40cd8b3005 100644
--- a/src/decode.h
+++ b/src/decode.h
@@ -247,6 +247,10 @@ struct PacketQueue_;
  */
 typedef struct Packet_
 {
+    /* double linked list ptrs */
+    struct Packet_ *next;
+    struct Packet_ *prev;
+
     /* Addresses, Ports and protocol
      * these are on top so we can use
      * the Packet as a hash key */
@@ -342,10 +346,6 @@ typedef struct Packet_
     /** packet number in the pcap file, matches wireshark */
     uint64_t pcap_cnt;
 
-    /* double linked list ptrs */
-    struct Packet_ *next;
-    struct Packet_ *prev;
-
     /* ready to set verdict counter, only set in root */
     uint8_t rtv_cnt;
     /* tunnel packet ref count */
@@ -369,20 +369,22 @@ typedef struct Packet_
 
     /* required for cuda support */
 #ifdef __SC_CUDA_SUPPORT__
-    PatternMatcherQueue *cuda_pmq;
-    MpmCtx *cuda_mpm_ctx;
-    MpmThreadCtx *cuda_mtc;
-
-    /* used to hold the match results.  We can instead use a void *result
-     * instead here.  That way we can make them hold any result. *todo* */
-    uint16_t cuda_matches;
-    /* indicates if the dispatcher should call the search or the scan phase
-     * of the pattern matcher.  We can instead use a void *cuda_data instead.
-     * This way we can send any data across to the dispatcher */
-    uint8_t cuda_search;
-    /* the dispatcher thread would pump the packet into this queue once it has
-     * processed the packet */
-    struct PacketQueue_ *cuda_outq;
+    /* indicates if the cuda mpm would be conducted or a normal cpu mpm would
+     * be conduced on this packet.  If it is set to 0, the cpu mpm; else cuda mpm */
+    uint8_t cuda_mpm_enabled;
+    /* indicates if the cuda mpm has finished running the mpm and processed the
+     * results for this packet, assuming if cuda_mpm_enabled has been set for this
+     * packet */
+    uint16_t cuda_done;
+    /* used by the detect thread and the cuda mpm dispatcher thread.  The detect
+     * thread would wait on this cond var, if the cuda mpm dispatcher thread
+     * still hasn't processed the packet.  The dispatcher would use this cond
+     * to inform the detect thread(in case it is waiting on this packet), once
+     * the dispatcher is done processing the packet results */
+    SCMutex cuda_mutex;
+    SCCondT cuda_cond;
+    /* the extra 1 in the 1481, is to hold the no_of_matches from the mpm run */
+    uint16_t mpm_offsets[1481];
 #endif
 } Packet;
 
@@ -460,23 +462,29 @@ typedef struct DecodeThreadVars_
 /**
  *  \brief Initialize a packet structure for use.
  */
-#define PACKET_INITIALIZE(p) do {               \
-        memset((p), 0x00, sizeof(Packet));      \
-        SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
-        PACKET_RESET_CHECKSUMS((p));            \
-    } while (0)
+#ifndef __SC_CUDA_SUPPORT__
+#define PACKET_INITIALIZE(p) { \
+    memset((p), 0x00, sizeof(Packet)); \
+    SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
+    PACKET_RESET_CHECKSUMS((p)); \
+}
+#else
+#define PACKET_INITIALIZE(p) { \
+    memset((p), 0x00, sizeof(Packet)); \
+    SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
+    PACKET_RESET_CHECKSUMS((p)); \
+    SCMutexInit(&(p)->cuda_mutex, NULL); \
+    SCCondInit(&(p)->cuda_cond, NULL); \
+}
+#endif
+
 
 /**
  *  \brief Recycle a packet structure for reuse.
  *  \todo the mutex destroy & init is necessary because of the memset, reconsider
  */
-#define PACKET_RECYCLE(p) do {                  \
-        CLEAR_ADDR(&(p)->src);                   \
-        CLEAR_ADDR(&(p)->dst);                   \
-        (p)->sp = 0;                            \
-        (p)->dp = 0;                            \
-        (p)->proto = 0;                         \
-        (p)->recursion_level = 0;               \
+#define PACKET_DO_RECYCLE(p) do {                  \
+        (p)->recursion_level = 0; \
         (p)->flags = 0; \
         (p)->flowflags = 0; \
         (p)->flow = NULL; \
@@ -530,15 +538,39 @@ typedef struct DecodeThreadVars_
         PACKET_RESET_CHECKSUMS((p));            \
     } while (0)
 
+#ifndef __SC_CUDA_SUPPORT__
+#define PACKET_RECYCLE(p) PACKET_DO_RECYCLE((p))
+#else
+#define PACKET_RECYCLE(p) do { \
+    PACKET_DO_RECYCLE((p)); \
+    SCMutexDestroy(&(p)->cuda_mutex); \
+    SCCondDestroy(&(p)->cuda_cond); \
+    SCMutexInit(&(p)->cuda_mutex, NULL); \
+    SCCondInit(&(p)->cuda_cond, NULL); \
+    PACKET_RESET_CHECKSUMS((p)); \
+} while(0)
+#endif
+
 /**
  *  \brief Cleanup a packet so that we can free it. No memset needed..
  */
+#ifndef __SC_CUDA_SUPPORT__
 #define PACKET_CLEANUP(p) do {                  \
         if ((p)->pktvar != NULL) {              \
             PktVarFree((p)->pktvar);            \
         }                                       \
         SCMutexDestroy(&(p)->mutex_rtv_cnt);    \
     } while (0)
+#else
+#define PACKET_CLEANUP(p) do { \
+    if ((p)->pktvar != NULL) { \
+        PktVarFree((p)->pktvar); \
+    } \
+    SCMutexDestroy(&(p)->mutex_rtv_cnt); \
+    SCMutexDestroy(&(p)->cuda_mutex); \
+    SCCondDestroy(&(p)->cuda_cond); \
+} while(0)
+#endif
 
 
 /* macro's for setting the action
diff --git a/src/detect-engine-mpm.c b/src/detect-engine-mpm.c
index 66dad79c92..fb123037ae 100644
--- a/src/detect-engine-mpm.c
+++ b/src/detect-engine-mpm.c
@@ -106,31 +106,42 @@ uint16_t PatternMatchDefaultMatcher(void) {
  *  \retval ret number of matches
  */
 uint32_t PacketPatternSearch(ThreadVars *tv, DetectEngineThreadCtx *det_ctx,
-                           Packet *p)
+                             Packet *p)
 {
     SCEnter();
 
     uint32_t ret;
+
 #ifndef __SC_CUDA_SUPPORT__
     ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
-                                                          &det_ctx->mtc,
-                                                          &det_ctx->pmq,
-                                                          p->payload,
-                                                          p->payload_len);
+                                                            &det_ctx->mtc,
+                                                            &det_ctx->pmq,
+                                                            p->payload,
+                                                            p->payload_len);
 #else
     /* if the user has enabled cuda support, but is not using the cuda mpm
      * algo, then we shouldn't take the path of the dispatcher.  Call the mpm
      * directly */
     if (det_ctx->sgh->mpm_ctx->mpm_type != MPM_B2G_CUDA) {
         ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
-                                                              &det_ctx->mtc,
-                                                              &det_ctx->pmq,
-                                                              p->payload,
-                                                              p->payload_len);
+                                                                &det_ctx->mtc,
+                                                                &det_ctx->pmq,
+                                                                p->payload,
+                                                                p->payload_len);
         SCReturnInt(ret);
     }
 
-    SCCudaHlProcessPacketWithDispatcher(p, det_ctx, &ret);
+    if (p->cuda_mpm_enabled) {
+        ret = B2gCudaResultsPostProcessing(p, det_ctx->sgh->mpm_ctx,
+                                           &det_ctx->mtc, &det_ctx->pmq);
+    } else {
+        ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
+                                                                &det_ctx->mtc,
+                                                                &det_ctx->pmq,
+                                                                p->payload,
+                                                                p->payload_len);
+    }
+
 #endif
 
     SCReturnInt(ret);
@@ -154,23 +165,8 @@ uint32_t UriPatternSearch(DetectEngineThreadCtx *det_ctx,
     //PrintRawDataFp(stdout, uri, uri_len);
 
     uint32_t ret;
-#ifndef __SC_CUDA_SUPPORT__
-    ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search
-        (det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq,
-         uri, uri_len);
-#else
-    /* if the user has enabled cuda support, but is not using the cuda mpm
-     * algo, then we shouldn't take the path of the dispatcher.  Call the mpm
-     * directly */
-    if (det_ctx->sgh->mpm_uri_ctx->mpm_type != MPM_B2G_CUDA) {
-        ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search
-            (det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq,
-             uri, uri_len);
-        SCReturnUInt(ret);
-    }
-
-    SCCudaHlProcessUriWithDispatcher(uri, uri_len, det_ctx, &ret);
-#endif
+    ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search(det_ctx->sgh->mpm_uri_ctx,
+            &det_ctx->mtcu, &det_ctx->pmq, uri, uri_len);
 
     SCReturnUInt(ret);
 }
diff --git a/src/detect-engine.c b/src/detect-engine.c
index 6eb9ca6eae..5d0ce14e99 100644
--- a/src/detect-engine.c
+++ b/src/detect-engine.c
@@ -378,47 +378,6 @@ TmEcode DetectEngineThreadCtxInit(ThreadVars *tv, void *initdata, void **data) {
 
     *data = (void *)det_ctx;
 
-#ifdef __SC_CUDA_SUPPORT__
-    if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
-        return TM_ECODE_OK;
-
-    Tmq *tmq;
-    /* we would prepend this name to the the tv name, to obtain the final unique
-     * detection thread queue name */
-    char *cuda_outq_name = "cuda_mpm_rc_disp_outq";
-    uint8_t disp_outq_name_len = (strlen(tv->name) + strlen(cuda_outq_name) + 1);
-
-    char *disp_outq_name = SCMalloc(disp_outq_name_len * sizeof(char));
-    if (disp_outq_name == NULL)
-        goto error;
-    strcpy(disp_outq_name, tv->name);
-    strcpy(disp_outq_name + strlen(tv->name), cuda_outq_name);
-    disp_outq_name[disp_outq_name_len] = '\0';
-
-    tmq = TmqGetQueueByName(disp_outq_name);
-    if (tmq != NULL) {
-        SCLogError(SC_ERR_TMQ_ALREADY_REGISTERED, "A queue by the name \"%s\" "
-                   "is already registered, which shouldn't be the case.  Queue "
-                   "name is duplicated.  Please check if multiple instances of "
-                   "detection module are given different names ",
-                   disp_outq_name);
-        goto error;
-    }
-    tmq = TmqCreateQueue(disp_outq_name);
-    if (tmq == NULL)
-        goto error;
-
-    /* hold the queue instane we create under this detection thread instance */
-    det_ctx->cuda_mpm_rc_disp_outq = tmq;
-    det_ctx->cuda_mpm_rc_disp_outq->reader_cnt++;
-    det_ctx->cuda_mpm_rc_disp_outq->writer_cnt++;
-
-    return TM_ECODE_OK;
-
- error:
-    return TM_ECODE_FAILED;
-#endif
-
     return TM_ECODE_OK;
 }
 
diff --git a/src/detect.c b/src/detect.c
index fdd862cf4d..8c5d0db7e0 100644
--- a/src/detect.c
+++ b/src/detect.c
@@ -3140,6 +3140,7 @@ int SigGroupBuild (DetectEngineCtx *de_ctx) {
             SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                        "module SC_RULES_CONTENT_B2G_CUDA");
         }
+        SCCudaCtxPushCurrent(dummy_context);
         if (SCCudaMemGetInfo(&cuda_free_before_alloc, &cuda_total) == 0) {
             SCLogInfo("Total Memory available in the CUDA context used for mpm "
                       "with b2g: %.2f MB", cuda_total/(1024.0 * 1024.0));
diff --git a/src/runmodes.c b/src/runmodes.c
index ed25c943d7..4effce00db 100644
--- a/src/runmodes.c
+++ b/src/runmodes.c
@@ -24,6 +24,7 @@
 
 #include "suricata-common.h"
 #include "detect-engine.h"
+#include "detect-engine-mpm.h"
 #include "tm-threads.h"
 #include "util-debug.h"
 #include "util-time.h"
@@ -43,6 +44,8 @@
 
 #include "output.h"
 
+#include "cuda-packet-batcher.h"
+
 /**
  * A list of output modules that will be active for the run mode.
  */
@@ -2305,6 +2308,115 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) {
         printf("ERROR: TmThreadSpawn failed\n");
         exit(EXIT_FAILURE);
     }
+
+#if defined(__SC_CUDA_SUPPORT__)
+    if (PatternMatchDefaultMatcher() == MPM_B2G_CUDA) {
+        ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode",
+                                                             "pickup-queue", "simple",
+                                                             "decode-queue1", "simple",
+                                                             "1slot");
+        if (tv_decode1 == NULL) {
+            printf("ERROR: TmThreadsCreate failed for Decode1\n");
+            exit(EXIT_FAILURE);
+        }
+        tm_module = TmModuleGetByName("DecodePcapFile");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName DecodePcap failed\n");
+            exit(EXIT_FAILURE);
+        }
+        Tm1SlotSetFunc(tv_decode1, tm_module, NULL);
+
+        TmThreadSetCPUAffinity(tv_decode1, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+
+        ThreadVars *tv_cuda_PB = TmThreadCreate("CUDA_PB",
+                                                "decode-queue1", "simple",
+                                                "cuda-pb-queue1", "simple",
+                                                "custom", SCCudaPBTmThreadsSlot1, 0);
+        if (tv_cuda_PB == NULL) {
+            printf("ERROR: TmThreadsCreate failed for CUDA_PB\n");
+            exit(EXIT_FAILURE);
+        }
+        tv_cuda_PB->type = TVT_PPT;
+
+        tm_module = TmModuleGetByName("CudaPacketBatcher");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName CudaPacketBatcher failed\n");
+            exit(EXIT_FAILURE);
+        }
+        Tm1SlotSetFunc(tv_cuda_PB, tm_module, (void *)de_ctx);
+
+        TmThreadSetCPUAffinity(tv_cuda_PB, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_cuda_PB, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_cuda_PB) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+
+        ThreadVars *tv_stream1 = TmThreadCreatePacketHandler("Stream1",
+                                                             "cuda-pb-queue1", "simple",
+                                                             "stream-queue1", "simple",
+                                                             "1slot");
+        if (tv_stream1 == NULL) {
+            printf("ERROR: TmThreadsCreate failed for Stream1\n");
+            exit(EXIT_FAILURE);
+        }
+        tm_module = TmModuleGetByName("StreamTcp");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName StreamTcp failed\n");
+            exit(EXIT_FAILURE);
+        }
+        Tm1SlotSetFunc(tv_stream1,tm_module,NULL);
+
+        TmThreadSetCPUAffinity(tv_stream1, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_stream1, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_stream1) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream",
+                                                             "pickup-queue", "simple",
+                                                             "stream-queue1", "simple",
+                                                             "varslot");
+        if (tv_decode1 == NULL) {
+            printf("ERROR: TmThreadsCreate failed for Decode1\n");
+            exit(EXIT_FAILURE);
+        }
+        tm_module = TmModuleGetByName("DecodePcapFile");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName DecodePcap failed\n");
+            exit(EXIT_FAILURE);
+        }
+        TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL);
+
+        tm_module = TmModuleGetByName("StreamTcp");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName StreamTcp failed\n");
+            exit(EXIT_FAILURE);
+        }
+        TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL);
+
+        TmThreadSetCPUAffinity(tv_decode1, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+#else
 //#if 0
     //ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","packetpool","packetpool","varslot");
     ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","stream-queue1","simple","varslot");
@@ -2336,7 +2448,7 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) {
         printf("ERROR: TmThreadSpawn failed\n");
         exit(EXIT_FAILURE);
     }
-
+#endif
 //#if 0
     /* start with cpu 1 so that if we're creating an odd number of detect
      * threads we're not creating the most on CPU0. */
diff --git a/src/suricata.c b/src/suricata.c
index 1e1f7c1919..2b8030c4e8 100644
--- a/src/suricata.c
+++ b/src/suricata.c
@@ -133,6 +133,7 @@
 /* holds the cuda b2g module */
 #include "util-mpm-b2g-cuda.h"
 #include "util-cuda-handlers.h"
+#include "cuda-packet-batcher.h"
 
 #include "output.h"
 #include "util-privs.h"
@@ -218,6 +219,7 @@ SignalHandlerSetup(int sig, void (*handler)())
 void GlobalInits()
 {
     memset(trans_q, 0, sizeof(trans_q));
+    memset(data_queues, 0, sizeof(data_queues));
 
     /* Initialize the trans_q mutex */
     int blah;
@@ -225,6 +227,9 @@ void GlobalInits()
     for(blah=0;blah<256;blah++) {
         r |= SCMutexInit(&trans_q[blah].mutex_q, NULL);
         r |= SCCondInit(&trans_q[blah].cond_q, NULL);
+
+        r |= SCMutexInit(&data_queues[blah].mutex_q, NULL);
+        r |= SCCondInit(&data_queues[blah].cond_q, NULL);
    }
 
     if (r != 0) {
@@ -793,6 +798,7 @@ int main(int argc, char **argv)
     TmModuleLogHttpLogIPv6Register();
 #ifdef __SC_CUDA_SUPPORT__
     TmModuleCudaMpmB2gRegister();
+    TmModuleCudaPacketBatcherRegister();
 #endif
     TmModuleReceiveErfFileRegister();
     TmModuleDecodeErfFileRegister();
@@ -996,10 +1002,15 @@ int main(int argc, char **argv)
             exit(EXIT_FAILURE);
     }
 
+
 #ifdef PROFILING
     SCProfilingInitRuleCounters(de_ctx);
 #endif /* PROFILING */
 
+#ifdef __SC_CUDA_SUPPORT__
+    SCCudaPBSetUpQueuesAndBuffers();
+#endif /* __SC_CUDA_SUPPORT__ */
+
     AppLayerHtpRegisterExtraCallbacks();
     SCThresholdConfInitContext(de_ctx,NULL);
 
@@ -1126,6 +1137,10 @@ int main(int argc, char **argv)
 
             SCLogInfo("time elapsed %" PRIuMAX "s", (uintmax_t)(end_time.tv_sec - start_time.tv_sec));
 
+#ifdef __SC_CUDA_SUPPORT__
+            SCCudaPBKillBatchingPackets();
+#endif
+
             TmThreadKillThreads();
             SCPerfReleaseResources();
             break;
diff --git a/src/suricata.h b/src/suricata.h
index a4ba160bd9..f4f32fbf22 100644
--- a/src/suricata.h
+++ b/src/suricata.h
@@ -26,6 +26,7 @@
 
 #include "suricata-common.h"
 #include "packet-queue.h"
+#include "data-queue.h"
 
 /* the name of our binary */
 #define PROG_NAME "Suricata"
@@ -54,6 +55,8 @@ enum {
  * XXX move to the TmQueue structure later
  */
 PacketQueue trans_q[256];
+
+SCDQDataQueue data_queues[256];
 /* memset to zeros, and mutex init! */
 void GlobalInits();
 
diff --git a/src/tm-modules.h b/src/tm-modules.h
index cdb394369a..817497a871 100644
--- a/src/tm-modules.h
+++ b/src/tm-modules.h
@@ -78,6 +78,7 @@ enum {
     TMM_RECEIVEIPFW,
 #ifdef __SC_CUDA_SUPPORT__
     TMM_CUDA_MPM_B2G,
+    TMM_CUDA_PACKET_BATCHER,
 #endif
     TMM_RECEIVEERFFILE,
     TMM_DECODEERFFILE,
diff --git a/src/tm-queues.c b/src/tm-queues.c
index 7ca0fc6974..80c392447c 100644
--- a/src/tm-queues.c
+++ b/src/tm-queues.c
@@ -52,6 +52,8 @@ Tmq* TmqCreateQueue(char *name) {
     Tmq *q = &tmqs[tmq_id];
     q->name = name;
     q->id = tmq_id++;
+    /* for cuda purposes */
+    q->q_type = 0;
 
     SCLogDebug("created queue \'%s\', %p", name, q);
     return q;
diff --git a/src/tm-queues.h b/src/tm-queues.h
index 022f221e59..01dbb6e704 100644
--- a/src/tm-queues.h
+++ b/src/tm-queues.h
@@ -29,6 +29,8 @@ typedef struct Tmq_ {
     uint16_t id;
     uint16_t reader_cnt;
     uint16_t writer_cnt;
+    /* 0 for packet-queue and 1 for data-queue */
+    uint8_t q_type;
 } Tmq;
 
 Tmq* TmqCreateQueue(char *name);
diff --git a/src/tm-threads.c b/src/tm-threads.c
index c55adc0748..5f091863b0 100644
--- a/src/tm-threads.c
+++ b/src/tm-threads.c
@@ -68,45 +68,6 @@ SCMutex tv_root_lock = PTHREAD_MUTEX_INITIALIZER;
    thread encounters a failure.  Defaults to restart the failed thread */
 uint8_t tv_aof = THV_RESTART_THREAD;
 
-typedef struct TmSlot_ {
-    /* function pointers */
-    TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *, PacketQueue *);
-
-    TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **);
-    void (*SlotThreadExitPrintStats)(ThreadVars *, void *);
-    TmEcode (*SlotThreadDeinit)(ThreadVars *, void *);
-
-    /* data storage */
-    void *slot_initdata;
-    void *slot_data;
-
-    /**< queue filled by the SlotFunc with packets that will
-     *   be processed futher _before_ the current packet.
-     *   The locks in the queue are NOT used */
-    PacketQueue slot_pre_pq;
-
-    /**< queue filled by the SlotFunc with packets that will
-     *   be processed futher _after_ the current packet. The
-     *   locks in the queue are NOT used */
-    PacketQueue slot_post_pq;
-
-    /* linked list, only used by TmVarSlot */
-    struct TmSlot_ *slot_next;
-
-    int id; /**< slot id, only used my TmVarSlot to know what the first
-             *   slot is. */
-} TmSlot;
-
-/* 1 function slot */
-typedef struct Tm1Slot_ {
-    TmSlot s;
-} Tm1Slot;
-
-/* Variable number of function slots */
-typedef struct TmVarSlot_ {
-    TmSlot *s;
-} TmVarSlot;
-
 /**
  *  \brief Check if a thread flag is set
  *
@@ -649,11 +610,12 @@ TmEcode TmThreadSetSlots(ThreadVars *tv, char *name, void *(*fn_p)(void *)) {
         size = sizeof(TmVarSlot);
         tv->tm_func = TmThreadsSlotVar;
     } else if (strcmp(name, "custom") == 0) {
+        /* \todo this needs to be changed to support slots of any size */
+        size = sizeof(Tm1Slot);
         if (fn_p == NULL)
             goto error;
 
         tv->tm_func = fn_p;
-        return TM_ECODE_OK;
     } else {
         printf("Error: Slot \"%s\" not supported\n", name);
         goto error;
@@ -1134,8 +1096,12 @@ void TmThreadKillThreads(void) {
                 if (tv->InShutdownHandler != NULL) {
                     tv->InShutdownHandler(tv);
                 }
-                for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++)
-                    SCCondSignal(&trans_q[tv->inq->id].cond_q);
+                for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
+                    if (tv->inq->q_type == 0)
+                        SCCondSignal(&trans_q[tv->inq->id].cond_q);
+                    else
+                        SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                }
 
                 /* to be sure, signal more */
                 int cnt = 0;
@@ -1151,9 +1117,12 @@ void TmThreadKillThreads(void) {
                         tv->InShutdownHandler(tv);
                     }
 
-                    for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++)
-                        SCCondSignal(&trans_q[tv->inq->id].cond_q);
-
+                    for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
+                        if (tv->inq->q_type == 0)
+                            SCCondSignal(&trans_q[tv->inq->id].cond_q);
+                        else
+                            SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                    }
                     usleep(100);
                 }
 
diff --git a/src/tm-threads.h b/src/tm-threads.h
index 0c30c2b771..20f0b5b879 100644
--- a/src/tm-threads.h
+++ b/src/tm-threads.h
@@ -34,6 +34,34 @@ enum {
     TVT_MAX,
 };
 
+
+typedef struct TmSlot_ {
+    /* function pointers */
+    TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *);
+
+    TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **);
+    void (*SlotThreadExitPrintStats)(ThreadVars *, void *);
+    TmEcode (*SlotThreadDeinit)(ThreadVars *, void *);
+
+    /* data storage */
+    void *slot_initdata;
+    void *slot_data;
+    PacketQueue slot_pq;
+
+    /* linked list, only used by TmVarSlot */
+    struct TmSlot_ *slot_next;
+} TmSlot;
+
+/* 1 function slot */
+typedef struct Tm1Slot_ {
+    TmSlot s;
+} Tm1Slot;
+
+/* Variable number of function slots */
+typedef struct TmVarSlot_ {
+    TmSlot *s;
+} TmVarSlot;
+
 extern ThreadVars *tv_root[TVT_MAX];
 
 extern SCMutex tv_root_lock;
diff --git a/src/tmqh-simple.c b/src/tmqh-simple.c
index 461928149e..fee8069daf 100644
--- a/src/tmqh-simple.c
+++ b/src/tmqh-simple.c
@@ -90,19 +90,26 @@ void TmqhOutputSimple(ThreadVars *t, Packet *p)
     SCMutexUnlock(&q->mutex_q);
 }
 
+/*******************************Generic-Q-Handlers*****************************/
+
 /**
  * \brief Public version of TmqhInputSimple from the tmqh-simple queue
  *        handler, except that it is a generic version that is directly
- *        tied to a PacketQueue instance.
+ *        tied to a "SCDQDataQueue" instance(sent as an arg).
+ *
+ *        Retrieves a data_instance from the queue.  If the queue is empty, it
+ *        waits on the queue, till a data_instance is enqueued into the queue
+ *        by some other module.
  *
- *        Retrieves a packet from the queue.  If the queue is empty, it waits
- *        on the queue, till a packet is enqueued into the queue.
+ *        All references to "data_instance" means a reference to a data structure
+ *        instance that implements the template "struct SCDQGenericQData_".
  *
- * \param q The PacketQueue instance to wait on.
+ * \param q The SCDQDataQueue instance to wait on.
  *
  * \retval p The returned packet from the queue.
+ * \retval data The returned data_instance from the queue.
  */
-Packet *TmqhInputSimpleOnQ(PacketQueue *q)
+SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *q)
 {
     SCMutexLock(&q->mutex_q);
     if (q->len == 0) {
@@ -111,11 +118,12 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q)
     }
 
     if (q->len > 0) {
-        Packet *p = PacketDequeue(q);
+        SCDQGenericQData *data = SCDQDataDequeue(q);
         SCMutexUnlock(&q->mutex_q);
-        return p;
+        return data;
     } else {
-        /* return NULL if we have no pkt. Should only happen on signals. */
+        /* return NULL if we have no data in the queue. Should only happen
+         * on signals. */
         SCMutexUnlock(&q->mutex_q);
         return NULL;
     }
@@ -124,17 +132,23 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q)
 /**
  * \brief Public version of TmqhOutputSimple from the tmqh-simple queue
  *        handler, except that it is a generic version that is directly
- *        tied to a PacketQueue instance.
+ *        tied to a SCDQDataQueue instance(sent as an arg).
+ *
+ *        Pumps out a data_instance into the queue.  If the queue is empty, it
+ *        waits on the queue, till a data_instance is enqueued into the queue.
  *
- *        Enqueues a packet into the packet queue.
+ *        All references to "data_instance" means a reference to a data structure
+ *        instance that implements the template "struct SCDQGenericQData_".
  *
- * \param q The PacketQueue instance to enqueue the packet into.
- * \param p The packet to be enqueued into the above queue.
+ * \param q    The SCDQDataQueue instance to pump the data into.
+ * \param data The data instance to be enqueued.
  */
-void TmqhOutputSimpleOnQ(PacketQueue *q, Packet *p)
+void TmqhOutputSimpleOnQ(SCDQDataQueue *q, SCDQGenericQData *data)
 {
     SCMutexLock(&q->mutex_q);
-    PacketEnqueue(q, p);
+    SCDQDataEnqueue(q, data);
     SCCondSignal(&q->cond_q);
     SCMutexUnlock(&q->mutex_q);
+
+    return;
 }
diff --git a/src/tmqh-simple.h b/src/tmqh-simple.h
index 54f69e69c2..1d4417b4e8 100644
--- a/src/tmqh-simple.h
+++ b/src/tmqh-simple.h
@@ -24,8 +24,11 @@
 #ifndef __TMQH_SIMPLE_H__
 #define __TMQH_SIMPLE_H__
 
-Packet *TmqhInputSimpleOnQ(PacketQueue *);
-void TmqhOutputSimpleOnQ(PacketQueue *, Packet *);
+#include "data-queue.h"
+
+SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *);
+void TmqhOutputSimpleOnQ(SCDQDataQueue *, SCDQGenericQData *);
+
 void TmqhSimpleRegister (void);
 
 #endif /* __TMQH_SIMPLE_H__ */
diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c
index 0789e37d10..e6b51eba87 100644
--- a/src/util-cuda-handlers.c
+++ b/src/util-cuda-handlers.c
@@ -100,6 +100,72 @@ SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t handle)
     return data;
 }
 
+/**
+ * \brief Returns a cuda_module against the handle in the argument.
+ *
+ *        If a cuda_module is not present for a handle, it is created
+ *        and associated with this handle and the cuda_module is returned
+ *        in the argument.  If a cuda_module is already present for
+ *        a handle, it is returned.
+ *
+ * \param p_context Pointer to a cuda context instance that should be updated
+ *                  with a cuda context.
+ * \param handle    A unique handle which identifies a module.  Obtained from
+ *                  a call to SCCudaHlGetUniqueHandle().
+ *
+ * \retval  0 On success.
+ * \retval -1 On failure.
+ */
+int SCCudaHlGetCudaModuleFromFile(CUmodule *p_module, const char *filename, int handle)
+{
+    SCCudaHlModuleData *data = NULL;
+
+    if (p_module == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments"
+                   "p_module NULL");
+        return -1;
+    }
+
+    /* check if the particular module that wants a CUDA module is already
+     * registered or not.  If it is registered, check if a context has
+     * been associated with the module.  If yes, then we can go ahead and
+     * create a cuda module or return the reference to the cuda module if
+     * we already have a cuda module associated with the module.  If no, "
+     * log warning and get out of here */
+    if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) ||
+         (data->cuda_context == 0)) {
+        SCLogDebug("Module not registered or no cuda context associated with "
+                   "this module.  You can't create a CUDA module without"
+                   "associatin a context with a module first. To use this "
+                   "registration facility, first register a module using "
+                   "context using SCCudaHlRegisterModule(), and then register "
+                   "a cuda context with that module using "
+                   "SCCudaHlGetCudaContext(), after which you can call this "
+                   "function ");
+        return -1;
+    }
+
+    /* we already have a cuda module associated with this module.  Return the
+     * cuda module */
+    if (data->cuda_module != 0) {
+        p_module[0] = data->cuda_module;
+        return 0;
+    }
+
+    /* we don't have a cuda module associated with this module.  Create a
+     * cuda module, update the module with this cuda module reference and
+     * then return the module refernce back to the calling function using
+     * the argument */
+    if (SCCudaModuleLoad(p_module, filename) == -1)
+        goto error;
+    data->cuda_module = p_module[0];
+
+    return 0;
+
+ error:
+    return -1;
+}
+
 /**
  * \internal
  * \brief Get a unique handle for a new module registration.  This new handle
@@ -347,10 +413,12 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name,
         }
     }
 
+    /* send the newly assigned device pointer back to the caller */
+    device_ptr[0] = new_module_device_ptr->d_ptr;
+
     /* insert it into the device_ptr list for the module instance */
     if (data->device_ptrs == NULL) {
         data->device_ptrs = new_module_device_ptr;
-        device_ptr[0] = new_module_device_ptr->d_ptr;
         return 0;
     }
 
@@ -367,6 +435,76 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name,
     return -1;
 }
 
+/**
+ * \brief Frees a Cuda Device Pointer.
+ *
+ *        If a device pointer by the name \"name\"  is registered for this
+ *        handle, it is freed.
+ *
+ * \param name       Name of the device pointer by which we have to search
+ *                   module for its existance.
+ * \param handle     A unique handle which identifies a module.  Obtained from
+ *                   a call to SCCudaHlGetUniqueHandle().
+ *
+ * \retval  0 On success.
+ * \retval -1 On failure.
+ */
+int SCCudaHlFreeCudaDevicePtr(const char *name, int handle)
+{
+    SCCudaHlModuleData *data = NULL;
+    SCCudaHlModuleDevicePointer *module_device_ptr = NULL;
+    SCCudaHlModuleDevicePointer *temp_module_device_ptr = NULL;
+
+    if (name == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments"
+                   "device_ptr is NULL or name is NULL");
+        goto error;
+    }
+
+    /* check if the particular module that wants to free device memory is
+     * already registered or not.  If it is registered, check if a context has
+     * been associated with the module.  If yes, then we can go ahead and
+     * free the device memory.
+     */
+    if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) ||
+         (data->cuda_context == 0)) {
+        SCLogDebug("Module not registered or no cuda context associated with "
+                   "this module.  You can't create a CUDA module without"
+                   "associatin a context with a module first. To use this "
+                   "registration facility, first register a module using "
+                   "context using SCCudaHlRegisterModule(), and then register "
+                   "a cuda context with that module using "
+                   "SCCudaHlGetCudaContext(), after which you can call this "
+                   "function ");
+        goto error;
+    }
+
+    /* if we already have a device pointer registered by this name return the
+     * cuda device pointer instance */
+    if ( (module_device_ptr = SCCudaHlCudaDevicePtrAvailable(data, name)) == NULL) {
+        goto error;
+    }
+
+    SCCudaMemFree(module_device_ptr->d_ptr);
+    module_device_ptr->d_ptr = 0;
+    if (module_device_ptr == data->device_ptrs) {
+        data->device_ptrs = data->device_ptrs->next;
+    } else {
+        temp_module_device_ptr = data->device_ptrs;
+        while (strcmp(temp_module_device_ptr->next->name, name) != 0) {
+            temp_module_device_ptr = temp_module_device_ptr->next;
+        }
+        temp_module_device_ptr->next = temp_module_device_ptr->next->next;
+    }
+    free(module_device_ptr->name);
+    free(module_device_ptr);
+
+    return 0;
+
+ error:
+    return -1;
+}
+
 /**
  * \brief Registers a Dispatcher function against this handle.
  *
@@ -441,8 +579,11 @@ int SCCudaHlGetModuleHandle(const char *name)
         data = data->next;
     }
 
-    if (data == NULL)
+    if (data == NULL) {
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "A cuda module by the name \"%s\" "
+                   "hasn't been registered", name);
         return -1;
+    }
 
     return data->handle;
 }
@@ -675,72 +816,4 @@ int SCCudaHlTestEnvCudaContextDeInit(void)
     return 1;
 }
 
-void SCCudaHlProcessPacketWithDispatcher(Packet *p, DetectEngineThreadCtx *det_ctx,
-                                         void *result)
-{
-    Packet *out_p = NULL;
-
-    p->cuda_mpm_ctx = det_ctx->sgh->mpm_ctx;
-    p->cuda_mtc = &det_ctx->mtc;
-    p->cuda_pmq = &det_ctx->pmq;
-    /* this outq is unique to this detection thread instance.  The dispatcher thread
-     * would use this queue to pump the packets back to this detection thread once
-     * it has processed the packet */
-    p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id];
-    /* for now it is hardcoded.  \todo Make the access to the right queue or the
-     * ThreadVars generic */
-
-    /* Push the packet into the dispatcher's input queue */
-    B2gCudaPushPacketTo_tv_CMB2_RC(p);
-
-    /* wait for the dispatcher to process and return the packet we pushed */
-    out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]);
-
-    /* todo make this generic, so that if we have more than 2 modules using the
-     * cuda interface, we can call update function for the module that has
-     * queued the packet and retrieve the results */
-    *((uint32_t *)result) = p->cuda_matches;
-
-    return;
-}
-
-void SCCudaHlProcessUriWithDispatcher(uint8_t *uri, uint16_t uri_len,
-                                      DetectEngineThreadCtx *det_ctx,
-                                      void *result)
-{
-    Packet *out_p = NULL;
-
-    Packet *p = SCMalloc(sizeof(Packet));
-    if (p == NULL) {
-        SCLogError(SC_ERR_FATAL, "Fatal error encountered in SCCudaHlProcessUriWithDispatcher. Exiting...");
-        exit(EXIT_FAILURE);
-    }
-    memset(p, 0, sizeof(Packet));
-
-    p->cuda_mpm_ctx = det_ctx->sgh->mpm_uri_ctx;
-    p->cuda_mtc = &det_ctx->mtcu;
-    p->cuda_pmq = &det_ctx->pmq;
-    p->payload = uri;
-    p->payload_len = uri_len;
-    /* this outq is unique to this detection thread instance.  The dispatcher thread
-     * would use this queue to pump the packets back to this detection thread once
-     * it has processed the packet */
-    p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id];
-
-    /* Push the packet into the dispatcher's input queue */
-    B2gCudaPushPacketTo_tv_CMB2_RC(p);
-
-    /* wait for the dispatcher to process and return the packet we pushed */
-    out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]);
-
-    /* todo make this generic, so that if we have more than 2 modules using the
-     * cuda interface, we can call update function for the module that has
-     * queued the packet and retrieve the results */
-    *((uint32_t *)result) = p->cuda_matches;
-
-    SCFree(p);
-
-    return;
-}
-
 #endif /* __SC_CUDA_SUPPORT */
diff --git a/src/util-cuda-handlers.h b/src/util-cuda-handlers.h
index f2faebe4af..cae7d595dd 100644
--- a/src/util-cuda-handlers.h
+++ b/src/util-cuda-handlers.h
@@ -63,7 +63,9 @@ typedef struct SCCudaHlModuleData_ {
 
 int SCCudaHlGetCudaContext(CUcontext *, int);
 int SCCudaHlGetCudaModule(CUmodule *, const char *, int);
+int SCCudaHlGetCudaModuleFromFile(CUmodule *, const char *, int);
 int SCCudaHlGetCudaDevicePtr(CUdeviceptr *, const char *, size_t, void *, int);
+int SCCudaHlFreeCudaDevicePtr(const char *, int);
 int SCCudaHlRegisterDispatcherFunc(void *(*SCCudaHlDispFunc)(void *), int);
 
 SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t);
diff --git a/src/util-mpm-b2g-cuda-kernel.cu b/src/util-mpm-b2g-cuda-kernel.cu
index f86e9998ef..d94cda7d00 100644
--- a/src/util-mpm-b2g-cuda-kernel.cu
+++ b/src/util-mpm-b2g-cuda-kernel.cu
@@ -27,50 +27,59 @@
  */
 
 #define B2G_CUDA_Q 2
-#define CUDA_THREADS 16
+#define CUDA_THREADS 4000
 #define B2G_CUDA_HASHSHIFT 4
 #define B2G_CUDA_TYPE unsigned int
 #define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b))
 #define u8_tolower(c) g_u8_lowercasetable[(c)]
 
+typedef struct SCCudaPBPacketDataForGPU_ {
+    /* holds the value B2gCtx->m */
+    unsigned int m;
+    /* holds B2gCtx->B2g */
+    unsigned int table;
+    /* holds the length of the payload */
+    unsigned int payload_len;
+    /* holds the payload */
+    unsigned char payload;
+} SCCudaPBPacketDataForGPU;
+
 extern "C"
-__global__ void B2gCudaSearchBNDMq(unsigned int *offsets,
-                                   unsigned int *B2G,
-                                   unsigned char *g_u8_lowercasetable,
-                                   unsigned char *buf,
-                                   unsigned short arg_buflen,
-                                   unsigned int m)
-{
+__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer,
+                                   unsigned char *packets_buffer,
+                                   unsigned int *packets_offset_buffer,
+                                   unsigned int *packets_payload_offset_buffer,
+                                   unsigned int nop,
+                                   unsigned char *g_u8_lowercasetable)
+ {
+    unsigned int tid = blockIdx.x * 32 + threadIdx.x;
+    /* if the thread id is greater than the no of packets sent in the packets
+     * buffer, terminate the thread */
+    //if (tid <= nop)
+    if (tid >= nop)
+        return;
+
+    SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]);
+    unsigned int m = packet->m;
+    unsigned char *buf = &packet->payload;
+    unsigned int buflen = packet->payload_len;
+    unsigned int *B2G = (unsigned int *)packet->table;
     unsigned int pos = m - B2G_CUDA_Q + 1;
     B2G_CUDA_TYPE d;
     unsigned short h;
-    unsigned int j;
     unsigned int first;
-    unsigned int tid = threadIdx.x;
-    unsigned short tid_chunk = arg_buflen / CUDA_THREADS;
-    unsigned short jump;
-    unsigned short buflen;
-
-    if (tid_chunk < m)
-        tid_chunk = m;
-
-    jump = tid_chunk * tid;
-    if ((jump + tid_chunk) > arg_buflen)
-        return;
+    unsigned int j = 0;
 
-    buflen = tid_chunk * 2 - 1;
-    if ((tid == CUDA_THREADS - 1) || ((jump + buflen) > arg_buflen)) {
-        buflen = arg_buflen - jump;
-    }
-
-    j = 0;
-    while (j < buflen) {
-        offsets[jump + j] = 0;
-        j++;
-    }
+    unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid;
+    //unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1;
+    //unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1;
+    unsigned short *offsets = matches_count + 1;
+    // temporarily hold the results here, before we shift it to matches_count
+    // before returning
+    unsigned short matches = 0;
 
     while (pos <= (buflen - B2G_CUDA_Q + 1)) {
-        h = B2G_CUDA_HASH16(u8_tolower(buf[jump + pos - 1]), u8_tolower(buf[jump + pos]));
+        h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos]));
         d = B2G[h];
 
         if (d != 0) {
@@ -83,19 +92,21 @@ __global__ void B2gCudaSearchBNDMq(unsigned int *offsets,
                     if (j > first) {
                         pos = j;
                     } else {
-                        offsets[j + jump] = 1;
+                        offsets[matches++] = j;
                     }
                 }
 
                 if (j == 0)
                     break;
 
-                h = B2G_CUDA_HASH16(u8_tolower(buf[jump + j - 1]), u8_tolower(buf[jump + j]));
+                h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j]));
                 d = (d << 1) & B2G[h];
             } while (d != 0);
         }
         pos = pos + m - B2G_CUDA_Q + 1;
     }
 
+    matches_count[0] = matches;
+
     return;
 }
diff --git a/src/util-mpm-b2g-cuda.c b/src/util-mpm-b2g-cuda.c
index 52dcc53a17..24b36651c1 100644
--- a/src/util-mpm-b2g-cuda.c
+++ b/src/util-mpm-b2g-cuda.c
@@ -45,6 +45,13 @@
 #include "threads.h"
 #include "tmqh-simple.h"
 
+#include "detect-engine-address.h"
+#include "detect-engine-port.h"
+#include "detect-engine.h"
+#include "detect-parse.h"
+
+#include "cuda-packet-batcher.h"
+
 /* macros decides if cuda is enabled for the platform or not */
 #ifdef __SC_CUDA_SUPPORT__
 
@@ -63,17 +70,6 @@ static void *b2g_func;
 /* threadvars Cuda(C) Mpm(M) B2G(B) Rules(R) Content(C) */
 ThreadVars *tv_CMB2_RC = NULL;
 
-/**
- * \todo Would break on x86_64 I believe.  We will fix this in a later version.
- */
-#define B2G_CUDA_KERNEL_ARG0_OFFSET     0
-#define B2G_CUDA_KERNEL_ARG1_OFFSET     4
-#define B2G_CUDA_KERNEL_ARG2_OFFSET     8
-#define B2G_CUDA_KERNEL_ARG3_OFFSET    12
-#define B2G_CUDA_KERNEL_ARG4_OFFSET    16
-#define B2G_CUDA_KERNEL_ARG5_OFFSET    20
-#define B2G_CUDA_KERNEL_TOTAL_ARG_SIZE 24
-
 void B2gCudaInitCtx(MpmCtx *, int);
 void B2gCudaThreadInitCtx(MpmCtx *, MpmThreadCtx *, uint32_t);
 void B2gCudaDestroyCtx(MpmCtx *);
@@ -100,201 +96,171 @@ void B2gCudaPrintInfo(MpmCtx *);
 void B2gCudaPrintSearchStats(MpmThreadCtx *);
 void B2gCudaRegisterTests(void);
 
-/* for debugging purposes.  keep it for now */
-int arg0 = 0;
-int arg1 = 0;
-int arg2 = 0;
-int arg3 = 0;
-int arg4 = 0;
-int arg5 = 0;
-int arg_total = 0;
-
 #if defined(__x86_64__) || defined(__ia64__)
 const char *b2g_cuda_ptx_image_64_bit =
-    "        .version 1.4\n"
-    "        .target sm_10, map_f64_to_f32\n"
-    "        .entry B2gCudaSearchBNDMq (\n"
-    "               .param .u64 __cudaparm_B2gCudaSearchBNDMq_offsets,\n"
-    "               .param .u64 __cudaparm_B2gCudaSearchBNDMq_B2G,\n"
-    "               .param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n"
-    "               .param .u64 __cudaparm_B2gCudaSearchBNDMq_buf,\n"
-    "               .param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n"
-    "               .param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n"
-    "       {\n"
-    "       .reg .u16 %rh<6>;\n"
-    "       .reg .u32 %r<58>;\n"
-    "       .reg .u64 %rd<31>;\n"
-    "       .reg .pred %p<14>;\n"
-    "       .loc    15      25      0\n"
+    "	.version 1.4\n"
+    "	.target sm_10, map_f64_to_f32\n"
+    "	.entry B2gCudaSearchBNDMq (\n"
+    "		.param .u64 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n"
+    "		.param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n"
+    "		.param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n"
+    "		.param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n"
+    "		.param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n"
+    "	{\n"
+    "	.reg .u16 %rh<7>;\n"
+    "	.reg .u32 %r<38>;\n"
+    "	.reg .u64 %rd<41>;\n"
+    "	.reg .pred %p<10>;\n"
+    "	.loc	3	36	0\n"
     "$LBB1_B2gCudaSearchBNDMq:\n"
-    "       .loc    15      27      0\n"
-    "       ld.param.u32    %r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n"
-    "       sub.u32         %r2, %r1, 1;\n"
-    "       mov.s32         %r3, %r2;\n"
-    "       .loc    15      33      0\n"
-    "       ld.param.u16    %r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n"
-    "       shr.u32         %r5, %r4, 4;\n"
-    "       cvt.u16.u32     %r6, %r5;\n"
-    "       mov.s32         %r7, %r6;\n"
-    "       setp.ge.u32     %p1, %r6, %r1;\n"
-    "       @%p1 bra        $Lt_0_8450;\n"
-    "       .loc    15      38      0\n"
-    "       cvt.u16.u32     %r7, %r1;\n"
+    "	mov.u16 	%rh1, %ctaid.x;\n"
+    "	mul.wide.u16 	%r1, %rh1, 32;\n"
+    "	cvt.u32.u16 	%r2, %tid.x;\n"
+    "	add.u32 	%r3, %r2, %r1;\n"
+    "	ld.param.u32 	%r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n"
+    "	setp.gt.u32 	%p1, %r4, %r3;\n"
+    "	@%p1 bra 	$Lt_0_5634;\n"
+    "	bra.uni 	$LBB17_B2gCudaSearchBNDMq;\n"
+    "$Lt_0_5634:\n"
+    "	.loc	3	45	0\n"
+    "	cvt.u64.u32 	%rd1, %r3;\n"
+    "	mul.lo.u64 	%rd2, %rd1, 4;\n"
+    "	ld.param.u64 	%rd3, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n"
+    "	add.u64 	%rd4, %rd3, %rd2;\n"
+    "	ld.global.u32 	%r5, [%rd4+0];\n"
+    "	cvt.u64.u32 	%rd5, %r5;\n"
+    "	ld.param.u64 	%rd6, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n"
+    "	add.u64 	%rd7, %rd5, %rd6;\n"
+    "	.loc	3	46	0\n"
+    "	ld.global.u32 	%r6, [%rd7+0];\n"
+    "	.loc	3	48	0\n"
+    "	ld.global.u32 	%r7, [%rd7+8];\n"
+    "	.loc	3	49	0\n"
+    "	ld.global.u32 	%r8, [%rd7+4];\n"
+    "	cvt.u64.u32 	%rd8, %r8;\n"
+    "	.loc	3	50	0\n"
+    "	sub.u32 	%r9, %r6, 1;\n"
+    "	mov.s32 	%r10, %r9;\n"
+    "	.loc	3	56	0\n"
+    "	ld.param.u64 	%rd9, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n"
+    "	ld.param.u64 	%rd10, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n"
+    "	add.u64 	%rd11, %rd10, %rd2;\n"
+    "	ld.global.u32 	%r11, [%rd11+0];\n"
+    "	cvt.u64.u32 	%rd12, %r11;\n"
+    "	add.u64 	%rd13, %rd12, %rd1;\n"
+    "	mul.lo.u64 	%rd14, %rd13, 2;\n"
+    "	add.u64 	%rd15, %rd9, %rd14;\n"
+    "	sub.u32 	%r12, %r7, 1;\n"
+    "	setp.gt.u32 	%p2, %r9, %r12;\n"
+    "	mov.u32 	%r13, 0;\n"
+    "	@%p2 bra 	$Lt_0_9474;\n"
+    "	add.u64 	%rd16, %rd7, 12;\n"
+    "	add.u64 	%rd17, %rd15, 2;\n"
+    "	ld.param.u64 	%rd18, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
+    "$Lt_0_6658:\n"
+    " //<loop> Loop body line 66\n"
+    "	.loc	3	66	0\n"
+    "	cvt.u64.u32 	%rd19, %r10;\n"
+    "	add.u64 	%rd20, %rd19, %rd7;\n"
+    "	ld.global.u8 	%rh2, [%rd20+12];\n"
+    "	cvt.u64.u8 	%rd21, %rh2;\n"
+    "	add.u64 	%rd22, %rd21, %rd18;\n"
+    "	ld.global.u8 	%r14, [%rd22+0];\n"
+    "	ld.global.u8 	%rh3, [%rd20+11];\n"
+    "	cvt.u64.u8 	%rd23, %rh3;\n"
+    "	add.u64 	%rd24, %rd23, %rd18;\n"
+    "	ld.global.u8 	%r15, [%rd24+0];\n"
+    "	shl.b32 	%r16, %r15, 4;\n"
+    "	or.b32 	%r17, %r14, %r16;\n"
+    "	cvt.u64.u32 	%rd25, %r17;\n"
+    "	mul.lo.u64 	%rd26, %rd25, 4;\n"
+    "	add.u64 	%rd27, %rd8, %rd26;\n"
+    "	ld.global.u32 	%r18, [%rd27+0];\n"
+    "	mov.u32 	%r19, 0;\n"
+    "	setp.eq.u32 	%p3, %r18, %r19;\n"
+    "	@%p3 bra 	$Lt_0_258;\n"
+    " //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
+    "	.loc	3	69	0\n"
+    "	mov.s32 	%r20, %r10;\n"
+    "	.loc	3	70	0\n"
+    "	sub.u32 	%r21, %r10, %r6;\n"
+    "	add.u32 	%r22, %r21, 1;\n"
+    "	sub.s32 	%r23, %r6, 1;\n"
+    "$Lt_0_7682:\n"
+    " //<loop> Loop body line 73\n"
+    "	.loc	3	73	0\n"
+    "	sub.u32 	%r20, %r20, 1;\n"
+    "	shr.u32 	%r24, %r18, %r23;\n"
+    "	mov.u32 	%r25, 0;\n"
+    "	setp.eq.u32 	%p4, %r24, %r25;\n"
+    "	@%p4 bra 	$Lt_0_8450;\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	setp.le.u32 	%p5, %r20, %r22;\n"
+    "	@%p5 bra 	$Lt_0_8706;\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	76	0\n"
+    "	mov.s32 	%r10, %r20;\n"
+    "	bra.uni 	$Lt_0_8450;\n"
+    "$Lt_0_8706:\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	78	0\n"
+    "	mov.s32 	%r26, %r13;\n"
+    "	add.u32 	%r27, %r26, 1;\n"
+    "	cvt.u16.u32 	%r13, %r27;\n"
+    "	cvt.u64.u32 	%rd28, %r26;\n"
+    "	mul.lo.u64 	%rd29, %rd28, 2;\n"
+    "	add.u64 	%rd30, %rd15, %rd29;\n"
+    "	st.global.u16 	[%rd30+2], %r20;\n"
     "$Lt_0_8450:\n"
-    "       cvt.u32.u16     %r8, %tid.x;\n"
-    "       mul.lo.u32      %r9, %r7, %r8;\n"
-    "       cvt.u16.u32     %r10, %r9;\n"
-    "       add.s32         %r11, %r7, %r10;\n"
-    "       setp.ge.s32     %p2, %r4, %r11;\n"
-    "       @%p2 bra        $Lt_0_8962;\n"
-    "       bra.uni         $LBB23_B2gCudaSearchBNDMq;\n"
-    "$Lt_0_8962:\n"
-    "       .loc    15      44      0\n"
-    "       mul24.lo.s32    %r12, %r7, 2;\n"
-    "       sub.s32         %r13, %r12, 1;\n"
-    "       mov.s32         %r14, %r13;\n"
-    "       cvt.u16.u32     %r15, %r14;\n"
-    "       mov.s32         %r16, %r15;\n"
-    "       add.s32         %r17, %r10, %r15;\n"
-    "       set.lt.u32.s32  %r18, %r4, %r17;\n"
-    "       neg.s32         %r19, %r18;\n"
-    "       mov.u32         %r20, 15;\n"
-    "       set.eq.u32.u32  %r21, %r8, %r20;\n"
-    "       neg.s32         %r22, %r21;\n"
-    "       or.b32  %r23, %r19, %r22;\n"
-    "       mov.u32         %r24, 0;\n"
-    "       setp.eq.s32     %p3, %r23, %r24;\n"
-    "       @%p3 bra        $Lt_0_9474;\n"
-    "       .loc    15      46      0\n"
-    "       sub.u32         %r25, %r4, %r9;\n"
-    "       cvt.u16.u32     %r16, %r25;\n"
-    "$Lt_0_9474:\n"
-    "       mov.u32         %r26, 0;\n"
-    "       setp.eq.u32     %p4, %r16, %r26;\n"
-    "       @%p4 bra        $Lt_0_9986;\n"
-    "       mov.s32         %r27, %r16;\n"
-    "       ld.param.u64    %rd1, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
-    "       mov.u32         %r28, 0;\n"
-    "       mov.s32         %r29, %r27;\n"
-    "$Lt_0_10498:\n"
-    " //<loop> Loop body line 46, nesting depth: 1, estimated iterations: unknown\n"
-    "       .loc    15      51      0\n"
-    "       mov.u32         %r30, 0;\n"
-    "       add.u32         %r31, %r10, %r28;\n"
-    "       cvt.u64.u32     %rd2, %r31;\n"
-    "       mul.lo.u64      %rd3, %rd2, 4;\n"
-    "       add.u64         %rd4, %rd1, %rd3;\n"
-    "       st.global.u32   [%rd4+0], %r30;\n"
-    "       add.u32         %r28, %r28, 1;\n"
-    "       setp.ne.u32     %p5, %r16, %r28;\n"
-    "       @%p5 bra        $Lt_0_10498;\n"
-    "$Lt_0_9986:\n"
-    "       sub.u32         %r32, %r16, 1;\n"
-    "       setp.gt.u32     %p6, %r2, %r32;\n"
-    "       @%p6 bra        $LBB23_B2gCudaSearchBNDMq;\n"
-    "       ld.param.u64    %rd5, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
-    "       ld.param.u64    %rd6, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n"
-    "       ld.param.u64    %rd7, [__cudaparm_B2gCudaSearchBNDMq_buf];\n"
-    "$Lt_0_11522:\n"
-    " //<loop> Loop body line 57\n"
-    "       .loc    15      57      0\n"
-    "       add.u32         %r33, %r10, %r3;\n"
-    "       cvt.u64.u32     %rd8, %r33;\n"
-    "       add.u64         %rd9, %rd8, %rd7;\n"
-    "       ld.global.u8    %rh1, [%rd9+0];\n"
-    "       cvt.u64.u8      %rd10, %rh1;\n"
-    "       add.u64         %rd11, %rd10, %rd5;\n"
-    "       ld.global.u8    %r34, [%rd11+0];\n"
-    "       ld.global.u8    %rh2, [%rd9+-1];\n"
-    "       cvt.u64.u8      %rd12, %rh2;\n"
-    "       add.u64         %rd13, %rd12, %rd5;\n"
-    "       ld.global.u8    %r35, [%rd13+0];\n"
-    "       shl.b32         %r36, %r35, 4;\n"
-    "       or.b32  %r37, %r34, %r36;\n"
-    "       cvt.u64.u32     %rd14, %r37;\n"
-    "       mul.lo.u64      %rd15, %rd14, 4;\n"
-    "       add.u64         %rd16, %rd6, %rd15;\n"
-    "       ld.global.u32   %r38, [%rd16+0];\n"
-    "       mov.u32         %r39, 0;\n"
-    "       setp.eq.u32     %p7, %r38, %r39;\n"
-    "       @%p7 bra        $Lt_0_258;\n"
-    " //<loop> Part of loop body line 57, head labeled $Lt_0_11522\n"
-    "       .loc    15      60      0\n"
-    "       mov.s32         %r28, %r3;\n"
-    "       .loc    15      61      0\n"
-    "       sub.u32         %r40, %r3, %r1;\n"
-    "       add.u32         %r41, %r40, 1;\n"
-    "       sub.s32         %r42, %r1, 1;\n"
-    "$Lt_0_12546:\n"
-    " //<loop> Loop body line 64\n"
-    "       .loc    15      64      0\n"
-    "       sub.u32         %r28, %r28, 1;\n"
-    "       shr.u32         %r43, %r38, %r42;\n"
-    "       mov.u32         %r44, 0;\n"
-    "       setp.eq.u32     %p8, %r43, %r44;\n"
-    "       @%p8 bra        $Lt_0_13314;\n"
-    " //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
-    "       setp.ge.u32     %p9, %r41, %r28;\n"
-    "       @%p9 bra        $Lt_0_13570;\n"
-    " //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
-    "       .loc    15      67      0\n"
-    "       mov.s32         %r3, %r28;\n"
-    "       bra.uni         $Lt_0_13314;\n"
-    "$Lt_0_13570:\n"
-    " //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
-    "       .loc    15      69      0\n"
-    "       mov.u32         %r45, 1;\n"
-    "       ld.param.u64    %rd17, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
-    "       add.u32         %r46, %r10, %r28;\n"
-    "       cvt.u64.u32     %rd18, %r46;\n"
-    "       mul.lo.u64      %rd19, %rd18, 4;\n"
-    "       add.u64         %rd20, %rd17, %rd19;\n"
-    "       st.global.u32   [%rd20+0], %r45;\n"
-    "$Lt_0_13314:\n"
-    "$Lt_0_12802:\n"
-    " //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
-    "       .loc    15      74      0\n"
-    "       mov.u32         %r47, 0;\n"
-    "       setp.eq.u32     %p10, %r28, %r47;\n"
-    "       @%p10 bra       $Lt_0_258;\n"
-    "//<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
-    "       .loc    15      77      0\n"
-    "       add.u32         %r48, %r10, %r28;\n"
-    "       cvt.u64.u32     %rd21, %r48;\n"
-    "       add.u64         %rd22, %rd21, %rd7;\n"
-    "       ld.global.u8    %rh3, [%rd22+0];\n"
-    "       cvt.u64.u8      %rd23, %rh3;\n"
-    "       add.u64         %rd24, %rd23, %rd5;\n"
-    "       ld.global.u8    %r49, [%rd24+0];\n"
-    "       ld.global.u8    %rh4, [%rd22+-1];\n"
-    "       cvt.u64.u8      %rd25, %rh4;\n"
-    "       add.u64         %rd26, %rd25, %rd5;\n"
-    "       ld.global.u8    %r50, [%rd26+0];\n"
-    "       shl.b32         %r51, %r50, 4;\n"
-    "       or.b32  %r52, %r49, %r51;\n"
-    "       cvt.u64.u32     %rd27, %r52;\n"
-    "       mul.lo.u64      %rd28, %rd27, 4;\n"
-    "       add.u64         %rd29, %rd6, %rd28;\n"
-    "       ld.global.u32   %r53, [%rd29+0];\n"
-    "       shl.b32         %r54, %r38, 1;\n"
-    "       and.b32         %r38, %r53, %r54;\n"
-    "       mov.u32         %r55, 0;\n"
-    "       setp.ne.u32     %p11, %r38, %r55;\n"
-    "       @%p11 bra       $Lt_0_12546;\n"
+    "$Lt_0_7938:\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	83	0\n"
+    "	mov.u32 	%r28, 0;\n"
+    "	setp.eq.u32 	%p6, %r20, %r28;\n"
+    "	@%p6 bra 	$Lt_0_258;\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	86	0\n"
+    "	cvt.u64.u32 	%rd31, %r20;\n"
+    "	add.u64 	%rd32, %rd31, %rd7;\n"
+    "	ld.global.u8 	%rh4, [%rd32+12];\n"
+    "	cvt.u64.u8 	%rd33, %rh4;\n"
+    "	add.u64 	%rd34, %rd33, %rd18;\n"
+    "	ld.global.u8 	%r29, [%rd34+0];\n"
+    "	ld.global.u8 	%rh5, [%rd32+11];\n"
+    "	cvt.u64.u8 	%rd35, %rh5;\n"
+    "	add.u64 	%rd36, %rd35, %rd18;\n"
+    "	ld.global.u8 	%r30, [%rd36+0];\n"
+    "	shl.b32 	%r31, %r30, 4;\n"
+    "	or.b32 	%r32, %r29, %r31;\n"
+    "	cvt.u64.u32 	%rd37, %r32;\n"
+    "	mul.lo.u64 	%rd38, %rd37, 4;\n"
+    "	add.u64 	%rd39, %rd8, %rd38;\n"
+    "	ld.global.u32 	%r33, [%rd39+0];\n"
+    "	shl.b32 	%r34, %r18, 1;\n"
+    "	and.b32 	%r18, %r33, %r34;\n"
+    "	mov.u32 	%r35, 0;\n"
+    "	setp.ne.u32 	%p7, %r18, %r35;\n"
+    "	@%p7 bra 	$Lt_0_7682;\n"
     "$Lt_0_258:\n"
-    "$Lt_0_11778:\n"
-    " //<loop> Part of loop body line 57, head labeled $Lt_0_11522\n"
-    "       .loc    15      80      0\n"
-    "       add.u32         %r56, %r3, %r1;\n"
-    "       sub.u32         %r3, %r56, 1;\n"
-    "       setp.ge.u32     %p12, %r32, %r3;\n"
-    "       @%p12 bra       $Lt_0_11522;\n"
-    "$LBB23_B2gCudaSearchBNDMq:\n"
-    "       .loc    15      83      0\n"
-    "       exit;\n"
+    "$Lt_0_6914:\n"
+    " //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
+    "	.loc	3	89	0\n"
+    "	add.u32 	%r36, %r6, %r10;\n"
+    "	sub.u32 	%r10, %r36, 1;\n"
+    "	setp.ge.u32 	%p8, %r12, %r10;\n"
+    "	@%p8 bra 	$Lt_0_6658;\n"
+    "	bra.uni 	$Lt_0_6146;\n"
+    "$Lt_0_9474:\n"
+    "$Lt_0_6146:\n"
+    "	.loc	3	92	0\n"
+    "	st.global.u16 	[%rd15+0], %r13;\n"
+    "$LBB17_B2gCudaSearchBNDMq:\n"
+    "	.loc	3	94	0\n"
+    "	exit;\n"
     "$LDWend_B2gCudaSearchBNDMq:\n"
-    "       } // B2gCudaSearchBNDMq\n"
-    "\n";
+    "	} // B2gCudaSearchBNDMq\n"
+    "";
 #else
 /**
  * \todo Optimize the kernel.  Also explore the options for compiling the
@@ -304,175 +270,152 @@ const char *b2g_cuda_ptx_image_32_bit =
     "	.version 1.4\n"
     "	.target sm_10, map_f64_to_f32\n"
     "	.entry B2gCudaSearchBNDMq (\n"
-    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_offsets,\n"
-    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_B2G,\n"
-    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n"
-    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_buf,\n"
-    "		.param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n"
-    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n"
+    "		.param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n"
     "	{\n"
-    "	.reg .u32 %r<81>;\n"
-    "	.reg .pred %p<14>;\n"
-    "	.loc	15	14	0\n"
+    "	.reg .u16 %rh<6>;\n"
+    "	.reg .u32 %r<65>;\n"
+    "	.reg .pred %p<10>;\n"
+    "	.loc	3	36	0\n"
     "$LBB1_B2gCudaSearchBNDMq:\n"
-    "	.loc	15	16	0\n"
-    "	ld.param.u32 	%r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n"
-    "	sub.u32 	%r2, %r1, 1;\n"
-    "	mov.s32 	%r3, %r2;\n"
-    "	.loc	15	22	0\n"
-    "	ld.param.u16 	%r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n"
-    "	shr.u32 	%r5, %r4, 4;\n"
-    "	cvt.u16.u32 	%r6, %r5;\n"
-    "	mov.s32 	%r7, %r6;\n"
-    "	setp.ge.u32 	%p1, %r6, %r1;\n"
-    "	@%p1 bra 	$Lt_0_8450;\n"
-    "	.loc	15	27	0\n"
-    "	cvt.u16.u32 	%r7, %r1;\n"
+    "	mov.u16 	%rh1, %ctaid.x;\n"
+    "	mul.wide.u16 	%r1, %rh1, 32;\n"
+    "	cvt.u32.u16 	%r2, %tid.x;\n"
+    "	add.u32 	%r3, %r2, %r1;\n"
+    "	ld.param.u32 	%r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n"
+    "	setp.gt.u32 	%p1, %r4, %r3;\n"
+    "	@%p1 bra 	$Lt_0_5634;\n"
+    "	bra.uni 	$LBB17_B2gCudaSearchBNDMq;\n"
+    "$Lt_0_5634:\n"
+    "	.loc	3	45	0\n"
+    "	mul.lo.u32 	%r5, %r3, 4;\n"
+    "	ld.param.u32 	%r6, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n"
+    "	add.u32 	%r7, %r6, %r5;\n"
+    "	ld.global.u32 	%r8, [%r7+0];\n"
+    "	ld.param.u32 	%r9, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n"
+    "	add.u32 	%r10, %r8, %r9;\n"
+    "	.loc	3	46	0\n"
+    "	ld.global.u32 	%r11, [%r10+0];\n"
+    "	.loc	3	48	0\n"
+    "	ld.global.u32 	%r12, [%r10+8];\n"
+    "	.loc	3	49	0\n"
+    "	ld.global.u32 	%r13, [%r10+4];\n"
+    "	.loc	3	50	0\n"
+    "	sub.u32 	%r14, %r11, 1;\n"
+    "	mov.s32 	%r15, %r14;\n"
+    "	.loc	3	56	0\n"
+    "	ld.param.u32 	%r16, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n"
+    "	ld.param.u32 	%r17, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n"
+    "	add.u32 	%r18, %r17, %r5;\n"
+    "	ld.global.u32 	%r19, [%r18+0];\n"
+    "	add.u32 	%r20, %r19, %r3;\n"
+    "	mul.lo.u32 	%r21, %r20, 2;\n"
+    "	add.u32 	%r22, %r16, %r21;\n"
+    "	sub.u32 	%r23, %r12, 1;\n"
+    "	setp.gt.u32 	%p2, %r14, %r23;\n"
+    "	mov.u16 	%rh2, 0;\n"
+    "	@%p2 bra 	$Lt_0_9474;\n"
+    "	add.u32 	%r24, %r10, 12;\n"
+    "	add.u32 	%r25, %r22, 2;\n"
+    "	ld.param.u32 	%r26, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
+    "$Lt_0_6658:\n"
+    " //<loop> Loop body line 66\n"
+    "	.loc	3	66	0\n"
+    "	add.u32 	%r27, %r10, %r15;\n"
+    "	ld.global.u8 	%r28, [%r27+12];\n"
+    "	add.u32 	%r29, %r28, %r26;\n"
+    "	ld.global.u8 	%r30, [%r29+0];\n"
+    "	ld.global.u8 	%r31, [%r27+11];\n"
+    "	add.u32 	%r32, %r31, %r26;\n"
+    "	ld.global.u8 	%r33, [%r32+0];\n"
+    "	shl.b32 	%r34, %r33, 4;\n"
+    "	or.b32 	%r35, %r30, %r34;\n"
+    "	mul.lo.u32 	%r36, %r35, 4;\n"
+    "	add.u32 	%r37, %r13, %r36;\n"
+    "	ld.global.u32 	%r38, [%r37+0];\n"
+    "	mov.u32 	%r39, 0;\n"
+    "	setp.eq.u32 	%p3, %r38, %r39;\n"
+    "	@%p3 bra 	$Lt_0_258;\n"
+    " //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
+    "	.loc	3	69	0\n"
+    "	mov.s32 	%r40, %r15;\n"
+    "	.loc	3	70	0\n"
+    "	sub.u32 	%r41, %r15, %r11;\n"
+    "	add.u32 	%r42, %r41, 1;\n"
+    "	sub.s32 	%r43, %r11, 1;\n"
+    "$Lt_0_7682:\n"
+    " //<loop> Loop body line 73\n"
+    "	.loc	3	73	0\n"
+    "	sub.u32 	%r40, %r40, 1;\n"
+    "	shr.u32 	%r44, %r38, %r43;\n"
+    "	mov.u32 	%r45, 0;\n"
+    "	setp.eq.u32 	%p4, %r44, %r45;\n"
+    "	@%p4 bra 	$Lt_0_8450;\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	setp.le.u32 	%p5, %r40, %r42;\n"
+    "	@%p5 bra 	$Lt_0_8706;\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	76	0\n"
+    "	mov.s32 	%r15, %r40;\n"
+    "	bra.uni 	$Lt_0_8450;\n"
+    "$Lt_0_8706:\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	78	0\n"
+    "	mov.s16 	%rh3, %rh2;\n"
+    "	add.u16 	%rh4, %rh3, 1;\n"
+    "	mov.u16 	%rh2, %rh4;\n"
+    "	mul.wide.u16 	%r46, %rh3, 2;\n"
+    "	add.u32 	%r47, %r22, %r46;\n"
+    "	st.global.u16 	[%r47+2], %r40;\n"
     "$Lt_0_8450:\n"
-    "	cvt.u32.u16 	%r8, %tid.x;\n"
-    "	mul.lo.u32 	%r9, %r7, %r8;\n"
-    "	cvt.u16.u32 	%r10, %r9;\n"
-    "	add.s32 	%r11, %r7, %r10;\n"
-    "	setp.ge.s32 	%p2, %r4, %r11;\n"
-    "	@%p2 bra 	$Lt_0_8962;\n"
-    "	bra.uni 	$LBB23_B2gCudaSearchBNDMq;\n"
-    "$Lt_0_8962:\n"
-    "	.loc	15	33	0\n"
-    "	mul24.lo.s32 	%r12, %r7, 2;\n"
-    "	sub.s32 	%r13, %r12, 1;\n"
-    "	mov.s32 	%r14, %r13;\n"
-    "	cvt.u16.u32 	%r15, %r14;\n"
-    "	mov.s32 	%r16, %r15;\n"
-    "	add.s32 	%r17, %r10, %r15;\n"
-    "	set.lt.u32.s32 	%r18, %r4, %r17;\n"
-    "	neg.s32 	%r19, %r18;\n"
-    "	mov.u32 	%r20, 15;\n"
-    "	set.eq.u32.u32 	%r21, %r8, %r20;\n"
-    "	neg.s32 	%r22, %r21;\n"
-    "	or.b32 	%r23, %r19, %r22;\n"
-    "	mov.u32 	%r24, 0;\n"
-    "	setp.eq.s32 	%p3, %r23, %r24;\n"
-    "	@%p3 bra 	$Lt_0_9474;\n"
-    "	.loc	15	35	0\n"
-    "	sub.u32 	%r25, %r4, %r9;\n"
-    "	cvt.u16.u32 	%r16, %r25;\n"
-    "$Lt_0_9474:\n"
-    "	mov.u32 	%r26, 0;\n"
-    "	setp.eq.u32 	%p4, %r16, %r26;\n"
-    "	@%p4 bra 	$Lt_0_9986;\n"
-    "	mov.s32 	%r27, %r16;\n"
-    "	ld.param.u32 	%r28, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
-    "	mov.u32 	%r29, 0;\n"
-    "	mov.s32 	%r30, %r27;\n"
-    "$Lt_0_10498:\n"
-    " //<loop> Loop body line 35, nesting depth: 1, estimated iterations: unknown\n"
-    "	.loc	15	40	0\n"
-    "	mov.u32 	%r31, 0;\n"
-    "	add.u32 	%r32, %r10, %r29;\n"
-    "	mul.lo.u32 	%r33, %r32, 4;\n"
-    "	add.u32 	%r34, %r28, %r33;\n"
-    "	st.global.u32 	[%r34+0], %r31;\n"
-    "	add.u32 	%r29, %r29, 1;\n"
-    "	setp.ne.u32 	%p5, %r16, %r29;\n"
-    "	@%p5 bra 	$Lt_0_10498;\n"
-    "$Lt_0_9986:\n"
-    "	sub.u32 	%r35, %r16, 1;\n"
-    "	setp.gt.u32 	%p6, %r2, %r35;\n"
-    "	@%p6 bra 	$LBB23_B2gCudaSearchBNDMq;\n"
-    "	ld.param.u32 	%r36, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
-    "	ld.param.u32 	%r37, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n"
-    "	ld.param.u32 	%r38, [__cudaparm_B2gCudaSearchBNDMq_buf];\n"
-    "$Lt_0_11522:\n"
-    " //<loop> Loop body line 46\n"
-    "	.loc	15	46	0\n"
-    "	add.u32 	%r39, %r10, %r3;\n"
-    "	add.u32 	%r40, %r39, %r38;\n"
-    "	ld.global.u8 	%r41, [%r40+0];\n"
-    "	add.u32 	%r42, %r41, %r36;\n"
-    "	ld.global.u8 	%r43, [%r42+0];\n"
-    "	ld.global.u8 	%r44, [%r40+-1];\n"
-    "	add.u32 	%r45, %r44, %r36;\n"
-    "	ld.global.u8 	%r46, [%r45+0];\n"
-    "	shl.b32 	%r47, %r46, 4;\n"
-    "	or.b32 	%r48, %r43, %r47;\n"
-    "	mul.lo.u32 	%r49, %r48, 4;\n"
-    "	add.u32 	%r50, %r37, %r49;\n"
-    "	ld.global.u32 	%r51, [%r50+0];\n"
-    "	mov.u32 	%r52, 0;\n"
-    "	setp.eq.u32 	%p7, %r51, %r52;\n"
-    "	@%p7 bra 	$Lt_0_258;\n"
-    " //<loop> Part of loop body line 46, head labeled $Lt_0_11522\n"
-    "	.loc	15	49	0\n"
-    "	mov.s32 	%r29, %r3;\n"
-    "	.loc	15	50	0\n"
-    "	sub.u32 	%r53, %r3, %r1;\n"
-    "	add.u32 	%r54, %r53, 1;\n"
-    "	sub.s32 	%r55, %r1, 1;\n"
-    "$Lt_0_12546:\n"
-    " //<loop> Loop body line 53\n"
-    "	.loc	15	53	0\n"
-    "	sub.u32 	%r29, %r29, 1;\n"
-    "	shr.u32 	%r56, %r51, %r55;\n"
-    "	mov.u32 	%r57, 0;\n"
-    "	setp.eq.u32 	%p8, %r56, %r57;\n"
-    "	@%p8 bra 	$Lt_0_13314;\n"
-    " //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
-    "	setp.ge.u32 	%p9, %r54, %r29;\n"
-    "	@%p9 bra 	$Lt_0_13570;\n"
-    " //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
-    "	.loc	15	56	0\n"
-    "	mov.s32 	%r3, %r29;\n"
-    "	bra.uni 	$Lt_0_13314;\n"
-    "$Lt_0_13570:\n"
-    " //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
-    "	.loc	15	58	0\n"
-    "	mov.u32 	%r58, 1;\n"
-    "	ld.param.u32 	%r59, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
-    "	add.u32 	%r60, %r10, %r29;\n"
-    "	mul.lo.u32 	%r61, %r60, 4;\n"
-    "	add.u32 	%r62, %r59, %r61;\n"
-    "	st.global.u32 	[%r62+0], %r58;\n"
-    "$Lt_0_13314:\n"
-    "$Lt_0_12802:\n"
-    " //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
-    "	.loc	15	63	0\n"
-    "	mov.u32 	%r63, 0;\n"
-    "	setp.eq.u32 	%p10, %r29, %r63;\n"
-    "	@%p10 bra 	$Lt_0_258;\n"
-    " //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
-    "	.loc	15	66	0\n"
-    "	add.u32 	%r64, %r10, %r29;\n"
-    "	add.u32 	%r65, %r64, %r38;\n"
-    "	ld.global.u8 	%r66, [%r65+0];\n"
-    "	add.u32 	%r67, %r66, %r36;\n"
-    "	ld.global.u8 	%r68, [%r67+0];\n"
-    "	ld.global.u8 	%r69, [%r65+-1];\n"
-    "	add.u32 	%r70, %r69, %r36;\n"
-    "	ld.global.u8 	%r71, [%r70+0];\n"
-    "	shl.b32 	%r72, %r71, 4;\n"
-    "	or.b32 	%r73, %r68, %r72;\n"
-    "	mul.lo.u32 	%r74, %r73, 4;\n"
-    "	add.u32 	%r75, %r37, %r74;\n"
-    "	ld.global.u32 	%r76, [%r75+0];\n"
-    "	shl.b32 	%r77, %r51, 1;\n"
-    "	and.b32 	%r51, %r76, %r77;\n"
-    "	mov.u32 	%r78, 0;\n"
-    "	setp.ne.u32 	%p11, %r51, %r78;\n"
-    "	@%p11 bra 	$Lt_0_12546;\n"
+    "$Lt_0_7938:\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	83	0\n"
+    "	mov.u32 	%r48, 0;\n"
+    "	setp.eq.u32 	%p6, %r40, %r48;\n"
+    "	@%p6 bra 	$Lt_0_258;\n"
+    " //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
+    "	.loc	3	86	0\n"
+    "	add.u32 	%r49, %r10, %r40;\n"
+    "	ld.global.u8 	%r50, [%r49+12];\n"
+    "	add.u32 	%r51, %r50, %r26;\n"
+    "	ld.global.u8 	%r52, [%r51+0];\n"
+    "	ld.global.u8 	%r53, [%r49+11];\n"
+    "	add.u32 	%r54, %r53, %r26;\n"
+    "	ld.global.u8 	%r55, [%r54+0];\n"
+    "	shl.b32 	%r56, %r55, 4;\n"
+    "	or.b32 	%r57, %r52, %r56;\n"
+    "	mul.lo.u32 	%r58, %r57, 4;\n"
+    "	add.u32 	%r59, %r13, %r58;\n"
+    "	ld.global.u32 	%r60, [%r59+0];\n"
+    "	shl.b32 	%r61, %r38, 1;\n"
+    "	and.b32 	%r38, %r60, %r61;\n"
+    "	mov.u32 	%r62, 0;\n"
+    "	setp.ne.u32 	%p7, %r38, %r62;\n"
+    "	@%p7 bra 	$Lt_0_7682;\n"
     "$Lt_0_258:\n"
-    "$Lt_0_11778:\n"
-    " //<loop> Part of loop body line 46, head labeled $Lt_0_11522\n"
-    "	.loc	15	69	0\n"
-    "	add.u32 	%r79, %r3, %r1;\n"
-    "	sub.u32 	%r3, %r79, 1;\n"
-    "	setp.ge.u32 	%p12, %r35, %r3;\n"
-    "	@%p12 bra 	$Lt_0_11522;\n"
-    "$LBB23_B2gCudaSearchBNDMq:\n"
-    "	.loc	15	72	0\n"
+    "$Lt_0_6914:\n"
+    " //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
+    "	.loc	3	89	0\n"
+    "	add.u32 	%r63, %r11, %r15;\n"
+    "	sub.u32 	%r15, %r63, 1;\n"
+    "	setp.ge.u32 	%p8, %r23, %r15;\n"
+    "	@%p8 bra 	$Lt_0_6658;\n"
+    "	bra.uni 	$Lt_0_6146;\n"
+    "$Lt_0_9474:\n"
+    "$Lt_0_6146:\n"
+    "	.loc	3	92	0\n"
+    "	st.global.u16 	[%r22+0], %rh2;\n"
+    "$LBB17_B2gCudaSearchBNDMq:\n"
+    "	.loc	3	94	0\n"
     "	exit;\n"
     "$LDWend_B2gCudaSearchBNDMq:\n"
     "	} // B2gCudaSearchBNDMq\n"
-    "\n";
+    "";
 #endif
 
 /**
@@ -482,7 +425,6 @@ void MpmB2gCudaRegister(void)
 {
     mpm_table[MPM_B2G_CUDA].name = "b2g_cuda";
     mpm_table[MPM_B2G_CUDA].max_pattern_length = B2G_CUDA_WORD_SIZE;
-
     mpm_table[MPM_B2G_CUDA].InitCtx = B2gCudaInitCtx;
     mpm_table[MPM_B2G_CUDA].InitThreadCtx = B2gCudaThreadInitCtx;
     mpm_table[MPM_B2G_CUDA].DestroyCtx = B2gCudaDestroyCtx;
@@ -497,37 +439,6 @@ void MpmB2gCudaRegister(void)
     mpm_table[MPM_B2G_CUDA].RegisterUnittests = B2gCudaRegisterTests;
 }
 
-static inline void B2gCudaEndMatchAppend(MpmCtx *mpm_ctx, B2gCudaPattern *p,
-                                         uint16_t offset, uint16_t depth,
-                                         uint32_t pid, uint32_t sid)
-{
-    MpmEndMatch *em = MpmAllocEndMatch(mpm_ctx);
-    if (em == NULL) {
-        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
-        return;
-    }
-
-    SCLogDebug("em alloced at %p", em);
-
-    em->id = pid;
-    em->sig_id = sid;
-    em->depth = depth;
-    em->offset = offset;
-
-    if (p->em == NULL) {
-        p->em = em;
-        return;
-    }
-
-    MpmEndMatch *m = p->em;
-    while (m->next != NULL) {
-        m = m->next;
-    }
-    m->next = em;
-
-    return;
-}
-
 void B2gCudaPrintInfo(MpmCtx *mpm_ctx)
 {
     B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx;
@@ -674,9 +585,6 @@ static inline B2gCudaPattern *B2gCudaInitHashLookup(B2gCudaCtx *ctx, uint8_t *pa
 
 void B2gCudaFreePattern(MpmCtx *mpm_ctx, B2gCudaPattern *p)
 {
-    if (p != NULL && p->em != NULL)
-        MpmEndMatchFreeAll(mpm_ctx, p->em);
-
     if (p != NULL && p->cs != NULL && p->cs != p->ci) {
         SCFree(p->cs);
         mpm_ctx->memory_cnt--;
@@ -705,7 +613,7 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat,
 {
     B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx;
 
-    SCLogDebug("ctx %p len %" PRIu16 " pid %" PRIu32, ctx, patlen, pid);
+    SCLogDebug("ctx %p len %"PRIu16" pid %" PRIu32, ctx, patlen, pid);
 
     if (patlen == 0)
         return 0;
@@ -721,11 +629,13 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat,
 
         p->len = patlen;
         p->flags = flags;
+        p->id = pid;
 
         /* setup the case insensitive part of the pattern */
         p->ci = SCMalloc(patlen);
         if (p->ci == NULL)
             goto error;
+
         mpm_ctx->memory_cnt++;
         mpm_ctx->memory_size += patlen;
         memcpy_tolower(p->ci, pat, patlen);
@@ -735,7 +645,7 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat,
             /* nocase means no difference between cs and ci */
             p->cs = p->ci;
         } else {
-            if (memcmp(p->ci, pat, p->len) == 0) {
+            if (memcmp(p->ci,pat,p->len) == 0) {
                 /* no diff between cs and ci: pat is lowercase */
                 p->cs = p->ci;
             } else {
@@ -745,34 +655,29 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat,
 
                 mpm_ctx->memory_cnt++;
                 mpm_ctx->memory_size += patlen;
-
                 memcpy(p->cs, pat, patlen);
             }
         }
 
+        //printf("B2gAddPattern: ci \""); prt(p->ci,p->len);
+        //printf("\" cs \""); prt(p->cs,p->len);
+        //printf("\"\n");
+
         /* put in the pattern hash */
         B2gCudaInitHashAdd(ctx, p);
 
         if (mpm_ctx->pattern_cnt == 65535) {
             printf("Max search words reached\n");
-            exit(EXIT_FAILURE);
+            exit(1);
         }
         mpm_ctx->pattern_cnt++;
 
-        if (mpm_ctx->maxlen < patlen)
-            mpm_ctx->maxlen = patlen;
-
-        if (mpm_ctx->minlen == 0)
-            mpm_ctx->minlen = patlen;
-        else if (mpm_ctx->minlen > patlen)
-            mpm_ctx->minlen = patlen;
+        if (mpm_ctx->maxlen < patlen) mpm_ctx->maxlen = patlen;
+        if (mpm_ctx->minlen == 0) mpm_ctx->minlen = patlen;
+        else if (mpm_ctx->minlen > patlen) mpm_ctx->minlen = patlen;
     }
 
-    /* we need a match */
-    B2gCudaEndMatchAppend(mpm_ctx, p, offset, depth, pid, sid);
-
     mpm_ctx->total_pattern_cnt++;
-
     return 0;
 
 error:
@@ -1011,12 +916,6 @@ int B2gCudaSetDeviceBuffers(MpmCtx *mpm_ctx)
 {
     B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx;
 
-    if (SCCudaHlGetCudaDevicePtr(&ctx->cuda_g_u8_lowercasetable,
-                                 "G_U8_LOWERCASETABLE", 256 * sizeof(char),
-                                 g_u8_lowercasetable, ctx->module_handle) == -1) {
-        goto error;
-    }
-
     /* search kernel */
     if (SCCudaMemAlloc(&ctx->cuda_B2G,
                        sizeof(B2G_CUDA_TYPE) * ctx->hash_size) == -1) {
@@ -1035,19 +934,7 @@ int B2gCudaSetDeviceBuffers(MpmCtx *mpm_ctx)
 
 int B2gCudaSetKernelArgs(MpmCtx *mpm_ctx)
 {
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx;
-
-    /* search kernel */
-    if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg2_offset,
-                        (void *)&ctx->cuda_g_u8_lowercasetable,
-                        sizeof(void *)) == -1) {
-        goto error;
-    }
-
     return 0;
-
- error:
-    return -1;
 }
 
 int B2gCudaPreparePatterns(MpmCtx *mpm_ctx)
@@ -1244,6 +1131,9 @@ void B2gCudaInitCtx(MpmCtx *mpm_ctx, int module_handle)
 
     /* initialize the hash we use to speed up pattern insertions */
     B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx;
+
+    /* hold the cuda module handle against which we are registered.  This is our
+     * only reference to know our place of birth */
     ctx->module_handle = module_handle;
 
     ctx->init_hash = SCMalloc(sizeof(B2gCudaPattern *) * INIT_HASH_SIZE);
@@ -1260,67 +1150,6 @@ void B2gCudaInitCtx(MpmCtx *mpm_ctx, int module_handle)
     /* init defaults search functions */
     ctx->Search = b2g_func;
 
-    if (SCCudaHlGetCudaContext(&ctx->cuda_context, module_handle) == -1) {
-        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context");
-    }
-
-#if defined(__x86_64__) || defined(__ia64__)
-    if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image_64_bit,
-                              module_handle) == -1) {
-        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
-    }
-#else
-    if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image_32_bit,
-                              module_handle) == -1) {
-        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
-    }
-#endif
-
-
-    if (SCCudaModuleGetFunction(&ctx->cuda_search_kernel, ctx->cuda_module,
-                                B2G_CUDA_SEARCHFUNC_NAME) == -1) {
-        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda function");
-    }
-
-#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
-
-    int offset = 0;
-
-    ALIGN_UP(offset, __alignof(void *));
-    ctx->cuda_search_kernel_arg0_offset = offset;
-    offset += sizeof(void *);
-
-    ALIGN_UP(offset, __alignof(void *));
-    ctx->cuda_search_kernel_arg1_offset = offset;
-    offset += sizeof(void *);
-
-    ALIGN_UP(offset, __alignof(void *));
-    ctx->cuda_search_kernel_arg2_offset = offset;
-    offset += sizeof(void *);
-
-    ALIGN_UP(offset, __alignof(void *));
-    ctx->cuda_search_kernel_arg3_offset = offset;
-    offset += sizeof(void *);
-
-    ALIGN_UP(offset, __alignof(unsigned short));
-    ctx->cuda_search_kernel_arg4_offset = offset;
-    offset += sizeof(unsigned short);
-
-    ALIGN_UP(offset, __alignof(unsigned int));
-    ctx->cuda_search_kernel_arg5_offset = offset;
-    offset += sizeof(unsigned int);
-
-    ctx->cuda_search_kernel_arg_total = offset;
-
-    //printf("arg0: %d\n", arg0);
-    //printf("arg1: %d\n", arg1);
-    //printf("arg2: %d\n", arg2);
-    //printf("arg3: %d\n", arg3);
-    //printf("arg4: %d\n", arg4);
-    //printf("arg5: %d\n", arg5);
-
-    //printf("arg_total: %d\n", arg_total);
-
     return;
 }
 
@@ -1395,16 +1224,35 @@ void B2gCudaDestroyCtx(MpmCtx *mpm_ctx)
         mpm_ctx->memory_size -= (sizeof(uint8_t) * ctx->hash_size);
     }
 
+    CUcontext dummy_context;
+    SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(ctx->module_handle);
+    if (module_data == NULL) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "How did we even fail to get a "
+                   "module_data if we are having a module_handle");
+        goto error;
+    }
+    if (SCCudaHlGetCudaContext(&dummy_context, ctx->module_handle) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
+                   "module %s", module_data->name);
+        goto error;
+    }
+    SCCudaCtxPushCurrent(dummy_context);
+
     if (ctx->cuda_B2G != 0) {
-        if (SCCudaMemFree(ctx->cuda_B2G) == -1)
-            SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error freeing ctx->cuda_search_B2G ");
+        if (SCCudaMemFree(ctx->cuda_B2G) == -1) {
+            SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error freeing ctx->cuda_B2G ");
+            goto error;
+        }
         ctx->cuda_B2G = 0;
     }
+    SCCudaCtxPopCurrent(&dummy_context);
 
     SCFree(mpm_ctx->ctx);
     mpm_ctx->memory_cnt--;
     mpm_ctx->memory_size -= sizeof(B2gCudaCtx);
 
+
+ error:
     return;
 }
 
@@ -1456,143 +1304,122 @@ uint32_t B2gCudaSearchBNDMq(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
                             PatternMatcherQueue *pmq, uint8_t *buf,
                             uint16_t buflen)
 {
-#define CUDA_THREADS 16
-    CUdeviceptr cuda_buf = 0;
-    CUdeviceptr cuda_offsets = 0;
-    uint32_t matches = 0;
-    B2gCudaCtx *ctx = mpm_ctx->ctx;
-    uint16_t h = 0;
-    int i = 0;
-    int host_offsets[UINT16_MAX];
-
-    if (buflen < ctx->m)
-        return 0;
-
-    if (SCCudaMemAlloc(&cuda_buf, buflen * sizeof(char)) == -1) {
-        goto error;
-    }
-    if (SCCudaMemcpyHtoD(cuda_buf, buf,
-                         buflen * sizeof(char)) == -1) {
-        goto error;
-    }
-
-    if (SCCudaMemAlloc(&cuda_offsets, buflen * sizeof(int)) == -1) {
-        goto error;
-    }
-
-    if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg0_offset,
-                        (void *)&cuda_offsets, sizeof(void *)) == -1) {
-        goto error;
-    }
-
-    if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg1_offset,
-                        (void *)&ctx->cuda_B2G, sizeof(void *)) == -1) {
-        goto error;
-    }
-
-    if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg3_offset,
-                        (void *)&cuda_buf, sizeof(void *)) == -1) {
-        goto error;
-    }
-
-    if (SCCudaParamSeti(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg4_offset,
-                        buflen) == -1) {
-        goto error;
-    }
-
-    if (SCCudaParamSeti(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg5_offset,
-                        ctx->m) == -1) {
-        goto error;
-    }
-
-    if (SCCudaParamSetSize(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg_total) == -1)
-        goto error;
-
-    if (SCCudaFuncSetBlockShape(ctx->cuda_search_kernel, CUDA_THREADS, 1, 1) == -1)
-        goto error;
-
-    if (SCCudaLaunchGrid(ctx->cuda_search_kernel, 1, 1) == -1)
-        goto error;
-
-    if (SCCudaMemcpyDtoH(host_offsets, cuda_offsets, buflen * sizeof(int)) == -1)
-        goto error;
+    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx;
+#ifdef B2G_COUNTERS
+    B2gCudaThreadCtx *tctx = (B2gCudaThreadCtx *)mpm_thread_ctx->ctx;
+#endif
+    uint32_t pos = ctx->m - B2G_CUDA_Q + 1, matches = 0;
+    B2G_CUDA_TYPE d;
 
-    //printf("Raw matches: ");
-    //for (i = 0; i < buflen; i++) {
-    //    printf("%d",offsets_buffer[i]);
-    //}
     //printf("\n");
+    //PrintRawDataFp(stdout, buf, buflen);
 
-    //printf("Matches: ");
-    for (i = 0; i < buflen; i++) {
-        if (host_offsets[i] == 0)
-            continue;
-
-
-        /* get our patterns from the hash */
-        h = B2G_CUDA_HASH16(u8_tolower(buf[i + ctx->m - 2]),
-                            u8_tolower(buf[i + ctx->m - 1]));
-
-        if (ctx->bloom[h] != NULL) {
-            COUNT(tctx->stat_pminlen_calls++);
-            COUNT(tctx->stat_pminlen_total+=ctx->pminlen[h]);
-
-            if ((buflen - i) < ctx->pminlen[h]) {
-                continue;
-            } else {
-                COUNT(tctx->stat_bloom_calls++);
+    SCLogDebug("buflen %"PRIu16", ctx->m %"PRIu32", pos %"PRIu32"", buflen,
+            ctx->m, pos);
 
-                if (BloomFilterTest(ctx->bloom[h], buf + i, ctx->pminlen[h]) == 0) {
-                    COUNT(tctx->stat_bloom_hits++);
+    COUNT(tctx->stat_calls++);
+    COUNT(tctx->stat_m_total+=ctx->m);
 
-                    continue;
-                }
-            }
-        }
+    if (buflen < ctx->m)
+        return 0;
 
-        B2gCudaHashItem *hi = ctx->hash[h], *thi;
-        for (thi = hi; thi != NULL; thi = thi->nxt) {
-            COUNT(tctx->stat_d0_hashloop++);
-            B2gCudaPattern *p = ctx->parray[thi->idx];
+    while (pos <= (uint32_t)(buflen - B2G_CUDA_Q + 1)) {
+        uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]),u8_tolower(buf[pos]));
+        d = ctx->B2G[h];
 
-            if (p->flags & MPM_PATTERN_FLAG_NOCASE) {
-                if ((buflen - i) < p->len) {
-                    continue;
+        if (d != 0) {
+            COUNT(tctx->stat_d0++);
+            uint32_t j = pos;
+            uint32_t first = pos - (ctx->m - B2G_CUDA_Q + 1);
+
+            do {
+                j = j - 1;
+
+                if (d >= (uint32_t)(1 << (ctx->m - 1))) {
+                    if (j > first) pos = j;
+                    else {
+                        /* get our patterns from the hash */
+                        h = B2G_CUDA_HASH16(u8_tolower(buf[j + ctx->m - 2]),u8_tolower(buf[j + ctx->m - 1]));
+
+                        if (ctx->bloom[h] != NULL) {
+                            COUNT(tctx->stat_pminlen_calls++);
+                            COUNT(tctx->stat_pminlen_total+=ctx->pminlen[h]);
+
+                            if ((buflen - j) < ctx->pminlen[h]) {
+                                goto skip_loop;
+                            } else {
+                                COUNT(tctx->stat_bloom_calls++);
+
+                                if (BloomFilterTest(ctx->bloom[h], buf+j, ctx->pminlen[h]) == 0) {
+                                    COUNT(tctx->stat_bloom_hits++);
+
+                                    SCLogDebug("Bloom: %p, buflen %" PRIu32 ", pos %" PRIu32 ", p_min_len %" PRIu32 "",
+                                        ctx->bloom[h], buflen, pos, ctx->pminlen[h]);
+                                    goto skip_loop;
+                                }
+                            }
+                        }
+
+                        B2gCudaHashItem *hi = ctx->hash[h], *thi;
+                        for (thi = hi; thi != NULL; thi = thi->nxt) {
+                            COUNT(tctx->stat_d0_hashloop++);
+                            B2gCudaPattern *p = ctx->parray[thi->idx];
+
+                            if (p->flags & MPM_PATTERN_FLAG_NOCASE) {
+                                if ((buflen - j) < p->len) {
+                                    continue;
+                                }
+
+                                if (memcmp_lowercase(p->ci, buf+j, p->len) == 0) {
+#ifdef PRINTMATCH
+                                    printf("CI Exact match: "); prt(p->ci, p->len); printf("\n");
+#endif
+                                    COUNT(tctx->stat_loop_match++);
+
+                                    matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
+                                } else {
+                                    COUNT(tctx->stat_loop_no_match++);
+                                }
+                            } else {
+                                if (buflen - j < p->len)
+                                    continue;
+
+                                if (memcmp(p->cs, buf+j, p->len) == 0) {
+#ifdef PRINTMATCH
+                                    printf("CS Exact match: "); prt(p->cs, p->len); printf("\n");
+#endif
+                                    COUNT(tctx->stat_loop_match++);
+
+                                    matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
+                                } else {
+                                    COUNT(tctx->stat_loop_no_match++);
+                                }
+                            }
+                        }
+skip_loop:
+                        SCLogDebug("skipped");
+                        //SCLogDebug("output at pos %" PRIu32 ": ", j); prt(buf + (j), ctx->m); printf("\n");
+                        ;
+                    }
                 }
 
-                if (memcmp_lowercase(p->ci, buf + i, p->len) == 0) {
-                    COUNT(tctx->stat_loop_match++);
-
-                    matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, i, p->len);
-                } else {
-                    COUNT(tctx->stat_loop_no_match++);
+                if (j == 0) {
+                    break;
                 }
-            } else {
-                if (buflen - i < p->len)
-                    continue;
-
-                if (memcmp(p->cs, buf + i, p->len) == 0) {
-                    COUNT(tctx->stat_loop_match++);
 
-                    matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, i, p->len);
-                } else {
-                    COUNT(tctx->stat_loop_no_match++);
-                }
-            }
+                h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]),u8_tolower(buf[j]));
+                d = (d << 1) & ctx->B2G[h];
+            } while (d != 0);
         }
-    } /* for(i = 0; i < buflen; i++) */
+        COUNT(tctx->stat_num_shift++);
+        COUNT(tctx->stat_total_shift += (ctx->m - B2G_Q + 1));
+        pos = pos + ctx->m - B2G_CUDA_Q + 1;
 
-    SCCudaMemFree(cuda_buf);
-    SCCudaMemFree(cuda_offsets);
+        SCLogDebug("pos %"PRIu32"", pos);
+    }
 
+    SCLogDebug("matches %"PRIu32"", matches);
     return matches;
-
- error:
-    if (cuda_buf != 0)
-        SCCudaMemFree(cuda_buf);
-    if (cuda_offsets != 0)
-        SCCudaMemFree(cuda_offsets);
-    return 0;
 }
 
 uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
@@ -1604,7 +1431,7 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
 #endif
     uint32_t pos = 0, matches = 0;
     B2G_CUDA_TYPE d;
-    uint32_t j = 0;
+    uint32_t j;
 
     COUNT(tctx->stat_calls++);
     COUNT(tctx->stat_m_total+=ctx->m);
@@ -1617,8 +1444,7 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
         d = ~0;
 
         do {
-            uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + j - 1]),
-                                         u8_tolower(buf[pos + j]));
+            uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + j - 1]),u8_tolower(buf[pos + j]));
             d = ((d << 1) & ctx->B2G[h]);
             j = j - 1;
         } while (d != 0 && j != 0);
@@ -1626,10 +1452,10 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
         /* (partial) match, move on to verification */
         if (d != 0) {
             COUNT(tctx->stat_d0++);
+            //printf("output at pos %" PRIu32 ": ", pos); prt(buf + pos, ctx->m); printf("\n");
 
             /* get our patterns from the hash */
-            uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + ctx->m - 2]),
-                                         u8_tolower(buf[pos + ctx->m - 1]));
+            uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + ctx->m - 2]),u8_tolower(buf[pos + ctx->m - 1]));
 
             if (ctx->bloom[h] != NULL) {
                 COUNT(tctx->stat_pminlen_calls++);
@@ -1643,13 +1469,13 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
                     if (BloomFilterTest(ctx->bloom[h], buf+pos, ctx->pminlen[h]) == 0) {
                         COUNT(tctx->stat_bloom_hits++);
 
+                        //printf("Bloom: %p, buflen %" PRIu32 ", pos %" PRIu32 ", p_min_len %" PRIu32 "\n", ctx->bloom[h], buflen, pos, ctx->pminlen[h]);
                         goto skip_loop;
                     }
                 }
             }
 
-            B2gCudaHashItem *hi = ctx->hash[h];
-            B2gCudaHashItem *thi = NULL;
+            B2gCudaHashItem *hi = ctx->hash[h], *thi;
             for (thi = hi; thi != NULL; thi = thi->nxt) {
                 COUNT(tctx->stat_d0_hashloop++);
                 B2gCudaPattern *p = ctx->parray[thi->idx];
@@ -1661,7 +1487,7 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
                     if (memcmp_lowercase(p->ci, buf+pos, p->len) == 0) {
                         COUNT(tctx->stat_loop_match++);
 
-                        matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, pos, p->len);
+                        matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
                     } else {
                         COUNT(tctx->stat_loop_no_match++);
                     }
@@ -1672,13 +1498,14 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
                     if (memcmp(p->cs, buf+pos, p->len) == 0) {
                         COUNT(tctx->stat_loop_match++);
 
-                        matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, pos, p->len);
+                        matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
                     } else {
                         COUNT(tctx->stat_loop_no_match++);
                     }
                 }
             }
 skip_loop:
+            //pos = pos + ctx->s0;
             pos = pos + 1;
         } else {
             COUNT(tctx->stat_num_shift++);
@@ -1688,6 +1515,7 @@ skip_loop:
         }
     }
 
+    //printf("Total matches %" PRIu32 "\n", matches);
     return matches;
 }
 
@@ -1700,12 +1528,13 @@ uint32_t B2gCudaSearch2(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
     uint8_t *bufend = buf + buflen - 1;
     uint32_t cnt = 0;
     B2gCudaPattern *p;
-    MpmEndMatch *em;
     B2gCudaHashItem *thi, *hi;
 
     if (buflen < 2)
         return 0;
 
+    //printf("BUF "); prt(buf,buflen); printf("\n");
+
     while (buf <= bufend) {
         uint8_t h8 = u8_tolower(*buf);
         hi = &ctx->hash1[h8];
@@ -1716,11 +1545,11 @@ uint32_t B2gCudaSearch2(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
 
                 if (p->flags & MPM_PATTERN_FLAG_NOCASE) {
                     if (h8 == p->ci[0]) {
-                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len);
+                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
                     }
                 } else {
                     if (*buf == p->cs[0]) {
-                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len);
+                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
                     }
                 }
             }
@@ -1735,29 +1564,32 @@ uint32_t B2gCudaSearch2(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
 
             if (p->flags & MPM_PATTERN_FLAG_NOCASE) {
                 if (h8 == p->ci[0] && u8_tolower(*(buf+1)) == p->ci[1]) {
-                    for (em = p->em; em; em = em->next) {
-                        if (MpmVerifyMatch(mpm_thread_ctx, pmq, em, (buf+1 - bufmin), p->len))
+                    //printf("CI Exact match: "); prt(p->ci, p->len); printf(" in buf "); prt(buf, p->len);printf(" (B2gSearch1)\n");
+//                    for (em = p->em; em; em = em->next) {
+                        if (MpmVerifyMatch(mpm_thread_ctx, pmq, p->id))
                             cnt++;
-                    }
+//                    }
                 }
             } else {
                 if (*buf == p->cs[0] && *(buf+1) == p->cs[1]) {
-                    for (em = p->em; em; em = em->next) {
-                        if (MpmVerifyMatch(mpm_thread_ctx, pmq, em, (buf+1 - bufmin), p->len))
+                    //printf("CS Exact match: "); prt(p->cs, p->len); printf(" in buf "); prt(buf, p->len);printf(" (B2gSearch1)\n");
+//                    for (em = p->em; em; em = em->next) {
+                        if (MpmVerifyMatch(mpm_thread_ctx, pmq, p->id))
                             cnt++;
-                    }
+//                    }
                 }
             }
         }
         buf += 1;
     }
 
+    //printf("B2gSearch2: after 2byte cnt %" PRIu32 "\n", cnt);
     if (ctx->pat_x_cnt > 0) {
         /* Pass bufmin on because buf no longer points to the
          * start of the buffer. */
         cnt += ctx->MBSearch(mpm_ctx, mpm_thread_ctx, pmq, bufmin, buflen);
+        //printf("B2gSearch1: after 2+byte cnt %" PRIu32 "\n", cnt);
     }
-
     return cnt;
 }
 #endif
@@ -1777,6 +1609,8 @@ uint32_t B2gCudaSearch1(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
     if (buflen == 0)
         SCReturnUInt(0);
 
+    //printf("BUF "); prt(buf,buflen); printf("\n");
+
     while (buf <= bufend) {
         uint8_t h = u8_tolower(*buf);
         hi = &ctx->hash1[h];
@@ -1790,11 +1624,11 @@ uint32_t B2gCudaSearch1(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
 
                 if (p->flags & MPM_PATTERN_FLAG_NOCASE) {
                     if (u8_tolower(*buf) == p->ci[0]) {
-                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len);
+                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
                     }
                 } else {
                     if (*buf == p->cs[0]) {
-                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len);
+                        cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
                     }
                 }
             }
@@ -1802,22 +1636,57 @@ uint32_t B2gCudaSearch1(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx,
         buf += 1;
     }
 
+    //printf("B2gSearch1: after 1byte cnt %" PRIu32 "\n", cnt);
 #ifdef B2G_CUDA_SEARCH2
     if (ctx->pat_2_cnt) {
         /* Pass bufmin on because buf no longer points to the
          * start of the buffer. */
         cnt += ctx->MBSearch2(mpm_ctx, mpm_thread_ctx, pmq, bufmin, buflen);
+        //printf("B2gSearch1: after 2+byte cnt %" PRIu32 "\n", cnt);
     } else
 #endif
     if (ctx->pat_x_cnt) {
         cnt += ctx->MBSearch(mpm_ctx, mpm_thread_ctx, pmq, bufmin, buflen);
     }
-
     SCReturnUInt(cnt);
 }
 
 /*********************Cuda_Specific_Mgmt_Code_Starts_Here**********************/
 
+typedef struct B2gCudaMpmThreadCtxData_ {
+    int b2g_cuda_module_handle;
+
+    CUcontext b2g_cuda_context;
+    CUmodule b2g_cuda_module;
+
+    /* the search kernel */
+    CUfunction b2g_cuda_search_kernel;
+
+    /* the cuda_search_kernel argument offsets */
+    uint8_t b2g_cuda_search_kernel_arg0_offset;
+    uint8_t b2g_cuda_search_kernel_arg1_offset;
+    uint8_t b2g_cuda_search_kernel_arg2_offset;
+    uint8_t b2g_cuda_search_kernel_arg3_offset;
+    uint8_t b2g_cuda_search_kernel_arg4_offset;
+    uint8_t b2g_cuda_search_kernel_arg5_offset;
+    uint8_t b2g_cuda_search_kernel_arg_total;
+
+    /* the results buffer to hold the match offsets for the packets */
+    uint16_t *results_buffer;
+    /* gpu buffer corresponding to the above buffer */
+    CUdeviceptr cuda_results_buffer;
+
+    /* gpu buffer corresponding to SCCudaPBPacketsBuffer->packets_buffer */
+    CUdeviceptr cuda_packets_buffer;
+    /* gpu buffer corresponding to SCCudaPBPacketsBuffer->packets_offset_buffer */
+    CUdeviceptr cuda_packets_offset_buffer;
+    /* gpu buffer corresponding to SCCudaPBPacketsBuffer->packets_payload_offset_buffer */
+    CUdeviceptr cuda_packets_payload_offset_buffer;
+    /* gpu buffer corresponding to the global symbol g_u8_lowercasetable
+     * XXX Remove this.  Store it as a constant buffer inside the kernel*/
+    CUdeviceptr cuda_g_u8_lowercasetable;
+} B2gCudaMpmThreadCtxData;
+
 /**
  * \brief The Cuda MPM B2G module's thread init function.
  *
@@ -1839,63 +1708,415 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
         SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error pushing cuda context");
     }
 
-    return TM_ECODE_OK;
-}
+    B2gCudaMpmThreadCtxData *tctx = malloc(sizeof(B2gCudaMpmThreadCtxData));
+    if (tctx == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+    memset(tctx, 0, sizeof(B2gCudaMpmThreadCtxData));
 
-/**
- * \brief The Cuda MPM B2G module's thread de-init function.
- *
- * \param tv   Pointer to the ThreadVars which has invoked this function.
- * \param data Pointer to the slot data if anything had been attached in
- *             the thread init function.
- *
- * \retval TM_ECODE_OK Always.
- */
-TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
-{
-    if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
-        return TM_ECODE_OK;
+    tctx->b2g_cuda_module_handle = module_data->handle;
 
-    if (SCCudaCtxPopCurrent(NULL) == -1) {
-        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error popping cuda context");
+    if (SCCudaHlGetCudaContext(&tctx->b2g_cuda_context, module_data->handle) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context");
+        goto error;
     }
 
-    return TM_ECODE_OK;
-}
+#if defined(__x86_64__) || defined(__ia64__)
+    if (SCCudaHlGetCudaModule(&tctx->b2g_cuda_module, b2g_cuda_ptx_image_64_bit,
+                              module_data->handle) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
+    }
+#else
+    if (SCCudaHlGetCudaModule(&tctx->b2g_cuda_module, b2g_cuda_ptx_image_32_bit,
+                              module_data->handle) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
+    }
+#endif
 
-/**
- * \brief The dispatcher function for the cuda mpm.  Takes a packet, feeds
- *        it to the gpu and informs the calling client when it has the
- *        results ready.
- *
- * \param tv   We don't need this.
- * \param p    Pointer to the Packet which contains all the relevant data,
- *             like the bufffer, buflen, the contexts.
- * \param data Pointer to the slot data if anything had been attached in
- *             the thread init function.
- * \param pq   We don't need this.
- *
- * \retval TM_ECODE_OK Always.
- */
-TmEcode B2gCudaMpmDispatcher(ThreadVars *tv, Packet *p, void *data,
-                             PacketQueue *pq)
-{
-    if (p == NULL)
-        return TM_ECODE_OK;
+    if (SCCudaModuleGetFunction(&tctx->b2g_cuda_search_kernel,
+                                tctx->b2g_cuda_module,
+                                B2G_CUDA_SEARCHFUNC_NAME) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda function");
+        goto error;
+    }
 
-    p->cuda_matches = mpm_table[p->cuda_mpm_ctx->mpm_type].Search(p->cuda_mpm_ctx,
-                                                                  p->cuda_mtc,
-                                                                  p->cuda_pmq,
-                                                                  p->payload,
-                                                                  p->payload_len);
-    TmqhOutputSimpleOnQ(p->cuda_outq, p);
+    if (SCCudaFuncSetBlockShape(tctx->b2g_cuda_search_kernel, 32, 1, 1) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error setting function block shape");
+        goto error;
+    }
 
-    return TM_ECODE_OK;
-}
+#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
 
-/**
- * \brief Registers the Cuda B2G MPM Module.
- */
+    int offset = 0;
+
+    ALIGN_UP(offset, __alignof(void *));
+    tctx->b2g_cuda_search_kernel_arg0_offset = offset;
+    offset += sizeof(void *);
+
+    ALIGN_UP(offset, __alignof(void *));
+    tctx->b2g_cuda_search_kernel_arg1_offset = offset;
+    offset += sizeof(void *);
+
+    ALIGN_UP(offset, __alignof(void *));
+    tctx->b2g_cuda_search_kernel_arg2_offset = offset;
+    offset += sizeof(void *);
+
+    ALIGN_UP(offset, __alignof(void *));
+    tctx->b2g_cuda_search_kernel_arg3_offset = offset;
+    offset += sizeof(void *);
+
+    ALIGN_UP(offset, __alignof(uint16_t));
+    tctx->b2g_cuda_search_kernel_arg4_offset = offset;
+    offset += sizeof(void *);
+
+    ALIGN_UP(offset, __alignof(void *));
+    tctx->b2g_cuda_search_kernel_arg5_offset = offset;
+    offset += sizeof(void *);
+
+    tctx->b2g_cuda_search_kernel_arg_total = offset;
+
+    /* buffer to hold the b2g cuda mpm match results for 4000 packets.  The
+     * extra 2 bytes(the 1 in 1481 instead of 1480) is to hold the no of
+     * matches for the payload.  The remaining 1480 positions in the buffer
+     * is to hold the match offsets */
+    tctx->results_buffer = malloc(sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS);
+    if (tctx->results_buffer == NULL) {
+        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+        exit(EXIT_FAILURE);
+    }
+
+    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_results_buffer,
+                                 "MPM_B2G_RESULTS",
+                                 sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 NULL, module_data->handle) == -1) {
+        goto error;
+    }
+
+    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_g_u8_lowercasetable,
+                                 "G_U8_LOWERCASETABLE", 256 * sizeof(char),
+                                 g_u8_lowercasetable, module_data->handle) == -1) {
+        goto error;
+    }
+
+    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_buffer,
+                                 "MPM_B2G_PACKETS_BUFFER",
+                                 (sizeof(SCCudaPBPacketDataForGPU) *
+                                  SC_CUDA_PB_MIN_NO_OF_PACKETS),
+                                 NULL, module_data->handle) == -1) {
+        goto error;
+    }
+
+    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_offset_buffer,
+                                 "MPM_B2G_PACKETS_BUFFER_OFFSETS",
+                                 sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 NULL, module_data->handle) == -1) {
+        goto error;
+    }
+
+    if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_payload_offset_buffer,
+                                 "MPM_B2G_PACKETS_PAYLOAD_BUFFER_OFFSETS",
+                                 sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS,
+                                 NULL, module_data->handle) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel,
+                        tctx->b2g_cuda_search_kernel_arg0_offset,
+                        (void *)&tctx->cuda_results_buffer,
+                        sizeof(void *)) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel,
+                        tctx->b2g_cuda_search_kernel_arg1_offset,
+                        (void *)&tctx->cuda_packets_buffer,
+                        sizeof(void *)) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel,
+                        tctx->b2g_cuda_search_kernel_arg2_offset,
+                        (void *)&tctx->cuda_packets_offset_buffer,
+                        sizeof(void *)) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel,
+                        tctx->b2g_cuda_search_kernel_arg3_offset,
+                        (void *)&tctx->cuda_packets_payload_offset_buffer,
+                        sizeof(void *)) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel,
+                        tctx->b2g_cuda_search_kernel_arg5_offset,
+                        (void *)&tctx->cuda_g_u8_lowercasetable,
+                        sizeof(void *)) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSetSize(tctx->b2g_cuda_search_kernel,
+                           tctx->b2g_cuda_search_kernel_arg_total) == -1) {
+        goto error;
+    }
+
+    *data = tctx;
+
+     return TM_ECODE_OK;
+
+ error:
+    return TM_ECODE_FAILED;
+}
+
+/**
+ * \brief The Cuda MPM B2G module's thread de-init function.
+ *
+ * \param tv   Pointer to the ThreadVars which has invoked this function.
+ * \param data Pointer to the slot data if anything had been attached in
+ *             the thread init function.
+ *
+ * \retval TM_ECODE_OK Always.
+ */
+TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
+{
+    B2gCudaMpmThreadCtxData *tctx = data;
+
+    if (tctx == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments.  data NULL\n");
+        return TM_ECODE_OK;
+    }
+
+    if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
+        return TM_ECODE_OK;
+
+    CUcontext dummy_context;
+    SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(tctx->b2g_cuda_module_handle);
+    if (module_data == NULL) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "How did we even fail to get a "
+                   "module_data if we are having a module_handle");
+        goto error;
+    }
+    if (SCCudaHlGetCudaContext(&dummy_context, tctx->b2g_cuda_module_handle) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
+                   "module %s", module_data->name);
+        goto error;
+    }
+    SCCudaCtxPushCurrent(dummy_context);
+
+    free(tctx->results_buffer);
+    SCCudaHlFreeCudaDevicePtr("MPM_B2G_RESULTS", tctx->b2g_cuda_module_handle);
+    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER", tctx->b2g_cuda_module_handle);
+    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER_OFFSETS",
+                              tctx->b2g_cuda_module_handle);
+    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_PAYLOAD_BUFFER_OFFSETS",
+                              tctx->b2g_cuda_module_handle);
+    SCCudaHlFreeCudaDevicePtr("G_U8_LOWERCASETABLE", tctx->b2g_cuda_module_handle);
+
+    free(tctx);
+
+    if (SCCudaCtxPopCurrent(NULL) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error popping cuda context");
+    }
+
+    return TM_ECODE_OK;
+
+ error:
+    return TM_ECODE_FAILED;
+}
+
+/**
+ * \brief The dispatcher function for the cuda mpm.  Takes a packet, feeds
+ *        it to the gpu and informs the calling client when it has the
+ *        results ready.
+ *
+ * \param tv   We don't need this.
+ * \param p    Pointer to the Packet which contains all the relevant data,
+ *             like the bufffer, buflen, the contexts.
+ * \param data Pointer to the slot data if anything had been attached in
+ *             the thread init function.
+ * \param pq   We don't need this.
+ *
+ * \retval TM_ECODE_OK Always.
+ */
+TmEcode B2gCudaMpmDispatcher(ThreadVars *tv, Packet *incoming_buffer,
+                             void *data, PacketQueue *pq)
+{
+    SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)incoming_buffer;
+    B2gCudaMpmThreadCtxData *tctx = data;
+    uint32_t i = 0;
+
+    SCLogDebug("Running the B2g CUDA mpm dispatcher");
+
+    if (pb == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument.  pb is NULL!!");
+        return TM_ECODE_OK;
+    }
+
+    if (SCCudaMemcpyHtoD(tctx->cuda_packets_buffer, pb->packets_buffer,
+                         pb->packets_buffer_len) == -1) {
+        goto error;
+    }
+
+    if (SCCudaMemcpyHtoD(tctx->cuda_packets_offset_buffer,
+                         pb->packets_offset_buffer,
+                         sizeof(uint32_t) * pb->nop_in_buffer) == -1) {
+        goto error;
+    }
+
+    if (SCCudaMemcpyHtoD(tctx->cuda_packets_payload_offset_buffer,
+                         pb->packets_payload_offset_buffer,
+                         sizeof(uint32_t) * pb->nop_in_buffer) == -1) {
+        goto error;
+    }
+
+    if (SCCudaParamSeti(tctx->b2g_cuda_search_kernel, tctx->b2g_cuda_search_kernel_arg4_offset,
+                        pb->nop_in_buffer) == -1) {
+        goto error;
+    }
+
+    /* the no of threads per block has already been set to 32
+     * \todo if we are very sure we are allocating a multiple of block_size
+     * buffer_threshold, then we can remove this + 1 here below */
+    int no_of_cuda_blocks = (pb->nop_in_buffer / 32) + 1;
+    if (SCCudaLaunchGrid(tctx->b2g_cuda_search_kernel, no_of_cuda_blocks, 1) == -1) {
+        goto error;
+    }
+
+    if (SCCudaMemcpyDtoH(tctx->results_buffer,
+                         tctx->cuda_results_buffer,
+                         sizeof(uint16_t) * (pb->nop_in_buffer + pb->packets_total_payload_len)) == -1) {
+        goto error;
+    }
+
+    i = 0;
+    for (i = 0; i < pb->nop_in_buffer; i++) {
+        memcpy(pb->packets_address_buffer[i]->mpm_offsets,
+               (tctx->results_buffer + i +
+                pb->packets_payload_offset_buffer[i]),
+               (pb->packets_address_buffer[i]->payload_len + 1) * sizeof(uint16_t));
+        SCMutexLock(&pb->packets_address_buffer[i]->cuda_mutex);
+        pb->packets_address_buffer[i]->cuda_done = 1;
+        SCMutexUnlock(&pb->packets_address_buffer[i]->cuda_mutex);
+        SCCondSignal(&pb->packets_address_buffer[i]->cuda_cond);
+    }
+
+    SCLogDebug("B2g Cuda mpm dispatcher returning");
+    return TM_ECODE_OK;
+
+ error:
+    for (i = 0; i < pb->nop_in_buffer; i++) {
+        SCMutexLock(&pb->packets_address_buffer[i]->cuda_mutex);
+        pb->packets_address_buffer[i]->cuda_done = 1;
+        SCMutexUnlock(&pb->packets_address_buffer[i]->cuda_mutex);
+        SCCondSignal(&pb->packets_address_buffer[i]->cuda_cond);
+    }
+    SCLogError(SC_ERR_B2G_CUDA_ERROR, "B2g Cuda mpm dispatcher returning with error");
+    return TM_ECODE_OK;
+}
+
+/**
+ * \brief The post processing of cuda mpm b2g results for a packet
+ *        is done here.  Will be used by the detection thread.  We basically
+ *        obtain the match offsets from the cuda mpm search and carry out
+ *        further matches on those offsets.  Also if the results are not
+ *        read for a packet, we wait on the conditional, which will then
+ *        be signalled by the cuda mpm dispatcher thread, once the results
+ *        for the packet are ready.
+ *
+ * \param p              Pointer to the packet whose mpm cuda results are
+ *                       to be further processed.
+ * \param mpm_ctx        Pointer to the mpm context for this packet.
+ * \param mpm_thread_ctx Pointer to the mpm thread context.
+ * \param pmq            Pointer to the patter matcher queue.
+ *
+ * \retval matches Holds the no of matches.
+ */
+int B2gCudaResultsPostProcessing(Packet *p, MpmCtx *mpm_ctx,
+                                 MpmThreadCtx *mpm_thread_ctx,
+                                 PatternMatcherQueue *pmq)
+{
+    B2gCudaCtx *ctx = mpm_ctx->ctx;
+
+    while (p->cuda_done == 0) {
+        SCMutexLock(&p->cuda_mutex);
+        if (p->cuda_done == 1) {
+            SCMutexUnlock(&p->cuda_mutex);
+            break;
+        } else {
+            SCondWait(&p->cuda_cond, &p->cuda_mutex);
+            SCMutexUnlock(&p->cuda_mutex);
+        }
+    }
+
+    /* reset this flag for the packet */
+    p->cuda_done = 0;
+
+    uint16_t *no_of_matches = p->mpm_offsets;
+    uint16_t *host_offsets = p->mpm_offsets + 1;
+    int i = 0, h = 0;
+    uint8_t *buf = p->payload;
+    uint16_t buflen = p->payload_len;
+    int matches = 0;
+    for (i = 0; i < no_of_matches[0]; i++) {
+        h = B2G_CUDA_HASH16(u8_tolower(buf[host_offsets[i] + ctx->m - 2]),
+                            u8_tolower(buf[host_offsets[i] + ctx->m - 1]));
+
+        if (ctx->bloom[h] != NULL) {
+            COUNT(tctx->stat_pminlen_calls++);
+            COUNT(tctx->stat_pminlen_total+=ctx->pminlen[h]);
+
+            if ((buflen - host_offsets[i]) < ctx->pminlen[h]) {
+                continue;
+            } else {
+                COUNT(tctx->stat_bloom_calls++);
+
+                if (BloomFilterTest(ctx->bloom[h], buf + host_offsets[i], ctx->pminlen[h]) == 0) {
+                    COUNT(tctx->stat_bloom_hits++);
+
+                    continue;
+                }
+            }
+        }
+
+        B2gCudaHashItem *hi = ctx->hash[h], *thi;
+        for (thi = hi; thi != NULL; thi = thi->nxt) {
+            COUNT(tctx->stat_d0_hashloop++);
+            B2gCudaPattern *p = ctx->parray[thi->idx];
+
+            if (p->flags & MPM_PATTERN_FLAG_NOCASE) {
+                if ((buflen - host_offsets[i]) < p->len) {
+                    continue;
+                }
+
+                if (memcmp_lowercase(p->ci, buf + host_offsets[i], p->len) == 0) {
+                    COUNT(tctx->stat_loop_match++);
+
+                    matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
+                } else {
+                    COUNT(tctx->stat_loop_no_match++);
+                }
+            } else {
+                if (buflen - host_offsets[i] < p->len)
+                    continue;
+
+                if (memcmp(p->cs, buf +  host_offsets[i], p->len) == 0) {
+                    COUNT(tctx->stat_loop_match++);
+
+                    matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id);
+                } else {
+                    COUNT(tctx->stat_loop_no_match++);
+                }
+            }
+        }
+    }
+
+    return matches;
+}
+
+/**
+ * \brief Registers the Cuda B2G MPM Module.
+ */
 void TmModuleCudaMpmB2gRegister(void)
 {
     tmm_modules[TMM_CUDA_MPM_B2G].name = "Cuda_Mpm_B2g";
@@ -1908,6 +2129,72 @@ void TmModuleCudaMpmB2gRegister(void)
 
 /***************************Code_Specific_To_Mpm_B2g***************************/
 
+void *CudaMpmB2gThreadsSlot1(void *td)
+{
+    ThreadVars *tv = (ThreadVars *)td;
+    Tm1Slot *s = (Tm1Slot *)tv->tm_slots;
+    SCCudaPBPacketsBuffer *data = NULL;
+    char run = 1;
+    TmEcode r = TM_ECODE_OK;
+
+    /* Set the thread name */
+    SCSetThreadName(tv->name);
+
+    if (tv->thread_setup_flags != 0)
+        TmThreadSetupOptions(tv);
+
+    SCLogDebug("%s starting", tv->name);
+
+    if (s->s.SlotThreadInit != NULL) {
+        r = s->s.SlotThreadInit(tv, s->s.slot_initdata, &s->s.slot_data);
+        if (r != TM_ECODE_OK) {
+            EngineKill();
+
+            TmThreadsSetFlag(tv, THV_CLOSED);
+            pthread_exit((void *) -1);
+        }
+    }
+    memset(&s->s.slot_pq, 0, sizeof(PacketQueue));
+
+    TmThreadsSetFlag(tv, THV_INIT_DONE);
+    while(run) {
+        TmThreadTestThreadUnPaused(tv);
+
+        /* input data */
+        data = (SCCudaPBPacketsBuffer *)TmqhInputSimpleOnQ(&data_queues[tv->inq->id]);
+
+        if (data == NULL) {
+            //printf("%s: TmThreadsSlot1: p == NULL\n", tv->name);
+        } else {
+            r = s->s.SlotFunc(tv, (Packet *)data, s->s.slot_data, &s->s.slot_pq);
+            /* handle error */
+
+            /* output the packet */
+            TmqhOutputSimpleOnQ(&data_queues[tv->outq->id], (SCDQGenericQData *)data);
+        }
+
+        if (TmThreadsCheckFlag(tv, THV_KILL)) {
+            run = 0;
+        }
+    }
+
+    if (s->s.SlotThreadExitPrintStats != NULL) {
+        s->s.SlotThreadExitPrintStats(tv, s->s.slot_data);
+    }
+
+    if (s->s.SlotThreadDeinit != NULL) {
+        r = s->s.SlotThreadDeinit(tv, s->s.slot_data);
+        if (r != TM_ECODE_OK) {
+            TmThreadsSetFlag(tv, THV_CLOSED);
+            pthread_exit((void *) -1);
+        }
+    }
+
+    SCLogDebug("%s ending", tv->name);
+    TmThreadsSetFlag(tv, THV_CLOSED);
+    pthread_exit((void *) 0);
+}
+
 int B2gCudaStartDispatcherThreadRC(const char *name)
 {
     SCCudaHlModuleData *data = NULL;
@@ -1935,15 +2222,17 @@ int B2gCudaStartDispatcherThreadRC(const char *name)
     }
 
     /* create the threads */
-    tv_CMB2_RC = TmThreadCreatePacketHandler("Cuda_Mpm_B2g_RC",
-                                             "rules_content_mpm_inqueue", "simple",
-                                             NULL, NULL,
-                                             "1slot_noout");
+    tv_CMB2_RC = TmThreadCreate("Cuda_Mpm_B2g_RC",
+                                "cuda_batcher_mpm_outqueue", "simple",
+                                "cuda_batcher_mpm_inqueue", "simple",
+                                "custom", CudaMpmB2gThreadsSlot1, 0);
     if (tv_CMB2_RC == NULL) {
         SCLogError(SC_ERR_TM_THREADS_ERROR, "ERROR: TmThreadsCreate failed");
         exit(EXIT_FAILURE);
     }
-    tv_CMB2_RC->inq->writer_cnt++;
+    tv_CMB2_RC->type = TVT_PPT;
+    tv_CMB2_RC->inq->q_type = 1;
+    tv_CMB2_RC->outq->q_type = 1;
 
     tm_module = TmModuleGetByName("Cuda_Mpm_B2g");
     if (tm_module == NULL) {
@@ -1981,28 +2270,10 @@ void B2gCudaKillDispatcherThreadRC(void)
     return;
 }
 
-void B2gCudaPushPacketTo_tv_CMB2_RC(Packet *p)
-{
-    PacketQueue *q = &trans_q[tv_CMB2_RC->inq->id];
-
-    SCMutexLock(&q->mutex_q);
-    PacketEnqueue(q, p);
-    SCCondSignal(&q->cond_q);
-    SCMutexUnlock(&q->mutex_q);
-
-    return;
-}
-
 /*********************************Unittests************************************/
 
 #ifdef UNITTESTS
 
-static int B2gCudaTestInitTestEnv(void)
-{
-    SCCudaHlRegisterModule("B2G_CUDA_TEST");
-
-    return 1;
-}
 
 static int B2gCudaTest01(void)
 {
@@ -2010,1207 +2281,621 @@ static int B2gCudaTest01(void)
     MpmThreadCtx mpm_thread_ctx;
     B2gCudaCtx *ctx = NULL;
     int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
+    int module_handle = SCCudaHlRegisterModule("B2G_CUDA_TEST");
+    SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(module_handle);
+    SCCudaPBPacketsBuffer *pb = NULL;
+
+    /* get the cuda context and push it */
+    CUcontext dummy_context;
+    if (SCCudaHlGetCudaContext(&dummy_context, module_handle) == -1) {
+        SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
+                   "module SC_RULES_CONTENT_B2G_CUDA");
+    }
+    SCCudaCtxPushCurrent(dummy_context);
 
     memset(&mpm_ctx, 0, sizeof(MpmCtx));
     B2gCudaInitCtx(&mpm_ctx, module_handle);
+    /* pop the context before we make further calls to the mpm cuda dispatcher */
+    SCCudaCtxPopCurrent(NULL);
+
+    B2gCudaMpmThreadCtxData *tctx = NULL;
+    B2gCudaMpmDispThreadInit(NULL, module_data, (void *)&tctx);
 
     ctx = mpm_ctx.ctx;
 
-    if (ctx->cuda_context == 0)
+    if (tctx->b2g_cuda_context == 0)
         goto end;
-    if (ctx->cuda_module == 0)
+    if (tctx->b2g_cuda_module == 0)
         goto end;
-    if (ctx->cuda_search_kernel == 0)
+    if (tctx->b2g_cuda_search_kernel == 0)
         goto end;
 
     if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"one", 3, 0, 0, 1, 1, 0) == -1)
         goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"two", 3, 0, 0, 2, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1)
-        goto end;
 
     if (B2gCudaPreparePatterns(&mpm_ctx) == -1)
         goto end;
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 4 /* 4 patterns */);
+    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
 
-    char *string = "onetwothreeaaaaoneaatwobbbthrbsonwehowvonwoonsldffoursadnothreewtowoneowtwo";
-    result = (B2gCudaSearchBNDMq(&mpm_ctx, &mpm_thread_ctx, NULL,
-                                 (uint8_t *)string, strlen(string)) == 9);
+    result = 1;
 
- end:
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
+    pb = SCCudaPBAllocSCCudaPBPacketsBuffer();
+    SCCudaPBPacketDataForGPU *curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer;
 
-static int B2gCudaTest02(void)
-{
-    MpmCtx mpm_ctx;
-    MpmThreadCtx mpm_thread_ctx;
-    B2gCudaCtx *ctx = NULL;
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-
-    memset(&mpm_ctx, 0, sizeof(MpmCtx));
-    B2gCudaInitCtx(&mpm_ctx, module_handle);
-
-    ctx = mpm_ctx.ctx;
+    char *string = "tone_one_one_one";
+    curr_packet->m = ctx->m;
+    curr_packet->table = ctx->cuda_B2G;
+    curr_packet->payload_len = strlen(string);
+    memcpy(curr_packet->payload, string, strlen(string));
 
-    if (ctx->cuda_context == 0)
-        goto end;
-    if (ctx->cuda_module == 0)
-        goto end;
-    if (ctx->cuda_search_kernel == 0)
-        goto end;
+    pb->nop_in_buffer = 1;
+    pb->packets_buffer_len = sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(string);
+    pb->packets_total_payload_len = strlen(string);
+    pb->packets_offset_buffer[0] = 0;
+    pb->packets_payload_offset_buffer[0] = 0;
 
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"one", 3, 0, 0, 1, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"two", 3, 0, 0, 2, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1)
-        goto end;
+    Packet p;
+    memset(&p, 0, sizeof(Packet));
+    pb->packets_address_buffer[0] = &p;
+    p.payload_len = strlen(string);
 
-    if (B2gCudaPreparePatterns(&mpm_ctx) == -1)
-        goto end;
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 4 /* 4 patterns */);
+    B2gCudaMpmDispatcher(NULL, (Packet *)pb, tctx, NULL);
 
-    char *string = "onetwothreeaaaaoneaatwobbbthrbsonwehowvonwoonsldffoursadnothreewtowoneowtwo";
-    result = (B2gCudaSearchBNDMq(&mpm_ctx, &mpm_thread_ctx, NULL,
-                                 (uint8_t *)string, strlen(string)) == 9);
+    result &= (p.mpm_offsets[0] == 4);
+    result &= (p.mpm_offsets[1] == 1);
+    result &= (p.mpm_offsets[2] == 5);
+    result &= (p.mpm_offsets[3] == 9);
+    result &= (p.mpm_offsets[4] == 13);
 
  end:
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
+    SCCudaPBDeAllocSCCudaPBPacketsBuffer(pb);
+    B2gCudaMpmDispThreadDeInit(NULL, (void *)tctx);
     B2gCudaDestroyCtx(&mpm_ctx);
+    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
     return result;
 }
 
-/**
- * \test Test that the *AddPattern* functions work as expected.
- */
-static int B2gCudaTest03(void)
+static int B2gCudaTest02(void)
 {
-    MpmCtx mpm_ctx;
-    B2gCudaCtx *ctx = NULL;
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
+    uint8_t raw_eth[] = {
+        0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02,
+        0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00,
+        0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06,
+        0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8,
+        0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11,
+        0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18,
+        0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01,
+        0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21,
+        0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31,
+        0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46,
+        0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f,
+        0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43,
+        0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e,
+        0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72,
+        0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43,
+        0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54,
+        0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20,
+        0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d,
+        0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44,
+        0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e,
+        0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70,
+        0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38,
+        0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47,
+        0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76,
+        0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d,
+        0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+        0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a,
+        0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a,
+        0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48,
+        0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74,
+        0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65,
+        0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f,
+        0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79,
+        0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+        0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63,
+        0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75,
+        0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c,
+        0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30,
+        0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c,
+        0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c,
+        0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42,
+        0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31,
+        0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76,
+        0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a,
+        0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75,
+        0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73,
+        0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c,
+        0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68,
+        0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e,
+        0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59,
+        0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e,
+        0x0d, 0x0a };
 
-    memset(&mpm_ctx, 0, sizeof(MpmCtx));
-    B2gCudaInitCtx(&mpm_ctx, module_handle);
+    int result = 0;
+    const char *strings[10] = {
+        "test_test_one",
+        "test_two_test",
+        "test_three_test",
+        "test_four_test",
+        "test_five_test",
+        "test_six_test",
+        "test_seven_test",
+        "test_eight_test",
+        "test_nine_test",
+        "test_ten_test"};
+    /* don't shoot me for hardcoding the results.  We will change this in
+     * sometime, by running a separate mpm on the cpu, and then hold
+     * the results in this temp buffer */
+    int results[10][2] = { {0, 5},
+                           {0, 9},
+                           {0, 11},
+                           {0, 10},
+                           {0, 10},
+                           {0, 9},
+                           {0, 11},
+                           {0, 11},
+                           {0, 10},
+                           {0, 9} };
+    Packet *p[10];
+    SCCudaPBThreadCtx *pb_tctx = NULL;
+
+    DecodeThreadVars dtv;
+    ThreadVars tv;
+    DetectEngineCtx *de_ctx = NULL;
+
+    SCCudaPBPacketsBuffer *pb = NULL;
+    SCDQDataQueue *dq = NULL;
+
+    char *inq_name = "cuda_batcher_mpm_inqueue";
+    char *outq_name = "cuda_batcher_mpm_outqueue";
+
+    Tmq *tmq_outq = NULL;
+    Tmq *tmq_inq = NULL;
+
+    uint32_t i = 0, j = 0;
+
+    uint8_t no_of_pkts = 10;
+
+    memset(&dtv, 0, sizeof(DecodeThreadVars));
+    memset(&tv, 0, sizeof(ThreadVars));
+
+    FlowInitConfig(FLOW_QUIET);
+
+    memset(p, 0, sizeof(p));
+    for (i = 0; i < no_of_pkts; i++) {
+        p[i] = malloc(sizeof(Packet));
+        if (p[i] == NULL) {
+            printf("error allocating memory\n");
+            exit(EXIT_FAILURE);
+        }
+        memset(p[i], 0, sizeof(Packet));
+        DecodeEthernet(&tv, &dtv, p[i], raw_eth, sizeof(raw_eth), NULL);
+    }
 
-    ctx = mpm_ctx.ctx;
-    if (ctx->cuda_context == 0)
-        goto end;
-    if (ctx->cuda_module == 0)
-        goto end;
-    if (ctx->cuda_search_kernel == 0)
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL) {
         goto end;
+    }
 
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"onee", 4, 0, 0, 1, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"twoo", 4, 0, 0, 2, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"onee", 4, 0, 0, 1, 2, 0) == -1)
-        goto end;
+    de_ctx->mpm_matcher = MPM_B2G_CUDA;
+    de_ctx->flags |= DE_QUIET;
 
-    if (B2gCudaPreparePatterns(&mpm_ctx) == -1)
+    de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                               "content:test; sid:1;)");
+    if (de_ctx->sig_list == NULL) {
+        printf("signature parsing failed\n");
         goto end;
+    }
+    SigGroupBuild(de_ctx);
 
-    char *string = "one";
-    result = (B2gCudaSearchBNDMq(&mpm_ctx, NULL, NULL, (uint8_t *)string, strlen(string)) == 0);
-
- end:
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-/**
- * \test Test that the *AddPattern* functions work as expected.
- */
-static int B2gCudaTest04(void)
-{
-    MpmCtx mpm_ctx;
-    MpmThreadCtx mpm_thread_ctx;
-    B2gCudaCtx *ctx = NULL;
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-
-    memset(&mpm_ctx, 0, sizeof(MpmCtx));
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-
-    ctx = mpm_ctx.ctx;
-    if (ctx->cuda_context == 0)
-        goto end;
-    if (ctx->cuda_module == 0)
-        goto end;
-    if (ctx->cuda_search_kernel == 0)
-        goto end;
+    SCCudaPBSetUpQueuesAndBuffers();
 
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"one", 3, 0, 0, 1, 1, 0) == -1)
+    /* get the queues used by the batcher thread */
+    tmq_inq = TmqGetQueueByName(inq_name);
+    if (tmq_inq == NULL) {
+        printf("tmq_inq NULL\n");
         goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"two", 3, 0, 0, 2, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1)
-        goto end;
-    if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1)
-        goto end;
-
-    if (B2gCudaPreparePatterns(&mpm_ctx) == -1)
+    }
+    tmq_outq = TmqGetQueueByName(outq_name);
+    if (tmq_outq == NULL) {
+        printf("tmq_outq NULL\n");
         goto end;
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 4 /* 4 patterns */);
-
-    char *string = "onetwothreeaaaaoneaatwobbbthrbsonwehowvonwfouoonsldffoursadnothreewtowoneowtwo";
-    result = (B2gCudaSearchBNDMq(&mpm_ctx, &mpm_thread_ctx,
-                                 NULL, (uint8_t *)string, strlen(string)) == 9);
+    }
 
     result = 1;
 
- end:
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch01(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "abcdefghjiklmnopqrstuvwxyz";
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
-
-    if (cnt == 1)
-        result = 1;
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-
-    return result;
-}
-
-static int B2gCudaTestSearch02(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "abcdefghjiklmnopqrstuvwxyz";
-
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abce", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
-
-    if (cnt == 0)
-        result = 1;
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-
-    return result;
-}
-
-static int B2gCudaTestSearch03(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "abcdefghjiklmnopqrstuvwxyz";
-
-    /* a match each for these strings */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcde", 4, 0, 0, 1, 0, 0);
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghj", 4, 0, 0, 2, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
-
-    if (cnt == 3)
-        result = 1;
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-
-    return result;
-}
+    /* queue state before calling the thread init function */
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 10);
 
-/**
- * \test Test patterns longer than 'm'. M is 4 here.
- */
-static int B2gCudaTestSearch04(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "abcdefghjiklmnopqrstuvwxyz";
+    /* init the TM thread */
+    SCCudaPBThreadInit(&tv, de_ctx, (void *)&pb_tctx);
+    SCCudaPBSetBufferPacketThreshhold(no_of_pkts);
 
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcdegh", 6, 0, 0, 1, 0, 0);
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghjxyz", 7, 0, 0, 2, 0, 0);
+    /* queue state after calling the thread init function */
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */);
+    pb = pb_tctx->curr_pb;
 
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
+    for (i = 0; i < no_of_pkts; i++) {
+        p[i]->payload = (uint8_t *)strings[i];
+        p[i]->payload_len = strlen(strings[i]);
+        SCCudaPBBatchPackets(NULL, p[i], pb_tctx, NULL);
+    }
 
-    if (cnt == 1)
-        result = 1;
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 1);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 8);
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
+    result &= (pb->nop_in_buffer == no_of_pkts);
 
-/**
- * \test Case insensitive test patterns longer than 'm'. M is 4 here.
- */
-static int B2gCudaTestSearch05(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "abcdefghjiklmnopqrstuvwxyz";
+    int module_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA");
+    SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(module_handle);
 
-    B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"ABCD", 4, 0, 0, 0, 0, 0);
-    B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"bCdEfG", 6, 0, 0, 1, 0, 0);
-    B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"fghJikl", 7, 0, 0, 2, 0, 0);
+    B2gCudaMpmThreadCtxData *b2g_tctx = NULL;
+    B2gCudaMpmDispThreadInit(NULL, module_data, (void *)&b2g_tctx);
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */);
+    if (b2g_tctx->b2g_cuda_context == 0)
+        goto end;
+    if (b2g_tctx->b2g_cuda_module == 0)
+        goto end;
+    if (b2g_tctx->b2g_cuda_search_kernel == 0)
+        goto end;
 
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
+    B2gCudaMpmDispatcher(NULL, (Packet *)pb, b2g_tctx, NULL);
 
-    if (cnt == 3)
-        result = 1;
+    for (i = 0; i < no_of_pkts; i++) {
+        for (j = 0; j < p[i]->mpm_offsets[0]; j++)
+            result &= (results[i][j] == p[i]->mpm_offsets[j + 1]);
+    }
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
+ end:
+    for (i = 0; i < no_of_pkts; i++) {
+        free(p[i]);
+    }
+    SCCudaPBCleanUpQueuesAndBuffers();
+    if (de_ctx != NULL) {
+        SigGroupCleanup(de_ctx);
+        SigCleanSignatures(de_ctx);
+        DetectEngineCtxFree(de_ctx);
+    }
+    SCCudaPBThreadDeInit(NULL, (void *)pb_tctx);
+    B2gCudaMpmDispThreadDeInit(NULL, (void *)b2g_tctx);
 
     return result;
 }
 
-static int B2gCudaTestSearch06(void)
+static int B2gCudaTest03(void)
 {
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "abcd";
-
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
-
-    if (cnt == 1)
-        result = 1;
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-
-    return result;
-}
+    uint8_t raw_eth[] = {
+        0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02,
+        0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00,
+        0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06,
+        0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8,
+        0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11,
+        0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18,
+        0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01,
+        0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21,
+        0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31,
+        0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46,
+        0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f,
+        0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43,
+        0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e,
+        0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72,
+        0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43,
+        0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54,
+        0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20,
+        0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d,
+        0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44,
+        0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e,
+        0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70,
+        0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38,
+        0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47,
+        0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76,
+        0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d,
+        0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+        0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a,
+        0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a,
+        0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48,
+        0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74,
+        0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65,
+        0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f,
+        0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79,
+        0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+        0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78,
+        0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63,
+        0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75,
+        0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c,
+        0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30,
+        0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c,
+        0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c,
+        0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42,
+        0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31,
+        0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76,
+        0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a,
+        0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75,
+        0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73,
+        0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c,
+        0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22,
+        0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77,
+        0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c,
+        0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68,
+        0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e,
+        0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59,
+        0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e,
+        0x0d, 0x0a };
 
-static int B2gCudaTestSearch07(void)
-{
     int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-    char *buf = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+    const char *strings[10] = {
+        "test_test_one",
+        "test_two_test",
+        "test_three_test",
+        "test_four_test",
+        "test_five_test",
+        "test_six_test",
+        "test_seven_test",
+        "test_eight_test",
+        "test_nine_test",
+        "test_ten_test"};
+    /* don't shoot me for hardcoding the results.  We will change this in
+     * sometime, by having run a separate mpm on the cpu and then hold
+     * the results in a temp buffer */
+    Packet *p[10];
+    SCCudaPBThreadCtx *pb_tctx = NULL;
+
+    DecodeThreadVars dtv;
+    ThreadVars tv;
+    DetectEngineCtx *de_ctx = NULL;
+    DetectEngineThreadCtx *det_ctx;
+    ThreadVars de_tv;
+
+    SCCudaPBPacketsBuffer *pb = NULL;
+    SCDQDataQueue *dq = NULL;
+
+    char *inq_name = "cuda_batcher_mpm_inqueue";
+    char *outq_name = "cuda_batcher_mpm_outqueue";
+
+    Tmq *tmq_outq = NULL;
+    Tmq *tmq_inq = NULL;
+
+    uint32_t i = 0, j = 0;
+
+    uint8_t no_of_pkts = 10;
+
+    Signature *sig = NULL;
+
+    memset(&dtv, 0, sizeof(DecodeThreadVars));
+    memset(&tv, 0, sizeof(ThreadVars));
+    memset(&de_tv, 0, sizeof(ThreadVars));
+
+    FlowInitConfig(FLOW_QUIET);
+    for (i = 0; i < no_of_pkts; i++) {
+        p[i] = malloc(sizeof(Packet));
+        if (p[i] == NULL) {
+            printf("error allocating memory\n");
+            exit(EXIT_FAILURE);
+        }
+        memset(p[i], 0, sizeof(Packet));
+        DecodeEthernet(&tv, &dtv, p[i], raw_eth, sizeof(raw_eth), NULL);
+    }
 
-    /* total matches: 135 */
-    /* should match 30 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"A", 1, 0, 0, 0, 0, 0);
-    /* should match 29 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AA", 2, 0, 0, 1, 0, 0);
-    /* should match 28 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAA", 3, 0, 0, 2, 0, 0);
-    /* should match 26 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAA", 5, 0, 0, 3, 0, 0);
-    /* should match 21 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAA", 10, 0, 0, 4, 0, 0);
-    /* should match 1 time */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 30, 0, 0, 5, 0, 0);
+    de_ctx = DetectEngineCtxInit();
+    if (de_ctx == NULL) {
+        goto end;
+    }
+    de_ctx->mpm_matcher = MPM_B2G_CUDA;
+    de_ctx->flags |= DE_QUIET;
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 6 /* 6 patterns */);
+    de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                               "content:test; sid:0;)");
+    if (de_ctx->sig_list == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = de_ctx->sig_list;
 
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:one; sid:1;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:two; sid:2;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    if (cnt == 135)
-        result = 1;
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:three; sid:3;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:four; sid:4;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    return result;
-}
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:five; sid:5;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-static int B2gCudaTestSearch08(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:six; sid:6;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); /* 1 match */
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:seven; sid:7;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:eight; sid:8;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"a", 1);
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:nine; sid:9;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
+    sig = sig->next;
 
-    if (cnt == 0)
-        result = 1;
-    else
-        printf("0 != %" PRIu32 " ",cnt);
+    sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; "
+                        "content:ten; sid:10;)");
+    if (sig->next == NULL) {
+        printf("signature parsing failed\n");
+        goto end;
+    }
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
+    /* build the signatures */
+    SigGroupBuild(de_ctx);
+    DetectEngineThreadCtxInit(&de_tv, (void *)de_ctx, (void *)&det_ctx);
 
-/* we segfault with this test */
-static int B2gCudaTestSearch09(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
+    SCCudaPBSetUpQueuesAndBuffers();
 
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"ab", 2, 0, 0, 0, 0, 0);
+    /* get the queues used by the batcher thread */
+    tmq_inq = TmqGetQueueByName(inq_name);
+    if (tmq_inq == NULL) {
+        printf("tmq_inq NULL\n");
+        goto end;
+    }
+    tmq_outq = TmqGetQueueByName(outq_name);
+    if (tmq_outq == NULL) {
+        printf("tmq_outq NULL\n");
+        goto end;
+    }
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
+    result = 1;
 
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"ab", 2);
+    /* queue state before calling the thread init function */
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 10);
 
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
+    /* init the TM thread */
+    SCCudaPBThreadInit(&tv, de_ctx, (void *)&pb_tctx);
+    SCCudaPBSetBufferPacketThreshhold(no_of_pkts);
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
+    /* queue state after calling the thread init function */
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 0);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 9);
 
-static int B2gCudaTestSearch10(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
+    pb = pb_tctx->curr_pb;
 
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefgh", 8, 0, 0, 0, 0, 0); /* 1 match */
+    for (i = 0; i < no_of_pkts; i++) {
+        p[i]->payload = (uint8_t *)strings[i];
+        p[i]->payload_len = strlen(strings[i]);
+        SCCudaPBBatchPackets(NULL, p[i], pb_tctx, NULL);
+    }
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
+    dq = &data_queues[tmq_outq->id];
+    result &= (dq->len == 1);
+    dq = &data_queues[tmq_inq->id];
+    result &= (dq->len == 8);
 
-    char *buf = "01234567890123456789012345678901234567890123456789"
-                "01234567890123456789012345678901234567890123456789"
-                "abcdefgh"
-                "01234567890123456789012345678901234567890123456789"
-                "01234567890123456789012345678901234567890123456789";
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf));
+    result &= (pb->nop_in_buffer == no_of_pkts);
 
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
+    int module_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA");
+    SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(module_handle);
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
+    B2gCudaMpmThreadCtxData *b2g_tctx = NULL;
+    B2gCudaMpmDispThreadInit(NULL, module_data, (void *)&b2g_tctx);
 
-static int B2gCudaTestSearch11(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
+    if (b2g_tctx->b2g_cuda_context == 0)
+        goto end;
+    if (b2g_tctx->b2g_cuda_module == 0)
+        goto end;
+    if (b2g_tctx->b2g_cuda_search_kernel == 0)
+        goto end;
 
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcde", 5, 0, 0, 0, 0, 0);
+    B2gCudaMpmDispatcher(NULL, (Packet *)pb, b2g_tctx, NULL);
 
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */);
+    for (i = 0; i < 10; i++)
+        SigMatchSignatures(&de_tv, de_ctx, det_ctx, p[i]);
 
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyz", 26);
+    for (i = 0; i < 10; i++) {
+        if (!PacketAlertCheck(p[i], 0)) {
+            result = 0;
+            goto end;
+        }
+        for (j = 1; j <= 10; j++) {
+            if (j == i + 1) {
+                if (!PacketAlertCheck(p[i], j)) {
+                    result = 0;
+                    goto end;
+                }
+            } else {
+                if (PacketAlertCheck(p[i], j)) {
+                    result = 0;
+                    goto end;
+                }
+            }
+        }
+    }
 
-    if (cnt == 2)
-        result = 1;
-    else
-        printf("2 != %" PRIu32 " ",cnt);
+ end:
+    for (i = 0; i < no_of_pkts; i++) {
+        free(p[i]);
+    }
+    SCCudaPBCleanUpQueuesAndBuffers();
+    if (de_ctx) {
+        SigGroupCleanup(de_ctx);
+        SigCleanSignatures(de_ctx);
+        DetectEngineCtxFree(de_ctx);
+    }
+    SCCudaPBThreadDeInit(NULL, (void *)pb_tctx);
+    B2gCudaMpmDispThreadDeInit(NULL, (void *)b2g_tctx);
 
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
     return result;
 }
 
-static int B2gCudaTestSearch12(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"wxyz", 4, 0, 0, 0, 0, 0); /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"vwxyz", 5, 0, 0, 0, 0, 0); /* 1 match */
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyz", 26);
-
-    if (cnt == 2)
-        result = 1;
-    else
-        printf("2 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch13(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCD",
-                            30, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCD", 30);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch14(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDE",
-                            31, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDE", 31);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch15(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDEF",
-                            32, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDEF", 32);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch16(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABC",
-                            29, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyzABC", 29);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch17(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzAB",
-                            28, 0, 0, 0, 0, 0); /* 1 match */
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcdefghijklmnopqrstuvwxyzAB", 28);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch18(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx,
-                            (uint8_t *)"abcde""fghij""klmno""pqrst""uvwxy""z",
-                            26, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"abcde""fghij""klmno""pqrst""uvwxy""z",
-                             26);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch19(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-                            30, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 30);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch20(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 */
-    B2gCudaAddPatternCS(&mpm_ctx,
-                            (uint8_t *)"AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AA",
-                            32, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                             (uint8_t *)"AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AA",
-                             32);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch21(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AA", 2, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"AA", 2);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-
-
-static int B2gCudaTestSearch22(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch23(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abce", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 0)
-        result = 1;
-    else
-        printf("0 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch24(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcde", 4, 0, 0, 1, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghj", 4, 0, 0, 2, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 3)
-        result = 1;
-    else
-        printf("3 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-/**
- * \test test patterns longer than 'm'. M is 4 here.
- */
-static int B2gCudaTestSearch25(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcdegh", 6, 0, 0, 1, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghjxyz", 7, 0, 0, 2, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-/**
- * \test case insensitive test patterns longer than 'm'. M is 4 here.
- */
-static int B2gCudaTestSearch26(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"ABCD", 4, 0, 0, 0, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"bCdEfG", 6, 0, 0, 1, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"fghJikl", 7, 0, 0, 2, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 3)
-        result = 1;
-    else
-        printf("3 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch27(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"abcd", 4);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch28(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* should match 30 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"A", 1, 0, 0, 0, 0, 0);
-    /* should match 29 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AA", 2, 0, 0, 1, 0, 0);
-    /* should match 28 times */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAA", 3, 0, 0, 2, 0, 0);
-    /* 26 */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAA", 5, 0, 0, 3, 0, 0);
-    /* 21 */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAA", 10, 0, 0, 4, 0, 0);
-    /* 1 */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
-                        30, 0, 0, 5, 0, 0);
-    /* total matches: 135 */
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 6 /* 6 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 30);
-
-    if (cnt == 135)
-        result = 1;
-    else
-        printf("135 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch29(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"a", 1);
-
-    if (cnt == 0)
-        result = 1;
-    else
-        printf("0 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch30(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"ab", 2, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"ab", 2);
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-/* 1 match */
-static int B2gCudaTestSearch31(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefgh", 8, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */);
-
-    char *buf = "01234567890123456789012345678901234567890123456789"
-                "01234567890123456789012345678901234567890123456789"
-                "abcdefgh"
-                "01234567890123456789012345678901234567890123456789"
-                "01234567890123456789012345678901234567890123456789";
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf,
-                               strlen(buf));
-
-    if (cnt == 1)
-        result = 1;
-    else
-        printf("1 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch32(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcde", 5, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 2)
-        result = 1;
-    else
-        printf("2 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestSearch33(void)
-{
-    int result = 0;
-    int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST");
-    MpmCtx mpm_ctx;
-    memset(&mpm_ctx, 0x00, sizeof(MpmCtx));
-    MpmThreadCtx mpm_thread_ctx;
-    MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle);
-    B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx;
-
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"wxyz", 4, 0, 0, 0, 0, 0);
-    /* 1 match */
-    B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"vwxyz", 5, 0, 0, 0, 0, 0);
-
-    B2gCudaPreparePatterns(&mpm_ctx);
-    B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */);
-
-    uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL,
-                               (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26);
-
-    if (cnt == 2)
-        result = 1;
-    else
-        printf("2 != %" PRIu32 " ",cnt);
-
-    B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx);
-    B2gCudaDestroyCtx(&mpm_ctx);
-    return result;
-}
-
-static int B2gCudaTestDeInitTestEnv(void)
-{
-    CUcontext context;
-    if (SCCudaCtxPopCurrent(&context) == -1)
-        exit(EXIT_FAILURE);
-    SCCudaHlDeRegisterModule("B2G_CUDA_TEST");
-
-    return 1;
-}
-
 #endif /* UNITTESTS */
 
 /*********************************Unittests************************************/
@@ -3218,49 +2903,9 @@ static int B2gCudaTestDeInitTestEnv(void)
 void B2gCudaRegisterTests(void)
 {
 #ifdef UNITTESTS
-    UtRegisterTest("B2gCudaTestInitTestEnv", B2gCudaTestInitTestEnv, 1);
     UtRegisterTest("B2gCudaTest01", B2gCudaTest01, 1);
     UtRegisterTest("B2gCudaTest02", B2gCudaTest02, 1);
     UtRegisterTest("B2gCudaTest03", B2gCudaTest03, 1);
-    UtRegisterTest("B2gCudaTest04", B2gCudaTest04, 1);
-    UtRegisterTest("B2gCudaTestSearch01", B2gCudaTestSearch01, 1);
-    UtRegisterTest("B2gCudaTestSearch02", B2gCudaTestSearch02, 1);
-    UtRegisterTest("B2gCudaTestSearch03", B2gCudaTestSearch03, 1);
-    UtRegisterTest("B2gCudaTestSearch04", B2gCudaTestSearch04, 1);
-    UtRegisterTest("B2gCudaTestSearch05", B2gCudaTestSearch05, 1);
-    UtRegisterTest("B2gCudaTestSearch06", B2gCudaTestSearch06, 1);
-    UtRegisterTest("B2gCudaTestSearch07", B2gCudaTestSearch07, 1);
-    UtRegisterTest("B2gCudaTestSearch08", B2gCudaTestSearch08, 1);
-    UtRegisterTest("B2gCudaTestSearch09", B2gCudaTestSearch09, 1);
-    UtRegisterTest("B2gCudaTestSearch10", B2gCudaTestSearch10, 1);
-    UtRegisterTest("B2gCudaTestSearch11", B2gCudaTestSearch11, 1);
-    UtRegisterTest("B2gCudaTestSearch12", B2gCudaTestSearch12, 1);
-    UtRegisterTest("B2gCudaTestSearch13", B2gCudaTestSearch13, 1);
-
-    UtRegisterTest("B2gCudaTestSearch14", B2gCudaTestSearch14, 1);
-    UtRegisterTest("B2gCudaTestSearch15", B2gCudaTestSearch15, 1);
-    UtRegisterTest("B2gCudaTestSearch16", B2gCudaTestSearch16, 1);
-    UtRegisterTest("B2gCudaTestSearch17", B2gCudaTestSearch17, 1);
-    UtRegisterTest("B2gCudaTestSearch18", B2gCudaTestSearch18, 1);
-    UtRegisterTest("B2gCudaTestSearch19", B2gCudaTestSearch19, 1);
-    UtRegisterTest("B2gCudaTestSearch20", B2gCudaTestSearch20, 1);
-    UtRegisterTest("B2gCudaTestSearch21", B2gCudaTestSearch21, 1);
-
-    UtRegisterTest("B2gCudaTestSearch22", B2gCudaTestSearch22, 1);
-    UtRegisterTest("B2gCudaTestSearch23", B2gCudaTestSearch23, 1);
-    UtRegisterTest("B2gCudaTestSearch24", B2gCudaTestSearch24, 1);
-    UtRegisterTest("B2gCudaTestSearch25", B2gCudaTestSearch25, 1);
-    UtRegisterTest("B2gCudaTestSearch26", B2gCudaTestSearch26, 1);
-    UtRegisterTest("B2gCudaTestSearch27", B2gCudaTestSearch27, 1);
-    UtRegisterTest("B2gCudaTestSearch28", B2gCudaTestSearch28, 1);
-    UtRegisterTest("B2gCudaTestSearch29", B2gCudaTestSearch29, 1);
-    UtRegisterTest("B2gCudaTestSearch30", B2gCudaTestSearch30, 1);
-    UtRegisterTest("B2gCudaTestSearch31", B2gCudaTestSearch31, 1);
-    UtRegisterTest("B2gCudaTestSearch32", B2gCudaTestSearch32, 1);
-    UtRegisterTest("B2gCudaTestSearch33", B2gCudaTestSearch33, 1);
-    /* we actually need to call this.  right now we don't need this.  we will
-     * change this in the next patch for cuda batching */
-    UtRegisterTest("B2gCudaTestDeInitTestEnv", B2gCudaTestDeInitTestEnv, 1);
 #endif /* UNITTESTS */
 }
 
diff --git a/src/util-mpm-b2g-cuda.h b/src/util-mpm-b2g-cuda.h
index 751e9d5798..a30e155ea3 100644
--- a/src/util-mpm-b2g-cuda.h
+++ b/src/util-mpm-b2g-cuda.h
@@ -51,7 +51,7 @@ typedef struct B2gCudaPattern_ {
     /* case INsensitive */
     uint8_t *ci;
     struct B2gCudaPattern_ *next;
-    MpmEndMatch *em;
+    uint32_t id;
 } B2gCudaPattern;
 
 typedef struct B2gCudaHashItem_ {
@@ -65,23 +65,6 @@ typedef struct B2gCudaCtx_ {
      * in the engine that is holding this B2g_Cuda_Ctx */
     int module_handle;
 
-    CUcontext cuda_context;
-    CUmodule cuda_module;
-
-    /* the search kernel */
-    CUfunction cuda_search_kernel;
-
-    /* the cuda_search_kernel argument offsets */
-    uint8_t cuda_search_kernel_arg0_offset;
-    uint8_t cuda_search_kernel_arg1_offset;
-    uint8_t cuda_search_kernel_arg2_offset;
-    uint8_t cuda_search_kernel_arg3_offset;
-    uint8_t cuda_search_kernel_arg4_offset;
-    uint8_t cuda_search_kernel_arg5_offset;
-    uint8_t cuda_search_kernel_arg_total;
-
-    /* cuda device pointer to thelower case table g_u8_lowercasetable */
-    CUdeviceptr cuda_g_u8_lowercasetable;
     /* cuda device pointer to B2gCudaCtx->B2G */
     CUdeviceptr cuda_B2G;
 
@@ -141,17 +124,18 @@ typedef struct B2gCudaThreadCtx_ {
 } B2gCudaThreadCtx;
 
 void MpmB2gCudaRegister(void);
-
 void TmModuleCudaMpmB2gRegister(void);
 
 int B2gCudaStartDispatcherThreadRC(const char *);
-int B2gCudaStartDispatcherThreadAPC(const char *);
-
 void B2gCudaKillDispatcherThreadRC(void);
-void B2gCudaKillDispatcherThreadAPC(void);
-
-void B2gCudaPushPacketTo_tv_CMB2_RC(Packet *);
-void B2gCudaPushPacketTo_tv_CMB2_APC(Packet *);
+int B2gCudaResultsPostProcessing(Packet *, MpmCtx *, MpmThreadCtx *,
+                                 PatternMatcherQueue *);
+uint32_t B2gCudaSearch1(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *,
+                        uint8_t *, uint16_t);
+#ifdef B2G_CUDA_SEARCH2
+uint32_t B2gCudaSearch2(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *,
+                        uint8_t *, uint16_t);
+#endif
 
 #endif /* __SC_CUDA_SUPPORT__ */