batching of packets support for cuda b2g mpm. Supported for both 32 and 64 bit platforms

16 years ago · 33f4beb0bc
parent b3c22cd512
commit 33f4beb0bc
24 changed files with 3529 additions and 2144 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -8,6 +8,7 @@ bin_PROGRAMS = suricata
 suricata_SOURCES = suricata.c suricata.h \
 runmodes.c runmodes.h \
 packet-queue.c packet-queue.h \
+data-queue.c data-queue.h \
 threads.c threads.h \
 source-nfq.c source-nfq.h \
 source-pcap.c source-pcap.h \
@ -211,7 +212,8 @@ win32-misc.c win32-misc.h \
 win32-service.c win32-service.h \
 util-action.c util-action.h \
 win32-syslog.h \
-util-profiling.c util-profiling.h
+util-profiling.c util-profiling.h \
+cuda-packet-batcher.c cuda-packet-batcher.h

 # set the include path found by configure
 INCLUDES= $(all_includes)
--- a/src/cuda-packet-batcher.c
+++ b/src/cuda-packet-batcher.c
--- a/src/cuda-packet-batcher.h
+++ b/src/cuda-packet-batcher.h
@ -0,0 +1,139 @@
+/**
+ * Copyright (c) 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ */
+
+#ifndef __CUDA_PACKET_BATCHER_H__
+#define __CUDA_PACKET_BATCHER_H__
+
+#include "suricata-common.h"
+
+/* compile in, only if we have a CUDA enabled on this machine */
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "util-cuda.h"
+
+/* The min no of packets that we allot the buffer for.  We will make
+ * this user configurable(yaml) based on the traffic they expect.  Either ways
+ * for a low/medium traffic network with occasional sgh matches, we shouldn't
+ * be enabling cuda.  We will only end up screwing performance */
+#define SC_CUDA_PB_MIN_NO_OF_PACKETS 4000
+
+/**
+ * \brief Implement the template SCDQGenericQData to transfer the cuda
+ *        packet buffer from the cuda batcher thread to the dispatcher
+ *        thread using the queue SCDQDataQueue.
+ */
+typedef struct SCCudaPBPacketsBuffer_ {
+    /* these members from the template SCDQGenericQData that have to be
+     * compulsarily implemented */
+    struct SCDQGenericQData_ *next;
+    struct SCDQGenericQData_ *prev;
+    /* if we want to consider this pointer as the head of a list, this var
+     * holds the no of elements in the list */
+    //uint16_t len;
+    /* in case this data instance is the head of a list, we can refer the
+     * bottomost instance directly using this var */
+    //struct SCDQGenericaQData *bot;
+
+    /* our own members from here on*/
+
+    /* current count of packets held in packets_buffer.  nop = no of packets */
+    uint32_t nop_in_buffer;
+    /* the packets buffer.  We will assign buffer for SC_CUDA_PB_MIN_NO_OF_PACKETS
+     * packets.  Basically the size of this buffer would be
+     * SC_CUDA_PB_MIN_NO_OF_PACKETS * sizeof(SCCudaPBPacketDataForGPU), so that
+     * we can hold mininum SC_CUDA_PB_MIN_NO_OF_PACKETS */
+    uint8_t *packets_buffer;
+    /* length of data buffered so far in packets_buffer, which would be sent
+     * to the GPU.  We will need this to copy the buffered data from the
+     * packets_buffer here on the host, to the buffer on the GPU */
+    uint32_t packets_buffer_len;
+    /* packet offset within the packets_buffer.  Each packet would be stored in
+     * packets buffer at a particular offset.  This buffer would indicate the
+     * offset of a packet inside the packet buffer.  We will allot space to hold
+     * offsets for SC_CUDA_PB_MIN_NO_OF_PACKETS packets
+     * \todo change it to holds offsets for more than SC_CUDA_PB_MIN_NO_OF_PACKETS
+     * when we use the buffer to hold packets based on the remaining size in the
+     * buffer rather than on a fixed limit like SC_CUDA_PB_MIN_NO_OF_PACKETS */
+    uint32_t *packets_offset_buffer;
+
+    /* the total packet payload lengths buffered so far.  We will need this to
+     * transfer the total length of the results buffer that has to be transferred
+     * back from the gpu */
+    uint32_t packets_total_payload_len;
+    /* the payload offsets for the different payload lengths buffered in.  For
+     * example if we buffer 4 packets of lengths 3, 4, 5, 6, we will store four
+     * offsets in the buffer {0, 3, 7, 12, 18} */
+    uint32_t *packets_payload_offset_buffer;
+
+    /* packet addresses for all the packets buffered in the packets_buffer.  We
+     * will allot space to hold packet addresses for SC_CUDA_PB_MIN_NO_OF_PACKETS.
+     * We will need this, so that the cuda mpm b2g dispatcher thread can inform
+     * and store the b2g cuda mpm results for the packet*/
+    Packet **packets_address_buffer;
+} SCCudaPBPacketsBuffer;
+
+/**
+ * \brief Structure for each packet that is being batched to the GPU.
+ */
+typedef struct SCCudaPBPacketDataForGPU_ {
+    /* holds B2gCudaCtx->m */
+    unsigned int m;
+    /* holds B2gCudaCtx->cuda_B2g */
+    CUdeviceptr table;
+    /* holds the length of the payload */
+    unsigned int payload_len;
+    /* holds the payload.  While we actually store the payload in the buffer,
+     * we may not end up using the entire 1480 bytes if the payload is smaller */
+    uint8_t payload[1480];
+} SCCudaPBPacketDataForGPU;
+
+/**
+ * \brief Same as struct SCCudaPBPacketDataForGPU_ except for the payload part.
+ *        We will need this for calculating the size of the non-payload part
+ *        of the packet data to be buffered.
+ */
+typedef struct SCCudaPBPacketDataForGPUNonPayload_ {
+    /* holds B2gCudaCtx->m */
+    unsigned int m;
+    /* holds B2gCudaCtx->cuda_B2g */
+    CUdeviceptr table;
+    /* holds the length of the payload */
+    unsigned int payload_len;
+} SCCudaPBPacketDataForGPUNonPayload;
+
+/**
+ * \brief The cuda packet batcher threading context.
+ */
+typedef struct SCCudaPBThreadCtx_ {
+    /* we need the detection engine context to retrieve the sgh while we start
+     * receiving and batching the packets */
+    DetectEngineCtx *de_ctx;
+
+    /* packets buffer currently in use inside the cuda batcher thread */
+    SCCudaPBPacketsBuffer *curr_pb;
+} SCCudaPBThreadCtx;
+
+SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void);
+void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *);
+
+void SCCudaPBSetBufferPacketThreshhold(uint32_t);
+void SCCudaPBCleanUpQueuesAndBuffers(void);
+void SCCudaPBSetUpQueuesAndBuffers(void);
+void SCCudaPBKillBatchingPackets(void);
+
+TmEcode SCCudaPBBatchPackets(ThreadVars *, Packet *, void *, PacketQueue *);
+TmEcode SCCudaPBThreadInit(ThreadVars *, void *, void **);
+TmEcode SCCudaPBThreadDeInit(ThreadVars *, void *);
+void SCCudaPBThreadExitStats(ThreadVars *, void *);
+void SCCudaPBRegisterTests(void);
+
+void TmModuleCudaPacketBatcherRegister(void);
+
+void *SCCudaPBTmThreadsSlot1(void *);
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
+#endif /* __CUDA_PACKET_BATCHER_H__ */
--- a/src/data-queue.c
+++ b/src/data-queue.c
@ -0,0 +1,93 @@
+/**
+ * Copyright (c) 2009, 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ */
+
+#include "suricata-common.h"
+#include "data-queue.h"
+#include "threads.h"
+
+/**
+ * \brief Enqueues data on the queue.
+ *
+ * \param q    Pointer to the data queue.
+ * \param data Pointer to the data to be queued.  It should be a pointer to a
+ *             structure instance that implements the template structure
+ *             struct SCDQGenericQData_ defined in data-queue.h.
+ */
+void SCDQDataEnqueue(SCDQDataQueue *q, SCDQGenericQData *data)
+{
+    /* we already have some data in queue */
+    if (q->top != NULL) {
+        data->next = q->top;
+        q->top->prev = data;
+        q->top = data;
+
+    /* the queue is empty */
+    } else {
+        q->top = data;
+        q->bot = data;
+    }
+
+    q->len++;
+
+#ifdef DBG_PERF
+    if (q->len > q->dbg_maxlen)
+        q->dbg_maxlen = q->len;
+#endif /* DBG_PERF */
+
+    return;
+}
+
+/**
+ * \brief Dequeues and returns an entry from the queue.
+ *
+ * \param q      Pointer to the data queue.
+ * \param retval Pointer to the data that has been enqueued.  The instance
+ *               returned is/should be a pointer to a structure instance that
+ *               implements the template structure struct SCDQGenericQData_
+ *               defined in data-queue.h.
+ */
+SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *q)
+{
+    SCDQGenericQData *data = NULL;
+
+    /* if the queue is empty there are is no data left and we return NULL */
+    if (q->len == 0) {
+        return NULL;
+    }
+
+    /* If we are going to get the last packet, set len to 0
+     * before doing anything else (to make the threads to follow
+     * the SCondWait as soon as possible) */
+    q->len--;
+
+    /* pull the bottom packet from the queue */
+    data = q->bot;
+
+#ifdef OS_DARWIN
+    /* Weird issue in OS_DARWIN
+     * Sometimes it looks that two thread arrive here at the same time
+     * so the bot ptr is NULL */
+    if (data == NULL) {
+        printf("No data to dequeue!\n");
+        return NULL;
+    }
+#endif /* OS_DARWIN */
+
+    /* more data in queue */
+    if (q->bot->prev != NULL) {
+        q->bot = q->bot->prev;
+        q->bot->next = NULL;
+    /* just the one we remove, so now empty */
+    } else {
+        q->top = NULL;
+        q->bot = NULL;
+    }
+
+    data->next = NULL;
+    data->prev = NULL;
+
+    return data;
+}
--- a/src/data-queue.h
+++ b/src/data-queue.h
@ -0,0 +1,64 @@
+/**
+ * Copyright (c) 2009, 2010 Open Information Security Foundation.
+ *
+ * \author Anoop Saldanha <poonaatsoc@gmail.com>
+ *
+ * \file Generic queues.  Any instance that wants to get itself on the generic
+ *       queue, would have to implement the template struct SCDQGenericQData_
+ *       defined below.
+ */
+
+#ifndef __DATA_QUEUE_H__
+#define __DATA_QUEUE_H__
+
+#include "threads.h"
+
+/**
+ * \brief Generic template for any data structure that wants to be on the
+ *        queue.  Any other data structure that wants to be on the queue
+ *        needs to use this template and define its own members from
+ *        <your_own_structure_members_from_here_on> onwards.
+ */
+typedef struct SCDQGenericQData_ {
+    /* this is needed when we want to supply a list of data items */
+    struct SCDQGenericQData_ *next;
+    struct SCDQGenericQData_ *prev;
+    /* if we want to consider this pointer as the head of a list, this var
+     * holds the no of elements in the list.  Else it holds a <need_to_think>. */
+    //uint16_t len;
+    /* in case this data instance is the head of a list, we can refer the
+     * bottomost instance directly using this var */
+    //struct SCDQGenericaQData *bot;
+
+
+    /* any other data structure that wants to be on the queue can implement
+     * its own memebers from here on, in its structure definition.  Just note
+     * that the first 2 members should always be next and prev in the same
+     * order */
+    // <your_own_structure_members_from_here_on>
+} SCDQGenericQData;
+
+/**
+ * \brief The data queue to hold instances that implement the template
+ *        SCDQGenericQData.
+ */
+typedef struct SCDQDataQueue_ {
+    /* holds the item at the top of the queue */
+    SCDQGenericQData *top;
+    /* holds the item at the bottom of the queue */
+    SCDQGenericQData *bot;
+    /* no of items currently in the queue */
+    uint16_t len;
+
+    SCMutex mutex_q;
+    SCCondT cond_q;
+
+#ifdef DBG_PERF
+    uint16_t dbg_maxlen;
+#endif /* DBG_PERF */
+} SCDQDataQueue;
+
+void SCDQDataEnqueue(SCDQDataQueue *, SCDQGenericQData *);
+SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *);
+
+#endif /* __DATA_QUEUE_H__ */
--- a/src/decode.h
+++ b/src/decode.h
@ -247,6 +247,10 @@ struct PacketQueue_;
 */
 typedef struct Packet_
 {
+    /* double linked list ptrs */
+    struct Packet_ *next;
+    struct Packet_ *prev;
+
    /* Addresses, Ports and protocol
     * these are on top so we can use
     * the Packet as a hash key */
@ -342,10 +346,6 @@ typedef struct Packet_
    /** packet number in the pcap file, matches wireshark */
    uint64_t pcap_cnt;

-    /* double linked list ptrs */
-    struct Packet_ *next;
-    struct Packet_ *prev;
-
    /* ready to set verdict counter, only set in root */
    uint8_t rtv_cnt;
    /* tunnel packet ref count */
@ -369,20 +369,22 @@ typedef struct Packet_

    /* required for cuda support */
 #ifdef __SC_CUDA_SUPPORT__
-    PatternMatcherQueue *cuda_pmq;
-    MpmCtx *cuda_mpm_ctx;
-    MpmThreadCtx *cuda_mtc;
-
-    /* used to hold the match results.  We can instead use a void *result
-     * instead here.  That way we can make them hold any result. *todo* */
-    uint16_t cuda_matches;
-    /* indicates if the dispatcher should call the search or the scan phase
-     * of the pattern matcher.  We can instead use a void *cuda_data instead.
-     * This way we can send any data across to the dispatcher */
-    uint8_t cuda_search;
-    /* the dispatcher thread would pump the packet into this queue once it has
-     * processed the packet */
-    struct PacketQueue_ *cuda_outq;
+    /* indicates if the cuda mpm would be conducted or a normal cpu mpm would
+     * be conduced on this packet.  If it is set to 0, the cpu mpm; else cuda mpm */
+    uint8_t cuda_mpm_enabled;
+    /* indicates if the cuda mpm has finished running the mpm and processed the
+     * results for this packet, assuming if cuda_mpm_enabled has been set for this
+     * packet */
+    uint16_t cuda_done;
+    /* used by the detect thread and the cuda mpm dispatcher thread.  The detect
+     * thread would wait on this cond var, if the cuda mpm dispatcher thread
+     * still hasn't processed the packet.  The dispatcher would use this cond
+     * to inform the detect thread(in case it is waiting on this packet), once
+     * the dispatcher is done processing the packet results */
+    SCMutex cuda_mutex;
+    SCCondT cuda_cond;
+    /* the extra 1 in the 1481, is to hold the no_of_matches from the mpm run */
+    uint16_t mpm_offsets[1481];
 #endif
 } Packet;

@ -460,22 +462,28 @@ typedef struct DecodeThreadVars_
 /**
 *  \brief Initialize a packet structure for use.
 */
-#define PACKET_INITIALIZE(p) do {               \
+#ifndef __SC_CUDA_SUPPORT__
+#define PACKET_INITIALIZE(p) { \
    memset((p), 0x00, sizeof(Packet)); \
    SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
    PACKET_RESET_CHECKSUMS((p)); \
-    } while (0)
+}
+#else
+#define PACKET_INITIALIZE(p) { \
+    memset((p), 0x00, sizeof(Packet)); \
+    SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
+    PACKET_RESET_CHECKSUMS((p)); \
+    SCMutexInit(&(p)->cuda_mutex, NULL); \
+    SCCondInit(&(p)->cuda_cond, NULL); \
+}
+#endif
+

 /**
 *  \brief Recycle a packet structure for reuse.
 *  \todo the mutex destroy & init is necessary because of the memset, reconsider
 */
-#define PACKET_RECYCLE(p) do {                  \
-        CLEAR_ADDR(&(p)->src);                   \
-        CLEAR_ADDR(&(p)->dst);                   \
-        (p)->sp = 0;                            \
-        (p)->dp = 0;                            \
-        (p)->proto = 0;                         \
+#define PACKET_DO_RECYCLE(p) do {                  \
        (p)->recursion_level = 0; \
        (p)->flags = 0; \
        (p)->flowflags = 0; \
@ -530,15 +538,39 @@ typedef struct DecodeThreadVars_
        PACKET_RESET_CHECKSUMS((p));            \
    } while (0)

+#ifndef __SC_CUDA_SUPPORT__
+#define PACKET_RECYCLE(p) PACKET_DO_RECYCLE((p))
+#else
+#define PACKET_RECYCLE(p) do { \
+    PACKET_DO_RECYCLE((p)); \
+    SCMutexDestroy(&(p)->cuda_mutex); \
+    SCCondDestroy(&(p)->cuda_cond); \
+    SCMutexInit(&(p)->cuda_mutex, NULL); \
+    SCCondInit(&(p)->cuda_cond, NULL); \
+    PACKET_RESET_CHECKSUMS((p)); \
+} while(0)
+#endif
+
 /**
 *  \brief Cleanup a packet so that we can free it. No memset needed..
 */
+#ifndef __SC_CUDA_SUPPORT__
 #define PACKET_CLEANUP(p) do {                  \
        if ((p)->pktvar != NULL) {              \
            PktVarFree((p)->pktvar);            \
        }                                       \
        SCMutexDestroy(&(p)->mutex_rtv_cnt);    \
    } while (0)
+#else
+#define PACKET_CLEANUP(p) do { \
+    if ((p)->pktvar != NULL) { \
+        PktVarFree((p)->pktvar); \
+    } \
+    SCMutexDestroy(&(p)->mutex_rtv_cnt); \
+    SCMutexDestroy(&(p)->cuda_mutex); \
+    SCCondDestroy(&(p)->cuda_cond); \
+} while(0)
+#endif


 /* macro's for setting the action
--- a/src/detect-engine-mpm.c
+++ b/src/detect-engine-mpm.c
@ -111,6 +111,7 @@ uint32_t PacketPatternSearch(ThreadVars *tv, DetectEngineThreadCtx *det_ctx,
    SCEnter();

    uint32_t ret;
+
 #ifndef __SC_CUDA_SUPPORT__
    ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
                                                            &det_ctx->mtc,
@ -130,7 +131,17 @@ uint32_t PacketPatternSearch(ThreadVars *tv, DetectEngineThreadCtx *det_ctx,
        SCReturnInt(ret);
    }

-    SCCudaHlProcessPacketWithDispatcher(p, det_ctx, &ret);
+    if (p->cuda_mpm_enabled) {
+        ret = B2gCudaResultsPostProcessing(p, det_ctx->sgh->mpm_ctx,
+                                           &det_ctx->mtc, &det_ctx->pmq);
+    } else {
+        ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
+                                                                &det_ctx->mtc,
+                                                                &det_ctx->pmq,
+                                                                p->payload,
+                                                                p->payload_len);
+    }
+
 #endif

    SCReturnInt(ret);
@ -154,23 +165,8 @@ uint32_t UriPatternSearch(DetectEngineThreadCtx *det_ctx,
    //PrintRawDataFp(stdout, uri, uri_len);

    uint32_t ret;
-#ifndef __SC_CUDA_SUPPORT__
-    ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search
-        (det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq,
-         uri, uri_len);
-#else
-    /* if the user has enabled cuda support, but is not using the cuda mpm
-     * algo, then we shouldn't take the path of the dispatcher.  Call the mpm
-     * directly */
-    if (det_ctx->sgh->mpm_uri_ctx->mpm_type != MPM_B2G_CUDA) {
-        ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search
-            (det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq,
-             uri, uri_len);
-        SCReturnUInt(ret);
-    }
-
-    SCCudaHlProcessUriWithDispatcher(uri, uri_len, det_ctx, &ret);
-#endif
+    ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search(det_ctx->sgh->mpm_uri_ctx,
+            &det_ctx->mtcu, &det_ctx->pmq, uri, uri_len);

    SCReturnUInt(ret);
 }
--- a/src/detect-engine.c
+++ b/src/detect-engine.c
@ -378,47 +378,6 @@ TmEcode DetectEngineThreadCtxInit(ThreadVars *tv, void *initdata, void **data) {

    *data = (void *)det_ctx;

-#ifdef __SC_CUDA_SUPPORT__
-    if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
-        return TM_ECODE_OK;
-
-    Tmq *tmq;
-    /* we would prepend this name to the the tv name, to obtain the final unique
-     * detection thread queue name */
-    char *cuda_outq_name = "cuda_mpm_rc_disp_outq";
-    uint8_t disp_outq_name_len = (strlen(tv->name) + strlen(cuda_outq_name) + 1);
-
-    char *disp_outq_name = SCMalloc(disp_outq_name_len * sizeof(char));
-    if (disp_outq_name == NULL)
-        goto error;
-    strcpy(disp_outq_name, tv->name);
-    strcpy(disp_outq_name + strlen(tv->name), cuda_outq_name);
-    disp_outq_name[disp_outq_name_len] = '\0';
-
-    tmq = TmqGetQueueByName(disp_outq_name);
-    if (tmq != NULL) {
-        SCLogError(SC_ERR_TMQ_ALREADY_REGISTERED, "A queue by the name \"%s\" "
-                   "is already registered, which shouldn't be the case.  Queue "
-                   "name is duplicated.  Please check if multiple instances of "
-                   "detection module are given different names ",
-                   disp_outq_name);
-        goto error;
-    }
-    tmq = TmqCreateQueue(disp_outq_name);
-    if (tmq == NULL)
-        goto error;
-
-    /* hold the queue instane we create under this detection thread instance */
-    det_ctx->cuda_mpm_rc_disp_outq = tmq;
-    det_ctx->cuda_mpm_rc_disp_outq->reader_cnt++;
-    det_ctx->cuda_mpm_rc_disp_outq->writer_cnt++;
-
-    return TM_ECODE_OK;
-
- error:
-    return TM_ECODE_FAILED;
-#endif
-
    return TM_ECODE_OK;
 }

--- a/src/detect.c
+++ b/src/detect.c
@ -3140,6 +3140,7 @@ int SigGroupBuild (DetectEngineCtx *de_ctx) {
            SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
                       "module SC_RULES_CONTENT_B2G_CUDA");
        }
+        SCCudaCtxPushCurrent(dummy_context);
        if (SCCudaMemGetInfo(&cuda_free_before_alloc, &cuda_total) == 0) {
            SCLogInfo("Total Memory available in the CUDA context used for mpm "
                      "with b2g: %.2f MB", cuda_total/(1024.0 * 1024.0));
--- a/src/runmodes.c
+++ b/src/runmodes.c
@ -24,6 +24,7 @@

 #include "suricata-common.h"
 #include "detect-engine.h"
+#include "detect-engine-mpm.h"
 #include "tm-threads.h"
 #include "util-debug.h"
 #include "util-time.h"
@ -43,6 +44,8 @@

 #include "output.h"

+#include "cuda-packet-batcher.h"
+
 /**
 * A list of output modules that will be active for the run mode.
 */
@ -2305,6 +2308,115 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) {
        printf("ERROR: TmThreadSpawn failed\n");
        exit(EXIT_FAILURE);
    }
+
+#if defined(__SC_CUDA_SUPPORT__)
+    if (PatternMatchDefaultMatcher() == MPM_B2G_CUDA) {
+        ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode",
+                                                             "pickup-queue", "simple",
+                                                             "decode-queue1", "simple",
+                                                             "1slot");
+        if (tv_decode1 == NULL) {
+            printf("ERROR: TmThreadsCreate failed for Decode1\n");
+            exit(EXIT_FAILURE);
+        }
+        tm_module = TmModuleGetByName("DecodePcapFile");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName DecodePcap failed\n");
+            exit(EXIT_FAILURE);
+        }
+        Tm1SlotSetFunc(tv_decode1, tm_module, NULL);
+
+        TmThreadSetCPUAffinity(tv_decode1, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+
+        ThreadVars *tv_cuda_PB = TmThreadCreate("CUDA_PB",
+                                                "decode-queue1", "simple",
+                                                "cuda-pb-queue1", "simple",
+                                                "custom", SCCudaPBTmThreadsSlot1, 0);
+        if (tv_cuda_PB == NULL) {
+            printf("ERROR: TmThreadsCreate failed for CUDA_PB\n");
+            exit(EXIT_FAILURE);
+        }
+        tv_cuda_PB->type = TVT_PPT;
+
+        tm_module = TmModuleGetByName("CudaPacketBatcher");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName CudaPacketBatcher failed\n");
+            exit(EXIT_FAILURE);
+        }
+        Tm1SlotSetFunc(tv_cuda_PB, tm_module, (void *)de_ctx);
+
+        TmThreadSetCPUAffinity(tv_cuda_PB, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_cuda_PB, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_cuda_PB) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+
+        ThreadVars *tv_stream1 = TmThreadCreatePacketHandler("Stream1",
+                                                             "cuda-pb-queue1", "simple",
+                                                             "stream-queue1", "simple",
+                                                             "1slot");
+        if (tv_stream1 == NULL) {
+            printf("ERROR: TmThreadsCreate failed for Stream1\n");
+            exit(EXIT_FAILURE);
+        }
+        tm_module = TmModuleGetByName("StreamTcp");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName StreamTcp failed\n");
+            exit(EXIT_FAILURE);
+        }
+        Tm1SlotSetFunc(tv_stream1,tm_module,NULL);
+
+        TmThreadSetCPUAffinity(tv_stream1, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_stream1, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_stream1) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream",
+                                                             "pickup-queue", "simple",
+                                                             "stream-queue1", "simple",
+                                                             "varslot");
+        if (tv_decode1 == NULL) {
+            printf("ERROR: TmThreadsCreate failed for Decode1\n");
+            exit(EXIT_FAILURE);
+        }
+        tm_module = TmModuleGetByName("DecodePcapFile");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName DecodePcap failed\n");
+            exit(EXIT_FAILURE);
+        }
+        TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL);
+
+        tm_module = TmModuleGetByName("StreamTcp");
+        if (tm_module == NULL) {
+            printf("ERROR: TmModuleGetByName StreamTcp failed\n");
+            exit(EXIT_FAILURE);
+        }
+        TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL);
+
+        TmThreadSetCPUAffinity(tv_decode1, 0);
+        if (ncpus > 1)
+            TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM);
+
+        if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) {
+            printf("ERROR: TmThreadSpawn failed\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+#else
 //#if 0
    //ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","packetpool","packetpool","varslot");
    ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","stream-queue1","simple","varslot");
@ -2336,7 +2448,7 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) {
        printf("ERROR: TmThreadSpawn failed\n");
        exit(EXIT_FAILURE);
    }
-
+#endif
 //#if 0
    /* start with cpu 1 so that if we're creating an odd number of detect
     * threads we're not creating the most on CPU0. */
--- a/src/suricata.c
+++ b/src/suricata.c
@ -133,6 +133,7 @@
 /* holds the cuda b2g module */
 #include "util-mpm-b2g-cuda.h"
 #include "util-cuda-handlers.h"
+#include "cuda-packet-batcher.h"

 #include "output.h"
 #include "util-privs.h"
@ -218,6 +219,7 @@ SignalHandlerSetup(int sig, void (*handler)())
 void GlobalInits()
 {
    memset(trans_q, 0, sizeof(trans_q));
+    memset(data_queues, 0, sizeof(data_queues));

    /* Initialize the trans_q mutex */
    int blah;
@ -225,6 +227,9 @@ void GlobalInits()
    for(blah=0;blah<256;blah++) {
        r |= SCMutexInit(&trans_q[blah].mutex_q, NULL);
        r |= SCCondInit(&trans_q[blah].cond_q, NULL);
+
+        r |= SCMutexInit(&data_queues[blah].mutex_q, NULL);
+        r |= SCCondInit(&data_queues[blah].cond_q, NULL);
   }

    if (r != 0) {
@ -793,6 +798,7 @@ int main(int argc, char **argv)
    TmModuleLogHttpLogIPv6Register();
 #ifdef __SC_CUDA_SUPPORT__
    TmModuleCudaMpmB2gRegister();
+    TmModuleCudaPacketBatcherRegister();
 #endif
    TmModuleReceiveErfFileRegister();
    TmModuleDecodeErfFileRegister();
@ -996,10 +1002,15 @@ int main(int argc, char **argv)
            exit(EXIT_FAILURE);
    }

+
 #ifdef PROFILING
    SCProfilingInitRuleCounters(de_ctx);
 #endif /* PROFILING */

+#ifdef __SC_CUDA_SUPPORT__
+    SCCudaPBSetUpQueuesAndBuffers();
+#endif /* __SC_CUDA_SUPPORT__ */
+
    AppLayerHtpRegisterExtraCallbacks();
    SCThresholdConfInitContext(de_ctx,NULL);

@ -1126,6 +1137,10 @@ int main(int argc, char **argv)

            SCLogInfo("time elapsed %" PRIuMAX "s", (uintmax_t)(end_time.tv_sec - start_time.tv_sec));

+#ifdef __SC_CUDA_SUPPORT__
+            SCCudaPBKillBatchingPackets();
+#endif
+
            TmThreadKillThreads();
            SCPerfReleaseResources();
            break;
--- a/src/suricata.h
+++ b/src/suricata.h
@ -26,6 +26,7 @@

 #include "suricata-common.h"
 #include "packet-queue.h"
+#include "data-queue.h"

 /* the name of our binary */
 #define PROG_NAME "Suricata"
@ -54,6 +55,8 @@ enum {
 * XXX move to the TmQueue structure later
 */
 PacketQueue trans_q[256];
+
+SCDQDataQueue data_queues[256];
 /* memset to zeros, and mutex init! */
 void GlobalInits();

--- a/src/tm-modules.h
+++ b/src/tm-modules.h
@ -78,6 +78,7 @@ enum {
    TMM_RECEIVEIPFW,
 #ifdef __SC_CUDA_SUPPORT__
    TMM_CUDA_MPM_B2G,
+    TMM_CUDA_PACKET_BATCHER,
 #endif
    TMM_RECEIVEERFFILE,
    TMM_DECODEERFFILE,
--- a/src/tm-queues.c
+++ b/src/tm-queues.c
@ -52,6 +52,8 @@ Tmq* TmqCreateQueue(char *name) {
    Tmq *q = &tmqs[tmq_id];
    q->name = name;
    q->id = tmq_id++;
+    /* for cuda purposes */
+    q->q_type = 0;

    SCLogDebug("created queue \'%s\', %p", name, q);
    return q;
--- a/src/tm-queues.h
+++ b/src/tm-queues.h
@ -29,6 +29,8 @@ typedef struct Tmq_ {
    uint16_t id;
    uint16_t reader_cnt;
    uint16_t writer_cnt;
+    /* 0 for packet-queue and 1 for data-queue */
+    uint8_t q_type;
 } Tmq;

 Tmq* TmqCreateQueue(char *name);
--- a/src/tm-threads.c
+++ b/src/tm-threads.c
@ -68,45 +68,6 @@ SCMutex tv_root_lock = PTHREAD_MUTEX_INITIALIZER;
   thread encounters a failure.  Defaults to restart the failed thread */
 uint8_t tv_aof = THV_RESTART_THREAD;

-typedef struct TmSlot_ {
-    /* function pointers */
-    TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *, PacketQueue *);
-
-    TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **);
-    void (*SlotThreadExitPrintStats)(ThreadVars *, void *);
-    TmEcode (*SlotThreadDeinit)(ThreadVars *, void *);
-
-    /* data storage */
-    void *slot_initdata;
-    void *slot_data;
-
-    /**< queue filled by the SlotFunc with packets that will
-     *   be processed futher _before_ the current packet.
-     *   The locks in the queue are NOT used */
-    PacketQueue slot_pre_pq;
-
-    /**< queue filled by the SlotFunc with packets that will
-     *   be processed futher _after_ the current packet. The
-     *   locks in the queue are NOT used */
-    PacketQueue slot_post_pq;
-
-    /* linked list, only used by TmVarSlot */
-    struct TmSlot_ *slot_next;
-
-    int id; /**< slot id, only used my TmVarSlot to know what the first
-             *   slot is. */
-} TmSlot;
-
-/* 1 function slot */
-typedef struct Tm1Slot_ {
-    TmSlot s;
-} Tm1Slot;
-
-/* Variable number of function slots */
-typedef struct TmVarSlot_ {
-    TmSlot *s;
-} TmVarSlot;
-
 /**
 *  \brief Check if a thread flag is set
 *
@ -649,11 +610,12 @@ TmEcode TmThreadSetSlots(ThreadVars *tv, char *name, void *(*fn_p)(void *)) {
        size = sizeof(TmVarSlot);
        tv->tm_func = TmThreadsSlotVar;
    } else if (strcmp(name, "custom") == 0) {
+        /* \todo this needs to be changed to support slots of any size */
+        size = sizeof(Tm1Slot);
        if (fn_p == NULL)
            goto error;

        tv->tm_func = fn_p;
-        return TM_ECODE_OK;
    } else {
        printf("Error: Slot \"%s\" not supported\n", name);
        goto error;
@ -1134,8 +1096,12 @@ void TmThreadKillThreads(void) {
                if (tv->InShutdownHandler != NULL) {
                    tv->InShutdownHandler(tv);
                }
-                for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++)
+                for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
+                    if (tv->inq->q_type == 0)
                        SCCondSignal(&trans_q[tv->inq->id].cond_q);
+                    else
+                        SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                }

                /* to be sure, signal more */
                int cnt = 0;
@ -1151,9 +1117,12 @@ void TmThreadKillThreads(void) {
                        tv->InShutdownHandler(tv);
                    }

-                    for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++)
+                    for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
+                        if (tv->inq->q_type == 0)
                            SCCondSignal(&trans_q[tv->inq->id].cond_q);
-
+                        else
+                            SCCondSignal(&data_queues[tv->inq->id].cond_q);
+                    }
                    usleep(100);
                }

--- a/src/tm-threads.h
+++ b/src/tm-threads.h
@ -34,6 +34,34 @@ enum {
    TVT_MAX,
 };

+
+typedef struct TmSlot_ {
+    /* function pointers */
+    TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *);
+
+    TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **);
+    void (*SlotThreadExitPrintStats)(ThreadVars *, void *);
+    TmEcode (*SlotThreadDeinit)(ThreadVars *, void *);
+
+    /* data storage */
+    void *slot_initdata;
+    void *slot_data;
+    PacketQueue slot_pq;
+
+    /* linked list, only used by TmVarSlot */
+    struct TmSlot_ *slot_next;
+} TmSlot;
+
+/* 1 function slot */
+typedef struct Tm1Slot_ {
+    TmSlot s;
+} Tm1Slot;
+
+/* Variable number of function slots */
+typedef struct TmVarSlot_ {
+    TmSlot *s;
+} TmVarSlot;
+
 extern ThreadVars *tv_root[TVT_MAX];

 extern SCMutex tv_root_lock;
--- a/src/tmqh-simple.c
+++ b/src/tmqh-simple.c
@ -90,19 +90,26 @@ void TmqhOutputSimple(ThreadVars *t, Packet *p)
    SCMutexUnlock(&q->mutex_q);
 }

+/*******************************Generic-Q-Handlers*****************************/
+
 /**
 * \brief Public version of TmqhInputSimple from the tmqh-simple queue
 *        handler, except that it is a generic version that is directly
- *        tied to a PacketQueue instance.
+ *        tied to a "SCDQDataQueue" instance(sent as an arg).
+ *
+ *        Retrieves a data_instance from the queue.  If the queue is empty, it
+ *        waits on the queue, till a data_instance is enqueued into the queue
+ *        by some other module.
 *
- *        Retrieves a packet from the queue.  If the queue is empty, it waits
- *        on the queue, till a packet is enqueued into the queue.
+ *        All references to "data_instance" means a reference to a data structure
+ *        instance that implements the template "struct SCDQGenericQData_".
 *
- * \param q The PacketQueue instance to wait on.
+ * \param q The SCDQDataQueue instance to wait on.
 *
 * \retval p The returned packet from the queue.
+ * \retval data The returned data_instance from the queue.
 */
-Packet *TmqhInputSimpleOnQ(PacketQueue *q)
+SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *q)
 {
    SCMutexLock(&q->mutex_q);
    if (q->len == 0) {
@ -111,11 +118,12 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q)
    }

    if (q->len > 0) {
-        Packet *p = PacketDequeue(q);
+        SCDQGenericQData *data = SCDQDataDequeue(q);
        SCMutexUnlock(&q->mutex_q);
-        return p;
+        return data;
    } else {
-        /* return NULL if we have no pkt. Should only happen on signals. */
+        /* return NULL if we have no data in the queue. Should only happen
+         * on signals. */
        SCMutexUnlock(&q->mutex_q);
        return NULL;
    }
@ -124,17 +132,23 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q)
 /**
 * \brief Public version of TmqhOutputSimple from the tmqh-simple queue
 *        handler, except that it is a generic version that is directly
- *        tied to a PacketQueue instance.
+ *        tied to a SCDQDataQueue instance(sent as an arg).
 *
- *        Enqueues a packet into the packet queue.
+ *        Pumps out a data_instance into the queue.  If the queue is empty, it
+ *        waits on the queue, till a data_instance is enqueued into the queue.
 *
- * \param q The PacketQueue instance to enqueue the packet into.
- * \param p The packet to be enqueued into the above queue.
+ *        All references to "data_instance" means a reference to a data structure
+ *        instance that implements the template "struct SCDQGenericQData_".
+ *
+ * \param q    The SCDQDataQueue instance to pump the data into.
+ * \param data The data instance to be enqueued.
 */
-void TmqhOutputSimpleOnQ(PacketQueue *q, Packet *p)
+void TmqhOutputSimpleOnQ(SCDQDataQueue *q, SCDQGenericQData *data)
 {
    SCMutexLock(&q->mutex_q);
-    PacketEnqueue(q, p);
+    SCDQDataEnqueue(q, data);
    SCCondSignal(&q->cond_q);
    SCMutexUnlock(&q->mutex_q);
+
+    return;
 }
--- a/src/tmqh-simple.h
+++ b/src/tmqh-simple.h
@ -24,8 +24,11 @@
 #ifndef __TMQH_SIMPLE_H__
 #define __TMQH_SIMPLE_H__

-Packet *TmqhInputSimpleOnQ(PacketQueue *);
-void TmqhOutputSimpleOnQ(PacketQueue *, Packet *);
+#include "data-queue.h"
+
+SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *);
+void TmqhOutputSimpleOnQ(SCDQDataQueue *, SCDQGenericQData *);
+
 void TmqhSimpleRegister (void);

 #endif /* __TMQH_SIMPLE_H__ */
--- a/src/util-cuda-handlers.c
+++ b/src/util-cuda-handlers.c
@ -100,6 +100,72 @@ SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t handle)
    return data;
 }

+/**
+ * \brief Returns a cuda_module against the handle in the argument.
+ *
+ *        If a cuda_module is not present for a handle, it is created
+ *        and associated with this handle and the cuda_module is returned
+ *        in the argument.  If a cuda_module is already present for
+ *        a handle, it is returned.
+ *
+ * \param p_context Pointer to a cuda context instance that should be updated
+ *                  with a cuda context.
+ * \param handle    A unique handle which identifies a module.  Obtained from
+ *                  a call to SCCudaHlGetUniqueHandle().
+ *
+ * \retval  0 On success.
+ * \retval -1 On failure.
+ */
+int SCCudaHlGetCudaModuleFromFile(CUmodule *p_module, const char *filename, int handle)
+{
+    SCCudaHlModuleData *data = NULL;
+
+    if (p_module == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments"
+                   "p_module NULL");
+        return -1;
+    }
+
+    /* check if the particular module that wants a CUDA module is already
+     * registered or not.  If it is registered, check if a context has
+     * been associated with the module.  If yes, then we can go ahead and
+     * create a cuda module or return the reference to the cuda module if
+     * we already have a cuda module associated with the module.  If no, "
+     * log warning and get out of here */
+    if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) ||
+         (data->cuda_context == 0)) {
+        SCLogDebug("Module not registered or no cuda context associated with "
+                   "this module.  You can't create a CUDA module without"
+                   "associatin a context with a module first. To use this "
+                   "registration facility, first register a module using "
+                   "context using SCCudaHlRegisterModule(), and then register "
+                   "a cuda context with that module using "
+                   "SCCudaHlGetCudaContext(), after which you can call this "
+                   "function ");
+        return -1;
+    }
+
+    /* we already have a cuda module associated with this module.  Return the
+     * cuda module */
+    if (data->cuda_module != 0) {
+        p_module[0] = data->cuda_module;
+        return 0;
+    }
+
+    /* we don't have a cuda module associated with this module.  Create a
+     * cuda module, update the module with this cuda module reference and
+     * then return the module refernce back to the calling function using
+     * the argument */
+    if (SCCudaModuleLoad(p_module, filename) == -1)
+        goto error;
+    data->cuda_module = p_module[0];
+
+    return 0;
+
+ error:
+    return -1;
+}
+
 /**
 * \internal
 * \brief Get a unique handle for a new module registration.  This new handle
@ -347,10 +413,12 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name,
        }
    }

+    /* send the newly assigned device pointer back to the caller */
+    device_ptr[0] = new_module_device_ptr->d_ptr;
+
    /* insert it into the device_ptr list for the module instance */
    if (data->device_ptrs == NULL) {
        data->device_ptrs = new_module_device_ptr;
-        device_ptr[0] = new_module_device_ptr->d_ptr;
        return 0;
    }

@ -367,6 +435,76 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name,
    return -1;
 }

+/**
+ * \brief Frees a Cuda Device Pointer.
+ *
+ *        If a device pointer by the name \"name\"  is registered for this
+ *        handle, it is freed.
+ *
+ * \param name       Name of the device pointer by which we have to search
+ *                   module for its existance.
+ * \param handle     A unique handle which identifies a module.  Obtained from
+ *                   a call to SCCudaHlGetUniqueHandle().
+ *
+ * \retval  0 On success.
+ * \retval -1 On failure.
+ */
+int SCCudaHlFreeCudaDevicePtr(const char *name, int handle)
+{
+    SCCudaHlModuleData *data = NULL;
+    SCCudaHlModuleDevicePointer *module_device_ptr = NULL;
+    SCCudaHlModuleDevicePointer *temp_module_device_ptr = NULL;
+
+    if (name == NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments"
+                   "device_ptr is NULL or name is NULL");
+        goto error;
+    }
+
+    /* check if the particular module that wants to free device memory is
+     * already registered or not.  If it is registered, check if a context has
+     * been associated with the module.  If yes, then we can go ahead and
+     * free the device memory.
+     */
+    if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) ||
+         (data->cuda_context == 0)) {
+        SCLogDebug("Module not registered or no cuda context associated with "
+                   "this module.  You can't create a CUDA module without"
+                   "associatin a context with a module first. To use this "
+                   "registration facility, first register a module using "
+                   "context using SCCudaHlRegisterModule(), and then register "
+                   "a cuda context with that module using "
+                   "SCCudaHlGetCudaContext(), after which you can call this "
+                   "function ");
+        goto error;
+    }
+
+    /* if we already have a device pointer registered by this name return the
+     * cuda device pointer instance */
+    if ( (module_device_ptr = SCCudaHlCudaDevicePtrAvailable(data, name)) == NULL) {
+        goto error;
+    }
+
+    SCCudaMemFree(module_device_ptr->d_ptr);
+    module_device_ptr->d_ptr = 0;
+    if (module_device_ptr == data->device_ptrs) {
+        data->device_ptrs = data->device_ptrs->next;
+    } else {
+        temp_module_device_ptr = data->device_ptrs;
+        while (strcmp(temp_module_device_ptr->next->name, name) != 0) {
+            temp_module_device_ptr = temp_module_device_ptr->next;
+        }
+        temp_module_device_ptr->next = temp_module_device_ptr->next->next;
+    }
+    free(module_device_ptr->name);
+    free(module_device_ptr);
+
+    return 0;
+
+ error:
+    return -1;
+}
+
 /**
 * \brief Registers a Dispatcher function against this handle.
 *
@ -441,8 +579,11 @@ int SCCudaHlGetModuleHandle(const char *name)
        data = data->next;
    }

-    if (data == NULL)
+    if (data == NULL) {
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "A cuda module by the name \"%s\" "
+                   "hasn't been registered", name);
        return -1;
+    }

    return data->handle;
 }
@ -675,72 +816,4 @@ int SCCudaHlTestEnvCudaContextDeInit(void)
    return 1;
 }

-void SCCudaHlProcessPacketWithDispatcher(Packet *p, DetectEngineThreadCtx *det_ctx,
-                                         void *result)
-{
-    Packet *out_p = NULL;
-
-    p->cuda_mpm_ctx = det_ctx->sgh->mpm_ctx;
-    p->cuda_mtc = &det_ctx->mtc;
-    p->cuda_pmq = &det_ctx->pmq;
-    /* this outq is unique to this detection thread instance.  The dispatcher thread
-     * would use this queue to pump the packets back to this detection thread once
-     * it has processed the packet */
-    p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id];
-    /* for now it is hardcoded.  \todo Make the access to the right queue or the
-     * ThreadVars generic */
-
-    /* Push the packet into the dispatcher's input queue */
-    B2gCudaPushPacketTo_tv_CMB2_RC(p);
-
-    /* wait for the dispatcher to process and return the packet we pushed */
-    out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]);
-
-    /* todo make this generic, so that if we have more than 2 modules using the
-     * cuda interface, we can call update function for the module that has
-     * queued the packet and retrieve the results */
-    *((uint32_t *)result) = p->cuda_matches;
-
-    return;
-}
-
-void SCCudaHlProcessUriWithDispatcher(uint8_t *uri, uint16_t uri_len,
-                                      DetectEngineThreadCtx *det_ctx,
-                                      void *result)
-{
-    Packet *out_p = NULL;
-
-    Packet *p = SCMalloc(sizeof(Packet));
-    if (p == NULL) {
-        SCLogError(SC_ERR_FATAL, "Fatal error encountered in SCCudaHlProcessUriWithDispatcher. Exiting...");
-        exit(EXIT_FAILURE);
-    }
-    memset(p, 0, sizeof(Packet));
-
-    p->cuda_mpm_ctx = det_ctx->sgh->mpm_uri_ctx;
-    p->cuda_mtc = &det_ctx->mtcu;
-    p->cuda_pmq = &det_ctx->pmq;
-    p->payload = uri;
-    p->payload_len = uri_len;
-    /* this outq is unique to this detection thread instance.  The dispatcher thread
-     * would use this queue to pump the packets back to this detection thread once
-     * it has processed the packet */
-    p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id];
-
-    /* Push the packet into the dispatcher's input queue */
-    B2gCudaPushPacketTo_tv_CMB2_RC(p);
-
-    /* wait for the dispatcher to process and return the packet we pushed */
-    out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]);
-
-    /* todo make this generic, so that if we have more than 2 modules using the
-     * cuda interface, we can call update function for the module that has
-     * queued the packet and retrieve the results */
-    *((uint32_t *)result) = p->cuda_matches;
-
-    SCFree(p);
-
-    return;
-}
-
 #endif /* __SC_CUDA_SUPPORT */
--- a/src/util-cuda-handlers.h
+++ b/src/util-cuda-handlers.h
@ -63,7 +63,9 @@ typedef struct SCCudaHlModuleData_ {

 int SCCudaHlGetCudaContext(CUcontext *, int);
 int SCCudaHlGetCudaModule(CUmodule *, const char *, int);
+int SCCudaHlGetCudaModuleFromFile(CUmodule *, const char *, int);
 int SCCudaHlGetCudaDevicePtr(CUdeviceptr *, const char *, size_t, void *, int);
+int SCCudaHlFreeCudaDevicePtr(const char *, int);
 int SCCudaHlRegisterDispatcherFunc(void *(*SCCudaHlDispFunc)(void *), int);

 SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t);
--- a/src/util-mpm-b2g-cuda-kernel.cu
+++ b/src/util-mpm-b2g-cuda-kernel.cu
@ -27,50 +27,59 @@
 */

 #define B2G_CUDA_Q 2
-#define CUDA_THREADS 16
+#define CUDA_THREADS 4000
 #define B2G_CUDA_HASHSHIFT 4
 #define B2G_CUDA_TYPE unsigned int
 #define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b))
 #define u8_tolower(c) g_u8_lowercasetable[(c)]

+typedef struct SCCudaPBPacketDataForGPU_ {
+    /* holds the value B2gCtx->m */
+    unsigned int m;
+    /* holds B2gCtx->B2g */
+    unsigned int table;
+    /* holds the length of the payload */
+    unsigned int payload_len;
+    /* holds the payload */
+    unsigned char payload;
+} SCCudaPBPacketDataForGPU;
+
 extern "C"
-__global__ void B2gCudaSearchBNDMq(unsigned int *offsets,
-                                   unsigned int *B2G,
-                                   unsigned char *g_u8_lowercasetable,
-                                   unsigned char *buf,
-                                   unsigned short arg_buflen,
-                                   unsigned int m)
-{
+__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer,
+                                   unsigned char *packets_buffer,
+                                   unsigned int *packets_offset_buffer,
+                                   unsigned int *packets_payload_offset_buffer,
+                                   unsigned int nop,
+                                   unsigned char *g_u8_lowercasetable)
+ {
+    unsigned int tid = blockIdx.x * 32 + threadIdx.x;
+    /* if the thread id is greater than the no of packets sent in the packets
+     * buffer, terminate the thread */
+    //if (tid <= nop)
+    if (tid >= nop)
+        return;
+
+    SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]);
+    unsigned int m = packet->m;
+    unsigned char *buf = &packet->payload;
+    unsigned int buflen = packet->payload_len;
+    unsigned int *B2G = (unsigned int *)packet->table;
    unsigned int pos = m - B2G_CUDA_Q + 1;
    B2G_CUDA_TYPE d;
    unsigned short h;
-    unsigned int j;
    unsigned int first;
-    unsigned int tid = threadIdx.x;
-    unsigned short tid_chunk = arg_buflen / CUDA_THREADS;
-    unsigned short jump;
-    unsigned short buflen;
-
-    if (tid_chunk < m)
-        tid_chunk = m;
-
-    jump = tid_chunk * tid;
-    if ((jump + tid_chunk) > arg_buflen)
-        return;
+    unsigned int j = 0;

-    buflen = tid_chunk * 2 - 1;
-    if ((tid == CUDA_THREADS - 1) || ((jump + buflen) > arg_buflen)) {
-        buflen = arg_buflen - jump;
-    }
-
-    j = 0;
-    while (j < buflen) {
-        offsets[jump + j] = 0;
-        j++;
-    }
+    unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid;
+    //unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1;
+    //unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1;
+    unsigned short *offsets = matches_count + 1;
+    // temporarily hold the results here, before we shift it to matches_count
+    // before returning
+    unsigned short matches = 0;

    while (pos <= (buflen - B2G_CUDA_Q + 1)) {
-        h = B2G_CUDA_HASH16(u8_tolower(buf[jump + pos - 1]), u8_tolower(buf[jump + pos]));
+        h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos]));
        d = B2G[h];

        if (d != 0) {
@ -83,19 +92,21 @@ __global__ void B2gCudaSearchBNDMq(unsigned int *offsets,
                    if (j > first) {
                        pos = j;
                    } else {
-                        offsets[j + jump] = 1;
+                        offsets[matches++] = j;
                    }
                }

                if (j == 0)
                    break;

-                h = B2G_CUDA_HASH16(u8_tolower(buf[jump + j - 1]), u8_tolower(buf[jump + j]));
+                h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j]));
                d = (d << 1) & B2G[h];
            } while (d != 0);
        }
        pos = pos + m - B2G_CUDA_Q + 1;
    }

+    matches_count[0] = matches;
+
    return;
 }
--- a/src/util-mpm-b2g-cuda.c
+++ b/src/util-mpm-b2g-cuda.c
--- a/src/util-mpm-b2g-cuda.h
+++ b/src/util-mpm-b2g-cuda.h
@ -51,7 +51,7 @@ typedef struct B2gCudaPattern_ {
    /* case INsensitive */
    uint8_t *ci;
    struct B2gCudaPattern_ *next;
-    MpmEndMatch *em;
+    uint32_t id;
 } B2gCudaPattern;

 typedef struct B2gCudaHashItem_ {
@ -65,23 +65,6 @@ typedef struct B2gCudaCtx_ {
     * in the engine that is holding this B2g_Cuda_Ctx */
    int module_handle;

-    CUcontext cuda_context;
-    CUmodule cuda_module;
-
-    /* the search kernel */
-    CUfunction cuda_search_kernel;
-
-    /* the cuda_search_kernel argument offsets */
-    uint8_t cuda_search_kernel_arg0_offset;
-    uint8_t cuda_search_kernel_arg1_offset;
-    uint8_t cuda_search_kernel_arg2_offset;
-    uint8_t cuda_search_kernel_arg3_offset;
-    uint8_t cuda_search_kernel_arg4_offset;
-    uint8_t cuda_search_kernel_arg5_offset;
-    uint8_t cuda_search_kernel_arg_total;
-
-    /* cuda device pointer to thelower case table g_u8_lowercasetable */
-    CUdeviceptr cuda_g_u8_lowercasetable;
    /* cuda device pointer to B2gCudaCtx->B2G */
    CUdeviceptr cuda_B2G;

@ -141,17 +124,18 @@ typedef struct B2gCudaThreadCtx_ {
 } B2gCudaThreadCtx;

 void MpmB2gCudaRegister(void);
-
 void TmModuleCudaMpmB2gRegister(void);

 int B2gCudaStartDispatcherThreadRC(const char *);
-int B2gCudaStartDispatcherThreadAPC(const char *);
-
 void B2gCudaKillDispatcherThreadRC(void);
-void B2gCudaKillDispatcherThreadAPC(void);
-
-void B2gCudaPushPacketTo_tv_CMB2_RC(Packet *);
-void B2gCudaPushPacketTo_tv_CMB2_APC(Packet *);
+int B2gCudaResultsPostProcessing(Packet *, MpmCtx *, MpmThreadCtx *,
+                                 PatternMatcherQueue *);
+uint32_t B2gCudaSearch1(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *,
+                        uint8_t *, uint16_t);
+#ifdef B2G_CUDA_SEARCH2
+uint32_t B2gCudaSearch2(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *,
+                        uint8_t *, uint16_t);
+#endif

 #endif /* __SC_CUDA_SUPPORT__ */