Version 1 of AC Cuda.

12 years ago · 17c763f855
parent 2de59fc235
commit 17c763f855
20 changed files with 1844 additions and 191 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -257,6 +257,7 @@ util-cpu.c util-cpu.h \
 util-crypt.c util-crypt.h \
 util-cuda.c util-cuda.h \
 util-cuda-buffer.c util-cuda-buffer.h \
+util-cuda-handlers.c util-cuda-handlers.h \
 util-daemon.c util-daemon.h \
 util-debug.c util-debug.h \
 util-debug-filters.c util-debug-filters.h \
@ -325,7 +326,7 @@ win32-misc.c win32-misc.h \
 win32-service.c win32-service.h \
 win32-syslog.h

-EXTRA_DIST = util-mpm-b2g-cuda-kernel.cu ptxdump.py
+EXTRA_DIST = util-mpm-ac-cuda-kernel.cu ptxdump.py

 # set the include path found by configure
 INCLUDES= $(all_includes)
@ -345,7 +346,7 @@ BUILT_SOURCES = cuda-ptxdump.h
 suricata_SOURCES += cuda-ptxdump.h

 suricata_CUDA_KERNELS = \
-util-mpm-b2g-cuda-kernel.cu
+util-mpm-ac-cuda-kernel.cu

 NVCCFLAGS=-O2

@ -355,7 +356,8 @@ SUFFIXES = \
 .ptx_sm_12 \
 .ptx_sm_13 \
 .ptx_sm_20 \
-.ptx_sm_21
+.ptx_sm_21 \
+.ptx_sm_30

 PTXS =  $(suricata_CUDA_KERNELS:.cu=.ptx_sm_10)
 PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_11)
@ -363,6 +365,7 @@ PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_12)
 PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_13)
 PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20)
 PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
+PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_30)

 .cu.ptx_sm_10:
 	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_10 -ptx $<
@ -382,6 +385,9 @@ PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
 .cu.ptx_sm_21:
 	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $<

+.cu.ptx_sm_30:
+	$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_30 -ptx $<
+
 cuda-ptxdump.h: $(PTXS)
 	$(PYTHON) ptxdump.py cuda-ptxdump $(PTXS)

--- a/src/decode.h
+++ b/src/decode.h
@ -28,9 +28,12 @@
 #define COUNTERS

 #include "suricata-common.h"
-
 #include "threadvars.h"

+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-buffer.h"
+#endif /* __SC_CUDA_SUPPORT__ */
+
 typedef enum {
    CHECKSUM_VALIDATION_DISABLE,
    CHECKSUM_VALIDATION_ENABLE,
@ -487,6 +490,14 @@ typedef struct Packet_
 #ifdef PROFILING
    PktProfiling profile;
 #endif
+#ifdef __SC_CUDA_SUPPORT__
+    uint8_t cuda_mpm_enabled;
+    uint8_t cuda_done;
+    uint16_t cuda_gpu_matches;
+    SCMutex cuda_mutex;
+    SCCondT cuda_cond;
+    uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1];
+#endif
 } Packet;

 #define DEFAULT_PACKET_SIZE (1500 + ETHERNET_HEADER_LEN)
@ -570,6 +581,24 @@ typedef struct DecodeThreadVars_
    uint16_t counter_defrag_ipv6_reassembled;
    uint16_t counter_defrag_ipv6_timeouts;
    uint16_t counter_defrag_max_hit;
+
+#ifdef __SC_CUDA_SUPPORT__
+    /* cb - CudaBuffer */
+    CudaBufferData *cuda_ac_cb;
+
+    MpmCtx *mpm_proto_other_ctx;
+
+    MpmCtx *mpm_proto_tcp_ctx_ts;
+    MpmCtx *mpm_proto_udp_ctx_ts;
+
+    MpmCtx *mpm_proto_tcp_ctx_tc;
+    MpmCtx *mpm_proto_udp_ctx_tc;
+
+    uint16_t data_buffer_size_max_limit;
+    uint16_t data_buffer_size_min_limit;
+
+    uint8_t mpm_is_cuda;
+#endif
 } DecodeThreadVars;

 /**
@ -586,12 +615,27 @@ typedef struct DecodeThreadVars_
 /**
 *  \brief Initialize a packet structure for use.
 */
-#define PACKET_INITIALIZE(p) { \
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-handlers.h"
+#include "util-mpm.h"
+
+#define PACKET_INITIALIZE(p) do {                                       \
+        memset((p), 0x00, SIZE_OF_PACKET);                              \
+        SCMutexInit(&(p)->tunnel_mutex, NULL);                          \
+        PACKET_RESET_CHECKSUMS((p));                                    \
+        (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet);                   \
+        (p)->livedev = NULL;                                            \
+        SCMutexInit(&(p)->cuda_mutex, NULL);                            \
+        SCCondInit(&(p)->cuda_cond, NULL);                              \
+    } while (0)
+#else
+#define PACKET_INITIALIZE(p) {         \
    SCMutexInit(&(p)->tunnel_mutex, NULL); \
    PACKET_RESET_CHECKSUMS((p)); \
    (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \
    (p)->livedev = NULL; \
 }
+#endif

 /**
 *  \brief Recycle a packet structure for reuse.
--- a/src/detect-engine-mpm.c
+++ b/src/detect-engine-mpm.c
@ -53,6 +53,9 @@
 #include "util-debug.h"
 #include "util-print.h"
 #include "util-memcmp.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-mpm-ac.h"
+#endif

 /** \todo make it possible to use multiple pattern matcher algorithms next to
          eachother. */
@ -221,11 +224,23 @@ uint32_t PacketPatternSearch(DetectEngineThreadCtx *det_ctx, Packet *p)
    if (mpm_ctx == NULL)
        SCReturnInt(0);

+#ifdef __SC_CUDA_SUPPORT__
+    if (p->cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) {
+        ret = SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq);
+    } else {
+        ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
+                                                  &det_ctx->mtc,
+                                                  &det_ctx->pmq,
+                                                  p->payload,
+                                                  p->payload_len);
+    }
+#else
    ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
                                              &det_ctx->mtc,
                                              &det_ctx->pmq,
                                              p->payload,
                                              p->payload_len);
+#endif

    SCReturnInt(ret);
 }
--- a/src/detect-engine.c
+++ b/src/detect-engine.c
@ -845,7 +845,11 @@ static uint8_t DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) {
        /* for now, since we still haven't implemented any intelligence into
         * understanding the patterns and distributing mpm_ctx across sgh */
        if (de_ctx->mpm_matcher == MPM_AC || de_ctx->mpm_matcher == MPM_AC_GFBS ||
+#ifdef __SC_CUDA_SUPPORT__
+            de_ctx->mpm_matcher == MPM_AC_BS || de_ctx->mpm_matcher == MPM_AC_CUDA) {
+#else
            de_ctx->mpm_matcher == MPM_AC_BS) {
+#endif
            de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
        } else {
            de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
@ -854,6 +858,15 @@ static uint8_t DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) {
        if (strcmp(sgh_mpm_context, "single") == 0) {
            de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
        } else if (strcmp(sgh_mpm_context, "full") == 0) {
+#ifdef __SC_CUDA_SUPPORT__
+            if (de_ctx->mpm_matcher == MPM_AC_CUDA) {
+                SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You can't use "
+                           "the cuda version of our mpm ac, i.e. \"ac-cuda\" "
+                           "along with \"full\" \"sgh-mpm-context\".  "
+                           "Allowed values are \"single\" and \"auto\".");
+                exit(EXIT_FAILURE);
+            }
+#endif
            de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
        } else {
           SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You have supplied an "
--- a/src/detect.c
+++ b/src/detect.c
@ -190,6 +190,7 @@
 #include "util-optimize.h"
 #include "util-vector.h"
 #include "util-path.h"
+#include "util-mpm-ac.h"

 #include "runmodes.h"

@ -1085,6 +1086,12 @@ static inline void DetectMpmPrefilter(DetectEngineCtx *de_ctx,
    }

    if (p->payload_len > 0 && (!(p->flags & PKT_NOPAYLOAD_INSPECTION))) {
+        if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) {
+            *sms_runflags |= SMS_USED_PM;
+            PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM);
+            PacketPatternSearchWithStreamCtx(det_ctx, p);
+            PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM);
+        }
        if (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_PACKET) {
            /* run the multi packet matcher against the payload of the packet */
            SCLogDebug("search: (%p, maxlen %" PRIu32 ", sgh->sig_cnt %" PRIu32 ")",
@ -1096,12 +1103,6 @@ static inline void DetectMpmPrefilter(DetectEngineCtx *de_ctx,

            *sms_runflags |= SMS_USED_PM;
        }
-        if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) {
-            *sms_runflags |= SMS_USED_PM;
-            PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM);
-            PacketPatternSearchWithStreamCtx(det_ctx, p);
-            PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM);
-        }
    }
 }

@ -4387,6 +4388,255 @@ int SigAddressPrepareStage5(DetectEngineCtx *de_ctx) {
    return 0;
 }

+#ifdef __SC_CUDA_SUPPORT__
+
+static void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
+{
+    MpmCtx *mpm_ctx = NULL;
+
+    int ac_16_tables = 0;
+    int ac_32_tables = 0;
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    if (ac_16_tables > 0 && ac_32_tables > 0)
+        SCACConstructBoth16and32StateTables();
+
+
+    SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables);
+    SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables);
+
+}
+#endif
+
 /**
 * \brief Convert the signature list into the runtime match structure.
 *
@ -4441,6 +4691,26 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)

    if (de_ctx->sgh_mpm_context == ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE) {
        MpmCtx *mpm_ctx = NULL;
+
+#ifdef __SC_CUDA_SUPPORT__
+        if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
+            /* setting it to default.  You've gotta remove it once you fix the state table thing */
+            SCACConstructBoth16and32StateTables();
+
+            MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+            CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+            if (cuda_context == 0) {
+                SCLogError(SC_ERR_FATAL, "cuda context is NULL.");
+                exit(EXIT_FAILURE);
+            }
+            int r = SCCudaCtxPushCurrent(cuda_context);
+            if (r < 0) {
+                SCLogError(SC_ERR_FATAL, "context push failed.");
+                exit(EXIT_FAILURE);
+            }
+        }
+#endif
+
        mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
        if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4562,7 +4832,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
        }
        //printf("hsmd- %d\n", mpm_ctx->pattern_cnt);
-
        mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
        if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4574,7 +4843,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
        }
        //printf("hscd- %d\n", mpm_ctx->pattern_cnt);
-
        mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
        if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4586,7 +4854,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
        }
        //printf("huad- %d\n", mpm_ctx->pattern_cnt);
-
        mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
        if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4616,6 +4883,21 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
            mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
        }
        //printf("hrhhd- %d\n", mpm_ctx->pattern_cnt);
+
+#ifdef __SC_CUDA_SUPPORT__
+        if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
+            int r = SCCudaCtxPopCurrent(NULL);
+            if (r < 0) {
+                SCLogError(SC_ERR_FATAL, "cuda context pop failure.");
+                exit(EXIT_FAILURE);
+            }
+        }
+
+        /* too late to call this either ways.  Should be called post ac goto.
+         * \todo Support this. */
+        DetermineCudaStateTableSize(de_ctx);
+#endif
+
    }

 //    SigAddressPrepareStage5(de_ctx);
--- a/src/source-pcap-file.c
+++ b/src/source-pcap-file.c
@ -43,6 +43,19 @@
 #include "util-profiling.h"
 #include "runmode-unix-socket.h"

+#ifdef __SC_CUDA_SUPPORT__
+
+#include "util-cuda.h"
+#include "util-cuda-buffer.h"
+#include "util-mpm-ac.h"
+#include "util-cuda-handlers.h"
+#include "detect-engine.h"
+#include "detect-engine-mpm.h"
+
+static DetectEngineCtx *cuda_de_ctx = NULL;
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 extern uint8_t suricata_ctl_flags;
 extern int max_pending_packets;

@ -111,6 +124,15 @@ void TmModuleDecodePcapFileRegister (void) {
    tmm_modules[TMM_DECODEPCAPFILE].flags = TM_FLAG_DECODE_TM;
 }

+#ifdef __SC_CUDA_SUPPORT__
+void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx)
+{
+    cuda_de_ctx = de_ctx;
+
+    return;
+}
+#endif
+
 void PcapFileCallbackLoop(char *user, struct pcap_pkthdr *h, u_char *pkt) {
    SCEnter();

@ -320,6 +342,90 @@ TmEcode ReceivePcapFileThreadDeinit(ThreadVars *tv, void *data) {
    SCReturnInt(TM_ECODE_OK);
 }

+#ifdef __SC_CUDA_SUPPORT__
+
+static inline void DecodePcapFileBufferPacket(DecodeThreadVars *dtv, Packet *p)
+{
+    if (p->cuda_mpm_enabled) {
+        while (!p->cuda_done) {
+            SCMutexLock(&p->cuda_mutex);
+            if (p->cuda_done) {
+                SCMutexUnlock(&p->cuda_mutex);
+                break;
+            } else {
+                SCCondWait(&p->cuda_cond, &p->cuda_mutex);
+                SCMutexUnlock(&p->cuda_mutex);
+            }
+        }
+    }
+    p->cuda_done = 0;
+
+    if (p->payload_len == 0 ||
+        (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) ||
+        (p->flags & PKT_ALLOC) ||
+        (dtv->data_buffer_size_min_limit != 0 && p->payload_len < dtv->data_buffer_size_min_limit) ||
+        (p->payload_len > dtv->data_buffer_size_max_limit && dtv->data_buffer_size_max_limit != 0) ) {
+        p->cuda_mpm_enabled = 0;
+        return;
+    }
+
+    MpmCtx *mpm_ctx = NULL;
+    if (p->proto == IPPROTO_TCP) {
+        if (p->flowflags & FLOW_PKT_TOSERVER)
+            mpm_ctx = dtv->mpm_proto_tcp_ctx_ts;
+        else
+            mpm_ctx = dtv->mpm_proto_tcp_ctx_tc;
+    } else if (p->proto == IPPROTO_UDP) {
+        if (p->flowflags & FLOW_PKT_TOSERVER)
+            mpm_ctx = dtv->mpm_proto_udp_ctx_ts;
+        else
+            mpm_ctx = dtv->mpm_proto_udp_ctx_tc;
+    } else {
+        mpm_ctx = dtv->mpm_proto_other_ctx;
+    }
+    if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) {
+        p->cuda_mpm_enabled = 0;
+        return;
+    }
+
+#if __WORDSIZE==64
+    CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
+                                                p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr),
+                                                (void *)p);
+    if (slice == NULL) {
+        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
+                   "this to dev.");
+        p->cuda_mpm_enabled = 0;
+        return;
+    }
+    *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
+    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
+    memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
+#else
+    CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
+                                                p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr),
+                                                (void *)p);
+    if (slice == NULL) {
+        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
+                   "this to dev.");
+        p->cuda_mpm_enabled = 0;
+        return;
+    }
+    *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
+    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
+    memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
+#endif
+    p->cuda_mpm_enabled = 1;
+    SC_ATOMIC_SET(slice->done, 1);
+
+    SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n",
+               p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda);
+
+    return;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 double prev_signaled_ts = 0;

 TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *postpq)
@ -351,11 +457,49 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P
    TimeSet(&p->ts);

    /* call the decoder */
+
    pcap_g.Decoder(tv, dtv, p, GET_PKT_DATA(p), GET_PKT_LEN(p), pq);

+#ifdef DEBUG
+    BUG_ON(p->pkt_src != PKT_SRC_WIRE && p->pkt_src != PKT_SRC_FFR_V2);
+#endif
+
+#ifdef __SC_CUDA_SUPPORT__
+    if (dtv->mpm_is_cuda)
+        DecodePcapFileBufferPacket(dtv, p);
+#endif
+
    SCReturnInt(TM_ECODE_OK);
 }

+#ifdef __SC_CUDA_SUPPORT__
+
+static int DecodePcapFileThreadInitCuda(DecodeThreadVars *dtv)
+{
+    if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
+        return 0;
+
+    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+    if (conf == NULL) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+        return -1;
+    }
+
+    dtv->mpm_is_cuda = 1;
+    dtv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+    dtv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit;
+    dtv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit;
+    dtv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
+    dtv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
+    dtv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0);
+    dtv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1);
+    dtv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0);
+
+    return 0;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)
 {
    SCEnter();
@ -367,6 +511,11 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)

    DecodeRegisterPerfCounters(dtv, tv);

+#ifdef __SC_CUDA_SUPPORT__
+    if (DecodePcapFileThreadInitCuda(dtv) < 0)
+        SCReturnInt(TM_ECODE_FAILED);
+#endif
+
    *data = (void *)dtv;

    SCReturnInt(TM_ECODE_OK);
--- a/src/source-pcap-file.h
+++ b/src/source-pcap-file.h
@ -26,6 +26,9 @@

 void TmModuleReceivePcapFileRegister (void);
 void TmModuleDecodePcapFileRegister (void);
+#ifdef __SC_CUDA_SUPPORT__
+void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx);
+#endif

 #endif /* __SOURCE_PCAP_FILE_H__ */

--- a/src/suricata.c
+++ b/src/suricata.c
@ -188,6 +188,10 @@
 #include "util-memcmp.h"
 #include "util-proto-name.h"
 #include "util-spm-bm.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-buffer.h"
+#include "util-mpm-ac.h"
+#endif

 /*
 * we put this here, because we only use it here in main.
@ -1302,6 +1306,7 @@ int main(int argc, char **argv)
        SCCudaListCards();
        exit(EXIT_SUCCESS);
    }
+    CudaBufferInit();
 #endif

    if (!CheckValidDaemonModes(daemon, run_mode)) {
@ -1313,9 +1318,6 @@ int main(int argc, char **argv)
    TimeInit();
    SupportFastPatternForSigMatchTypes();

-    /* load the pattern matchers */
-    MpmTableSetup();
-
    if (run_mode != RUNMODE_UNITTEST &&
            !list_keywords &&
            !list_app_layer_protocols) {
@ -1363,6 +1365,9 @@ int main(int argc, char **argv)
        }
    }

+    /* load the pattern matchers */
+    MpmTableSetup();
+
    AppLayerDetectProtoThreadInit();
    if (list_app_layer_protocols) {
        AppLayerListSupportedProtocols();
@ -1703,6 +1708,9 @@ int main(int argc, char **argv)
        DetectProtoTests();
        DetectPortTests();
        SCAtomicRegisterTests();
+#ifdef __SC_CUDA_SUPPORT__
+        CudaBufferRegisterUnittests();
+#endif
        if (list_unittests) {
            UtListTests(regex_arg);
        }
@ -1819,6 +1827,10 @@ int main(int argc, char **argv)
            "context failed.");
        exit(EXIT_FAILURE);
    }
+#ifdef __SC_CUDA_SUPPORT__
+    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
+        DecodePcapFileSetCudaDeCtx(de_ctx);
+#endif /* __SC_CUDA_SUPPORT__ */

    SCClassConfLoadClassficationConfigFile(de_ctx);
    SCRConfLoadReferenceConfigFile(de_ctx);
@ -1961,6 +1973,11 @@ int main(int argc, char **argv)
        SCPerfSpawnThreads();
    }

+#ifdef __SC_CUDA_SUPPORT__
+    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
+        SCACCudaStartDispatcher();
+#endif
+
    /* Check if the alloted queues have at least 1 reader and writer */
    TmValidateQueueState();

@ -2118,5 +2135,11 @@ int main(int argc, char **argv)

    SC_ATOMIC_DESTROY(engine_stage);

+#ifdef __SC_CUDA_SUPPORT__
+    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
+        MpmCudaBufferDeSetup();
+    CudaHandlerFreeProfiles();
+#endif
+
    exit(engine_retval);
 }
--- a/src/util-cuda-handlers.c
+++ b/src/util-cuda-handlers.c
@ -0,0 +1,357 @@
+/* Copyright (C) 2007-2012 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ */
+
+/* compile in, only if we have a CUDA enabled device on the machine, with the
+ * toolkit and the driver installed */
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "suricata-common.h"
+
+#include "util-error.h"
+#include "util-debug.h"
+#include "conf.h"
+#include "util-cuda.h"
+#include "util-cuda-handlers.h"
+
+/* file only exists if cuda is enabled */
+#include "cuda-ptxdump.h"
+
+/************************conf file profile section**********************/
+
+typedef struct CudaHandlerConfProfile_ {
+    char *name;
+    void *ctx;
+    void (*Free)(void *);
+
+    struct CudaHandlerConfProfile_ *next;
+} CudaHandlerConfProfile;
+
+static CudaHandlerConfProfile *conf_profiles = NULL;
+/* protects above var */
+static SCMutex mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void CudaHandlerAddCudaProfileFromConf(const char *name,
+                                       void *(*Callback)(ConfNode *node),
+                                       void (*Free)(void *))
+{
+    /* we don't do data validation */
+    SCMutexLock(&mutex);
+
+    CudaHandlerConfProfile *tmp_cp = conf_profiles;
+    while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
+        tmp_cp = tmp_cp->next;
+
+    if (tmp_cp != NULL) {
+        SCLogError(SC_ERR_INVALID_ARGUMENT, "We already have a cuda conf "
+                   "profile by the name \"%s\" registered.", name);
+        exit(EXIT_FAILURE);
+    }
+
+    char tmp[200];
+    int r = snprintf(tmp, sizeof(tmp), "%s%s", "cuda.", name);
+    if (r < 0) {
+        SCLogError(SC_ERR_FATAL, "snprintf failure.");
+        exit(EXIT_FAILURE);
+    } else if (r > (int)sizeof(tmp)) {
+        SCLogError(SC_ERR_FATAL, "buffer not big enough to write param.");
+        exit(EXIT_FAILURE);
+    }
+    void *ctx = Callback(ConfGetNode(tmp));
+    if (ctx == NULL) {
+        SCMutexUnlock(&mutex);
+        return;
+    }
+
+    CudaHandlerConfProfile *new_cp = SCMalloc(sizeof(CudaHandlerConfProfile));
+    if (new_cp == NULL)
+        exit(EXIT_FAILURE);
+    memset(new_cp, 0, sizeof(CudaHandlerConfProfile));
+    new_cp->name = SCStrdup(name);
+    if (new_cp->name == NULL)
+        exit(EXIT_FAILURE);
+    new_cp->ctx = ctx;
+    new_cp->Free = Free;
+
+    if (conf_profiles == NULL) {
+        conf_profiles = new_cp;
+    } else {
+        new_cp->next = conf_profiles;
+        conf_profiles = new_cp;
+    }
+
+    SCMutexUnlock(&mutex);
+    return;
+}
+
+void *CudaHandlerGetCudaProfile(const char *name)
+{
+    SCMutexLock(&mutex);
+
+    CudaHandlerConfProfile *tmp_cp = conf_profiles;
+    while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
+        tmp_cp = tmp_cp->next;
+
+    if (tmp_cp == NULL) {
+        SCMutexUnlock(&mutex);
+        return NULL;
+    }
+
+    SCMutexUnlock(&mutex);
+    return tmp_cp->ctx;
+}
+
+void CudaHandlerFreeProfiles(void)
+{
+    SCMutexLock(&mutex);
+
+    CudaHandlerConfProfile *tmp = conf_profiles;
+    while (tmp != NULL) {
+        CudaHandlerConfProfile *curr = tmp;
+        tmp = tmp->next;
+        SCFree(curr->name);
+        if (curr->Free != NULL)
+            curr->Free(curr->ctx);
+        SCFree(curr);
+    }
+
+    SCMutexUnlock(&mutex);
+    return;
+}
+
+/*******************cuda context related data section*******************/
+
+/* we use a concept where every device on the gpu has only 1 context.  If
+ * a section in the engine wants to use a device and tries to open a context
+ * on it, we first check if a context is already created for the device and if
+ * so we return it.  If not we create a new one and update with the entry */
+
+static CUcontext *cuda_contexts = NULL;
+static int no_of_cuda_contexts = 0;
+
+typedef struct CudaHandlerModuleData_ {
+    char *name;
+    void *data;
+
+    struct CudaHandlerModuleData_ *next;
+} CudaHandlerModuleData;
+
+typedef struct CudaHandlerModule_ {
+    char *name;
+
+    /* the context used by this module */
+    CUcontext context;
+    /* the device on which the above context was created */
+    int device_id;
+    CudaHandlerModuleData *module_data;
+
+    struct CudaHandlerModule_ *next;
+} CudaHandlerModule;
+
+static CudaHandlerModule *cudahl_modules = NULL;
+
+CUcontext CudaHandlerModuleGetContext(const char *name, int device_id)
+{
+    SCMutexLock(&mutex);
+
+    CudaHandlerModule *module = cudahl_modules;
+    while (module != NULL && strcasecmp(module->name, name) != 0)
+        module = module->next;
+    if (module != NULL) {
+        if (module->device_id != device_id) {
+            SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module already "
+                       "registered, but the new device_id is different "
+                       "from the already registered device_id.");
+            exit(EXIT_FAILURE);
+        }
+        SCMutexUnlock(&mutex);
+        return module->context;
+    }
+
+    CudaHandlerModule *new_module = SCMalloc(sizeof(CudaHandlerModule));
+    if (new_module == NULL)
+        exit(EXIT_FAILURE);
+    memset(new_module, 0, sizeof(CudaHandlerModule));
+    new_module->device_id = device_id;
+    new_module->name = SCStrdup(name);
+    if (new_module->name == NULL)
+        exit(EXIT_FAILURE);
+    if (cudahl_modules == NULL) {
+        cudahl_modules = new_module;
+    } else {
+        new_module->next = cudahl_modules;
+        cudahl_modules = new_module;
+    }
+
+    if (no_of_cuda_contexts <= device_id) {
+        cuda_contexts = SCRealloc(cuda_contexts, sizeof(CUcontext) * (device_id + 1));
+        if (cuda_contexts == NULL)
+            exit(EXIT_FAILURE);
+        memset(cuda_contexts + no_of_cuda_contexts, 0,
+               sizeof(CUcontext) * ((device_id + 1) - no_of_cuda_contexts));
+        no_of_cuda_contexts = device_id + 1;
+    }
+
+    if (cuda_contexts[device_id] == 0) {
+        SCCudaDevices *devices = SCCudaGetDeviceList();
+        if (SCCudaCtxCreate(&cuda_contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
+                            devices->devices[device_id]->device) == -1) {
+            SCLogDebug("ctxcreate failure.");
+            exit(EXIT_FAILURE);
+        }
+    }
+    new_module->context = cuda_contexts[device_id];
+
+    SCMutexUnlock(&mutex);
+    return cuda_contexts[device_id];
+}
+
+void CudaHandlerModuleStoreData(const char *module_name,
+                                const char *data_name, void *data_ptr)
+{
+    SCMutexLock(&mutex);
+
+    CudaHandlerModule *module = cudahl_modules;
+    while (module != NULL && strcasecmp(module->name, module_name) != 0)
+        module = module->next;
+    if (module == NULL) {
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
+                   "\"%s\" from module \"%s\" that hasn't been registered "
+                   "yet.",  module_name, data_name);
+        exit(EXIT_FAILURE);
+    }
+
+    CudaHandlerModuleData *data = module->module_data;
+    while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
+        data = data->next;
+    }
+    if (data != NULL) {
+        SCLogWarning(SC_ERR_CUDA_HANDLER_ERROR, "Data \"%s\" already "
+                     "registered for this module \"%s\".", data_name,
+                     module_name);
+        SCMutexUnlock(&mutex);
+        goto end;
+    }
+
+    CudaHandlerModuleData *new_data = SCMalloc(sizeof(CudaHandlerModuleData));
+    if (new_data == NULL)
+        exit(EXIT_FAILURE);
+    memset(new_data, 0, sizeof(CudaHandlerModuleData));
+    new_data->name = SCStrdup(data_name);
+    if (new_data->name == NULL)
+        exit(EXIT_FAILURE);
+    new_data->data = data_ptr;
+
+    if (module->module_data == NULL) {
+        module->module_data = new_data;
+    } else {
+        new_data->next = module->module_data;
+        module->module_data = new_data;
+    }
+
+    SCMutexUnlock(&mutex);
+
+ end:
+    return;
+}
+
+void *CudaHandlerModuleGetData(const char *module_name, const char *data_name)
+{
+    SCMutexLock(&mutex);
+
+    CudaHandlerModule *module = cudahl_modules;
+    while (module != NULL && strcasecmp(module->name, module_name) != 0)
+        module = module->next;
+    if (module == NULL) {
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
+                   "\"%s\" from module \"%s\" that hasn't been registered "
+                   "yet.",  module_name, data_name);
+        SCMutexUnlock(&mutex);
+        return NULL;
+    }
+
+    CudaHandlerModuleData *data = module->module_data;
+    while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
+        data = data->next;
+    }
+    if (data == NULL) {
+        SCLogInfo("Data \"%s\" already registered for this module \"%s\".  "
+                  "Returning it.", data_name, module_name);
+        SCMutexUnlock(&mutex);
+        return NULL;
+    }
+
+    SCMutexUnlock(&mutex);
+    return data->data;
+}
+
+int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image)
+{
+#define CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE 15
+
+    int i = 0;
+
+    /* select the ptx image based on the compute capability supported by all
+     * devices (i.e. the lowest) */
+    char *image = SCMalloc(strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
+    if (unlikely(image == NULL)) {
+        exit(EXIT_FAILURE);
+    }
+    memset(image, 0x00, strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
+
+    int major = INT_MAX;
+    int minor = INT_MAX;
+    SCCudaDevices *devices = SCCudaGetDeviceList();
+    for (i = 0; i < devices->count; i++){
+        if (devices->devices[i]->major_rev < major){
+            major = devices->devices[i]->major_rev;
+            minor = devices->devices[i]->minor_rev;
+        }
+        if (devices->devices[i]->major_rev == major &&
+            devices->devices[i]->minor_rev < minor){
+            minor = devices->devices[i]->minor_rev;
+        }
+    }
+    snprintf(image,
+             strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE,
+             "%s_sm_%u%u",
+             ptx_image, major, minor);
+
+    /* we don't have a cuda module associated with this module.  Create a
+     * cuda module, update the module with this cuda module reference and
+     * then return the module refernce back to the calling function using
+     * the argument */
+    SCLogDebug("Loading kernel module: %s\n",image);
+    if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1)
+        goto error;
+    SCFree(image);
+
+    return 0;
+ error:
+    SCFree(image);
+    return -1;
+
+#undef CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE
+}
+
+
+#endif /*  __SC_CUDA_SUPPORT__ */
--- a/src/util-cuda-handlers.h
+++ b/src/util-cuda-handlers.h
@ -0,0 +1,50 @@
+/* Copyright (C) 2007-2012 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ */
+
+#ifndef __UTIL_CUDA_HANDLERS__H__
+#define __UTIL_CUDA_HANDLERS__H__
+
+#include "conf.h"
+#include "util-cuda.h"
+
+/************************conf file profile section**********************/
+
+void CudaHandlerAddCudaProfileFromConf(const char *name,
+                                       void *(*Callback)(ConfNode *node),
+                                       void (*Free)(void *));
+void *CudaHandlerGetCudaProfile(const char *name);
+void CudaHandlerFreeProfiles(void);
+
+/*******************cuda context related data section*******************/
+
+#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_HOST 0
+#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_DEVICE 1
+#define CUDA_HANDLER_MODULE_DATA_TYPE_CUDA_BUFFER 2
+
+CUcontext CudaHandlerModuleGetContext(const char *module_name, int device_id);
+void CudaHandlerModuleStoreData(const char *module_name,
+                                const char *data_name, void *data_ptr);
+void *CudaHandlerModuleGetData(const char *module_name, const char *data_name);
+int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image);
+
+#endif /* __UTIL_CUDA_HANDLERS__H__ */
--- a/src/util-cuda.c
+++ b/src/util-cuda.c
@ -2346,7 +2346,7 @@ int SCCudaMemAllocHost(void **pp, size_t byte_size)
    }

    result = cuMemAllocHost(pp, byte_size);
-    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1)
+    if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_HOST) == -1)
        goto error;

    return 0;
--- a/src/util-error.c
+++ b/src/util-error.c
@ -161,7 +161,7 @@ const char * SCErrorToString(SCError err)
        CASE_CODE (SC_ERR_CUDA_HANDLER_ERROR);
        CASE_CODE (SC_ERR_TM_THREADS_ERROR);
        CASE_CODE (SC_ERR_TM_MODULES_ERROR);
-        CASE_CODE (SC_ERR_B2G_CUDA_ERROR);
+        CASE_CODE (SC_ERR_AC_CUDA_ERROR);
        CASE_CODE (SC_ERR_INVALID_YAML_CONF_ENTRY);
        CASE_CODE (SC_ERR_TMQ_ALREADY_REGISTERED);
        CASE_CODE (SC_ERR_CONFLICTING_RULE_KEYWORDS);
--- a/src/util-error.h
+++ b/src/util-error.h
@ -165,7 +165,7 @@ typedef enum {
    SC_ERR_CUDA_HANDLER_ERROR,
    SC_ERR_TM_THREADS_ERROR,
    SC_ERR_TM_MODULES_ERROR,
-    SC_ERR_B2G_CUDA_ERROR,
+    SC_ERR_AC_CUDA_ERROR,
    SC_ERR_INVALID_YAML_CONF_ENTRY,
    SC_ERR_TMQ_ALREADY_REGISTERED,
    SC_ERR_CONFLICTING_RULE_KEYWORDS,
--- a/src/util-mpm-ac-cuda-kernel.cu
+++ b/src/util-mpm-ac-cuda-kernel.cu
@ -0,0 +1,96 @@
+/* Copyright (C) 2007-2012 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ *
+ * The Cuda kernel for MPM AC.
+ *
+ * \todo - This is a basic version of the kernel.
+ *       - Support 16 bit state tables.
+ *       - Texture memory.
+ *       - Multiple threads per blocks of threads.  Make use of
+ *         shared memory/texture memory.
+ */
+
+extern "C"
+__global__ void SCACCudaSearch64(unsigned char *d_buffer,
+                                 unsigned int d_buffer_start_offset,
+                                 unsigned int *o_buffer,
+                                 unsigned int *results_buffer,
+                                 unsigned int nop,
+                                 unsigned char *tolower)
+{
+    unsigned int u = 0;
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= nop)
+        return;
+
+    unsigned int buflen = *((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
+    unsigned int (*state_table_u32)[256] =
+        (unsigned int (*)[256])*((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8));
+    unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 16);
+
+    unsigned int state = 0;
+    unsigned int matches = 0;
+    unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
+    for (u = 0; u < buflen; u++) {
+        state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
+        if (state & 0xFF000000) {
+            results[matches++] = u;
+            results[matches++] = state & 0x00FFFFFF;
+        }
+    }
+
+    *(results - 1) = matches;
+    return;
+}
+
+extern "C"
+__global__ void SCACCudaSearch32(unsigned char *d_buffer,
+                                 unsigned int d_buffer_start_offset,
+                                 unsigned int *o_buffer,
+                                 unsigned int *results_buffer,
+                                 unsigned int nop,
+                                 unsigned char *tolower)
+{
+    unsigned int u = 0;
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= nop)
+        return;
+
+    unsigned int buflen = *((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
+    unsigned int (*state_table_u32)[256] =
+        (unsigned int (*)[256])*((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 4));
+    unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8);
+
+    unsigned int state = 0;
+    unsigned int matches = 0;
+    unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
+    for (u = 0; u < buflen; u++) {
+        state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
+        if (state & 0xFF000000) {
+            results[matches++] = u;
+            results[matches++] = state & 0x00FFFFFF;
+        }
+    }
+
+    *(results - 1) = matches;
+    return;
+}
--- a/src/util-mpm-ac.c
+++ b/src/util-mpm-ac.c
@ -49,12 +49,22 @@
 #include "suricata.h"

 #include "detect.h"
-#include "util-mpm-ac.h"

 #include "conf.h"
 #include "util-debug.h"
 #include "util-unittest.h"
 #include "util-memcmp.h"
+#include "util-mpm-ac.h"
+
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "util-mpm.h"
+#include "tm-threads.h"
+#include "util-mpm.h"
+#include "detect-engine-mpm.h"
+#include "util-cuda.h"
+#include "util-cuda-handlers.h"
+#endif /* __SC_CUDA_SUPPORT__ */

 void SCACInitCtx(MpmCtx *, int);
 void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *, uint32_t);
@ -78,6 +88,8 @@ void SCACRegisterTests(void);

 #define STATE_QUEUE_CONTAINER_SIZE 65536

+static int construct_both_16_and_32_state_tables = 0;
+
 /**
 * \brief Helper structure used by AC during state table creation
 */
@ -87,31 +99,6 @@ typedef struct StateQueue_ {
    int bot;
 } StateQueue;

-/**
- * \brief Register the aho-corasick mpm.
- */
-void MpmACRegister(void)
-{
-    mpm_table[MPM_AC].name = "ac";
-    /* don't need this.  isn't that awesome?  no more chopping and blah blah */
-    mpm_table[MPM_AC].max_pattern_length = 0;
-
-    mpm_table[MPM_AC].InitCtx = SCACInitCtx;
-    mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx;
-    mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx;
-    mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx;
-    mpm_table[MPM_AC].AddPattern = SCACAddPatternCS;
-    mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI;
-    mpm_table[MPM_AC].Prepare = SCACPreparePatterns;
-    mpm_table[MPM_AC].Search = SCACSearch;
-    mpm_table[MPM_AC].Cleanup = NULL;
-    mpm_table[MPM_AC].PrintCtx = SCACPrintInfo;
-    mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats;
-    mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests;
-
-    return;
-}
-
 /**
 * \internal
 * \brief Initialize the AC context with user specified conf parameters.  We
@ -759,7 +746,7 @@ static inline void SCACCreateDeltaTable(MpmCtx *mpm_ctx)
    int ascii_code = 0;
    int32_t r_state = 0;

-    if (ctx->state_count < 32767) {
+    if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
        ctx->state_table_u16 = SCMalloc(ctx->state_count *
                                        sizeof(SC_AC_STATE_TYPE_U16) * 256);
        if (ctx->state_table_u16 == NULL) {
@ -797,7 +784,9 @@ static inline void SCACCreateDeltaTable(MpmCtx *mpm_ctx)
                }
            }
        }
-    } else {
+    }
+
+    if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
        /* create space for the state table.  We could have used the existing goto
         * table, but since we have it set to hold 32 bit state values, we will create
         * a new state table here of type SC_AC_STATE_TYPE(current set to uint16_t) */
@ -850,7 +839,7 @@ static inline void SCACClubOutputStatePresenceWithDeltaTable(MpmCtx *mpm_ctx)
    uint32_t state = 0;
    uint32_t temp_state = 0;

-    if (ctx->state_count < 32767) {
+    if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
        for (state = 0; state < ctx->state_count; state++) {
            for (ascii_code = 0; ascii_code < 256; ascii_code++) {
                temp_state = ctx->state_table_u16[state & 0x7FFF][ascii_code];
@ -858,7 +847,9 @@ static inline void SCACClubOutputStatePresenceWithDeltaTable(MpmCtx *mpm_ctx)
                    ctx->state_table_u16[state & 0x7FFF][ascii_code] |= (1 << 15);
            }
        }
-    } else {
+    }
+
+    if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
        for (state = 0; state < ctx->state_count; state++) {
            for (ascii_code = 0; ascii_code < 256; ascii_code++) {
                temp_state = ctx->state_table_u32[state & 0x00FFFFFF][ascii_code];
@ -1032,6 +1023,25 @@ int SCACPreparePatterns(MpmCtx *mpm_ctx)
    /* prepare the state table required by AC */
    SCACPrepareStateTable(mpm_ctx);

+#ifdef __SC_CUDA_SUPPORT__
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        int r = SCCudaMemAlloc(&ctx->state_table_u32_cuda,
+                               ctx->state_count * sizeof(unsigned int) * 256);
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+            exit(EXIT_FAILURE);
+        }
+
+        r = SCCudaMemcpyHtoD(ctx->state_table_u32_cuda,
+                             ctx->state_table_u32,
+                             ctx->state_count * sizeof(unsigned int) * 256);
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+            exit(EXIT_FAILURE);
+        }
+    }
+#endif
+
    /* free all the stored patterns.  Should save us a good 100-200 mbs */
    for (i = 0; i < mpm_ctx->pattern_cnt; i++) {
        if (ctx->parray[i] != NULL) {
@ -1166,7 +1176,8 @@ void SCACDestroyCtx(MpmCtx *mpm_ctx)
        mpm_ctx->memory_cnt++;
        mpm_ctx->memory_size -= (ctx->state_count *
                                 sizeof(SC_AC_STATE_TYPE_U16) * 256);
-    } else if (ctx->state_table_u32 != NULL) {
+    }
+    if (ctx->state_table_u32 != NULL) {
        SCFree(ctx->state_table_u32);
        ctx->state_table_u32 = NULL;

@ -1394,6 +1405,533 @@ void SCACPrintInfo(MpmCtx *mpm_ctx)
    return;
 }

+/****************************Cuda side of things****************************/
+
+#ifdef __SC_CUDA_SUPPORT__
+/* \todos
+ * - Use texture memory - Can we fit all the arrays into a 3d texture.
+ *   Texture memory definitely offers slightly better performance even
+ *   on gpus that offer cache for global memory.
+ * - Packetpool - modify to support > 65k max pending packets.  We are
+ *   hitting packetpool limit currently even with 65k packets.
+ * - Use streams.  We have tried overlapping parsing results from the
+ *   previous call with invoking the next call.
+ * - Offer higher priority to decode threads.
+ * - Modify pcap file mode to support reading from multiple pcap files
+ *   and hence we will have multiple receive threads.
+ * - Split state table into many small pieces and have multiple threads
+ *   run each small state table on the same payload.
+ * - Used a config peference of l1 over shared memory with no noticeable
+ *   perf increase.  Explore it in detail over cards/architectures.
+ * - Constant memory performance sucked.  Explore it in detail.
+ * - Currently all our state tables are small.  Implement 16 bit state
+ *   tables on priority.
+ * - Introduce profiling.
+ * - Retrieve sgh before buffer packet.
+ * - Buffer smsgs too.
+ */
+
+void SCACConstructBoth16and32StateTables(void)
+{
+    construct_both_16_and_32_state_tables = 1;
+
+    return;
+}
+
+/* \todo Reduce offset buffer size.  Probably a 100,000 entry would be sufficient. */
+static void *SCACCudaDispatcher(void *arg)
+{
+#define BLOCK_SIZE 32
+
+    int r = 0;
+    ThreadVars *tv = (ThreadVars *)arg;
+    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+    uint32_t sleep_interval_ms = conf->batching_timeout;
+
+    SCLogInfo("AC Cuda Mpm Dispatcher using a timeout of "
+              "\"%"PRIu32"\" micro-seconds", sleep_interval_ms);
+
+    CudaBufferData *cb_data =
+        CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME,
+                                 MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+
+    CUcontext cuda_context =
+        CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+    if (cuda_context == 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "context is NULL.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaCtxPushCurrent(cuda_context);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "context push failed.");
+        exit(EXIT_FAILURE);
+    }
+    CUmodule cuda_module = 0;
+    if (CudaHandlerGetCudaModule(&cuda_module, "util-mpm-ac-cuda-kernel") < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda module.");
+        exit(EXIT_FAILURE);
+    }
+    CUfunction kernel = 0;
+#if __WORDSIZE==64
+    if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch64") == -1) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
+        exit(EXIT_FAILURE);
+    }
+#else
+    if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch32") == -1) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
+        exit(EXIT_FAILURE);
+    }
+#endif
+
+    uint8_t g_u8_lowercasetable[256];
+    for (uint8_t c = 0; c < 255; c++)
+        g_u8_lowercasetable[c] = tolower((uint8_t)c);
+    CUdeviceptr cuda_g_u8_lowercasetable_d = 0;
+    CUdeviceptr cuda_packets_buffer_d = 0;
+    CUdeviceptr cuda_offset_buffer_d = 0;
+    CUdeviceptr cuda_results_buffer_d = 0;
+    uint32_t *cuda_results_buffer_h = NULL;
+    r = SCCudaMemAlloc(&cuda_g_u8_lowercasetable_d, sizeof(g_u8_lowercasetable));
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemcpyHtoD(cuda_g_u8_lowercasetable_d, g_u8_lowercasetable, sizeof(g_u8_lowercasetable));
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemAlloc(&cuda_packets_buffer_d, conf->gpu_transfer_size);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemAlloc(&cuda_offset_buffer_d, conf->gpu_transfer_size * 4);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemAlloc(&cuda_results_buffer_d, conf->gpu_transfer_size * 8);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemAllocHost((void **)&cuda_results_buffer_h, conf->gpu_transfer_size * 8);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
+        exit(EXIT_FAILURE);
+    }
+
+    CudaBufferCulledInfo cb_culled_info;
+    memset(&cb_culled_info, 0, sizeof(cb_culled_info));
+
+    TmThreadsSetFlag(tv, THV_INIT_DONE);
+    while (1) {
+        if (TmThreadsCheckFlag(tv, THV_KILL))
+            break;
+
+        usleep(sleep_interval_ms);
+
+        /**************** 1 SEND ****************/
+        CudaBufferCullCompletedSlices(cb_data, &cb_culled_info);
+        if (cb_culled_info.no_of_items == 0)
+            continue;
+#if 0
+        SCLogInfo("1 - cb_culled_info.no_of_items-%"PRIu32" "
+                  "cb_culled_info.buffer_len - %"PRIu32" "
+                  "cb_culled_info.average size - %f "
+                  "cb_culled_info.d_buffer_start_offset - %"PRIu32" "
+                  "cb_culled_info.op_buffer_start_offset - %"PRIu32" "
+                  "cb_data.no_of_items - %"PRIu32"  "
+                  "cb_data.d_buffer_read - %"PRIu32" "
+                  "cb_data.d_buffer_write - %"PRIu32" "
+                  "cb_data.op_buffer_read - %"PRIu32" "
+                  "cb_data.op_buffer_write - %"PRIu32"\n",
+                  cb_culled_info.no_of_items,
+                  cb_culled_info.d_buffer_len,
+                  cb_culled_info.d_buffer_len / (float)cb_culled_info.no_of_items,
+                  cb_culled_info.d_buffer_start_offset,
+                  cb_culled_info.op_buffer_start_offset,
+                  cb_data->no_of_items,
+                  cb_data->d_buffer_read,
+                  cb_data->d_buffer_write,
+                  cb_data->op_buffer_read,
+                  cb_data->op_buffer_write);
+#endif
+        r = SCCudaMemcpyHtoD(cuda_packets_buffer_d, (cb_data->d_buffer + cb_culled_info.d_buffer_start_offset), cb_culled_info.d_buffer_len);
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+            exit(EXIT_FAILURE);
+        }
+        r = SCCudaMemcpyHtoD(cuda_offset_buffer_d, (cb_data->o_buffer + cb_culled_info.op_buffer_start_offset), sizeof(uint32_t) * cb_culled_info.no_of_items);
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
+            exit(EXIT_FAILURE);
+        }
+        void *args[] = { &cuda_packets_buffer_d,
+                         &cb_culled_info.d_buffer_start_offset,
+                         &cuda_offset_buffer_d,
+                         &cuda_results_buffer_d,
+                         &cb_culled_info.no_of_items,
+                         &cuda_g_u8_lowercasetable_d };
+        r = SCCudaLaunchKernel(kernel,
+                               (cb_culled_info.no_of_items / BLOCK_SIZE) + 1, 1, 1,
+                               BLOCK_SIZE, 1, 1,
+                               0, 0,
+                               args, NULL);
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaLaunchKernel failure.");
+            exit(EXIT_FAILURE);
+        }
+        r = SCCudaMemcpyDtoH(cuda_results_buffer_h, cuda_results_buffer_d, sizeof(uint32_t) * (cb_culled_info.d_buffer_len * 2));
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyDtoH failure.");
+            exit(EXIT_FAILURE);
+        }
+
+
+
+        /**************** 1 SYNCHRO ****************/
+        r = SCCudaCtxSynchronize();
+        if (r < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaCtxSynchronize failure.");
+            exit(EXIT_FAILURE);
+        }
+
+        /************* 1 Parse Results ************/
+        uint32_t i_op_start_offset = cb_culled_info.op_buffer_start_offset;
+        uint32_t no_of_items = cb_culled_info.no_of_items;
+        uint32_t *o_buffer = cb_data->o_buffer;
+        uint32_t d_buffer_start_offset = cb_culled_info.d_buffer_start_offset;
+        for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
+            Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];
+
+            p->cuda_gpu_matches =
+                cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
+            if (p->cuda_gpu_matches != 0) {
+                memcpy(p->cuda_results,
+                       cuda_results_buffer_h +
+                       ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2),
+                       (cuda_results_buffer_h[((o_buffer[i_op_start_offset] -
+                                                d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
+            }
+
+            SCMutexLock(&p->cuda_mutex);
+            p->cuda_done = 1;
+            SCMutexUnlock(&p->cuda_mutex);
+            SCCondSignal(&p->cuda_cond);
+        }
+        if (no_of_items != 0)
+            CudaBufferReportCulledConsumption(cb_data, &cb_culled_info);
+    } /* while (1) */
+
+    r = SCCudaModuleUnload(cuda_module);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error unloading cuda module.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemFree(cuda_packets_buffer_d);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemFree(cuda_offset_buffer_d);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemFree(cuda_results_buffer_d);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
+        exit(EXIT_FAILURE);
+    }
+    r = SCCudaMemFreeHost(cuda_results_buffer_h);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+        exit(EXIT_FAILURE);
+    }
+
+    TmThreadsSetFlag(tv, THV_RUNNING_DONE);
+    TmThreadWaitForFlag(tv, THV_DEINIT);
+    TmThreadsSetFlag(tv, THV_CLOSED);
+
+    return NULL;
+
+#undef BLOCK_SIZE
+}
+
+uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
+                                          PatternMatcherQueue *pmq)
+{
+    uint32_t u = 0;
+
+    while (!p->cuda_done) {
+        SCMutexLock(&p->cuda_mutex);
+        if (p->cuda_done) {
+            SCMutexUnlock(&p->cuda_mutex);
+            break;
+        } else {
+            SCCondWait(&p->cuda_cond, &p->cuda_mutex);
+            SCMutexUnlock(&p->cuda_mutex);
+        }
+    } /* while */
+    p->cuda_done = 0;
+    p->cuda_mpm_enabled = 0;
+
+    uint32_t cuda_matches = p->cuda_gpu_matches;
+    if (cuda_matches == 0)
+        return 0;
+
+    uint32_t matches = 0;
+    uint32_t *results = p->cuda_results + 1;
+    uint8_t *buf = p->payload;
+    SCACCtx *ctx = mpm_ctx->ctx;
+    SCACOutputTable *output_table = ctx->output_table;
+    SCACPatternList *pid_pat_list = ctx->pid_pat_list;
+
+    for (u = 0; u < cuda_matches; u += 2) {
+        uint32_t offset = results[u];
+        uint32_t state = results[u + 1];
+        /* we should technically be doing state & 0x00FFFFFF, but we don't
+         * since the cuda kernel does that for us */
+        uint32_t no_of_entries = output_table[state].no_of_entries;
+        /* we should technically be doing state & 0x00FFFFFF, but we don't
+         * since the cuda kernel does that for us */
+        uint32_t *pids = output_table[state].pids;
+        uint32_t k;
+        /* note that this is not a verbatim copy from SCACSearch().  We
+         * don't copy the pattern id into the pattern_id_array.  That's
+         * the only change */
+        for (k = 0; k < no_of_entries; k++) {
+            if (pids[k] & 0xFFFF0000) {
+                if (SCMemcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs,
+                             buf + offset - pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1,
+                             pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) {
+                    /* inside loop */
+                    if (pid_pat_list[pids[k] & 0x0000FFFF].case_state != 3) {
+                        continue;
+                    }
+                }
+                if (pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] & (1 << ((pids[k] & 0x0000FFFF) % 8))) {
+                    ;
+                } else {
+                    pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] |= (1 << ((pids[k] & 0x0000FFFF) % 8));
+                }
+                matches++;
+            } else {
+                if (pmq->pattern_id_bitarray[pids[k] / 8] & (1 << (pids[k] % 8))) {
+                    ;
+                } else {
+                    pmq->pattern_id_bitarray[pids[k] / 8] |= (1 << (pids[k] % 8));
+                }
+                matches++;
+            }
+        }
+    }
+
+    return matches;
+}
+
+void SCACCudaStartDispatcher(void)
+{
+    /* create the threads */
+    ThreadVars *tv = TmThreadCreate("Cuda_Mpm_AC_Dispatcher",
+                                    NULL, NULL,
+                                    NULL, NULL,
+                                    "custom", SCACCudaDispatcher, 0);
+    if (tv == NULL) {
+        SCLogError(SC_ERR_THREAD_CREATE, "Error creating a thread for "
+                   "ac cuda dispatcher.  Killing engine.");
+        exit(EXIT_FAILURE);
+    }
+    if (TmThreadSpawn(tv) != 0) {
+        SCLogError(SC_ERR_THREAD_SPAWN, "Failed to spawn thread for "
+                   "ac cuda dispatcher.  Killing engine.");
+        exit(EXIT_FAILURE);
+    }
+
+    return;
+}
+
+int MpmCudaBufferSetup(void)
+{
+    int r = 0;
+    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+    if (conf == NULL) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+        return -1;
+    }
+
+    CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+    if (cuda_context == 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
+        return -1;
+    }
+    r = SCCudaCtxPushCurrent(cuda_context);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
+        return -1;
+    }
+
+    uint8_t *d_buffer = NULL;
+    uint32_t *o_buffer = NULL;
+    void **p_buffer = NULL;
+
+    r = SCCudaMemAllocHost((void *)&d_buffer, conf->cb_buffer_size);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
+        return -1;
+    }
+    SCLogInfo("Allocated a cuda d_buffer - %"PRIu32" bytes", conf->cb_buffer_size);
+    r = SCCudaMemAllocHost((void *)&o_buffer, sizeof(uint32_t) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failue.");
+        return -1;
+    }
+    r = SCCudaMemAllocHost((void *)&p_buffer, sizeof(void *) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
+        return -1;
+    }
+
+    r = SCCudaCtxPopCurrent(NULL);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
+        return -1;
+    }
+
+    CudaBufferData *cb = CudaBufferRegisterNew(d_buffer, conf->cb_buffer_size, o_buffer, p_buffer, UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
+    if (cb == NULL) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error registering new cb instance.");
+        return -1;
+    }
+    CudaHandlerModuleStoreData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME, cb);
+
+    return 0;
+}
+
+int MpmCudaBufferDeSetup(void)
+{
+    int r = 0;
+    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+    if (conf == NULL) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+        return -1;
+    }
+
+    CudaBufferData *cb_data = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+    BUG_ON(cb_data == NULL);
+
+    CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
+    if (cuda_context == 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
+        return -1;
+    }
+    r = SCCudaCtxPushCurrent(cuda_context);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
+        return -1;
+    }
+
+    r = SCCudaMemFreeHost(cb_data->d_buffer);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+        return -1;
+    }
+    r = SCCudaMemFreeHost(cb_data->o_buffer);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+        return -1;
+    }
+    r = SCCudaMemFreeHost(cb_data->p_buffer);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
+        return -1;
+    }
+
+    r = SCCudaCtxPopCurrent(NULL);
+    if (r < 0) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
+        return -1;
+    }
+
+    CudaBufferDeRegister(cb_data);
+
+    return 0;
+}
+
+#endif /* __SC_CUDA_SUPPORT */
+
+/************************** Mpm Registration ***************************/
+
+/**
+ * \brief Register the aho-corasick mpm.
+ */
+void MpmACRegister(void)
+{
+    mpm_table[MPM_AC].name = "ac";
+    /* don't need this.  isn't that awesome?  no more chopping and blah blah */
+    mpm_table[MPM_AC].max_pattern_length = 0;
+
+    mpm_table[MPM_AC].InitCtx = SCACInitCtx;
+    mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx;
+    mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx;
+    mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx;
+    mpm_table[MPM_AC].AddPattern = SCACAddPatternCS;
+    mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI;
+    mpm_table[MPM_AC].Prepare = SCACPreparePatterns;
+    mpm_table[MPM_AC].Search = SCACSearch;
+    mpm_table[MPM_AC].Cleanup = NULL;
+    mpm_table[MPM_AC].PrintCtx = SCACPrintInfo;
+    mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats;
+    mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests;
+
+    return;
+}
+
+#ifdef __SC_CUDA_SUPPORT__
+
+/**
+ * \brief Register the aho-corasick cuda mpm.
+ */
+void MpmACCudaRegister(void)
+{
+    mpm_table[MPM_AC_CUDA].name = "ac-cuda";
+    /* don't need this.  isn't that awesome?  no more chopping and blah blah */
+    mpm_table[MPM_AC_CUDA].max_pattern_length = 0;
+
+    mpm_table[MPM_AC_CUDA].InitCtx = SCACInitCtx;
+    mpm_table[MPM_AC_CUDA].InitThreadCtx = SCACInitThreadCtx;
+    mpm_table[MPM_AC_CUDA].DestroyCtx = SCACDestroyCtx;
+    mpm_table[MPM_AC_CUDA].DestroyThreadCtx = SCACDestroyThreadCtx;
+    mpm_table[MPM_AC_CUDA].AddPattern = SCACAddPatternCS;
+    mpm_table[MPM_AC_CUDA].AddPatternNocase = SCACAddPatternCI;
+    mpm_table[MPM_AC_CUDA].Prepare = SCACPreparePatterns;
+    mpm_table[MPM_AC_CUDA].Search = SCACSearch;
+    mpm_table[MPM_AC_CUDA].Cleanup = NULL;
+    mpm_table[MPM_AC_CUDA].PrintCtx = SCACPrintInfo;
+    mpm_table[MPM_AC_CUDA].PrintThreadCtx = SCACPrintSearchStats;
+    mpm_table[MPM_AC_CUDA].RegisterUnittests = SCACRegisterTests;
+
+    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
+        MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+        if (conf == NULL) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+            exit(EXIT_FAILURE);
+        }
+
+        if (MpmCudaBufferSetup() < 0) {
+            SCLogError(SC_ERR_AC_CUDA_ERROR, "Error setting up env for ac cuda");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return;
+}
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 /*************************************Unittests********************************/

 #ifdef UNITTESTS
--- a/src/util-mpm-ac.h
+++ b/src/util-mpm-ac.h
@ -22,9 +22,16 @@
 *
 */

+#ifndef __UTIL_MPM_AC__H__
+#define __UTIL_MPM_AC__H__
+
 #define SC_AC_STATE_TYPE_U16 uint16_t
 #define SC_AC_STATE_TYPE_U32 uint32_t

+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda.h"
+#endif /* __SC_CUDA_SUPPORT__ */
+
 typedef struct SCACPattern_ {
    /* length of the pattern */
    uint16_t len;
@ -79,6 +86,11 @@ typedef struct SCACCtx_ {
    /* the size of each state */
    uint16_t single_state_size;
    uint16_t max_pat_id;
+
+#ifdef __SC_CUDA_SUPPORT__
+    CUdeviceptr state_table_u16_cuda;
+    CUdeviceptr state_table_u32_cuda;
+#endif /* __SC_CUDA_SUPPORT__ */
 } SCACCtx;

 typedef struct SCACThreadCtx_ {
@ -89,3 +101,24 @@ typedef struct SCACThreadCtx_ {
 } SCACThreadCtx;

 void MpmACRegister(void);
+
+
+#ifdef __SC_CUDA_SUPPORT__
+
+#define MPM_AC_CUDA_MODULE_NAME "ac_cuda"
+#define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb"
+
+
+void MpmACCudaRegister(void);
+void SCACConstructBoth16and32StateTables(void);
+int MpmCudaBufferSetup(void);
+int MpmCudaBufferDeSetup(void);
+void SCACCudaStartDispatcher(void);
+void SCACCudaKillDispatcher(void);
+uint32_t  SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
+                                          PatternMatcherQueue *pmq);
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
+
+#endif /* __UTIL_MPM_AC__H__ */
--- a/src/util-mpm-b2g-cuda-kernel.cu
+++ b/src/util-mpm-b2g-cuda-kernel.cu
@ -1,112 +0,0 @@
-/* Copyright (C) 2007-2010 Open Information Security Foundation
- *
- * You can copy, redistribute or modify this Program under the terms of
- * the GNU General Public License version 2 as published by the Free
- * Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-/**
- * \file
- *
- * \author Anoop Saldanha <anoopsaldanha@gmail.com>
- *
- * The Cuda kernel for MPM B2G.
- *
- * \todo This is a basic version of the kernel.  Modify it to support multiple
- *       blocks of threads.  Make use of shared memory/texture memory.
- */
-
-#define B2G_CUDA_Q 2
-#define CUDA_THREADS 4000
-#define B2G_CUDA_HASHSHIFT 4
-#define B2G_CUDA_TYPE unsigned int
-#define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b))
-#define u8_tolower(c) g_u8_lowercasetable[(c)]
-
-typedef struct SCCudaPBPacketDataForGPU_ {
-    /* holds the value B2gCtx->m */
-    unsigned int m;
-    /* holds B2gCtx->B2g */
-    unsigned int table;
-    /* holds the length of the payload */
-    unsigned int payload_len;
-    /* holds the payload */
-    unsigned char payload;
-} SCCudaPBPacketDataForGPU;
-
-extern "C"
-__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer,
-                                   unsigned char *packets_buffer,
-                                   unsigned int *packets_offset_buffer,
-                                   unsigned int *packets_payload_offset_buffer,
-                                   unsigned int nop,
-                                   unsigned char *g_u8_lowercasetable)
- {
-    unsigned int tid = blockIdx.x * 32 + threadIdx.x;
-    /* if the thread id is greater than the no of packets sent in the packets
-     * buffer, terminate the thread */
-    //if (tid <= nop)
-    if (tid >= nop)
-        return;
-
-    SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]);
-    unsigned int m = packet->m;
-    unsigned char *buf = &packet->payload;
-    unsigned int buflen = packet->payload_len;
-    unsigned int *B2G = (unsigned int *)packet->table;
-    unsigned int pos = m - B2G_CUDA_Q + 1;
-    B2G_CUDA_TYPE d;
-    unsigned short h;
-    unsigned int first;
-    unsigned int j = 0;
-
-    unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid;
-    //unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1;
-    //unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1;
-    unsigned short *offsets = matches_count + 1;
-    // temporarily hold the results here, before we shift it to matches_count
-    // before returning
-    unsigned short matches = 0;
-
-    while (pos <= (buflen - B2G_CUDA_Q + 1)) {
-        h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos]));
-        d = B2G[h];
-
-        if (d != 0) {
-            j = pos;
-            first = pos - (m - B2G_CUDA_Q + 1);
-
-            do {
-                j = j - 1;
-                if (d >= (1 << (m - 1))) {
-                    if (j > first) {
-                        pos = j;
-                    } else {
-                        offsets[matches++] = j;
-                    }
-                }
-
-                if (j == 0)
-                    break;
-
-                h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j]));
-                d = (d << 1) & B2G[h];
-            } while (d != 0);
-        }
-        pos = pos + m - B2G_CUDA_Q + 1;
-    }
-
-    matches_count[0] = matches;
-
-    return;
-}
--- a/src/util-mpm.c
+++ b/src/util-mpm.c
@ -45,6 +45,9 @@
 #include "conf-yaml-loader.h"
 #include "queue.h"
 #include "util-unittest.h"
+#ifdef __SC_CUDA_SUPPORT__
+#include "util-cuda-handlers.h"
+#endif

 /**
 * \brief Register a new Mpm Context.
@ -265,6 +268,118 @@ void MpmFactoryDeRegisterAllMpmCtxProfiles(DetectEngineCtx *de_ctx)
    return;
 }

+#ifdef __SC_CUDA_SUPPORT__
+
+static void MpmCudaConfFree(void *conf)
+{
+    SCFree(conf);
+    return;
+}
+
+static void *MpmCudaConfParse(ConfNode *node)
+{
+    const char *value;
+
+    MpmCudaConf *conf = SCMalloc(sizeof(MpmCudaConf));
+    if (conf == NULL)
+        exit(EXIT_FAILURE);
+    memset(conf, 0, sizeof(conf));
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "data-buffer-size-min-limit");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->data_buffer_size_min_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT;
+    } else if (ParseSizeStringU16(value, &conf->data_buffer_size_min_limit) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "data-buffer-size-min-limit - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "data-buffer-size-max-limit");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->data_buffer_size_max_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT;
+    } else if (ParseSizeStringU16(value, &conf->data_buffer_size_max_limit) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "data-buffer-size-max-limit - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "cudabuffer-buffer-size");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->cb_buffer_size = UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT;
+    } else if (ParseSizeStringU32(value, &conf->cb_buffer_size) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "cb-buffer-size - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "gpu-transfer-size");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->gpu_transfer_size = UTIL_MPM_CUDA_GPU_TRANSFER_SIZE;
+    } else if (ParseSizeStringU32(value, &conf->gpu_transfer_size) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "gpu-transfer-size - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "batching-timeout");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->batching_timeout = UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT;
+    } else if ((conf->batching_timeout = atoi(value)) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "batching-timeout - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "device-id");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->device_id = UTIL_MPM_CUDA_DEVICE_ID_DEFAULT;
+    } else if ((conf->device_id = atoi(value)) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "device-id - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    if (node != NULL)
+        value = ConfNodeLookupChildValue(node, "cuda-streams");
+    else
+        value = NULL;
+    if (value == NULL) {
+        /* default */
+        conf->cuda_streams = UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT;
+    } else if ((conf->cuda_streams = atoi(value)) < 0) {
+        SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
+                   "cuda-streams - \"%s\"", node->name, value);
+        exit(EXIT_FAILURE);
+    }
+
+    return conf;
+}
+#endif
+
 /**
 *  \brief Setup a pmq
 *
@ -447,6 +562,10 @@ void MpmTableSetup(void) {
    MpmB2gcRegister();
    MpmB2gmRegister();
    MpmACRegister();
+#ifdef __SC_CUDA_SUPPORT__
+    CudaHandlerAddCudaProfileFromConf("mpm", MpmCudaConfParse, MpmCudaConfFree);
+    MpmACCudaRegister();
+#endif /* __SC_CUDA_SUPPORT__ */
    MpmACBSRegister();
    MpmACGfbsRegister();
 }
--- a/src/util-mpm.h
+++ b/src/util-mpm.h
@ -65,6 +65,9 @@ enum {

    /* aho-corasick */
    MPM_AC,
+#ifdef __SC_CUDA_SUPPORT__
+    MPM_AC_CUDA,
+#endif
    /* aho-corasick-goto-failure state based */
    MPM_AC_GFBS,
    MPM_AC_BS,
@ -81,6 +84,7 @@ typedef struct MpmThreadCtx_ {

    uint32_t memory_cnt;
    uint32_t memory_size;
+
 } MpmThreadCtx;

 /** \brief helper structure for the pattern matcher engine. The Pattern Matcher
@ -176,6 +180,40 @@ typedef struct MpmTableElmt_ {

 MpmTableElmt mpm_table[MPM_TABLE_SIZE];

+/* macros decides if cuda is enabled for the platform or not */
+#ifdef __SC_CUDA_SUPPORT__
+
+/* the min size limit of a payload(or any other data) to be buffered */
+#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT 0
+/* the max size limit of a payload(or any other data) to be buffered */
+#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT 1500
+/* Default value for data buffer used by cuda mpm engine for CudaBuffer reg */
+#define UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT 500 * 1024 * 1024
+/* Default value for the max data chunk that would be sent to gpu */
+#define UTIL_MPM_CUDA_GPU_TRANSFER_SIZE 50 * 1024 * 1024
+/* Default value for offset/pointer buffer to be used by cuda mpm
+ * engine for CudaBuffer reg */
+#define UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT 500000
+#define UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT 2000
+#define UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT 2
+#define UTIL_MPM_CUDA_DEVICE_ID_DEFAULT 0
+
+/**
+ * \brief Cuda configuration for "mpm" profile.  We can further extend this
+ *        to have conf for specific mpms.  For now its common for all mpms.
+ */
+typedef struct MpmCudaConf_ {
+    uint16_t data_buffer_size_min_limit;
+    uint16_t data_buffer_size_max_limit;
+    uint32_t cb_buffer_size;
+    uint32_t gpu_transfer_size;
+    int batching_timeout;
+    int device_id;
+    int cuda_streams;
+} MpmCudaConf;
+
+#endif /* __SC_CUDA_SUPPORT__ */
+
 struct DetectEngineCtx_;

 int32_t MpmFactoryRegisterMpmCtxProfile(struct DetectEngineCtx_ *, const char *, uint8_t);
--- a/suricata.yaml.in
+++ b/suricata.yaml.in
@ -404,34 +404,33 @@ threading:
 # Cuda configuration.
 cuda:
  # The "mpm" profile.  On not specifying any of these parameters, the engine's
-  # internal default values are used, which are same as the ones specified here.
-  - mpm:
-      # Threshold limit for no of packets buffered to the GPU.  Once we hit this
-      # limit, we pass the buffer to the gpu.
-      packet-buffer-limit: 2400
-      # The maximum length for a packet that we would buffer to the gpu.
-      # Anything over this is MPM'ed on the CPU.  All entries > 0 are valid.
-      # Can be specified in kb, mb, gb.  Just a number indicates it's in bytes.
-      packet-size-limit: 1500
-      # No of packet buffers we initialize.  All entries > 0 are valid.
-      packet-buffers: 10
-      # The timeout limit for batching of packets in secs.  If we don't fill the
-      # buffer within this timeout limit, we pass the currently filled buffer to the gpu.
-      # All entries > 0 are valid.
-      batching-timeout: 1
-      # Specifies whether to use page-locked memory whereever possible.  Accepted values
-      # are "enabled" and "disabled".
-      page-locked: enabled
-      # The device to use for the mpm.  Currently we don't support load balancing
-      # on multiple gpus.  In case you have multiple devices on your system, you
-      # can specify the device to use, using this conf.  By default we hold 0, to
-      # specify the first device cuda sees.  To find out device-id associated with
-      # the card(s) on the system run "suricata --list-cuda-cards".
-      device-id: 0
-      # No of Cuda streams used for asynchronous processing. All values > 0 are valid.
-      # For this option you need a device with Compute Capability > 1.0 and
-      # page-locked enabled to have any effect.
-      cuda-streams: 2
+  # internal default values are used, which are same as the ones specified in
+  # in the default conf file.
+  mpm:
+    # The minimum length required to buffer data to the gpu.
+    # Anything below this is MPM'ed on the CPU.
+    # Can be specified in kb, mb, gb.  Just a number indicates it's in bytes.
+    # A value of 0 indicates there's no limit.
+    data-buffer-size-min-limit: 0
+    # The maximum length for data that we would buffer to the gpu.
+    # Anything over this is MPM'ed on the CPU.
+    # Can be specified in kb, mb, gb.  Just a number indicates it's in bytes.
+    data-buffer-size-max-limit: 1500
+    # The ring buffer size used by the CudaBuffer API to buffer data.
+    cudabuffer-buffer-size: 500mb
+    # The max chunk size that can be sent to the gpu in a single go.
+    gpu-transfer-size: 50mb
+    # The timeout limit for batching of packets in microseconds.
+    batching-timeout: 2000
+    # The device to use for the mpm.  Currently we don't support load balancing
+    # on multiple gpus.  In case you have multiple devices on your system, you
+    # can specify the device to use, using this conf.  By default we hold 0, to
+    # specify the first device cuda sees.  To find out device-id associated with
+    # the card(s) on the system run "suricata --list-cuda-cards".
+    device-id: 0
+    # No of Cuda streams used for asynchronous processing. All values > 0 are valid.
+    # For this option you need a device with Compute Capability > 1.0.
+    cuda-streams: 2

 # Select the multi pattern algorithm you want to run for scan/search the
 # in the engine. The supported algorithms are b2g, b2gc, b2gm, b3g, wumanber,