Minor cosmetic changes to the cuda code.

Moved a couple of functions to more cuda relevant files; Re-structured some data types.
11 years ago · 602c91ed41
parent c9f076def3
commit 602c91ed41
14 changed files with 521 additions and 432 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -258,6 +258,7 @@ util-crypt.c util-crypt.h \
 util-cuda.c util-cuda.h \
 util-cuda-buffer.c util-cuda-buffer.h \
 util-cuda-handlers.c util-cuda-handlers.h \
+util-cuda-vars.c util-cuda-vars.h \
 util-daemon.c util-daemon.h \
 util-debug.c util-debug.h \
 util-debug-filters.c util-debug-filters.h \
--- a/src/decode.h
+++ b/src/decode.h
@ -32,6 +32,7 @@

 #ifdef __SC_CUDA_SUPPORT__
 #include "util-cuda-buffer.h"
+#include "util-cuda-vars.h"
 #endif /* __SC_CUDA_SUPPORT__ */

 typedef enum {
@ -491,12 +492,7 @@ typedef struct Packet_
    PktProfiling profile;
 #endif
 #ifdef __SC_CUDA_SUPPORT__
-    uint8_t cuda_mpm_enabled;
-    uint8_t cuda_done;
-    uint16_t cuda_gpu_matches;
-    SCMutex cuda_mutex;
-    SCCondT cuda_cond;
-    uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1];
+    CudaPacketVars cuda_pkt_vars;
 #endif
 } Packet;

@ -583,21 +579,7 @@ typedef struct DecodeThreadVars_
    uint16_t counter_defrag_max_hit;

 #ifdef __SC_CUDA_SUPPORT__
-    /* cb - CudaBuffer */
-    CudaBufferData *cuda_ac_cb;
-
-    MpmCtx *mpm_proto_other_ctx;
-
-    MpmCtx *mpm_proto_tcp_ctx_ts;
-    MpmCtx *mpm_proto_udp_ctx_ts;
-
-    MpmCtx *mpm_proto_tcp_ctx_tc;
-    MpmCtx *mpm_proto_udp_ctx_tc;
-
-    uint16_t data_buffer_size_max_limit;
-    uint16_t data_buffer_size_min_limit;
-
-    uint8_t mpm_is_cuda;
+    CudaThreadVars cuda_vars;
 #endif
 } DecodeThreadVars;

@ -625,8 +607,8 @@ typedef struct DecodeThreadVars_
        PACKET_RESET_CHECKSUMS((p));                                    \
        (p)->pkt = ((uint8_t *)(p)) + sizeof(Packet);                   \
        (p)->livedev = NULL;                                            \
-        SCMutexInit(&(p)->cuda_mutex, NULL);                            \
-        SCCondInit(&(p)->cuda_cond, NULL);                              \
+        SCMutexInit(&(p)->cuda_pkt_vars.cuda_mutex, NULL);            \
+        SCCondInit(&(p)->cuda_pkt_vars.cuda_cond, NULL);                \
    } while (0)
 #else
 #define PACKET_INITIALIZE(p) {         \
--- a/src/detect-engine-mpm.c
+++ b/src/detect-engine-mpm.c
@ -225,7 +225,7 @@ uint32_t PacketPatternSearch(DetectEngineThreadCtx *det_ctx, Packet *p)
        SCReturnInt(0);

 #ifdef __SC_CUDA_SUPPORT__
-    if (p->cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) {
+    if (p->cuda_pkt_vars.cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) {
        ret = SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq);
    } else {
        ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
--- a/src/detect.c
+++ b/src/detect.c
@ -4388,255 +4388,6 @@ int SigAddressPrepareStage5(DetectEngineCtx *de_ctx) {
    return 0;
 }

-#ifdef __SC_CUDA_SUPPORT__
-
-static void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
-{
-    MpmCtx *mpm_ctx = NULL;
-
-    int ac_16_tables = 0;
-    int ac_32_tables = 0;
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
-    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
-        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
-        if (ctx->state_count < 32767)
-            ac_16_tables++;
-        else
-            ac_32_tables++;
-    }
-
-    if (ac_16_tables > 0 && ac_32_tables > 0)
-        SCACConstructBoth16and32StateTables();
-
-
-    SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables);
-    SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables);
-
-}
-#endif
-
 /**
 * \brief Convert the signature list into the runtime match structure.
 *
--- a/src/runmodes.c
+++ b/src/runmodes.c
@ -301,6 +301,15 @@ void RunModeDispatch(int runmode, const char *custom_mode, DetectEngineCtx *de_c
        }
    }

+#ifdef __SC_CUDA_SUPPORT__
+    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA &&
+        strcasecmp(custom_mode, "autofp") != 0) {
+        SCLogError(SC_ERR_RUNMODE, "When using a cuda mpm, the only runmode we "
+                   "support is autofp.");
+        exit(EXIT_FAILURE);
+    }
+#endif
+
    RunMode *mode = RunModeGetCustomMode(runmode, custom_mode);
    if (mode == NULL) {
        SCLogError(SC_ERR_RUNMODE, "The custom type \"%s\" doesn't exist "
--- a/src/source-pcap-file.c
+++ b/src/source-pcap-file.c
@ -51,8 +51,7 @@
 #include "util-cuda-handlers.h"
 #include "detect-engine.h"
 #include "detect-engine-mpm.h"
-
-static DetectEngineCtx *cuda_de_ctx = NULL;
+#include "util-cuda-vars.h"

 #endif /* __SC_CUDA_SUPPORT__ */

@ -124,15 +123,6 @@ void TmModuleDecodePcapFileRegister (void) {
    tmm_modules[TMM_DECODEPCAPFILE].flags = TM_FLAG_DECODE_TM;
 }

-#ifdef __SC_CUDA_SUPPORT__
-void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx)
-{
-    cuda_de_ctx = de_ctx;
-
-    return;
-}
-#endif
-
 void PcapFileCallbackLoop(char *user, struct pcap_pkthdr *h, u_char *pkt) {
    SCEnter();

@ -342,90 +332,6 @@ TmEcode ReceivePcapFileThreadDeinit(ThreadVars *tv, void *data) {
    SCReturnInt(TM_ECODE_OK);
 }

-#ifdef __SC_CUDA_SUPPORT__
-
-static inline void DecodePcapFileBufferPacket(DecodeThreadVars *dtv, Packet *p)
-{
-    if (p->cuda_mpm_enabled) {
-        while (!p->cuda_done) {
-            SCMutexLock(&p->cuda_mutex);
-            if (p->cuda_done) {
-                SCMutexUnlock(&p->cuda_mutex);
-                break;
-            } else {
-                SCCondWait(&p->cuda_cond, &p->cuda_mutex);
-                SCMutexUnlock(&p->cuda_mutex);
-            }
-        }
-    }
-    p->cuda_done = 0;
-
-    if (p->payload_len == 0 ||
-        (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) ||
-        (p->flags & PKT_ALLOC) ||
-        (dtv->data_buffer_size_min_limit != 0 && p->payload_len < dtv->data_buffer_size_min_limit) ||
-        (p->payload_len > dtv->data_buffer_size_max_limit && dtv->data_buffer_size_max_limit != 0) ) {
-        p->cuda_mpm_enabled = 0;
-        return;
-    }
-
-    MpmCtx *mpm_ctx = NULL;
-    if (p->proto == IPPROTO_TCP) {
-        if (p->flowflags & FLOW_PKT_TOSERVER)
-            mpm_ctx = dtv->mpm_proto_tcp_ctx_ts;
-        else
-            mpm_ctx = dtv->mpm_proto_tcp_ctx_tc;
-    } else if (p->proto == IPPROTO_UDP) {
-        if (p->flowflags & FLOW_PKT_TOSERVER)
-            mpm_ctx = dtv->mpm_proto_udp_ctx_ts;
-        else
-            mpm_ctx = dtv->mpm_proto_udp_ctx_tc;
-    } else {
-        mpm_ctx = dtv->mpm_proto_other_ctx;
-    }
-    if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) {
-        p->cuda_mpm_enabled = 0;
-        return;
-    }
-
-#if __WORDSIZE==64
-    CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
-                                                p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr),
-                                                (void *)p);
-    if (slice == NULL) {
-        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
-                   "this to dev.");
-        p->cuda_mpm_enabled = 0;
-        return;
-    }
-    *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
-    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
-    memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
-#else
-    CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
-                                                p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr),
-                                                (void *)p);
-    if (slice == NULL) {
-        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
-                   "this to dev.");
-        p->cuda_mpm_enabled = 0;
-        return;
-    }
-    *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
-    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
-    memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
-#endif
-    p->cuda_mpm_enabled = 1;
-    SC_ATOMIC_SET(slice->done, 1);
-
-    SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n",
-               p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda);
-
-    return;
-}
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 double prev_signaled_ts = 0;

 TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *postpq)
@ -457,7 +363,6 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P
    TimeSet(&p->ts);

    /* call the decoder */
-
    pcap_g.Decoder(tv, dtv, p, GET_PKT_DATA(p), GET_PKT_LEN(p), pq);

 #ifdef DEBUG
@ -465,41 +370,13 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P
 #endif

 #ifdef __SC_CUDA_SUPPORT__
-    if (dtv->mpm_is_cuda)
-        DecodePcapFileBufferPacket(dtv, p);
+    if (dtv->cuda_vars.mpm_is_cuda)
+        CudaBufferPacket(&dtv->cuda_vars, p);
 #endif

    SCReturnInt(TM_ECODE_OK);
 }

-#ifdef __SC_CUDA_SUPPORT__
-
-static int DecodePcapFileThreadInitCuda(DecodeThreadVars *dtv)
-{
-    if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
-        return 0;
-
-    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
-    if (conf == NULL) {
-        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
-        return -1;
-    }
-
-    dtv->mpm_is_cuda = 1;
-    dtv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
-    dtv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit;
-    dtv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit;
-    dtv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
-    dtv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
-    dtv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0);
-    dtv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1);
-    dtv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0);
-
-    return 0;
-}
-
-#endif /* __SC_CUDA_SUPPORT__ */
-
 TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)
 {
    SCEnter();
@ -512,7 +389,7 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)
    DecodeRegisterPerfCounters(dtv, tv);

 #ifdef __SC_CUDA_SUPPORT__
-    if (DecodePcapFileThreadInitCuda(dtv) < 0)
+    if (CudaThreadVarsInit(&dtv->cuda_vars) < 0)
        SCReturnInt(TM_ECODE_FAILED);
 #endif

--- a/src/source-pcap-file.h
+++ b/src/source-pcap-file.h
@ -26,9 +26,6 @@

 void TmModuleReceivePcapFileRegister (void);
 void TmModuleDecodePcapFileRegister (void);
-#ifdef __SC_CUDA_SUPPORT__
-void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx);
-#endif

 #endif /* __SOURCE_PCAP_FILE_H__ */

--- a/src/suricata.c
+++ b/src/suricata.c
@ -1829,7 +1829,7 @@ int main(int argc, char **argv)
    }
 #ifdef __SC_CUDA_SUPPORT__
    if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
-        DecodePcapFileSetCudaDeCtx(de_ctx);
+        CudaVarsSetDeCtx(de_ctx);
 #endif /* __SC_CUDA_SUPPORT__ */

    SCClassConfLoadClassficationConfigFile(de_ctx);
--- a/src/util-cuda-buffer.c
+++ b/src/util-cuda-buffer.c
@ -272,7 +272,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void

    if (cb_data->d_buffer_write < cb_data->d_buffer_read) {
        if (cb_data->d_buffer_write + len >= cb_data->d_buffer_read) {
-            SCLogInfo("d_buffer full");
+            SCLogDebug("d_buffer full");
            SCMutexUnlock(&cb_data->m);

            SCMutexLock(&slice_pool_mutex);
@ -282,7 +282,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void
        }
    } else {
        if (cb_data->d_buffer_write + len > cb_data->d_buffer_len) {
-            SCLogInfo("d_buffer limit hit - buffer_len - %"PRIu32,
+            SCLogDebug("d_buffer limit hit - buffer_len - %"PRIu32,
                      cb_data->d_buffer_len);
            SCMutexUnlock(&cb_data->m);

@ -295,7 +295,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void

    if (cb_data->op_buffer_write < cb_data->op_buffer_read) {
        if (cb_data->op_buffer_write + 1 >= cb_data->op_buffer_read) {
-            SCLogInfo("op_buffer full");
+            SCLogDebug("op_buffer full");
            SCMutexUnlock(&cb_data->m);

            SCMutexLock(&slice_pool_mutex);
@ -305,7 +305,7 @@ CudaBufferSlice *CudaBufferGetSlice(CudaBufferData *cb_data, uint32_t len, void
        }
    } else {
        if (cb_data->op_buffer_write + 1 > cb_data->op_buffer_len) {
-            SCLogInfo("op_buffer limit hit - buffer_len - %"PRIu32,
+            SCLogDebug("op_buffer limit hit - buffer_len - %"PRIu32,
                      cb_data->op_buffer_len);
            SCMutexUnlock(&cb_data->m);

@ -866,7 +866,7 @@ int CudaBufferTest02(void)

 int CudaBufferTest03(void)
 {
-    CudaBufferSlice *slice1, *slice2, *slice3, *slice_temp;
+    CudaBufferSlice *slice, *slice_temp;
    int result = 0;

    uint8_t *d_buffer = SCMalloc(sizeof(uint8_t) * 64);
@ -886,9 +886,9 @@ int CudaBufferTest03(void)
        goto end;
    }

-    slice1 = CudaBufferGetSlice(data, 16, NULL);
-    slice2 = CudaBufferGetSlice(data, 16, NULL);
-    slice3 = CudaBufferGetSlice(data, 24, NULL);
+    slice = CudaBufferGetSlice(data, 16, NULL);
+    slice = CudaBufferGetSlice(data, 16, NULL);
+    slice = CudaBufferGetSlice(data, 24, NULL);

    /* culling */
    CudaBufferCulledInfo culled_info;
--- a/src/util-cuda-vars.c
+++ b/src/util-cuda-vars.c
@ -0,0 +1,74 @@
+/* Copyright (C) 2007-2010 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ */
+
+#ifdef __SC_CUDA_SUPPORT__
+
+#include "suricata.h"
+#include "util-mpm.h"
+#include "util-cuda-handlers.h"
+#include "util-cuda-vars.h"
+#include "detect-engine-mpm.h"
+#include "util-debug.h"
+#include "util-mpm-ac.h"
+
+static DetectEngineCtx *cuda_de_ctx = NULL;
+
+void CudaVarsSetDeCtx(DetectEngineCtx *de_ctx)
+{
+    if (cuda_de_ctx != NULL) {
+        SCLogError(SC_ERR_FATAL, "CudaVarsSetDeCtx() called more than once.  "
+                   "This function should be called only once during the "
+                   "lifetime of the engine.");
+        exit(EXIT_FAILURE);
+    }
+
+    cuda_de_ctx = de_ctx;
+
+    return;
+}
+
+int CudaThreadVarsInit(CudaThreadVars *ctv)
+{
+    if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
+        return 0;
+
+    MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
+    if (conf == NULL) {
+        SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
+        return -1;
+    }
+
+    ctv->mpm_is_cuda = 1;
+    ctv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
+    ctv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit;
+    ctv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit;
+    ctv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
+    ctv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
+    ctv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0);
+    ctv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1);
+    ctv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0);
+
+    return 0;
+}
+
+#endif
--- a/src/util-cuda-vars.h
+++ b/src/util-cuda-vars.h
@ -0,0 +1,65 @@
+/* Copyright (C) 2007-2010 Open Information Security Foundation
+ *
+ * You can copy, redistribute or modify this Program under the terms of
+ * the GNU General Public License version 2 as published by the Free
+ * Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/**
+ * \file
+ *
+ * \author Anoop Saldanha <anoopsaldanha@gmail.com>
+ */
+
+#ifdef __SC_CUDA_SUPPORT__
+
+#ifndef __UTIL_CUDA_VARS__H__
+#define __UTIL_CUDA_VARS__H__
+
+#include "util-cuda-buffer.h"
+#include "util-mpm.h"
+#include "threads.h"
+
+typedef struct CudaThreadVars_ {
+    /* cb - CudaBuffer */
+    CudaBufferData *cuda_ac_cb;
+
+    MpmCtx *mpm_proto_other_ctx;
+
+    MpmCtx *mpm_proto_tcp_ctx_ts;
+    MpmCtx *mpm_proto_udp_ctx_ts;
+
+    MpmCtx *mpm_proto_tcp_ctx_tc;
+    MpmCtx *mpm_proto_udp_ctx_tc;
+
+    uint16_t data_buffer_size_max_limit;
+    uint16_t data_buffer_size_min_limit;
+
+    uint8_t mpm_is_cuda;
+} CudaThreadVars;
+
+typedef struct CudaPacketVars_ {
+    uint8_t cuda_mpm_enabled;
+    uint8_t cuda_done;
+    uint16_t cuda_gpu_matches;
+    SCMutex cuda_mutex;
+    SCCondT cuda_cond;
+    uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1];
+} CudaPacketVars;
+
+void CudaVarsSetDeCtx(struct DetectEngineCtx_ *de_ctx);
+int CudaThreadVarsInit(CudaThreadVars *ctv);
+
+#endif /* __UTIL_CUDA_VARS__H__ */
+
+#endif /* __SC_CUDA_SUPPORT__ */
--- a/src/util-mpm-ac.c
+++ b/src/util-mpm-ac.c
@ -1408,6 +1408,257 @@ void SCACPrintInfo(MpmCtx *mpm_ctx)
 /****************************Cuda side of things****************************/

 #ifdef __SC_CUDA_SUPPORT__
+
+/* \todo Technically it's generic to all mpms, but since we use ac only, the
+ *       code internally directly references ac and hence it has found its
+ *       home in this file, instead of util-mpm.c
+ */
+void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
+{
+    MpmCtx *mpm_ctx = NULL;
+
+    int ac_16_tables = 0;
+    int ac_32_tables = 0;
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+    mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
+    if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
+        SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
+        if (ctx->state_count < 32767)
+            ac_16_tables++;
+        else
+            ac_32_tables++;
+    }
+
+    if (ac_16_tables > 0 && ac_32_tables > 0)
+        SCACConstructBoth16and32StateTables();
+
+
+    SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables);
+    SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables);
+
+}
+
 /* \todos
 * - Use texture memory - Can we fit all the arrays into a 3d texture.
 *   Texture memory definitely offers slightly better performance even
@ -1607,20 +1858,20 @@ static void *SCACCudaDispatcher(void *arg)
        for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
            Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];

-            p->cuda_gpu_matches =
+            p->cuda_pkt_vars.cuda_gpu_matches =
                cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
-            if (p->cuda_gpu_matches != 0) {
-                memcpy(p->cuda_results,
+            if (p->cuda_pkt_vars.cuda_gpu_matches != 0) {
+                memcpy(p->cuda_pkt_vars.cuda_results,
                       cuda_results_buffer_h +
                       ((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2),
                       (cuda_results_buffer_h[((o_buffer[i_op_start_offset] -
                                                d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
            }

-            SCMutexLock(&p->cuda_mutex);
-            p->cuda_done = 1;
-            SCMutexUnlock(&p->cuda_mutex);
-            SCCondSignal(&p->cuda_cond);
+            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+            p->cuda_pkt_vars.cuda_done = 1;
+            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+            SCCondSignal(&p->cuda_pkt_vars.cuda_cond);
        }
        if (no_of_items != 0)
            CudaBufferReportCulledConsumption(cb_data, &cb_culled_info);
@ -1666,25 +1917,25 @@ uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
 {
    uint32_t u = 0;

-    while (!p->cuda_done) {
-        SCMutexLock(&p->cuda_mutex);
-        if (p->cuda_done) {
-            SCMutexUnlock(&p->cuda_mutex);
+    while (!p->cuda_pkt_vars.cuda_done) {
+        SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+        if (p->cuda_pkt_vars.cuda_done) {
+            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
            break;
        } else {
-            SCCondWait(&p->cuda_cond, &p->cuda_mutex);
-            SCMutexUnlock(&p->cuda_mutex);
+            SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex);
+            SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
        }
    } /* while */
-    p->cuda_done = 0;
-    p->cuda_mpm_enabled = 0;
+    p->cuda_pkt_vars.cuda_done = 0;
+    p->cuda_pkt_vars.cuda_mpm_enabled = 0;

-    uint32_t cuda_matches = p->cuda_gpu_matches;
+    uint32_t cuda_matches = p->cuda_pkt_vars.cuda_gpu_matches;
    if (cuda_matches == 0)
        return 0;

    uint32_t matches = 0;
-    uint32_t *results = p->cuda_results + 1;
+    uint32_t *results = p->cuda_pkt_vars.cuda_results + 1;
    uint8_t *buf = p->payload;
    SCACCtx *ctx = mpm_ctx->ctx;
    SCACOutputTable *output_table = ctx->output_table;
--- a/src/util-mpm-ac.h
+++ b/src/util-mpm-ac.h
@ -30,6 +30,8 @@

 #ifdef __SC_CUDA_SUPPORT__
 #include "util-cuda.h"
+#include "util-cuda-vars.h"
+#include "decode.h"
 #endif /* __SC_CUDA_SUPPORT__ */

 typedef struct SCACPattern_ {
@ -108,6 +110,85 @@ void MpmACRegister(void);
 #define MPM_AC_CUDA_MODULE_NAME "ac_cuda"
 #define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb"

+static inline void CudaBufferPacket(CudaThreadVars *ctv, Packet *p)
+{
+    if (p->cuda_pkt_vars.cuda_mpm_enabled) {
+        while (!p->cuda_pkt_vars.cuda_done) {
+            SCMutexLock(&p->cuda_pkt_vars.cuda_mutex);
+            if (p->cuda_pkt_vars.cuda_done) {
+                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+                break;
+            } else {
+                SCCondWait(&p->cuda_pkt_vars.cuda_cond, &p->cuda_pkt_vars.cuda_mutex);
+                SCMutexUnlock(&p->cuda_pkt_vars.cuda_mutex);
+            }
+        }
+    }
+    p->cuda_pkt_vars.cuda_done = 0;
+
+    if (p->payload_len == 0 ||
+        (p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) ||
+        (p->flags & PKT_ALLOC) ||
+        (ctv->data_buffer_size_min_limit != 0 && p->payload_len < ctv->data_buffer_size_min_limit) ||
+        (p->payload_len > ctv->data_buffer_size_max_limit && ctv->data_buffer_size_max_limit != 0) ) {
+        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
+        return;
+    }
+
+    MpmCtx *mpm_ctx = NULL;
+    if (p->proto == IPPROTO_TCP) {
+        if (p->flowflags & FLOW_PKT_TOSERVER)
+            mpm_ctx = ctv->mpm_proto_tcp_ctx_ts;
+        else
+            mpm_ctx = ctv->mpm_proto_tcp_ctx_tc;
+    } else if (p->proto == IPPROTO_UDP) {
+        if (p->flowflags & FLOW_PKT_TOSERVER)
+            mpm_ctx = ctv->mpm_proto_udp_ctx_ts;
+        else
+            mpm_ctx = ctv->mpm_proto_udp_ctx_tc;
+    } else {
+        mpm_ctx = ctv->mpm_proto_other_ctx;
+    }
+    if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) {
+        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
+        return;
+    }
+
+#if __WORDSIZE==64
+    CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb,
+                                                p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr),
+                                                (void *)p);
+    if (slice == NULL) {
+        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
+                   "this to dev.");
+        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
+        return;
+    }
+    *((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
+    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
+    memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
+#else
+    CudaBufferSlice *slice = CudaBufferGetSlice(ctv->cuda_ac_cb,
+                                                p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr),
+                                                (void *)p);
+    if (slice == NULL) {
+        SCLogError(SC_ERR_FATAL, "Error retrieving slice.  Please report "
+                   "this to dev.");
+        p->cuda_pkt_vars.cuda_mpm_enabled = 0;
+        return;
+    }
+    *((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
+    *((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
+    memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
+#endif
+    p->cuda_pkt_vars.cuda_mpm_enabled = 1;
+    SC_ATOMIC_SET(slice->done, 1);
+
+    SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n",
+               p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda);
+
+    return;
+}

 void MpmACCudaRegister(void);
 void SCACConstructBoth16and32StateTables(void);
@ -117,6 +198,7 @@ void SCACCudaStartDispatcher(void);
 void SCACCudaKillDispatcher(void);
 uint32_t  SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
                                          PatternMatcherQueue *pmq);
+void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx);

 #endif /* __SC_CUDA_SUPPORT__ */

--- a/suricata.yaml.in
+++ b/suricata.yaml.in
@ -10,9 +10,9 @@
 # conservative 1024. A higher number will make sure CPU's/CPU cores will be
 # more easily kept busy, but may negatively impact caching.
 #
-# If you are using the CUDA pattern matcher (b2g_cuda below), different rules
-# apply. In that case try something like 4000 or more. This is because the CUDA
-# pattern matcher scans many packets in parallel.
+# If you are using the CUDA pattern matcher (mpm-algo: ac-cuda), different rules
+# apply. In that case try something like 60000 or more. This is because the CUDA
+# pattern matcher buffers and scans as many packets as possible in parallel.
 #max-pending-packets: 1024

 # Runmode the engine should use. Please check --list-runmodes to get the available