Version 1 of AC Cuda.

pull/400/head
Anoop Saldanha 12 years ago committed by Victor Julien
parent 2de59fc235
commit 17c763f855

@ -257,6 +257,7 @@ util-cpu.c util-cpu.h \
util-crypt.c util-crypt.h \
util-cuda.c util-cuda.h \
util-cuda-buffer.c util-cuda-buffer.h \
util-cuda-handlers.c util-cuda-handlers.h \
util-daemon.c util-daemon.h \
util-debug.c util-debug.h \
util-debug-filters.c util-debug-filters.h \
@ -325,7 +326,7 @@ win32-misc.c win32-misc.h \
win32-service.c win32-service.h \
win32-syslog.h
EXTRA_DIST = util-mpm-b2g-cuda-kernel.cu ptxdump.py
EXTRA_DIST = util-mpm-ac-cuda-kernel.cu ptxdump.py
# set the include path found by configure
INCLUDES= $(all_includes)
@ -345,7 +346,7 @@ BUILT_SOURCES = cuda-ptxdump.h
suricata_SOURCES += cuda-ptxdump.h
suricata_CUDA_KERNELS = \
util-mpm-b2g-cuda-kernel.cu
util-mpm-ac-cuda-kernel.cu
NVCCFLAGS=-O2
@ -355,7 +356,8 @@ SUFFIXES = \
.ptx_sm_12 \
.ptx_sm_13 \
.ptx_sm_20 \
.ptx_sm_21
.ptx_sm_21 \
.ptx_sm_30
PTXS = $(suricata_CUDA_KERNELS:.cu=.ptx_sm_10)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_11)
@ -363,6 +365,7 @@ PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_12)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_13)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_20)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_30)
.cu.ptx_sm_10:
$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_10 -ptx $<
@ -382,6 +385,9 @@ PTXS += $(suricata_CUDA_KERNELS:.cu=.ptx_sm_21)
.cu.ptx_sm_21:
$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_21 -ptx $<
.cu.ptx_sm_30:
$(NVCC) $(NVCCFLAGS) -o $@ -arch=sm_30 -ptx $<
cuda-ptxdump.h: $(PTXS)
$(PYTHON) ptxdump.py cuda-ptxdump $(PTXS)

@ -28,9 +28,12 @@
#define COUNTERS
#include "suricata-common.h"
#include "threadvars.h"
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda-buffer.h"
#endif /* __SC_CUDA_SUPPORT__ */
typedef enum {
CHECKSUM_VALIDATION_DISABLE,
CHECKSUM_VALIDATION_ENABLE,
@ -487,6 +490,14 @@ typedef struct Packet_
#ifdef PROFILING
PktProfiling profile;
#endif
#ifdef __SC_CUDA_SUPPORT__
uint8_t cuda_mpm_enabled;
uint8_t cuda_done;
uint16_t cuda_gpu_matches;
SCMutex cuda_mutex;
SCCondT cuda_cond;
uint32_t cuda_results[(UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT * 2) + 1];
#endif
} Packet;
#define DEFAULT_PACKET_SIZE (1500 + ETHERNET_HEADER_LEN)
@ -570,6 +581,24 @@ typedef struct DecodeThreadVars_
uint16_t counter_defrag_ipv6_reassembled;
uint16_t counter_defrag_ipv6_timeouts;
uint16_t counter_defrag_max_hit;
#ifdef __SC_CUDA_SUPPORT__
/* cb - CudaBuffer */
CudaBufferData *cuda_ac_cb;
MpmCtx *mpm_proto_other_ctx;
MpmCtx *mpm_proto_tcp_ctx_ts;
MpmCtx *mpm_proto_udp_ctx_ts;
MpmCtx *mpm_proto_tcp_ctx_tc;
MpmCtx *mpm_proto_udp_ctx_tc;
uint16_t data_buffer_size_max_limit;
uint16_t data_buffer_size_min_limit;
uint8_t mpm_is_cuda;
#endif
} DecodeThreadVars;
/**
@ -586,12 +615,27 @@ typedef struct DecodeThreadVars_
/**
* \brief Initialize a packet structure for use.
*/
#define PACKET_INITIALIZE(p) { \
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda-handlers.h"
#include "util-mpm.h"
#define PACKET_INITIALIZE(p) do { \
memset((p), 0x00, SIZE_OF_PACKET); \
SCMutexInit(&(p)->tunnel_mutex, NULL); \
PACKET_RESET_CHECKSUMS((p)); \
(p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \
(p)->livedev = NULL; \
SCMutexInit(&(p)->cuda_mutex, NULL); \
SCCondInit(&(p)->cuda_cond, NULL); \
} while (0)
#else
#define PACKET_INITIALIZE(p) { \
SCMutexInit(&(p)->tunnel_mutex, NULL); \
PACKET_RESET_CHECKSUMS((p)); \
(p)->pkt = ((uint8_t *)(p)) + sizeof(Packet); \
(p)->livedev = NULL; \
}
#endif
/**
* \brief Recycle a packet structure for reuse.

@ -53,6 +53,9 @@
#include "util-debug.h"
#include "util-print.h"
#include "util-memcmp.h"
#ifdef __SC_CUDA_SUPPORT__
#include "util-mpm-ac.h"
#endif
/** \todo make it possible to use multiple pattern matcher algorithms next to
eachother. */
@ -221,11 +224,23 @@ uint32_t PacketPatternSearch(DetectEngineThreadCtx *det_ctx, Packet *p)
if (mpm_ctx == NULL)
SCReturnInt(0);
#ifdef __SC_CUDA_SUPPORT__
if (p->cuda_mpm_enabled && p->pkt_src == PKT_SRC_WIRE) {
ret = SCACCudaPacketResultsProcessing(p, mpm_ctx, &det_ctx->pmq);
} else {
ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
&det_ctx->mtc,
&det_ctx->pmq,
p->payload,
p->payload_len);
}
#else
ret = mpm_table[mpm_ctx->mpm_type].Search(mpm_ctx,
&det_ctx->mtc,
&det_ctx->pmq,
p->payload,
p->payload_len);
#endif
SCReturnInt(ret);
}

@ -845,7 +845,11 @@ static uint8_t DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) {
/* for now, since we still haven't implemented any intelligence into
* understanding the patterns and distributing mpm_ctx across sgh */
if (de_ctx->mpm_matcher == MPM_AC || de_ctx->mpm_matcher == MPM_AC_GFBS ||
#ifdef __SC_CUDA_SUPPORT__
de_ctx->mpm_matcher == MPM_AC_BS || de_ctx->mpm_matcher == MPM_AC_CUDA) {
#else
de_ctx->mpm_matcher == MPM_AC_BS) {
#endif
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
} else {
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
@ -854,6 +858,15 @@ static uint8_t DetectEngineCtxLoadConf(DetectEngineCtx *de_ctx) {
if (strcmp(sgh_mpm_context, "single") == 0) {
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE;
} else if (strcmp(sgh_mpm_context, "full") == 0) {
#ifdef __SC_CUDA_SUPPORT__
if (de_ctx->mpm_matcher == MPM_AC_CUDA) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You can't use "
"the cuda version of our mpm ac, i.e. \"ac-cuda\" "
"along with \"full\" \"sgh-mpm-context\". "
"Allowed values are \"single\" and \"auto\".");
exit(EXIT_FAILURE);
}
#endif
de_ctx->sgh_mpm_context = ENGINE_SGH_MPM_FACTORY_CONTEXT_FULL;
} else {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "You have supplied an "

@ -190,6 +190,7 @@
#include "util-optimize.h"
#include "util-vector.h"
#include "util-path.h"
#include "util-mpm-ac.h"
#include "runmodes.h"
@ -1085,6 +1086,12 @@ static inline void DetectMpmPrefilter(DetectEngineCtx *de_ctx,
}
if (p->payload_len > 0 && (!(p->flags & PKT_NOPAYLOAD_INSPECTION))) {
if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) {
*sms_runflags |= SMS_USED_PM;
PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM);
PacketPatternSearchWithStreamCtx(det_ctx, p);
PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM);
}
if (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_PACKET) {
/* run the multi packet matcher against the payload of the packet */
SCLogDebug("search: (%p, maxlen %" PRIu32 ", sgh->sig_cnt %" PRIu32 ")",
@ -1096,12 +1103,6 @@ static inline void DetectMpmPrefilter(DetectEngineCtx *de_ctx,
*sms_runflags |= SMS_USED_PM;
}
if (!(p->flags & PKT_STREAM_ADD) && (det_ctx->sgh->flags & SIG_GROUP_HEAD_MPM_STREAM)) {
*sms_runflags |= SMS_USED_PM;
PACKET_PROFILING_DETECT_START(p, PROF_DETECT_MPM_PKT_STREAM);
PacketPatternSearchWithStreamCtx(det_ctx, p);
PACKET_PROFILING_DETECT_END(p, PROF_DETECT_MPM_PKT_STREAM);
}
}
}
@ -4387,6 +4388,255 @@ int SigAddressPrepareStage5(DetectEngineCtx *de_ctx) {
return 0;
}
#ifdef __SC_CUDA_SUPPORT__
static void DetermineCudaStateTableSize(DetectEngineCtx *de_ctx)
{
MpmCtx *mpm_ctx = NULL;
int ac_16_tables = 0;
int ac_32_tables = 0;
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_udp_packet, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_other_packet, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_uri, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcbd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hhd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrhd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hmd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hcd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hrud, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_stream, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 0);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
SCACCtx *ctx = (SCACCtx *)mpm_ctx->ctx;
if (ctx->state_count < 32767)
ac_16_tables++;
else
ac_32_tables++;
}
if (ac_16_tables > 0 && ac_32_tables > 0)
SCACConstructBoth16and32StateTables();
SCLogDebug("Total mpm ac 16 bit state tables - %d\n", ac_16_tables);
SCLogDebug("Total mpm ac 32 bit state tables - %d\n", ac_32_tables);
}
#endif
/**
* \brief Convert the signature list into the runtime match structure.
*
@ -4441,6 +4691,26 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
if (de_ctx->sgh_mpm_context == ENGINE_SGH_MPM_FACTORY_CONTEXT_SINGLE) {
MpmCtx *mpm_ctx = NULL;
#ifdef __SC_CUDA_SUPPORT__
if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
/* setting it to default. You've gotta remove it once you fix the state table thing */
SCACConstructBoth16and32StateTables();
MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
if (cuda_context == 0) {
SCLogError(SC_ERR_FATAL, "cuda context is NULL.");
exit(EXIT_FAILURE);
}
int r = SCCudaCtxPushCurrent(cuda_context);
if (r < 0) {
SCLogError(SC_ERR_FATAL, "context push failed.");
exit(EXIT_FAILURE);
}
}
#endif
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4562,7 +4832,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("hsmd- %d\n", mpm_ctx->pattern_cnt);
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hsmd, 1);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4574,7 +4843,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("hscd- %d\n", mpm_ctx->pattern_cnt);
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_hscd, 1);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4586,7 +4854,6 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("huad- %d\n", mpm_ctx->pattern_cnt);
mpm_ctx = MpmFactoryGetMpmCtxForProfile(de_ctx, de_ctx->sgh_mpm_context_huad, 1);
if (mpm_table[de_ctx->mpm_matcher].Prepare != NULL) {
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
@ -4616,6 +4883,21 @@ int SigGroupBuild(DetectEngineCtx *de_ctx)
mpm_table[de_ctx->mpm_matcher].Prepare(mpm_ctx);
}
//printf("hrhhd- %d\n", mpm_ctx->pattern_cnt);
#ifdef __SC_CUDA_SUPPORT__
if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
int r = SCCudaCtxPopCurrent(NULL);
if (r < 0) {
SCLogError(SC_ERR_FATAL, "cuda context pop failure.");
exit(EXIT_FAILURE);
}
}
/* too late to call this either ways. Should be called post ac goto.
* \todo Support this. */
DetermineCudaStateTableSize(de_ctx);
#endif
}
// SigAddressPrepareStage5(de_ctx);

@ -43,6 +43,19 @@
#include "util-profiling.h"
#include "runmode-unix-socket.h"
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda.h"
#include "util-cuda-buffer.h"
#include "util-mpm-ac.h"
#include "util-cuda-handlers.h"
#include "detect-engine.h"
#include "detect-engine-mpm.h"
static DetectEngineCtx *cuda_de_ctx = NULL;
#endif /* __SC_CUDA_SUPPORT__ */
extern uint8_t suricata_ctl_flags;
extern int max_pending_packets;
@ -111,6 +124,15 @@ void TmModuleDecodePcapFileRegister (void) {
tmm_modules[TMM_DECODEPCAPFILE].flags = TM_FLAG_DECODE_TM;
}
#ifdef __SC_CUDA_SUPPORT__
void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx)
{
cuda_de_ctx = de_ctx;
return;
}
#endif
void PcapFileCallbackLoop(char *user, struct pcap_pkthdr *h, u_char *pkt) {
SCEnter();
@ -320,6 +342,90 @@ TmEcode ReceivePcapFileThreadDeinit(ThreadVars *tv, void *data) {
SCReturnInt(TM_ECODE_OK);
}
#ifdef __SC_CUDA_SUPPORT__
static inline void DecodePcapFileBufferPacket(DecodeThreadVars *dtv, Packet *p)
{
if (p->cuda_mpm_enabled) {
while (!p->cuda_done) {
SCMutexLock(&p->cuda_mutex);
if (p->cuda_done) {
SCMutexUnlock(&p->cuda_mutex);
break;
} else {
SCCondWait(&p->cuda_cond, &p->cuda_mutex);
SCMutexUnlock(&p->cuda_mutex);
}
}
}
p->cuda_done = 0;
if (p->payload_len == 0 ||
(p->flags & (PKT_NOPAYLOAD_INSPECTION & PKT_NOPACKET_INSPECTION)) ||
(p->flags & PKT_ALLOC) ||
(dtv->data_buffer_size_min_limit != 0 && p->payload_len < dtv->data_buffer_size_min_limit) ||
(p->payload_len > dtv->data_buffer_size_max_limit && dtv->data_buffer_size_max_limit != 0) ) {
p->cuda_mpm_enabled = 0;
return;
}
MpmCtx *mpm_ctx = NULL;
if (p->proto == IPPROTO_TCP) {
if (p->flowflags & FLOW_PKT_TOSERVER)
mpm_ctx = dtv->mpm_proto_tcp_ctx_ts;
else
mpm_ctx = dtv->mpm_proto_tcp_ctx_tc;
} else if (p->proto == IPPROTO_UDP) {
if (p->flowflags & FLOW_PKT_TOSERVER)
mpm_ctx = dtv->mpm_proto_udp_ctx_ts;
else
mpm_ctx = dtv->mpm_proto_udp_ctx_tc;
} else {
mpm_ctx = dtv->mpm_proto_other_ctx;
}
if (mpm_ctx == NULL || mpm_ctx->pattern_cnt == 0) {
p->cuda_mpm_enabled = 0;
return;
}
#if __WORDSIZE==64
CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
p->payload_len + sizeof(uint64_t) + sizeof(CUdeviceptr),
(void *)p);
if (slice == NULL) {
SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report "
"this to dev.");
p->cuda_mpm_enabled = 0;
return;
}
*((uint64_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
*((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint64_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
memcpy(slice->buffer + slice->start_offset + sizeof(uint64_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
#else
CudaBufferSlice *slice = CudaBufferGetSlice(dtv->cuda_ac_cb,
p->payload_len + sizeof(uint32_t) + sizeof(CUdeviceptr),
(void *)p);
if (slice == NULL) {
SCLogError(SC_ERR_FATAL, "Error retrieving slice. Please report "
"this to dev.");
p->cuda_mpm_enabled = 0;
return;
}
*((uint32_t *)(slice->buffer + slice->start_offset)) = p->payload_len;
*((CUdeviceptr *)(slice->buffer + slice->start_offset + sizeof(uint32_t))) = ((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda;
memcpy(slice->buffer + slice->start_offset + sizeof(uint32_t) + sizeof(CUdeviceptr), p->payload, p->payload_len);
#endif
p->cuda_mpm_enabled = 1;
SC_ATOMIC_SET(slice->done, 1);
SCLogDebug("cuda ac buffering packet %p, payload_len - %"PRIu16" and deviceptr - %"PRIu64"\n",
p, p->payload_len, (unsigned long)((SCACCtx *)(mpm_ctx->ctx))->state_table_u32_cuda);
return;
}
#endif /* __SC_CUDA_SUPPORT__ */
double prev_signaled_ts = 0;
TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, PacketQueue *postpq)
@ -351,11 +457,49 @@ TmEcode DecodePcapFile(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq, P
TimeSet(&p->ts);
/* call the decoder */
pcap_g.Decoder(tv, dtv, p, GET_PKT_DATA(p), GET_PKT_LEN(p), pq);
#ifdef DEBUG
BUG_ON(p->pkt_src != PKT_SRC_WIRE && p->pkt_src != PKT_SRC_FFR_V2);
#endif
#ifdef __SC_CUDA_SUPPORT__
if (dtv->mpm_is_cuda)
DecodePcapFileBufferPacket(dtv, p);
#endif
SCReturnInt(TM_ECODE_OK);
}
#ifdef __SC_CUDA_SUPPORT__
static int DecodePcapFileThreadInitCuda(DecodeThreadVars *dtv)
{
if (PatternMatchDefaultMatcher() != MPM_AC_CUDA)
return 0;
MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
if (conf == NULL) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
return -1;
}
dtv->mpm_is_cuda = 1;
dtv->cuda_ac_cb = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
dtv->data_buffer_size_max_limit = conf->data_buffer_size_max_limit;
dtv->data_buffer_size_min_limit = conf->data_buffer_size_min_limit;
dtv->mpm_proto_tcp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 0);
dtv->mpm_proto_tcp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_tcp_packet, 1);
dtv->mpm_proto_udp_ctx_ts = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 0);
dtv->mpm_proto_udp_ctx_tc = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_udp_packet, 1);
dtv->mpm_proto_other_ctx = MpmFactoryGetMpmCtxForProfile(cuda_de_ctx, cuda_de_ctx->sgh_mpm_context_proto_other_packet, 0);
return 0;
}
#endif /* __SC_CUDA_SUPPORT__ */
TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)
{
SCEnter();
@ -367,6 +511,11 @@ TmEcode DecodePcapFileThreadInit(ThreadVars *tv, void *initdata, void **data)
DecodeRegisterPerfCounters(dtv, tv);
#ifdef __SC_CUDA_SUPPORT__
if (DecodePcapFileThreadInitCuda(dtv) < 0)
SCReturnInt(TM_ECODE_FAILED);
#endif
*data = (void *)dtv;
SCReturnInt(TM_ECODE_OK);

@ -26,6 +26,9 @@
void TmModuleReceivePcapFileRegister (void);
void TmModuleDecodePcapFileRegister (void);
#ifdef __SC_CUDA_SUPPORT__
void DecodePcapFileSetCudaDeCtx(DetectEngineCtx *de_ctx);
#endif
#endif /* __SOURCE_PCAP_FILE_H__ */

@ -188,6 +188,10 @@
#include "util-memcmp.h"
#include "util-proto-name.h"
#include "util-spm-bm.h"
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda-buffer.h"
#include "util-mpm-ac.h"
#endif
/*
* we put this here, because we only use it here in main.
@ -1302,6 +1306,7 @@ int main(int argc, char **argv)
SCCudaListCards();
exit(EXIT_SUCCESS);
}
CudaBufferInit();
#endif
if (!CheckValidDaemonModes(daemon, run_mode)) {
@ -1313,9 +1318,6 @@ int main(int argc, char **argv)
TimeInit();
SupportFastPatternForSigMatchTypes();
/* load the pattern matchers */
MpmTableSetup();
if (run_mode != RUNMODE_UNITTEST &&
!list_keywords &&
!list_app_layer_protocols) {
@ -1363,6 +1365,9 @@ int main(int argc, char **argv)
}
}
/* load the pattern matchers */
MpmTableSetup();
AppLayerDetectProtoThreadInit();
if (list_app_layer_protocols) {
AppLayerListSupportedProtocols();
@ -1703,6 +1708,9 @@ int main(int argc, char **argv)
DetectProtoTests();
DetectPortTests();
SCAtomicRegisterTests();
#ifdef __SC_CUDA_SUPPORT__
CudaBufferRegisterUnittests();
#endif
if (list_unittests) {
UtListTests(regex_arg);
}
@ -1819,6 +1827,10 @@ int main(int argc, char **argv)
"context failed.");
exit(EXIT_FAILURE);
}
#ifdef __SC_CUDA_SUPPORT__
if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
DecodePcapFileSetCudaDeCtx(de_ctx);
#endif /* __SC_CUDA_SUPPORT__ */
SCClassConfLoadClassficationConfigFile(de_ctx);
SCRConfLoadReferenceConfigFile(de_ctx);
@ -1961,6 +1973,11 @@ int main(int argc, char **argv)
SCPerfSpawnThreads();
}
#ifdef __SC_CUDA_SUPPORT__
if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
SCACCudaStartDispatcher();
#endif
/* Check if the alloted queues have at least 1 reader and writer */
TmValidateQueueState();
@ -2118,5 +2135,11 @@ int main(int argc, char **argv)
SC_ATOMIC_DESTROY(engine_stage);
#ifdef __SC_CUDA_SUPPORT__
if (PatternMatchDefaultMatcher() == MPM_AC_CUDA)
MpmCudaBufferDeSetup();
CudaHandlerFreeProfiles();
#endif
exit(engine_retval);
}

@ -0,0 +1,357 @@
/* Copyright (C) 2007-2012 Open Information Security Foundation
*
* You can copy, redistribute or modify this Program under the terms of
* the GNU General Public License version 2 as published by the Free
* Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
/**
* \file
*
* \author Anoop Saldanha <anoopsaldanha@gmail.com>
*/
/* compile in, only if we have a CUDA enabled device on the machine, with the
* toolkit and the driver installed */
#ifdef __SC_CUDA_SUPPORT__
#include "suricata-common.h"
#include "util-error.h"
#include "util-debug.h"
#include "conf.h"
#include "util-cuda.h"
#include "util-cuda-handlers.h"
/* file only exists if cuda is enabled */
#include "cuda-ptxdump.h"
/************************conf file profile section**********************/
typedef struct CudaHandlerConfProfile_ {
char *name;
void *ctx;
void (*Free)(void *);
struct CudaHandlerConfProfile_ *next;
} CudaHandlerConfProfile;
static CudaHandlerConfProfile *conf_profiles = NULL;
/* protects above var */
static SCMutex mutex = PTHREAD_MUTEX_INITIALIZER;
void CudaHandlerAddCudaProfileFromConf(const char *name,
void *(*Callback)(ConfNode *node),
void (*Free)(void *))
{
/* we don't do data validation */
SCMutexLock(&mutex);
CudaHandlerConfProfile *tmp_cp = conf_profiles;
while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
tmp_cp = tmp_cp->next;
if (tmp_cp != NULL) {
SCLogError(SC_ERR_INVALID_ARGUMENT, "We already have a cuda conf "
"profile by the name \"%s\" registered.", name);
exit(EXIT_FAILURE);
}
char tmp[200];
int r = snprintf(tmp, sizeof(tmp), "%s%s", "cuda.", name);
if (r < 0) {
SCLogError(SC_ERR_FATAL, "snprintf failure.");
exit(EXIT_FAILURE);
} else if (r > (int)sizeof(tmp)) {
SCLogError(SC_ERR_FATAL, "buffer not big enough to write param.");
exit(EXIT_FAILURE);
}
void *ctx = Callback(ConfGetNode(tmp));
if (ctx == NULL) {
SCMutexUnlock(&mutex);
return;
}
CudaHandlerConfProfile *new_cp = SCMalloc(sizeof(CudaHandlerConfProfile));
if (new_cp == NULL)
exit(EXIT_FAILURE);
memset(new_cp, 0, sizeof(CudaHandlerConfProfile));
new_cp->name = SCStrdup(name);
if (new_cp->name == NULL)
exit(EXIT_FAILURE);
new_cp->ctx = ctx;
new_cp->Free = Free;
if (conf_profiles == NULL) {
conf_profiles = new_cp;
} else {
new_cp->next = conf_profiles;
conf_profiles = new_cp;
}
SCMutexUnlock(&mutex);
return;
}
void *CudaHandlerGetCudaProfile(const char *name)
{
SCMutexLock(&mutex);
CudaHandlerConfProfile *tmp_cp = conf_profiles;
while (tmp_cp != NULL && strcasecmp(name, tmp_cp->name) != 0)
tmp_cp = tmp_cp->next;
if (tmp_cp == NULL) {
SCMutexUnlock(&mutex);
return NULL;
}
SCMutexUnlock(&mutex);
return tmp_cp->ctx;
}
void CudaHandlerFreeProfiles(void)
{
SCMutexLock(&mutex);
CudaHandlerConfProfile *tmp = conf_profiles;
while (tmp != NULL) {
CudaHandlerConfProfile *curr = tmp;
tmp = tmp->next;
SCFree(curr->name);
if (curr->Free != NULL)
curr->Free(curr->ctx);
SCFree(curr);
}
SCMutexUnlock(&mutex);
return;
}
/*******************cuda context related data section*******************/
/* we use a concept where every device on the gpu has only 1 context. If
* a section in the engine wants to use a device and tries to open a context
* on it, we first check if a context is already created for the device and if
* so we return it. If not we create a new one and update with the entry */
static CUcontext *cuda_contexts = NULL;
static int no_of_cuda_contexts = 0;
typedef struct CudaHandlerModuleData_ {
char *name;
void *data;
struct CudaHandlerModuleData_ *next;
} CudaHandlerModuleData;
typedef struct CudaHandlerModule_ {
char *name;
/* the context used by this module */
CUcontext context;
/* the device on which the above context was created */
int device_id;
CudaHandlerModuleData *module_data;
struct CudaHandlerModule_ *next;
} CudaHandlerModule;
static CudaHandlerModule *cudahl_modules = NULL;
CUcontext CudaHandlerModuleGetContext(const char *name, int device_id)
{
SCMutexLock(&mutex);
CudaHandlerModule *module = cudahl_modules;
while (module != NULL && strcasecmp(module->name, name) != 0)
module = module->next;
if (module != NULL) {
if (module->device_id != device_id) {
SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Module already "
"registered, but the new device_id is different "
"from the already registered device_id.");
exit(EXIT_FAILURE);
}
SCMutexUnlock(&mutex);
return module->context;
}
CudaHandlerModule *new_module = SCMalloc(sizeof(CudaHandlerModule));
if (new_module == NULL)
exit(EXIT_FAILURE);
memset(new_module, 0, sizeof(CudaHandlerModule));
new_module->device_id = device_id;
new_module->name = SCStrdup(name);
if (new_module->name == NULL)
exit(EXIT_FAILURE);
if (cudahl_modules == NULL) {
cudahl_modules = new_module;
} else {
new_module->next = cudahl_modules;
cudahl_modules = new_module;
}
if (no_of_cuda_contexts <= device_id) {
cuda_contexts = SCRealloc(cuda_contexts, sizeof(CUcontext) * (device_id + 1));
if (cuda_contexts == NULL)
exit(EXIT_FAILURE);
memset(cuda_contexts + no_of_cuda_contexts, 0,
sizeof(CUcontext) * ((device_id + 1) - no_of_cuda_contexts));
no_of_cuda_contexts = device_id + 1;
}
if (cuda_contexts[device_id] == 0) {
SCCudaDevices *devices = SCCudaGetDeviceList();
if (SCCudaCtxCreate(&cuda_contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
devices->devices[device_id]->device) == -1) {
SCLogDebug("ctxcreate failure.");
exit(EXIT_FAILURE);
}
}
new_module->context = cuda_contexts[device_id];
SCMutexUnlock(&mutex);
return cuda_contexts[device_id];
}
void CudaHandlerModuleStoreData(const char *module_name,
const char *data_name, void *data_ptr)
{
SCMutexLock(&mutex);
CudaHandlerModule *module = cudahl_modules;
while (module != NULL && strcasecmp(module->name, module_name) != 0)
module = module->next;
if (module == NULL) {
SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
"\"%s\" from module \"%s\" that hasn't been registered "
"yet.", module_name, data_name);
exit(EXIT_FAILURE);
}
CudaHandlerModuleData *data = module->module_data;
while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
data = data->next;
}
if (data != NULL) {
SCLogWarning(SC_ERR_CUDA_HANDLER_ERROR, "Data \"%s\" already "
"registered for this module \"%s\".", data_name,
module_name);
SCMutexUnlock(&mutex);
goto end;
}
CudaHandlerModuleData *new_data = SCMalloc(sizeof(CudaHandlerModuleData));
if (new_data == NULL)
exit(EXIT_FAILURE);
memset(new_data, 0, sizeof(CudaHandlerModuleData));
new_data->name = SCStrdup(data_name);
if (new_data->name == NULL)
exit(EXIT_FAILURE);
new_data->data = data_ptr;
if (module->module_data == NULL) {
module->module_data = new_data;
} else {
new_data->next = module->module_data;
module->module_data = new_data;
}
SCMutexUnlock(&mutex);
end:
return;
}
void *CudaHandlerModuleGetData(const char *module_name, const char *data_name)
{
SCMutexLock(&mutex);
CudaHandlerModule *module = cudahl_modules;
while (module != NULL && strcasecmp(module->name, module_name) != 0)
module = module->next;
if (module == NULL) {
SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Trying to retrieve data "
"\"%s\" from module \"%s\" that hasn't been registered "
"yet.", module_name, data_name);
SCMutexUnlock(&mutex);
return NULL;
}
CudaHandlerModuleData *data = module->module_data;
while (data != NULL && (strcasecmp(data_name, data->name) != 0)) {
data = data->next;
}
if (data == NULL) {
SCLogInfo("Data \"%s\" already registered for this module \"%s\". "
"Returning it.", data_name, module_name);
SCMutexUnlock(&mutex);
return NULL;
}
SCMutexUnlock(&mutex);
return data->data;
}
int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image)
{
#define CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE 15
int i = 0;
/* select the ptx image based on the compute capability supported by all
* devices (i.e. the lowest) */
char *image = SCMalloc(strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
if (unlikely(image == NULL)) {
exit(EXIT_FAILURE);
}
memset(image, 0x00, strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE);
int major = INT_MAX;
int minor = INT_MAX;
SCCudaDevices *devices = SCCudaGetDeviceList();
for (i = 0; i < devices->count; i++){
if (devices->devices[i]->major_rev < major){
major = devices->devices[i]->major_rev;
minor = devices->devices[i]->minor_rev;
}
if (devices->devices[i]->major_rev == major &&
devices->devices[i]->minor_rev < minor){
minor = devices->devices[i]->minor_rev;
}
}
snprintf(image,
strlen(ptx_image) + CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE,
"%s_sm_%u%u",
ptx_image, major, minor);
/* we don't have a cuda module associated with this module. Create a
* cuda module, update the module with this cuda module reference and
* then return the module refernce back to the calling function using
* the argument */
SCLogDebug("Loading kernel module: %s\n",image);
if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1)
goto error;
SCFree(image);
return 0;
error:
SCFree(image);
return -1;
#undef CUDA_HANDLER_GET_CUDA_MODULE_BUFFER_EXTRA_SPACE
}
#endif /* __SC_CUDA_SUPPORT__ */

@ -0,0 +1,50 @@
/* Copyright (C) 2007-2012 Open Information Security Foundation
*
* You can copy, redistribute or modify this Program under the terms of
* the GNU General Public License version 2 as published by the Free
* Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
/**
* \file
*
* \author Anoop Saldanha <anoopsaldanha@gmail.com>
*/
#ifndef __UTIL_CUDA_HANDLERS__H__
#define __UTIL_CUDA_HANDLERS__H__
#include "conf.h"
#include "util-cuda.h"
/************************conf file profile section**********************/
void CudaHandlerAddCudaProfileFromConf(const char *name,
void *(*Callback)(ConfNode *node),
void (*Free)(void *));
void *CudaHandlerGetCudaProfile(const char *name);
void CudaHandlerFreeProfiles(void);
/*******************cuda context related data section*******************/
#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_HOST 0
#define CUDA_HANDLER_MODULE_DATA_TYPE_MEMORY_DEVICE 1
#define CUDA_HANDLER_MODULE_DATA_TYPE_CUDA_BUFFER 2
CUcontext CudaHandlerModuleGetContext(const char *module_name, int device_id);
void CudaHandlerModuleStoreData(const char *module_name,
const char *data_name, void *data_ptr);
void *CudaHandlerModuleGetData(const char *module_name, const char *data_name);
int CudaHandlerGetCudaModule(CUmodule *p_module, const char *ptx_image);
#endif /* __UTIL_CUDA_HANDLERS__H__ */

@ -2346,7 +2346,7 @@ int SCCudaMemAllocHost(void **pp, size_t byte_size)
}
result = cuMemAllocHost(pp, byte_size);
if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1)
if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_HOST) == -1)
goto error;
return 0;

@ -161,7 +161,7 @@ const char * SCErrorToString(SCError err)
CASE_CODE (SC_ERR_CUDA_HANDLER_ERROR);
CASE_CODE (SC_ERR_TM_THREADS_ERROR);
CASE_CODE (SC_ERR_TM_MODULES_ERROR);
CASE_CODE (SC_ERR_B2G_CUDA_ERROR);
CASE_CODE (SC_ERR_AC_CUDA_ERROR);
CASE_CODE (SC_ERR_INVALID_YAML_CONF_ENTRY);
CASE_CODE (SC_ERR_TMQ_ALREADY_REGISTERED);
CASE_CODE (SC_ERR_CONFLICTING_RULE_KEYWORDS);

@ -165,7 +165,7 @@ typedef enum {
SC_ERR_CUDA_HANDLER_ERROR,
SC_ERR_TM_THREADS_ERROR,
SC_ERR_TM_MODULES_ERROR,
SC_ERR_B2G_CUDA_ERROR,
SC_ERR_AC_CUDA_ERROR,
SC_ERR_INVALID_YAML_CONF_ENTRY,
SC_ERR_TMQ_ALREADY_REGISTERED,
SC_ERR_CONFLICTING_RULE_KEYWORDS,

@ -0,0 +1,96 @@
/* Copyright (C) 2007-2012 Open Information Security Foundation
*
* You can copy, redistribute or modify this Program under the terms of
* the GNU General Public License version 2 as published by the Free
* Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
/**
* \file
*
* \author Anoop Saldanha <anoopsaldanha@gmail.com>
*
* The Cuda kernel for MPM AC.
*
* \todo - This is a basic version of the kernel.
* - Support 16 bit state tables.
* - Texture memory.
* - Multiple threads per blocks of threads. Make use of
* shared memory/texture memory.
*/
extern "C"
__global__ void SCACCudaSearch64(unsigned char *d_buffer,
unsigned int d_buffer_start_offset,
unsigned int *o_buffer,
unsigned int *results_buffer,
unsigned int nop,
unsigned char *tolower)
{
unsigned int u = 0;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= nop)
return;
unsigned int buflen = *((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
unsigned int (*state_table_u32)[256] =
(unsigned int (*)[256])*((unsigned long *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8));
unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 16);
unsigned int state = 0;
unsigned int matches = 0;
unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
for (u = 0; u < buflen; u++) {
state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
if (state & 0xFF000000) {
results[matches++] = u;
results[matches++] = state & 0x00FFFFFF;
}
}
*(results - 1) = matches;
return;
}
extern "C"
__global__ void SCACCudaSearch32(unsigned char *d_buffer,
unsigned int d_buffer_start_offset,
unsigned int *o_buffer,
unsigned int *results_buffer,
unsigned int nop,
unsigned char *tolower)
{
unsigned int u = 0;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= nop)
return;
unsigned int buflen = *((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset)));
unsigned int (*state_table_u32)[256] =
(unsigned int (*)[256])*((unsigned int *)(d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 4));
unsigned char *buf = (d_buffer + (o_buffer[tid] - d_buffer_start_offset) + 8);
unsigned int state = 0;
unsigned int matches = 0;
unsigned int *results = (results_buffer + ((o_buffer[tid] - d_buffer_start_offset) * 2) + 1);
for (u = 0; u < buflen; u++) {
state = state_table_u32[state & 0x00FFFFFF][tolower[buf[u]]];
if (state & 0xFF000000) {
results[matches++] = u;
results[matches++] = state & 0x00FFFFFF;
}
}
*(results - 1) = matches;
return;
}

@ -49,12 +49,22 @@
#include "suricata.h"
#include "detect.h"
#include "util-mpm-ac.h"
#include "conf.h"
#include "util-debug.h"
#include "util-unittest.h"
#include "util-memcmp.h"
#include "util-mpm-ac.h"
#ifdef __SC_CUDA_SUPPORT__
#include "util-mpm.h"
#include "tm-threads.h"
#include "util-mpm.h"
#include "detect-engine-mpm.h"
#include "util-cuda.h"
#include "util-cuda-handlers.h"
#endif /* __SC_CUDA_SUPPORT__ */
void SCACInitCtx(MpmCtx *, int);
void SCACInitThreadCtx(MpmCtx *, MpmThreadCtx *, uint32_t);
@ -78,6 +88,8 @@ void SCACRegisterTests(void);
#define STATE_QUEUE_CONTAINER_SIZE 65536
static int construct_both_16_and_32_state_tables = 0;
/**
* \brief Helper structure used by AC during state table creation
*/
@ -87,31 +99,6 @@ typedef struct StateQueue_ {
int bot;
} StateQueue;
/**
* \brief Register the aho-corasick mpm.
*/
void MpmACRegister(void)
{
mpm_table[MPM_AC].name = "ac";
/* don't need this. isn't that awesome? no more chopping and blah blah */
mpm_table[MPM_AC].max_pattern_length = 0;
mpm_table[MPM_AC].InitCtx = SCACInitCtx;
mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx;
mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx;
mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx;
mpm_table[MPM_AC].AddPattern = SCACAddPatternCS;
mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI;
mpm_table[MPM_AC].Prepare = SCACPreparePatterns;
mpm_table[MPM_AC].Search = SCACSearch;
mpm_table[MPM_AC].Cleanup = NULL;
mpm_table[MPM_AC].PrintCtx = SCACPrintInfo;
mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats;
mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests;
return;
}
/**
* \internal
* \brief Initialize the AC context with user specified conf parameters. We
@ -759,7 +746,7 @@ static inline void SCACCreateDeltaTable(MpmCtx *mpm_ctx)
int ascii_code = 0;
int32_t r_state = 0;
if (ctx->state_count < 32767) {
if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
ctx->state_table_u16 = SCMalloc(ctx->state_count *
sizeof(SC_AC_STATE_TYPE_U16) * 256);
if (ctx->state_table_u16 == NULL) {
@ -797,7 +784,9 @@ static inline void SCACCreateDeltaTable(MpmCtx *mpm_ctx)
}
}
}
} else {
}
if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
/* create space for the state table. We could have used the existing goto
* table, but since we have it set to hold 32 bit state values, we will create
* a new state table here of type SC_AC_STATE_TYPE(current set to uint16_t) */
@ -850,7 +839,7 @@ static inline void SCACClubOutputStatePresenceWithDeltaTable(MpmCtx *mpm_ctx)
uint32_t state = 0;
uint32_t temp_state = 0;
if (ctx->state_count < 32767) {
if ((ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
for (state = 0; state < ctx->state_count; state++) {
for (ascii_code = 0; ascii_code < 256; ascii_code++) {
temp_state = ctx->state_table_u16[state & 0x7FFF][ascii_code];
@ -858,7 +847,9 @@ static inline void SCACClubOutputStatePresenceWithDeltaTable(MpmCtx *mpm_ctx)
ctx->state_table_u16[state & 0x7FFF][ascii_code] |= (1 << 15);
}
}
} else {
}
if (!(ctx->state_count < 32767) || construct_both_16_and_32_state_tables) {
for (state = 0; state < ctx->state_count; state++) {
for (ascii_code = 0; ascii_code < 256; ascii_code++) {
temp_state = ctx->state_table_u32[state & 0x00FFFFFF][ascii_code];
@ -1032,6 +1023,25 @@ int SCACPreparePatterns(MpmCtx *mpm_ctx)
/* prepare the state table required by AC */
SCACPrepareStateTable(mpm_ctx);
#ifdef __SC_CUDA_SUPPORT__
if (mpm_ctx->mpm_type == MPM_AC_CUDA) {
int r = SCCudaMemAlloc(&ctx->state_table_u32_cuda,
ctx->state_count * sizeof(unsigned int) * 256);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemcpyHtoD(ctx->state_table_u32_cuda,
ctx->state_table_u32,
ctx->state_count * sizeof(unsigned int) * 256);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
exit(EXIT_FAILURE);
}
}
#endif
/* free all the stored patterns. Should save us a good 100-200 mbs */
for (i = 0; i < mpm_ctx->pattern_cnt; i++) {
if (ctx->parray[i] != NULL) {
@ -1166,7 +1176,8 @@ void SCACDestroyCtx(MpmCtx *mpm_ctx)
mpm_ctx->memory_cnt++;
mpm_ctx->memory_size -= (ctx->state_count *
sizeof(SC_AC_STATE_TYPE_U16) * 256);
} else if (ctx->state_table_u32 != NULL) {
}
if (ctx->state_table_u32 != NULL) {
SCFree(ctx->state_table_u32);
ctx->state_table_u32 = NULL;
@ -1394,6 +1405,533 @@ void SCACPrintInfo(MpmCtx *mpm_ctx)
return;
}
/****************************Cuda side of things****************************/
#ifdef __SC_CUDA_SUPPORT__
/* \todos
* - Use texture memory - Can we fit all the arrays into a 3d texture.
* Texture memory definitely offers slightly better performance even
* on gpus that offer cache for global memory.
* - Packetpool - modify to support > 65k max pending packets. We are
* hitting packetpool limit currently even with 65k packets.
* - Use streams. We have tried overlapping parsing results from the
* previous call with invoking the next call.
* - Offer higher priority to decode threads.
* - Modify pcap file mode to support reading from multiple pcap files
* and hence we will have multiple receive threads.
* - Split state table into many small pieces and have multiple threads
* run each small state table on the same payload.
* - Used a config peference of l1 over shared memory with no noticeable
* perf increase. Explore it in detail over cards/architectures.
* - Constant memory performance sucked. Explore it in detail.
* - Currently all our state tables are small. Implement 16 bit state
* tables on priority.
* - Introduce profiling.
* - Retrieve sgh before buffer packet.
* - Buffer smsgs too.
*/
void SCACConstructBoth16and32StateTables(void)
{
construct_both_16_and_32_state_tables = 1;
return;
}
/* \todo Reduce offset buffer size. Probably a 100,000 entry would be sufficient. */
static void *SCACCudaDispatcher(void *arg)
{
#define BLOCK_SIZE 32
int r = 0;
ThreadVars *tv = (ThreadVars *)arg;
MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
uint32_t sleep_interval_ms = conf->batching_timeout;
SCLogInfo("AC Cuda Mpm Dispatcher using a timeout of "
"\"%"PRIu32"\" micro-seconds", sleep_interval_ms);
CudaBufferData *cb_data =
CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME,
MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
CUcontext cuda_context =
CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
if (cuda_context == 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "context is NULL.");
exit(EXIT_FAILURE);
}
r = SCCudaCtxPushCurrent(cuda_context);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "context push failed.");
exit(EXIT_FAILURE);
}
CUmodule cuda_module = 0;
if (CudaHandlerGetCudaModule(&cuda_module, "util-mpm-ac-cuda-kernel") < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda module.");
exit(EXIT_FAILURE);
}
CUfunction kernel = 0;
#if __WORDSIZE==64
if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch64") == -1) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
exit(EXIT_FAILURE);
}
#else
if (SCCudaModuleGetFunction(&kernel, cuda_module, "SCACCudaSearch32") == -1) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving kernel");
exit(EXIT_FAILURE);
}
#endif
uint8_t g_u8_lowercasetable[256];
for (uint8_t c = 0; c < 255; c++)
g_u8_lowercasetable[c] = tolower((uint8_t)c);
CUdeviceptr cuda_g_u8_lowercasetable_d = 0;
CUdeviceptr cuda_packets_buffer_d = 0;
CUdeviceptr cuda_offset_buffer_d = 0;
CUdeviceptr cuda_results_buffer_d = 0;
uint32_t *cuda_results_buffer_h = NULL;
r = SCCudaMemAlloc(&cuda_g_u8_lowercasetable_d, sizeof(g_u8_lowercasetable));
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemcpyHtoD(cuda_g_u8_lowercasetable_d, g_u8_lowercasetable, sizeof(g_u8_lowercasetable));
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemAlloc(&cuda_packets_buffer_d, conf->gpu_transfer_size);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemAlloc(&cuda_offset_buffer_d, conf->gpu_transfer_size * 4);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemAlloc(&cuda_results_buffer_d, conf->gpu_transfer_size * 8);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemAllocHost((void **)&cuda_results_buffer_h, conf->gpu_transfer_size * 8);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemAlloc failure.");
exit(EXIT_FAILURE);
}
CudaBufferCulledInfo cb_culled_info;
memset(&cb_culled_info, 0, sizeof(cb_culled_info));
TmThreadsSetFlag(tv, THV_INIT_DONE);
while (1) {
if (TmThreadsCheckFlag(tv, THV_KILL))
break;
usleep(sleep_interval_ms);
/**************** 1 SEND ****************/
CudaBufferCullCompletedSlices(cb_data, &cb_culled_info);
if (cb_culled_info.no_of_items == 0)
continue;
#if 0
SCLogInfo("1 - cb_culled_info.no_of_items-%"PRIu32" "
"cb_culled_info.buffer_len - %"PRIu32" "
"cb_culled_info.average size - %f "
"cb_culled_info.d_buffer_start_offset - %"PRIu32" "
"cb_culled_info.op_buffer_start_offset - %"PRIu32" "
"cb_data.no_of_items - %"PRIu32" "
"cb_data.d_buffer_read - %"PRIu32" "
"cb_data.d_buffer_write - %"PRIu32" "
"cb_data.op_buffer_read - %"PRIu32" "
"cb_data.op_buffer_write - %"PRIu32"\n",
cb_culled_info.no_of_items,
cb_culled_info.d_buffer_len,
cb_culled_info.d_buffer_len / (float)cb_culled_info.no_of_items,
cb_culled_info.d_buffer_start_offset,
cb_culled_info.op_buffer_start_offset,
cb_data->no_of_items,
cb_data->d_buffer_read,
cb_data->d_buffer_write,
cb_data->op_buffer_read,
cb_data->op_buffer_write);
#endif
r = SCCudaMemcpyHtoD(cuda_packets_buffer_d, (cb_data->d_buffer + cb_culled_info.d_buffer_start_offset), cb_culled_info.d_buffer_len);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemcpyHtoD(cuda_offset_buffer_d, (cb_data->o_buffer + cb_culled_info.op_buffer_start_offset), sizeof(uint32_t) * cb_culled_info.no_of_items);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyHtoD failure.");
exit(EXIT_FAILURE);
}
void *args[] = { &cuda_packets_buffer_d,
&cb_culled_info.d_buffer_start_offset,
&cuda_offset_buffer_d,
&cuda_results_buffer_d,
&cb_culled_info.no_of_items,
&cuda_g_u8_lowercasetable_d };
r = SCCudaLaunchKernel(kernel,
(cb_culled_info.no_of_items / BLOCK_SIZE) + 1, 1, 1,
BLOCK_SIZE, 1, 1,
0, 0,
args, NULL);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaLaunchKernel failure.");
exit(EXIT_FAILURE);
}
r = SCCudaMemcpyDtoH(cuda_results_buffer_h, cuda_results_buffer_d, sizeof(uint32_t) * (cb_culled_info.d_buffer_len * 2));
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaMemcpyDtoH failure.");
exit(EXIT_FAILURE);
}
/**************** 1 SYNCHRO ****************/
r = SCCudaCtxSynchronize();
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "SCCudaCtxSynchronize failure.");
exit(EXIT_FAILURE);
}
/************* 1 Parse Results ************/
uint32_t i_op_start_offset = cb_culled_info.op_buffer_start_offset;
uint32_t no_of_items = cb_culled_info.no_of_items;
uint32_t *o_buffer = cb_data->o_buffer;
uint32_t d_buffer_start_offset = cb_culled_info.d_buffer_start_offset;
for (uint32_t i = 0; i < no_of_items; i++, i_op_start_offset++) {
Packet *p = (Packet *)cb_data->p_buffer[i_op_start_offset];
p->cuda_gpu_matches =
cuda_results_buffer_h[((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2)];
if (p->cuda_gpu_matches != 0) {
memcpy(p->cuda_results,
cuda_results_buffer_h +
((o_buffer[i_op_start_offset] - d_buffer_start_offset) * 2),
(cuda_results_buffer_h[((o_buffer[i_op_start_offset] -
d_buffer_start_offset) * 2)] * sizeof(uint32_t)) + 4);
}
SCMutexLock(&p->cuda_mutex);
p->cuda_done = 1;
SCMutexUnlock(&p->cuda_mutex);
SCCondSignal(&p->cuda_cond);
}
if (no_of_items != 0)
CudaBufferReportCulledConsumption(cb_data, &cb_culled_info);
} /* while (1) */
r = SCCudaModuleUnload(cuda_module);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error unloading cuda module.");
exit(EXIT_FAILURE);
}
r = SCCudaMemFree(cuda_packets_buffer_d);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
exit(EXIT_FAILURE);
}
r = SCCudaMemFree(cuda_offset_buffer_d);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
exit(EXIT_FAILURE);
}
r = SCCudaMemFree(cuda_results_buffer_d);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda device memory.");
exit(EXIT_FAILURE);
}
r = SCCudaMemFreeHost(cuda_results_buffer_h);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
exit(EXIT_FAILURE);
}
TmThreadsSetFlag(tv, THV_RUNNING_DONE);
TmThreadWaitForFlag(tv, THV_DEINIT);
TmThreadsSetFlag(tv, THV_CLOSED);
return NULL;
#undef BLOCK_SIZE
}
uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
PatternMatcherQueue *pmq)
{
uint32_t u = 0;
while (!p->cuda_done) {
SCMutexLock(&p->cuda_mutex);
if (p->cuda_done) {
SCMutexUnlock(&p->cuda_mutex);
break;
} else {
SCCondWait(&p->cuda_cond, &p->cuda_mutex);
SCMutexUnlock(&p->cuda_mutex);
}
} /* while */
p->cuda_done = 0;
p->cuda_mpm_enabled = 0;
uint32_t cuda_matches = p->cuda_gpu_matches;
if (cuda_matches == 0)
return 0;
uint32_t matches = 0;
uint32_t *results = p->cuda_results + 1;
uint8_t *buf = p->payload;
SCACCtx *ctx = mpm_ctx->ctx;
SCACOutputTable *output_table = ctx->output_table;
SCACPatternList *pid_pat_list = ctx->pid_pat_list;
for (u = 0; u < cuda_matches; u += 2) {
uint32_t offset = results[u];
uint32_t state = results[u + 1];
/* we should technically be doing state & 0x00FFFFFF, but we don't
* since the cuda kernel does that for us */
uint32_t no_of_entries = output_table[state].no_of_entries;
/* we should technically be doing state & 0x00FFFFFF, but we don't
* since the cuda kernel does that for us */
uint32_t *pids = output_table[state].pids;
uint32_t k;
/* note that this is not a verbatim copy from SCACSearch(). We
* don't copy the pattern id into the pattern_id_array. That's
* the only change */
for (k = 0; k < no_of_entries; k++) {
if (pids[k] & 0xFFFF0000) {
if (SCMemcmp(pid_pat_list[pids[k] & 0x0000FFFF].cs,
buf + offset - pid_pat_list[pids[k] & 0x0000FFFF].patlen + 1,
pid_pat_list[pids[k] & 0x0000FFFF].patlen) != 0) {
/* inside loop */
if (pid_pat_list[pids[k] & 0x0000FFFF].case_state != 3) {
continue;
}
}
if (pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] & (1 << ((pids[k] & 0x0000FFFF) % 8))) {
;
} else {
pmq->pattern_id_bitarray[(pids[k] & 0x0000FFFF) / 8] |= (1 << ((pids[k] & 0x0000FFFF) % 8));
}
matches++;
} else {
if (pmq->pattern_id_bitarray[pids[k] / 8] & (1 << (pids[k] % 8))) {
;
} else {
pmq->pattern_id_bitarray[pids[k] / 8] |= (1 << (pids[k] % 8));
}
matches++;
}
}
}
return matches;
}
void SCACCudaStartDispatcher(void)
{
/* create the threads */
ThreadVars *tv = TmThreadCreate("Cuda_Mpm_AC_Dispatcher",
NULL, NULL,
NULL, NULL,
"custom", SCACCudaDispatcher, 0);
if (tv == NULL) {
SCLogError(SC_ERR_THREAD_CREATE, "Error creating a thread for "
"ac cuda dispatcher. Killing engine.");
exit(EXIT_FAILURE);
}
if (TmThreadSpawn(tv) != 0) {
SCLogError(SC_ERR_THREAD_SPAWN, "Failed to spawn thread for "
"ac cuda dispatcher. Killing engine.");
exit(EXIT_FAILURE);
}
return;
}
int MpmCudaBufferSetup(void)
{
int r = 0;
MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
if (conf == NULL) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
return -1;
}
CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
if (cuda_context == 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
return -1;
}
r = SCCudaCtxPushCurrent(cuda_context);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
return -1;
}
uint8_t *d_buffer = NULL;
uint32_t *o_buffer = NULL;
void **p_buffer = NULL;
r = SCCudaMemAllocHost((void *)&d_buffer, conf->cb_buffer_size);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
return -1;
}
SCLogInfo("Allocated a cuda d_buffer - %"PRIu32" bytes", conf->cb_buffer_size);
r = SCCudaMemAllocHost((void *)&o_buffer, sizeof(uint32_t) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failue.");
return -1;
}
r = SCCudaMemAllocHost((void *)&p_buffer, sizeof(void *) * UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Cuda alloc host failure.");
return -1;
}
r = SCCudaCtxPopCurrent(NULL);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
return -1;
}
CudaBufferData *cb = CudaBufferRegisterNew(d_buffer, conf->cb_buffer_size, o_buffer, p_buffer, UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT);
if (cb == NULL) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error registering new cb instance.");
return -1;
}
CudaHandlerModuleStoreData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME, cb);
return 0;
}
int MpmCudaBufferDeSetup(void)
{
int r = 0;
MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
if (conf == NULL) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
return -1;
}
CudaBufferData *cb_data = CudaHandlerModuleGetData(MPM_AC_CUDA_MODULE_NAME, MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME);
BUG_ON(cb_data == NULL);
CUcontext cuda_context = CudaHandlerModuleGetContext(MPM_AC_CUDA_MODULE_NAME, conf->device_id);
if (cuda_context == 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error retrieving cuda context.");
return -1;
}
r = SCCudaCtxPushCurrent(cuda_context);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error pushing cuda context.");
return -1;
}
r = SCCudaMemFreeHost(cb_data->d_buffer);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
return -1;
}
r = SCCudaMemFreeHost(cb_data->o_buffer);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
return -1;
}
r = SCCudaMemFreeHost(cb_data->p_buffer);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error freeing cuda host memory.");
return -1;
}
r = SCCudaCtxPopCurrent(NULL);
if (r < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "cuda context pop failure.");
return -1;
}
CudaBufferDeRegister(cb_data);
return 0;
}
#endif /* __SC_CUDA_SUPPORT */
/************************** Mpm Registration ***************************/
/**
* \brief Register the aho-corasick mpm.
*/
void MpmACRegister(void)
{
mpm_table[MPM_AC].name = "ac";
/* don't need this. isn't that awesome? no more chopping and blah blah */
mpm_table[MPM_AC].max_pattern_length = 0;
mpm_table[MPM_AC].InitCtx = SCACInitCtx;
mpm_table[MPM_AC].InitThreadCtx = SCACInitThreadCtx;
mpm_table[MPM_AC].DestroyCtx = SCACDestroyCtx;
mpm_table[MPM_AC].DestroyThreadCtx = SCACDestroyThreadCtx;
mpm_table[MPM_AC].AddPattern = SCACAddPatternCS;
mpm_table[MPM_AC].AddPatternNocase = SCACAddPatternCI;
mpm_table[MPM_AC].Prepare = SCACPreparePatterns;
mpm_table[MPM_AC].Search = SCACSearch;
mpm_table[MPM_AC].Cleanup = NULL;
mpm_table[MPM_AC].PrintCtx = SCACPrintInfo;
mpm_table[MPM_AC].PrintThreadCtx = SCACPrintSearchStats;
mpm_table[MPM_AC].RegisterUnittests = SCACRegisterTests;
return;
}
#ifdef __SC_CUDA_SUPPORT__
/**
* \brief Register the aho-corasick cuda mpm.
*/
void MpmACCudaRegister(void)
{
mpm_table[MPM_AC_CUDA].name = "ac-cuda";
/* don't need this. isn't that awesome? no more chopping and blah blah */
mpm_table[MPM_AC_CUDA].max_pattern_length = 0;
mpm_table[MPM_AC_CUDA].InitCtx = SCACInitCtx;
mpm_table[MPM_AC_CUDA].InitThreadCtx = SCACInitThreadCtx;
mpm_table[MPM_AC_CUDA].DestroyCtx = SCACDestroyCtx;
mpm_table[MPM_AC_CUDA].DestroyThreadCtx = SCACDestroyThreadCtx;
mpm_table[MPM_AC_CUDA].AddPattern = SCACAddPatternCS;
mpm_table[MPM_AC_CUDA].AddPatternNocase = SCACAddPatternCI;
mpm_table[MPM_AC_CUDA].Prepare = SCACPreparePatterns;
mpm_table[MPM_AC_CUDA].Search = SCACSearch;
mpm_table[MPM_AC_CUDA].Cleanup = NULL;
mpm_table[MPM_AC_CUDA].PrintCtx = SCACPrintInfo;
mpm_table[MPM_AC_CUDA].PrintThreadCtx = SCACPrintSearchStats;
mpm_table[MPM_AC_CUDA].RegisterUnittests = SCACRegisterTests;
if (PatternMatchDefaultMatcher() == MPM_AC_CUDA) {
MpmCudaConf *conf = CudaHandlerGetCudaProfile("mpm");
if (conf == NULL) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error obtaining cuda mpm profile.");
exit(EXIT_FAILURE);
}
if (MpmCudaBufferSetup() < 0) {
SCLogError(SC_ERR_AC_CUDA_ERROR, "Error setting up env for ac cuda");
exit(EXIT_FAILURE);
}
}
return;
}
#endif /* __SC_CUDA_SUPPORT__ */
/*************************************Unittests********************************/
#ifdef UNITTESTS

@ -22,9 +22,16 @@
*
*/
#ifndef __UTIL_MPM_AC__H__
#define __UTIL_MPM_AC__H__
#define SC_AC_STATE_TYPE_U16 uint16_t
#define SC_AC_STATE_TYPE_U32 uint32_t
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda.h"
#endif /* __SC_CUDA_SUPPORT__ */
typedef struct SCACPattern_ {
/* length of the pattern */
uint16_t len;
@ -79,6 +86,11 @@ typedef struct SCACCtx_ {
/* the size of each state */
uint16_t single_state_size;
uint16_t max_pat_id;
#ifdef __SC_CUDA_SUPPORT__
CUdeviceptr state_table_u16_cuda;
CUdeviceptr state_table_u32_cuda;
#endif /* __SC_CUDA_SUPPORT__ */
} SCACCtx;
typedef struct SCACThreadCtx_ {
@ -89,3 +101,24 @@ typedef struct SCACThreadCtx_ {
} SCACThreadCtx;
void MpmACRegister(void);
#ifdef __SC_CUDA_SUPPORT__
#define MPM_AC_CUDA_MODULE_NAME "ac_cuda"
#define MPM_AC_CUDA_MODULE_CUDA_BUFFER_NAME "ac_cuda_cb"
void MpmACCudaRegister(void);
void SCACConstructBoth16and32StateTables(void);
int MpmCudaBufferSetup(void);
int MpmCudaBufferDeSetup(void);
void SCACCudaStartDispatcher(void);
void SCACCudaKillDispatcher(void);
uint32_t SCACCudaPacketResultsProcessing(Packet *p, MpmCtx *mpm_ctx,
PatternMatcherQueue *pmq);
#endif /* __SC_CUDA_SUPPORT__ */
#endif /* __UTIL_MPM_AC__H__ */

@ -1,112 +0,0 @@
/* Copyright (C) 2007-2010 Open Information Security Foundation
*
* You can copy, redistribute or modify this Program under the terms of
* the GNU General Public License version 2 as published by the Free
* Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
/**
* \file
*
* \author Anoop Saldanha <anoopsaldanha@gmail.com>
*
* The Cuda kernel for MPM B2G.
*
* \todo This is a basic version of the kernel. Modify it to support multiple
* blocks of threads. Make use of shared memory/texture memory.
*/
#define B2G_CUDA_Q 2
#define CUDA_THREADS 4000
#define B2G_CUDA_HASHSHIFT 4
#define B2G_CUDA_TYPE unsigned int
#define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b))
#define u8_tolower(c) g_u8_lowercasetable[(c)]
typedef struct SCCudaPBPacketDataForGPU_ {
/* holds the value B2gCtx->m */
unsigned int m;
/* holds B2gCtx->B2g */
unsigned int table;
/* holds the length of the payload */
unsigned int payload_len;
/* holds the payload */
unsigned char payload;
} SCCudaPBPacketDataForGPU;
extern "C"
__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer,
unsigned char *packets_buffer,
unsigned int *packets_offset_buffer,
unsigned int *packets_payload_offset_buffer,
unsigned int nop,
unsigned char *g_u8_lowercasetable)
{
unsigned int tid = blockIdx.x * 32 + threadIdx.x;
/* if the thread id is greater than the no of packets sent in the packets
* buffer, terminate the thread */
//if (tid <= nop)
if (tid >= nop)
return;
SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]);
unsigned int m = packet->m;
unsigned char *buf = &packet->payload;
unsigned int buflen = packet->payload_len;
unsigned int *B2G = (unsigned int *)packet->table;
unsigned int pos = m - B2G_CUDA_Q + 1;
B2G_CUDA_TYPE d;
unsigned short h;
unsigned int first;
unsigned int j = 0;
unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid;
//unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1;
//unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1;
unsigned short *offsets = matches_count + 1;
// temporarily hold the results here, before we shift it to matches_count
// before returning
unsigned short matches = 0;
while (pos <= (buflen - B2G_CUDA_Q + 1)) {
h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos]));
d = B2G[h];
if (d != 0) {
j = pos;
first = pos - (m - B2G_CUDA_Q + 1);
do {
j = j - 1;
if (d >= (1 << (m - 1))) {
if (j > first) {
pos = j;
} else {
offsets[matches++] = j;
}
}
if (j == 0)
break;
h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j]));
d = (d << 1) & B2G[h];
} while (d != 0);
}
pos = pos + m - B2G_CUDA_Q + 1;
}
matches_count[0] = matches;
return;
}

@ -45,6 +45,9 @@
#include "conf-yaml-loader.h"
#include "queue.h"
#include "util-unittest.h"
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda-handlers.h"
#endif
/**
* \brief Register a new Mpm Context.
@ -265,6 +268,118 @@ void MpmFactoryDeRegisterAllMpmCtxProfiles(DetectEngineCtx *de_ctx)
return;
}
#ifdef __SC_CUDA_SUPPORT__
static void MpmCudaConfFree(void *conf)
{
SCFree(conf);
return;
}
static void *MpmCudaConfParse(ConfNode *node)
{
const char *value;
MpmCudaConf *conf = SCMalloc(sizeof(MpmCudaConf));
if (conf == NULL)
exit(EXIT_FAILURE);
memset(conf, 0, sizeof(conf));
if (node != NULL)
value = ConfNodeLookupChildValue(node, "data-buffer-size-min-limit");
else
value = NULL;
if (value == NULL) {
/* default */
conf->data_buffer_size_min_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT;
} else if (ParseSizeStringU16(value, &conf->data_buffer_size_min_limit) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"data-buffer-size-min-limit - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
if (node != NULL)
value = ConfNodeLookupChildValue(node, "data-buffer-size-max-limit");
else
value = NULL;
if (value == NULL) {
/* default */
conf->data_buffer_size_max_limit = UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT;
} else if (ParseSizeStringU16(value, &conf->data_buffer_size_max_limit) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"data-buffer-size-max-limit - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
if (node != NULL)
value = ConfNodeLookupChildValue(node, "cudabuffer-buffer-size");
else
value = NULL;
if (value == NULL) {
/* default */
conf->cb_buffer_size = UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT;
} else if (ParseSizeStringU32(value, &conf->cb_buffer_size) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"cb-buffer-size - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
if (node != NULL)
value = ConfNodeLookupChildValue(node, "gpu-transfer-size");
else
value = NULL;
if (value == NULL) {
/* default */
conf->gpu_transfer_size = UTIL_MPM_CUDA_GPU_TRANSFER_SIZE;
} else if (ParseSizeStringU32(value, &conf->gpu_transfer_size) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"gpu-transfer-size - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
if (node != NULL)
value = ConfNodeLookupChildValue(node, "batching-timeout");
else
value = NULL;
if (value == NULL) {
/* default */
conf->batching_timeout = UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT;
} else if ((conf->batching_timeout = atoi(value)) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"batching-timeout - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
if (node != NULL)
value = ConfNodeLookupChildValue(node, "device-id");
else
value = NULL;
if (value == NULL) {
/* default */
conf->device_id = UTIL_MPM_CUDA_DEVICE_ID_DEFAULT;
} else if ((conf->device_id = atoi(value)) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"device-id - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
if (node != NULL)
value = ConfNodeLookupChildValue(node, "cuda-streams");
else
value = NULL;
if (value == NULL) {
/* default */
conf->cuda_streams = UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT;
} else if ((conf->cuda_streams = atoi(value)) < 0) {
SCLogError(SC_ERR_INVALID_YAML_CONF_ENTRY, "Invalid entry for %s."
"cuda-streams - \"%s\"", node->name, value);
exit(EXIT_FAILURE);
}
return conf;
}
#endif
/**
* \brief Setup a pmq
*
@ -447,6 +562,10 @@ void MpmTableSetup(void) {
MpmB2gcRegister();
MpmB2gmRegister();
MpmACRegister();
#ifdef __SC_CUDA_SUPPORT__
CudaHandlerAddCudaProfileFromConf("mpm", MpmCudaConfParse, MpmCudaConfFree);
MpmACCudaRegister();
#endif /* __SC_CUDA_SUPPORT__ */
MpmACBSRegister();
MpmACGfbsRegister();
}

@ -65,6 +65,9 @@ enum {
/* aho-corasick */
MPM_AC,
#ifdef __SC_CUDA_SUPPORT__
MPM_AC_CUDA,
#endif
/* aho-corasick-goto-failure state based */
MPM_AC_GFBS,
MPM_AC_BS,
@ -81,6 +84,7 @@ typedef struct MpmThreadCtx_ {
uint32_t memory_cnt;
uint32_t memory_size;
} MpmThreadCtx;
/** \brief helper structure for the pattern matcher engine. The Pattern Matcher
@ -176,6 +180,40 @@ typedef struct MpmTableElmt_ {
MpmTableElmt mpm_table[MPM_TABLE_SIZE];
/* macros decides if cuda is enabled for the platform or not */
#ifdef __SC_CUDA_SUPPORT__
/* the min size limit of a payload(or any other data) to be buffered */
#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MIN_LIMIT_DEFAULT 0
/* the max size limit of a payload(or any other data) to be buffered */
#define UTIL_MPM_CUDA_DATA_BUFFER_SIZE_MAX_LIMIT_DEFAULT 1500
/* Default value for data buffer used by cuda mpm engine for CudaBuffer reg */
#define UTIL_MPM_CUDA_CUDA_BUFFER_DBUFFER_SIZE_DEFAULT 500 * 1024 * 1024
/* Default value for the max data chunk that would be sent to gpu */
#define UTIL_MPM_CUDA_GPU_TRANSFER_SIZE 50 * 1024 * 1024
/* Default value for offset/pointer buffer to be used by cuda mpm
* engine for CudaBuffer reg */
#define UTIL_MPM_CUDA_CUDA_BUFFER_OPBUFFER_ITEMS_DEFAULT 500000
#define UTIL_MPM_CUDA_BATCHING_TIMEOUT_DEFAULT 2000
#define UTIL_MPM_CUDA_CUDA_STREAMS_DEFAULT 2
#define UTIL_MPM_CUDA_DEVICE_ID_DEFAULT 0
/**
* \brief Cuda configuration for "mpm" profile. We can further extend this
* to have conf for specific mpms. For now its common for all mpms.
*/
typedef struct MpmCudaConf_ {
uint16_t data_buffer_size_min_limit;
uint16_t data_buffer_size_max_limit;
uint32_t cb_buffer_size;
uint32_t gpu_transfer_size;
int batching_timeout;
int device_id;
int cuda_streams;
} MpmCudaConf;
#endif /* __SC_CUDA_SUPPORT__ */
struct DetectEngineCtx_;
int32_t MpmFactoryRegisterMpmCtxProfile(struct DetectEngineCtx_ *, const char *, uint8_t);

@ -404,34 +404,33 @@ threading:
# Cuda configuration.
cuda:
# The "mpm" profile. On not specifying any of these parameters, the engine's
# internal default values are used, which are same as the ones specified here.
- mpm:
# Threshold limit for no of packets buffered to the GPU. Once we hit this
# limit, we pass the buffer to the gpu.
packet-buffer-limit: 2400
# The maximum length for a packet that we would buffer to the gpu.
# Anything over this is MPM'ed on the CPU. All entries > 0 are valid.
# Can be specified in kb, mb, gb. Just a number indicates it's in bytes.
packet-size-limit: 1500
# No of packet buffers we initialize. All entries > 0 are valid.
packet-buffers: 10
# The timeout limit for batching of packets in secs. If we don't fill the
# buffer within this timeout limit, we pass the currently filled buffer to the gpu.
# All entries > 0 are valid.
batching-timeout: 1
# Specifies whether to use page-locked memory whereever possible. Accepted values
# are "enabled" and "disabled".
page-locked: enabled
# The device to use for the mpm. Currently we don't support load balancing
# on multiple gpus. In case you have multiple devices on your system, you
# can specify the device to use, using this conf. By default we hold 0, to
# specify the first device cuda sees. To find out device-id associated with
# the card(s) on the system run "suricata --list-cuda-cards".
device-id: 0
# No of Cuda streams used for asynchronous processing. All values > 0 are valid.
# For this option you need a device with Compute Capability > 1.0 and
# page-locked enabled to have any effect.
cuda-streams: 2
# internal default values are used, which are same as the ones specified in
# in the default conf file.
mpm:
# The minimum length required to buffer data to the gpu.
# Anything below this is MPM'ed on the CPU.
# Can be specified in kb, mb, gb. Just a number indicates it's in bytes.
# A value of 0 indicates there's no limit.
data-buffer-size-min-limit: 0
# The maximum length for data that we would buffer to the gpu.
# Anything over this is MPM'ed on the CPU.
# Can be specified in kb, mb, gb. Just a number indicates it's in bytes.
data-buffer-size-max-limit: 1500
# The ring buffer size used by the CudaBuffer API to buffer data.
cudabuffer-buffer-size: 500mb
# The max chunk size that can be sent to the gpu in a single go.
gpu-transfer-size: 50mb
# The timeout limit for batching of packets in microseconds.
batching-timeout: 2000
# The device to use for the mpm. Currently we don't support load balancing
# on multiple gpus. In case you have multiple devices on your system, you
# can specify the device to use, using this conf. By default we hold 0, to
# specify the first device cuda sees. To find out device-id associated with
# the card(s) on the system run "suricata --list-cuda-cards".
device-id: 0
# No of Cuda streams used for asynchronous processing. All values > 0 are valid.
# For this option you need a device with Compute Capability > 1.0.
cuda-streams: 2
# Select the multi pattern algorithm you want to run for scan/search the
# in the engine. The supported algorithms are b2g, b2gc, b2gm, b3g, wumanber,

Loading…
Cancel
Save