batching of packets support for cuda b2g mpm. Supported for both 32 and 64 bit platforms

remotes/origin/master-1.0.x
Anoop Saldanha 16 years ago committed by Victor Julien
parent b3c22cd512
commit 33f4beb0bc

@ -8,6 +8,7 @@ bin_PROGRAMS = suricata
suricata_SOURCES = suricata.c suricata.h \
runmodes.c runmodes.h \
packet-queue.c packet-queue.h \
data-queue.c data-queue.h \
threads.c threads.h \
source-nfq.c source-nfq.h \
source-pcap.c source-pcap.h \
@ -211,7 +212,8 @@ win32-misc.c win32-misc.h \
win32-service.c win32-service.h \
util-action.c util-action.h \
win32-syslog.h \
util-profiling.c util-profiling.h
util-profiling.c util-profiling.h \
cuda-packet-batcher.c cuda-packet-batcher.h
# set the include path found by configure
INCLUDES= $(all_includes)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,139 @@
/**
* Copyright (c) 2010 Open Information Security Foundation.
*
* \author Anoop Saldanha <poonaatsoc@gmail.com>
*/
#ifndef __CUDA_PACKET_BATCHER_H__
#define __CUDA_PACKET_BATCHER_H__
#include "suricata-common.h"
/* compile in, only if we have a CUDA enabled on this machine */
#ifdef __SC_CUDA_SUPPORT__
#include "util-cuda.h"
/* The min no of packets that we allot the buffer for. We will make
* this user configurable(yaml) based on the traffic they expect. Either ways
* for a low/medium traffic network with occasional sgh matches, we shouldn't
* be enabling cuda. We will only end up screwing performance */
#define SC_CUDA_PB_MIN_NO_OF_PACKETS 4000
/**
* \brief Implement the template SCDQGenericQData to transfer the cuda
* packet buffer from the cuda batcher thread to the dispatcher
* thread using the queue SCDQDataQueue.
*/
typedef struct SCCudaPBPacketsBuffer_ {
/* these members from the template SCDQGenericQData that have to be
* compulsarily implemented */
struct SCDQGenericQData_ *next;
struct SCDQGenericQData_ *prev;
/* if we want to consider this pointer as the head of a list, this var
* holds the no of elements in the list */
//uint16_t len;
/* in case this data instance is the head of a list, we can refer the
* bottomost instance directly using this var */
//struct SCDQGenericaQData *bot;
/* our own members from here on*/
/* current count of packets held in packets_buffer. nop = no of packets */
uint32_t nop_in_buffer;
/* the packets buffer. We will assign buffer for SC_CUDA_PB_MIN_NO_OF_PACKETS
* packets. Basically the size of this buffer would be
* SC_CUDA_PB_MIN_NO_OF_PACKETS * sizeof(SCCudaPBPacketDataForGPU), so that
* we can hold mininum SC_CUDA_PB_MIN_NO_OF_PACKETS */
uint8_t *packets_buffer;
/* length of data buffered so far in packets_buffer, which would be sent
* to the GPU. We will need this to copy the buffered data from the
* packets_buffer here on the host, to the buffer on the GPU */
uint32_t packets_buffer_len;
/* packet offset within the packets_buffer. Each packet would be stored in
* packets buffer at a particular offset. This buffer would indicate the
* offset of a packet inside the packet buffer. We will allot space to hold
* offsets for SC_CUDA_PB_MIN_NO_OF_PACKETS packets
* \todo change it to holds offsets for more than SC_CUDA_PB_MIN_NO_OF_PACKETS
* when we use the buffer to hold packets based on the remaining size in the
* buffer rather than on a fixed limit like SC_CUDA_PB_MIN_NO_OF_PACKETS */
uint32_t *packets_offset_buffer;
/* the total packet payload lengths buffered so far. We will need this to
* transfer the total length of the results buffer that has to be transferred
* back from the gpu */
uint32_t packets_total_payload_len;
/* the payload offsets for the different payload lengths buffered in. For
* example if we buffer 4 packets of lengths 3, 4, 5, 6, we will store four
* offsets in the buffer {0, 3, 7, 12, 18} */
uint32_t *packets_payload_offset_buffer;
/* packet addresses for all the packets buffered in the packets_buffer. We
* will allot space to hold packet addresses for SC_CUDA_PB_MIN_NO_OF_PACKETS.
* We will need this, so that the cuda mpm b2g dispatcher thread can inform
* and store the b2g cuda mpm results for the packet*/
Packet **packets_address_buffer;
} SCCudaPBPacketsBuffer;
/**
* \brief Structure for each packet that is being batched to the GPU.
*/
typedef struct SCCudaPBPacketDataForGPU_ {
/* holds B2gCudaCtx->m */
unsigned int m;
/* holds B2gCudaCtx->cuda_B2g */
CUdeviceptr table;
/* holds the length of the payload */
unsigned int payload_len;
/* holds the payload. While we actually store the payload in the buffer,
* we may not end up using the entire 1480 bytes if the payload is smaller */
uint8_t payload[1480];
} SCCudaPBPacketDataForGPU;
/**
* \brief Same as struct SCCudaPBPacketDataForGPU_ except for the payload part.
* We will need this for calculating the size of the non-payload part
* of the packet data to be buffered.
*/
typedef struct SCCudaPBPacketDataForGPUNonPayload_ {
/* holds B2gCudaCtx->m */
unsigned int m;
/* holds B2gCudaCtx->cuda_B2g */
CUdeviceptr table;
/* holds the length of the payload */
unsigned int payload_len;
} SCCudaPBPacketDataForGPUNonPayload;
/**
* \brief The cuda packet batcher threading context.
*/
typedef struct SCCudaPBThreadCtx_ {
/* we need the detection engine context to retrieve the sgh while we start
* receiving and batching the packets */
DetectEngineCtx *de_ctx;
/* packets buffer currently in use inside the cuda batcher thread */
SCCudaPBPacketsBuffer *curr_pb;
} SCCudaPBThreadCtx;
SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void);
void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *);
void SCCudaPBSetBufferPacketThreshhold(uint32_t);
void SCCudaPBCleanUpQueuesAndBuffers(void);
void SCCudaPBSetUpQueuesAndBuffers(void);
void SCCudaPBKillBatchingPackets(void);
TmEcode SCCudaPBBatchPackets(ThreadVars *, Packet *, void *, PacketQueue *);
TmEcode SCCudaPBThreadInit(ThreadVars *, void *, void **);
TmEcode SCCudaPBThreadDeInit(ThreadVars *, void *);
void SCCudaPBThreadExitStats(ThreadVars *, void *);
void SCCudaPBRegisterTests(void);
void TmModuleCudaPacketBatcherRegister(void);
void *SCCudaPBTmThreadsSlot1(void *);
#endif /* __SC_CUDA_SUPPORT__ */
#endif /* __CUDA_PACKET_BATCHER_H__ */

@ -0,0 +1,93 @@
/**
* Copyright (c) 2009, 2010 Open Information Security Foundation.
*
* \author Anoop Saldanha <poonaatsoc@gmail.com>
*/
#include "suricata-common.h"
#include "data-queue.h"
#include "threads.h"
/**
* \brief Enqueues data on the queue.
*
* \param q Pointer to the data queue.
* \param data Pointer to the data to be queued. It should be a pointer to a
* structure instance that implements the template structure
* struct SCDQGenericQData_ defined in data-queue.h.
*/
void SCDQDataEnqueue(SCDQDataQueue *q, SCDQGenericQData *data)
{
/* we already have some data in queue */
if (q->top != NULL) {
data->next = q->top;
q->top->prev = data;
q->top = data;
/* the queue is empty */
} else {
q->top = data;
q->bot = data;
}
q->len++;
#ifdef DBG_PERF
if (q->len > q->dbg_maxlen)
q->dbg_maxlen = q->len;
#endif /* DBG_PERF */
return;
}
/**
* \brief Dequeues and returns an entry from the queue.
*
* \param q Pointer to the data queue.
* \param retval Pointer to the data that has been enqueued. The instance
* returned is/should be a pointer to a structure instance that
* implements the template structure struct SCDQGenericQData_
* defined in data-queue.h.
*/
SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *q)
{
SCDQGenericQData *data = NULL;
/* if the queue is empty there are is no data left and we return NULL */
if (q->len == 0) {
return NULL;
}
/* If we are going to get the last packet, set len to 0
* before doing anything else (to make the threads to follow
* the SCondWait as soon as possible) */
q->len--;
/* pull the bottom packet from the queue */
data = q->bot;
#ifdef OS_DARWIN
/* Weird issue in OS_DARWIN
* Sometimes it looks that two thread arrive here at the same time
* so the bot ptr is NULL */
if (data == NULL) {
printf("No data to dequeue!\n");
return NULL;
}
#endif /* OS_DARWIN */
/* more data in queue */
if (q->bot->prev != NULL) {
q->bot = q->bot->prev;
q->bot->next = NULL;
/* just the one we remove, so now empty */
} else {
q->top = NULL;
q->bot = NULL;
}
data->next = NULL;
data->prev = NULL;
return data;
}

@ -0,0 +1,64 @@
/**
* Copyright (c) 2009, 2010 Open Information Security Foundation.
*
* \author Anoop Saldanha <poonaatsoc@gmail.com>
*
* \file Generic queues. Any instance that wants to get itself on the generic
* queue, would have to implement the template struct SCDQGenericQData_
* defined below.
*/
#ifndef __DATA_QUEUE_H__
#define __DATA_QUEUE_H__
#include "threads.h"
/**
* \brief Generic template for any data structure that wants to be on the
* queue. Any other data structure that wants to be on the queue
* needs to use this template and define its own members from
* <your_own_structure_members_from_here_on> onwards.
*/
typedef struct SCDQGenericQData_ {
/* this is needed when we want to supply a list of data items */
struct SCDQGenericQData_ *next;
struct SCDQGenericQData_ *prev;
/* if we want to consider this pointer as the head of a list, this var
* holds the no of elements in the list. Else it holds a <need_to_think>. */
//uint16_t len;
/* in case this data instance is the head of a list, we can refer the
* bottomost instance directly using this var */
//struct SCDQGenericaQData *bot;
/* any other data structure that wants to be on the queue can implement
* its own memebers from here on, in its structure definition. Just note
* that the first 2 members should always be next and prev in the same
* order */
// <your_own_structure_members_from_here_on>
} SCDQGenericQData;
/**
* \brief The data queue to hold instances that implement the template
* SCDQGenericQData.
*/
typedef struct SCDQDataQueue_ {
/* holds the item at the top of the queue */
SCDQGenericQData *top;
/* holds the item at the bottom of the queue */
SCDQGenericQData *bot;
/* no of items currently in the queue */
uint16_t len;
SCMutex mutex_q;
SCCondT cond_q;
#ifdef DBG_PERF
uint16_t dbg_maxlen;
#endif /* DBG_PERF */
} SCDQDataQueue;
void SCDQDataEnqueue(SCDQDataQueue *, SCDQGenericQData *);
SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *);
#endif /* __DATA_QUEUE_H__ */

@ -247,6 +247,10 @@ struct PacketQueue_;
*/
typedef struct Packet_
{
/* double linked list ptrs */
struct Packet_ *next;
struct Packet_ *prev;
/* Addresses, Ports and protocol
* these are on top so we can use
* the Packet as a hash key */
@ -342,10 +346,6 @@ typedef struct Packet_
/** packet number in the pcap file, matches wireshark */
uint64_t pcap_cnt;
/* double linked list ptrs */
struct Packet_ *next;
struct Packet_ *prev;
/* ready to set verdict counter, only set in root */
uint8_t rtv_cnt;
/* tunnel packet ref count */
@ -369,20 +369,22 @@ typedef struct Packet_
/* required for cuda support */
#ifdef __SC_CUDA_SUPPORT__
PatternMatcherQueue *cuda_pmq;
MpmCtx *cuda_mpm_ctx;
MpmThreadCtx *cuda_mtc;
/* used to hold the match results. We can instead use a void *result
* instead here. That way we can make them hold any result. *todo* */
uint16_t cuda_matches;
/* indicates if the dispatcher should call the search or the scan phase
* of the pattern matcher. We can instead use a void *cuda_data instead.
* This way we can send any data across to the dispatcher */
uint8_t cuda_search;
/* the dispatcher thread would pump the packet into this queue once it has
* processed the packet */
struct PacketQueue_ *cuda_outq;
/* indicates if the cuda mpm would be conducted or a normal cpu mpm would
* be conduced on this packet. If it is set to 0, the cpu mpm; else cuda mpm */
uint8_t cuda_mpm_enabled;
/* indicates if the cuda mpm has finished running the mpm and processed the
* results for this packet, assuming if cuda_mpm_enabled has been set for this
* packet */
uint16_t cuda_done;
/* used by the detect thread and the cuda mpm dispatcher thread. The detect
* thread would wait on this cond var, if the cuda mpm dispatcher thread
* still hasn't processed the packet. The dispatcher would use this cond
* to inform the detect thread(in case it is waiting on this packet), once
* the dispatcher is done processing the packet results */
SCMutex cuda_mutex;
SCCondT cuda_cond;
/* the extra 1 in the 1481, is to hold the no_of_matches from the mpm run */
uint16_t mpm_offsets[1481];
#endif
} Packet;
@ -460,22 +462,28 @@ typedef struct DecodeThreadVars_
/**
* \brief Initialize a packet structure for use.
*/
#define PACKET_INITIALIZE(p) do { \
#ifndef __SC_CUDA_SUPPORT__
#define PACKET_INITIALIZE(p) { \
memset((p), 0x00, sizeof(Packet)); \
SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
PACKET_RESET_CHECKSUMS((p)); \
} while (0)
}
#else
#define PACKET_INITIALIZE(p) { \
memset((p), 0x00, sizeof(Packet)); \
SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \
PACKET_RESET_CHECKSUMS((p)); \
SCMutexInit(&(p)->cuda_mutex, NULL); \
SCCondInit(&(p)->cuda_cond, NULL); \
}
#endif
/**
* \brief Recycle a packet structure for reuse.
* \todo the mutex destroy & init is necessary because of the memset, reconsider
*/
#define PACKET_RECYCLE(p) do { \
CLEAR_ADDR(&(p)->src); \
CLEAR_ADDR(&(p)->dst); \
(p)->sp = 0; \
(p)->dp = 0; \
(p)->proto = 0; \
#define PACKET_DO_RECYCLE(p) do { \
(p)->recursion_level = 0; \
(p)->flags = 0; \
(p)->flowflags = 0; \
@ -530,15 +538,39 @@ typedef struct DecodeThreadVars_
PACKET_RESET_CHECKSUMS((p)); \
} while (0)
#ifndef __SC_CUDA_SUPPORT__
#define PACKET_RECYCLE(p) PACKET_DO_RECYCLE((p))
#else
#define PACKET_RECYCLE(p) do { \
PACKET_DO_RECYCLE((p)); \
SCMutexDestroy(&(p)->cuda_mutex); \
SCCondDestroy(&(p)->cuda_cond); \
SCMutexInit(&(p)->cuda_mutex, NULL); \
SCCondInit(&(p)->cuda_cond, NULL); \
PACKET_RESET_CHECKSUMS((p)); \
} while(0)
#endif
/**
* \brief Cleanup a packet so that we can free it. No memset needed..
*/
#ifndef __SC_CUDA_SUPPORT__
#define PACKET_CLEANUP(p) do { \
if ((p)->pktvar != NULL) { \
PktVarFree((p)->pktvar); \
} \
SCMutexDestroy(&(p)->mutex_rtv_cnt); \
} while (0)
#else
#define PACKET_CLEANUP(p) do { \
if ((p)->pktvar != NULL) { \
PktVarFree((p)->pktvar); \
} \
SCMutexDestroy(&(p)->mutex_rtv_cnt); \
SCMutexDestroy(&(p)->cuda_mutex); \
SCCondDestroy(&(p)->cuda_cond); \
} while(0)
#endif
/* macro's for setting the action

@ -111,6 +111,7 @@ uint32_t PacketPatternSearch(ThreadVars *tv, DetectEngineThreadCtx *det_ctx,
SCEnter();
uint32_t ret;
#ifndef __SC_CUDA_SUPPORT__
ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
&det_ctx->mtc,
@ -130,7 +131,17 @@ uint32_t PacketPatternSearch(ThreadVars *tv, DetectEngineThreadCtx *det_ctx,
SCReturnInt(ret);
}
SCCudaHlProcessPacketWithDispatcher(p, det_ctx, &ret);
if (p->cuda_mpm_enabled) {
ret = B2gCudaResultsPostProcessing(p, det_ctx->sgh->mpm_ctx,
&det_ctx->mtc, &det_ctx->pmq);
} else {
ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx,
&det_ctx->mtc,
&det_ctx->pmq,
p->payload,
p->payload_len);
}
#endif
SCReturnInt(ret);
@ -154,23 +165,8 @@ uint32_t UriPatternSearch(DetectEngineThreadCtx *det_ctx,
//PrintRawDataFp(stdout, uri, uri_len);
uint32_t ret;
#ifndef __SC_CUDA_SUPPORT__
ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search
(det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq,
uri, uri_len);
#else
/* if the user has enabled cuda support, but is not using the cuda mpm
* algo, then we shouldn't take the path of the dispatcher. Call the mpm
* directly */
if (det_ctx->sgh->mpm_uri_ctx->mpm_type != MPM_B2G_CUDA) {
ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search
(det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq,
uri, uri_len);
SCReturnUInt(ret);
}
SCCudaHlProcessUriWithDispatcher(uri, uri_len, det_ctx, &ret);
#endif
ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search(det_ctx->sgh->mpm_uri_ctx,
&det_ctx->mtcu, &det_ctx->pmq, uri, uri_len);
SCReturnUInt(ret);
}

@ -378,47 +378,6 @@ TmEcode DetectEngineThreadCtxInit(ThreadVars *tv, void *initdata, void **data) {
*data = (void *)det_ctx;
#ifdef __SC_CUDA_SUPPORT__
if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA)
return TM_ECODE_OK;
Tmq *tmq;
/* we would prepend this name to the the tv name, to obtain the final unique
* detection thread queue name */
char *cuda_outq_name = "cuda_mpm_rc_disp_outq";
uint8_t disp_outq_name_len = (strlen(tv->name) + strlen(cuda_outq_name) + 1);
char *disp_outq_name = SCMalloc(disp_outq_name_len * sizeof(char));
if (disp_outq_name == NULL)
goto error;
strcpy(disp_outq_name, tv->name);
strcpy(disp_outq_name + strlen(tv->name), cuda_outq_name);
disp_outq_name[disp_outq_name_len] = '\0';
tmq = TmqGetQueueByName(disp_outq_name);
if (tmq != NULL) {
SCLogError(SC_ERR_TMQ_ALREADY_REGISTERED, "A queue by the name \"%s\" "
"is already registered, which shouldn't be the case. Queue "
"name is duplicated. Please check if multiple instances of "
"detection module are given different names ",
disp_outq_name);
goto error;
}
tmq = TmqCreateQueue(disp_outq_name);
if (tmq == NULL)
goto error;
/* hold the queue instane we create under this detection thread instance */
det_ctx->cuda_mpm_rc_disp_outq = tmq;
det_ctx->cuda_mpm_rc_disp_outq->reader_cnt++;
det_ctx->cuda_mpm_rc_disp_outq->writer_cnt++;
return TM_ECODE_OK;
error:
return TM_ECODE_FAILED;
#endif
return TM_ECODE_OK;
}

@ -3140,6 +3140,7 @@ int SigGroupBuild (DetectEngineCtx *de_ctx) {
SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the "
"module SC_RULES_CONTENT_B2G_CUDA");
}
SCCudaCtxPushCurrent(dummy_context);
if (SCCudaMemGetInfo(&cuda_free_before_alloc, &cuda_total) == 0) {
SCLogInfo("Total Memory available in the CUDA context used for mpm "
"with b2g: %.2f MB", cuda_total/(1024.0 * 1024.0));

@ -24,6 +24,7 @@
#include "suricata-common.h"
#include "detect-engine.h"
#include "detect-engine-mpm.h"
#include "tm-threads.h"
#include "util-debug.h"
#include "util-time.h"
@ -43,6 +44,8 @@
#include "output.h"
#include "cuda-packet-batcher.h"
/**
* A list of output modules that will be active for the run mode.
*/
@ -2305,6 +2308,115 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) {
printf("ERROR: TmThreadSpawn failed\n");
exit(EXIT_FAILURE);
}
#if defined(__SC_CUDA_SUPPORT__)
if (PatternMatchDefaultMatcher() == MPM_B2G_CUDA) {
ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode",
"pickup-queue", "simple",
"decode-queue1", "simple",
"1slot");
if (tv_decode1 == NULL) {
printf("ERROR: TmThreadsCreate failed for Decode1\n");
exit(EXIT_FAILURE);
}
tm_module = TmModuleGetByName("DecodePcapFile");
if (tm_module == NULL) {
printf("ERROR: TmModuleGetByName DecodePcap failed\n");
exit(EXIT_FAILURE);
}
Tm1SlotSetFunc(tv_decode1, tm_module, NULL);
TmThreadSetCPUAffinity(tv_decode1, 0);
if (ncpus > 1)
TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM);
if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) {
printf("ERROR: TmThreadSpawn failed\n");
exit(EXIT_FAILURE);
}
ThreadVars *tv_cuda_PB = TmThreadCreate("CUDA_PB",
"decode-queue1", "simple",
"cuda-pb-queue1", "simple",
"custom", SCCudaPBTmThreadsSlot1, 0);
if (tv_cuda_PB == NULL) {
printf("ERROR: TmThreadsCreate failed for CUDA_PB\n");
exit(EXIT_FAILURE);
}
tv_cuda_PB->type = TVT_PPT;
tm_module = TmModuleGetByName("CudaPacketBatcher");
if (tm_module == NULL) {
printf("ERROR: TmModuleGetByName CudaPacketBatcher failed\n");
exit(EXIT_FAILURE);
}
Tm1SlotSetFunc(tv_cuda_PB, tm_module, (void *)de_ctx);
TmThreadSetCPUAffinity(tv_cuda_PB, 0);
if (ncpus > 1)
TmThreadSetThreadPriority(tv_cuda_PB, PRIO_MEDIUM);
if (TmThreadSpawn(tv_cuda_PB) != TM_ECODE_OK) {
printf("ERROR: TmThreadSpawn failed\n");
exit(EXIT_FAILURE);
}
ThreadVars *tv_stream1 = TmThreadCreatePacketHandler("Stream1",
"cuda-pb-queue1", "simple",
"stream-queue1", "simple",
"1slot");
if (tv_stream1 == NULL) {
printf("ERROR: TmThreadsCreate failed for Stream1\n");
exit(EXIT_FAILURE);
}
tm_module = TmModuleGetByName("StreamTcp");
if (tm_module == NULL) {
printf("ERROR: TmModuleGetByName StreamTcp failed\n");
exit(EXIT_FAILURE);
}
Tm1SlotSetFunc(tv_stream1,tm_module,NULL);
TmThreadSetCPUAffinity(tv_stream1, 0);
if (ncpus > 1)
TmThreadSetThreadPriority(tv_stream1, PRIO_MEDIUM);
if (TmThreadSpawn(tv_stream1) != TM_ECODE_OK) {
printf("ERROR: TmThreadSpawn failed\n");
exit(EXIT_FAILURE);
}
} else {
ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream",
"pickup-queue", "simple",
"stream-queue1", "simple",
"varslot");
if (tv_decode1 == NULL) {
printf("ERROR: TmThreadsCreate failed for Decode1\n");
exit(EXIT_FAILURE);
}
tm_module = TmModuleGetByName("DecodePcapFile");
if (tm_module == NULL) {
printf("ERROR: TmModuleGetByName DecodePcap failed\n");
exit(EXIT_FAILURE);
}
TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL);
tm_module = TmModuleGetByName("StreamTcp");
if (tm_module == NULL) {
printf("ERROR: TmModuleGetByName StreamTcp failed\n");
exit(EXIT_FAILURE);
}
TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL);
TmThreadSetCPUAffinity(tv_decode1, 0);
if (ncpus > 1)
TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM);
if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) {
printf("ERROR: TmThreadSpawn failed\n");
exit(EXIT_FAILURE);
}
}
#else
//#if 0
//ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","packetpool","packetpool","varslot");
ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","stream-queue1","simple","varslot");
@ -2336,7 +2448,7 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) {
printf("ERROR: TmThreadSpawn failed\n");
exit(EXIT_FAILURE);
}
#endif
//#if 0
/* start with cpu 1 so that if we're creating an odd number of detect
* threads we're not creating the most on CPU0. */

@ -133,6 +133,7 @@
/* holds the cuda b2g module */
#include "util-mpm-b2g-cuda.h"
#include "util-cuda-handlers.h"
#include "cuda-packet-batcher.h"
#include "output.h"
#include "util-privs.h"
@ -218,6 +219,7 @@ SignalHandlerSetup(int sig, void (*handler)())
void GlobalInits()
{
memset(trans_q, 0, sizeof(trans_q));
memset(data_queues, 0, sizeof(data_queues));
/* Initialize the trans_q mutex */
int blah;
@ -225,6 +227,9 @@ void GlobalInits()
for(blah=0;blah<256;blah++) {
r |= SCMutexInit(&trans_q[blah].mutex_q, NULL);
r |= SCCondInit(&trans_q[blah].cond_q, NULL);
r |= SCMutexInit(&data_queues[blah].mutex_q, NULL);
r |= SCCondInit(&data_queues[blah].cond_q, NULL);
}
if (r != 0) {
@ -793,6 +798,7 @@ int main(int argc, char **argv)
TmModuleLogHttpLogIPv6Register();
#ifdef __SC_CUDA_SUPPORT__
TmModuleCudaMpmB2gRegister();
TmModuleCudaPacketBatcherRegister();
#endif
TmModuleReceiveErfFileRegister();
TmModuleDecodeErfFileRegister();
@ -996,10 +1002,15 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
#ifdef PROFILING
SCProfilingInitRuleCounters(de_ctx);
#endif /* PROFILING */
#ifdef __SC_CUDA_SUPPORT__
SCCudaPBSetUpQueuesAndBuffers();
#endif /* __SC_CUDA_SUPPORT__ */
AppLayerHtpRegisterExtraCallbacks();
SCThresholdConfInitContext(de_ctx,NULL);
@ -1126,6 +1137,10 @@ int main(int argc, char **argv)
SCLogInfo("time elapsed %" PRIuMAX "s", (uintmax_t)(end_time.tv_sec - start_time.tv_sec));
#ifdef __SC_CUDA_SUPPORT__
SCCudaPBKillBatchingPackets();
#endif
TmThreadKillThreads();
SCPerfReleaseResources();
break;

@ -26,6 +26,7 @@
#include "suricata-common.h"
#include "packet-queue.h"
#include "data-queue.h"
/* the name of our binary */
#define PROG_NAME "Suricata"
@ -54,6 +55,8 @@ enum {
* XXX move to the TmQueue structure later
*/
PacketQueue trans_q[256];
SCDQDataQueue data_queues[256];
/* memset to zeros, and mutex init! */
void GlobalInits();

@ -78,6 +78,7 @@ enum {
TMM_RECEIVEIPFW,
#ifdef __SC_CUDA_SUPPORT__
TMM_CUDA_MPM_B2G,
TMM_CUDA_PACKET_BATCHER,
#endif
TMM_RECEIVEERFFILE,
TMM_DECODEERFFILE,

@ -52,6 +52,8 @@ Tmq* TmqCreateQueue(char *name) {
Tmq *q = &tmqs[tmq_id];
q->name = name;
q->id = tmq_id++;
/* for cuda purposes */
q->q_type = 0;
SCLogDebug("created queue \'%s\', %p", name, q);
return q;

@ -29,6 +29,8 @@ typedef struct Tmq_ {
uint16_t id;
uint16_t reader_cnt;
uint16_t writer_cnt;
/* 0 for packet-queue and 1 for data-queue */
uint8_t q_type;
} Tmq;
Tmq* TmqCreateQueue(char *name);

@ -68,45 +68,6 @@ SCMutex tv_root_lock = PTHREAD_MUTEX_INITIALIZER;
thread encounters a failure. Defaults to restart the failed thread */
uint8_t tv_aof = THV_RESTART_THREAD;
typedef struct TmSlot_ {
/* function pointers */
TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *, PacketQueue *);
TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **);
void (*SlotThreadExitPrintStats)(ThreadVars *, void *);
TmEcode (*SlotThreadDeinit)(ThreadVars *, void *);
/* data storage */
void *slot_initdata;
void *slot_data;
/**< queue filled by the SlotFunc with packets that will
* be processed futher _before_ the current packet.
* The locks in the queue are NOT used */
PacketQueue slot_pre_pq;
/**< queue filled by the SlotFunc with packets that will
* be processed futher _after_ the current packet. The
* locks in the queue are NOT used */
PacketQueue slot_post_pq;
/* linked list, only used by TmVarSlot */
struct TmSlot_ *slot_next;
int id; /**< slot id, only used my TmVarSlot to know what the first
* slot is. */
} TmSlot;
/* 1 function slot */
typedef struct Tm1Slot_ {
TmSlot s;
} Tm1Slot;
/* Variable number of function slots */
typedef struct TmVarSlot_ {
TmSlot *s;
} TmVarSlot;
/**
* \brief Check if a thread flag is set
*
@ -649,11 +610,12 @@ TmEcode TmThreadSetSlots(ThreadVars *tv, char *name, void *(*fn_p)(void *)) {
size = sizeof(TmVarSlot);
tv->tm_func = TmThreadsSlotVar;
} else if (strcmp(name, "custom") == 0) {
/* \todo this needs to be changed to support slots of any size */
size = sizeof(Tm1Slot);
if (fn_p == NULL)
goto error;
tv->tm_func = fn_p;
return TM_ECODE_OK;
} else {
printf("Error: Slot \"%s\" not supported\n", name);
goto error;
@ -1134,8 +1096,12 @@ void TmThreadKillThreads(void) {
if (tv->InShutdownHandler != NULL) {
tv->InShutdownHandler(tv);
}
for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++)
for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
if (tv->inq->q_type == 0)
SCCondSignal(&trans_q[tv->inq->id].cond_q);
else
SCCondSignal(&data_queues[tv->inq->id].cond_q);
}
/* to be sure, signal more */
int cnt = 0;
@ -1151,9 +1117,12 @@ void TmThreadKillThreads(void) {
tv->InShutdownHandler(tv);
}
for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++)
for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) {
if (tv->inq->q_type == 0)
SCCondSignal(&trans_q[tv->inq->id].cond_q);
else
SCCondSignal(&data_queues[tv->inq->id].cond_q);
}
usleep(100);
}

@ -34,6 +34,34 @@ enum {
TVT_MAX,
};
typedef struct TmSlot_ {
/* function pointers */
TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *);
TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **);
void (*SlotThreadExitPrintStats)(ThreadVars *, void *);
TmEcode (*SlotThreadDeinit)(ThreadVars *, void *);
/* data storage */
void *slot_initdata;
void *slot_data;
PacketQueue slot_pq;
/* linked list, only used by TmVarSlot */
struct TmSlot_ *slot_next;
} TmSlot;
/* 1 function slot */
typedef struct Tm1Slot_ {
TmSlot s;
} Tm1Slot;
/* Variable number of function slots */
typedef struct TmVarSlot_ {
TmSlot *s;
} TmVarSlot;
extern ThreadVars *tv_root[TVT_MAX];
extern SCMutex tv_root_lock;

@ -90,19 +90,26 @@ void TmqhOutputSimple(ThreadVars *t, Packet *p)
SCMutexUnlock(&q->mutex_q);
}
/*******************************Generic-Q-Handlers*****************************/
/**
* \brief Public version of TmqhInputSimple from the tmqh-simple queue
* handler, except that it is a generic version that is directly
* tied to a PacketQueue instance.
* tied to a "SCDQDataQueue" instance(sent as an arg).
*
* Retrieves a data_instance from the queue. If the queue is empty, it
* waits on the queue, till a data_instance is enqueued into the queue
* by some other module.
*
* Retrieves a packet from the queue. If the queue is empty, it waits
* on the queue, till a packet is enqueued into the queue.
* All references to "data_instance" means a reference to a data structure
* instance that implements the template "struct SCDQGenericQData_".
*
* \param q The PacketQueue instance to wait on.
* \param q The SCDQDataQueue instance to wait on.
*
* \retval p The returned packet from the queue.
* \retval data The returned data_instance from the queue.
*/
Packet *TmqhInputSimpleOnQ(PacketQueue *q)
SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *q)
{
SCMutexLock(&q->mutex_q);
if (q->len == 0) {
@ -111,11 +118,12 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q)
}
if (q->len > 0) {
Packet *p = PacketDequeue(q);
SCDQGenericQData *data = SCDQDataDequeue(q);
SCMutexUnlock(&q->mutex_q);
return p;
return data;
} else {
/* return NULL if we have no pkt. Should only happen on signals. */
/* return NULL if we have no data in the queue. Should only happen
* on signals. */
SCMutexUnlock(&q->mutex_q);
return NULL;
}
@ -124,17 +132,23 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q)
/**
* \brief Public version of TmqhOutputSimple from the tmqh-simple queue
* handler, except that it is a generic version that is directly
* tied to a PacketQueue instance.
* tied to a SCDQDataQueue instance(sent as an arg).
*
* Enqueues a packet into the packet queue.
* Pumps out a data_instance into the queue. If the queue is empty, it
* waits on the queue, till a data_instance is enqueued into the queue.
*
* \param q The PacketQueue instance to enqueue the packet into.
* \param p The packet to be enqueued into the above queue.
* All references to "data_instance" means a reference to a data structure
* instance that implements the template "struct SCDQGenericQData_".
*
* \param q The SCDQDataQueue instance to pump the data into.
* \param data The data instance to be enqueued.
*/
void TmqhOutputSimpleOnQ(PacketQueue *q, Packet *p)
void TmqhOutputSimpleOnQ(SCDQDataQueue *q, SCDQGenericQData *data)
{
SCMutexLock(&q->mutex_q);
PacketEnqueue(q, p);
SCDQDataEnqueue(q, data);
SCCondSignal(&q->cond_q);
SCMutexUnlock(&q->mutex_q);
return;
}

@ -24,8 +24,11 @@
#ifndef __TMQH_SIMPLE_H__
#define __TMQH_SIMPLE_H__
Packet *TmqhInputSimpleOnQ(PacketQueue *);
void TmqhOutputSimpleOnQ(PacketQueue *, Packet *);
#include "data-queue.h"
SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *);
void TmqhOutputSimpleOnQ(SCDQDataQueue *, SCDQGenericQData *);
void TmqhSimpleRegister (void);
#endif /* __TMQH_SIMPLE_H__ */

@ -100,6 +100,72 @@ SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t handle)
return data;
}
/**
* \brief Returns a cuda_module against the handle in the argument.
*
* If a cuda_module is not present for a handle, it is created
* and associated with this handle and the cuda_module is returned
* in the argument. If a cuda_module is already present for
* a handle, it is returned.
*
* \param p_context Pointer to a cuda context instance that should be updated
* with a cuda context.
* \param handle A unique handle which identifies a module. Obtained from
* a call to SCCudaHlGetUniqueHandle().
*
* \retval 0 On success.
* \retval -1 On failure.
*/
int SCCudaHlGetCudaModuleFromFile(CUmodule *p_module, const char *filename, int handle)
{
SCCudaHlModuleData *data = NULL;
if (p_module == NULL) {
SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments"
"p_module NULL");
return -1;
}
/* check if the particular module that wants a CUDA module is already
* registered or not. If it is registered, check if a context has
* been associated with the module. If yes, then we can go ahead and
* create a cuda module or return the reference to the cuda module if
* we already have a cuda module associated with the module. If no, "
* log warning and get out of here */
if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) ||
(data->cuda_context == 0)) {
SCLogDebug("Module not registered or no cuda context associated with "
"this module. You can't create a CUDA module without"
"associatin a context with a module first. To use this "
"registration facility, first register a module using "
"context using SCCudaHlRegisterModule(), and then register "
"a cuda context with that module using "
"SCCudaHlGetCudaContext(), after which you can call this "
"function ");
return -1;
}
/* we already have a cuda module associated with this module. Return the
* cuda module */
if (data->cuda_module != 0) {
p_module[0] = data->cuda_module;
return 0;
}
/* we don't have a cuda module associated with this module. Create a
* cuda module, update the module with this cuda module reference and
* then return the module refernce back to the calling function using
* the argument */
if (SCCudaModuleLoad(p_module, filename) == -1)
goto error;
data->cuda_module = p_module[0];
return 0;
error:
return -1;
}
/**
* \internal
* \brief Get a unique handle for a new module registration. This new handle
@ -347,10 +413,12 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name,
}
}
/* send the newly assigned device pointer back to the caller */
device_ptr[0] = new_module_device_ptr->d_ptr;
/* insert it into the device_ptr list for the module instance */
if (data->device_ptrs == NULL) {
data->device_ptrs = new_module_device_ptr;
device_ptr[0] = new_module_device_ptr->d_ptr;
return 0;
}
@ -367,6 +435,76 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name,
return -1;
}
/**
* \brief Frees a Cuda Device Pointer.
*
* If a device pointer by the name \"name\" is registered for this
* handle, it is freed.
*
* \param name Name of the device pointer by which we have to search
* module for its existance.
* \param handle A unique handle which identifies a module. Obtained from
* a call to SCCudaHlGetUniqueHandle().
*
* \retval 0 On success.
* \retval -1 On failure.
*/
int SCCudaHlFreeCudaDevicePtr(const char *name, int handle)
{
SCCudaHlModuleData *data = NULL;
SCCudaHlModuleDevicePointer *module_device_ptr = NULL;
SCCudaHlModuleDevicePointer *temp_module_device_ptr = NULL;
if (name == NULL) {
SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments"
"device_ptr is NULL or name is NULL");
goto error;
}
/* check if the particular module that wants to free device memory is
* already registered or not. If it is registered, check if a context has
* been associated with the module. If yes, then we can go ahead and
* free the device memory.
*/
if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) ||
(data->cuda_context == 0)) {
SCLogDebug("Module not registered or no cuda context associated with "
"this module. You can't create a CUDA module without"
"associatin a context with a module first. To use this "
"registration facility, first register a module using "
"context using SCCudaHlRegisterModule(), and then register "
"a cuda context with that module using "
"SCCudaHlGetCudaContext(), after which you can call this "
"function ");
goto error;
}
/* if we already have a device pointer registered by this name return the
* cuda device pointer instance */
if ( (module_device_ptr = SCCudaHlCudaDevicePtrAvailable(data, name)) == NULL) {
goto error;
}
SCCudaMemFree(module_device_ptr->d_ptr);
module_device_ptr->d_ptr = 0;
if (module_device_ptr == data->device_ptrs) {
data->device_ptrs = data->device_ptrs->next;
} else {
temp_module_device_ptr = data->device_ptrs;
while (strcmp(temp_module_device_ptr->next->name, name) != 0) {
temp_module_device_ptr = temp_module_device_ptr->next;
}
temp_module_device_ptr->next = temp_module_device_ptr->next->next;
}
free(module_device_ptr->name);
free(module_device_ptr);
return 0;
error:
return -1;
}
/**
* \brief Registers a Dispatcher function against this handle.
*
@ -441,8 +579,11 @@ int SCCudaHlGetModuleHandle(const char *name)
data = data->next;
}
if (data == NULL)
if (data == NULL) {
SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "A cuda module by the name \"%s\" "
"hasn't been registered", name);
return -1;
}
return data->handle;
}
@ -675,72 +816,4 @@ int SCCudaHlTestEnvCudaContextDeInit(void)
return 1;
}
void SCCudaHlProcessPacketWithDispatcher(Packet *p, DetectEngineThreadCtx *det_ctx,
void *result)
{
Packet *out_p = NULL;
p->cuda_mpm_ctx = det_ctx->sgh->mpm_ctx;
p->cuda_mtc = &det_ctx->mtc;
p->cuda_pmq = &det_ctx->pmq;
/* this outq is unique to this detection thread instance. The dispatcher thread
* would use this queue to pump the packets back to this detection thread once
* it has processed the packet */
p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id];
/* for now it is hardcoded. \todo Make the access to the right queue or the
* ThreadVars generic */
/* Push the packet into the dispatcher's input queue */
B2gCudaPushPacketTo_tv_CMB2_RC(p);
/* wait for the dispatcher to process and return the packet we pushed */
out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]);
/* todo make this generic, so that if we have more than 2 modules using the
* cuda interface, we can call update function for the module that has
* queued the packet and retrieve the results */
*((uint32_t *)result) = p->cuda_matches;
return;
}
void SCCudaHlProcessUriWithDispatcher(uint8_t *uri, uint16_t uri_len,
DetectEngineThreadCtx *det_ctx,
void *result)
{
Packet *out_p = NULL;
Packet *p = SCMalloc(sizeof(Packet));
if (p == NULL) {
SCLogError(SC_ERR_FATAL, "Fatal error encountered in SCCudaHlProcessUriWithDispatcher. Exiting...");
exit(EXIT_FAILURE);
}
memset(p, 0, sizeof(Packet));
p->cuda_mpm_ctx = det_ctx->sgh->mpm_uri_ctx;
p->cuda_mtc = &det_ctx->mtcu;
p->cuda_pmq = &det_ctx->pmq;
p->payload = uri;
p->payload_len = uri_len;
/* this outq is unique to this detection thread instance. The dispatcher thread
* would use this queue to pump the packets back to this detection thread once
* it has processed the packet */
p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id];
/* Push the packet into the dispatcher's input queue */
B2gCudaPushPacketTo_tv_CMB2_RC(p);
/* wait for the dispatcher to process and return the packet we pushed */
out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]);
/* todo make this generic, so that if we have more than 2 modules using the
* cuda interface, we can call update function for the module that has
* queued the packet and retrieve the results */
*((uint32_t *)result) = p->cuda_matches;
SCFree(p);
return;
}
#endif /* __SC_CUDA_SUPPORT */

@ -63,7 +63,9 @@ typedef struct SCCudaHlModuleData_ {
int SCCudaHlGetCudaContext(CUcontext *, int);
int SCCudaHlGetCudaModule(CUmodule *, const char *, int);
int SCCudaHlGetCudaModuleFromFile(CUmodule *, const char *, int);
int SCCudaHlGetCudaDevicePtr(CUdeviceptr *, const char *, size_t, void *, int);
int SCCudaHlFreeCudaDevicePtr(const char *, int);
int SCCudaHlRegisterDispatcherFunc(void *(*SCCudaHlDispFunc)(void *), int);
SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t);

@ -27,50 +27,59 @@
*/
#define B2G_CUDA_Q 2
#define CUDA_THREADS 16
#define CUDA_THREADS 4000
#define B2G_CUDA_HASHSHIFT 4
#define B2G_CUDA_TYPE unsigned int
#define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b))
#define u8_tolower(c) g_u8_lowercasetable[(c)]
typedef struct SCCudaPBPacketDataForGPU_ {
/* holds the value B2gCtx->m */
unsigned int m;
/* holds B2gCtx->B2g */
unsigned int table;
/* holds the length of the payload */
unsigned int payload_len;
/* holds the payload */
unsigned char payload;
} SCCudaPBPacketDataForGPU;
extern "C"
__global__ void B2gCudaSearchBNDMq(unsigned int *offsets,
unsigned int *B2G,
unsigned char *g_u8_lowercasetable,
unsigned char *buf,
unsigned short arg_buflen,
unsigned int m)
{
__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer,
unsigned char *packets_buffer,
unsigned int *packets_offset_buffer,
unsigned int *packets_payload_offset_buffer,
unsigned int nop,
unsigned char *g_u8_lowercasetable)
{
unsigned int tid = blockIdx.x * 32 + threadIdx.x;
/* if the thread id is greater than the no of packets sent in the packets
* buffer, terminate the thread */
//if (tid <= nop)
if (tid >= nop)
return;
SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]);
unsigned int m = packet->m;
unsigned char *buf = &packet->payload;
unsigned int buflen = packet->payload_len;
unsigned int *B2G = (unsigned int *)packet->table;
unsigned int pos = m - B2G_CUDA_Q + 1;
B2G_CUDA_TYPE d;
unsigned short h;
unsigned int j;
unsigned int first;
unsigned int tid = threadIdx.x;
unsigned short tid_chunk = arg_buflen / CUDA_THREADS;
unsigned short jump;
unsigned short buflen;
if (tid_chunk < m)
tid_chunk = m;
jump = tid_chunk * tid;
if ((jump + tid_chunk) > arg_buflen)
return;
unsigned int j = 0;
buflen = tid_chunk * 2 - 1;
if ((tid == CUDA_THREADS - 1) || ((jump + buflen) > arg_buflen)) {
buflen = arg_buflen - jump;
}
j = 0;
while (j < buflen) {
offsets[jump + j] = 0;
j++;
}
unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid;
//unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1;
//unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1;
unsigned short *offsets = matches_count + 1;
// temporarily hold the results here, before we shift it to matches_count
// before returning
unsigned short matches = 0;
while (pos <= (buflen - B2G_CUDA_Q + 1)) {
h = B2G_CUDA_HASH16(u8_tolower(buf[jump + pos - 1]), u8_tolower(buf[jump + pos]));
h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos]));
d = B2G[h];
if (d != 0) {
@ -83,19 +92,21 @@ __global__ void B2gCudaSearchBNDMq(unsigned int *offsets,
if (j > first) {
pos = j;
} else {
offsets[j + jump] = 1;
offsets[matches++] = j;
}
}
if (j == 0)
break;
h = B2G_CUDA_HASH16(u8_tolower(buf[jump + j - 1]), u8_tolower(buf[jump + j]));
h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j]));
d = (d << 1) & B2G[h];
} while (d != 0);
}
pos = pos + m - B2G_CUDA_Q + 1;
}
matches_count[0] = matches;
return;
}

File diff suppressed because it is too large Load Diff

@ -51,7 +51,7 @@ typedef struct B2gCudaPattern_ {
/* case INsensitive */
uint8_t *ci;
struct B2gCudaPattern_ *next;
MpmEndMatch *em;
uint32_t id;
} B2gCudaPattern;
typedef struct B2gCudaHashItem_ {
@ -65,23 +65,6 @@ typedef struct B2gCudaCtx_ {
* in the engine that is holding this B2g_Cuda_Ctx */
int module_handle;
CUcontext cuda_context;
CUmodule cuda_module;
/* the search kernel */
CUfunction cuda_search_kernel;
/* the cuda_search_kernel argument offsets */
uint8_t cuda_search_kernel_arg0_offset;
uint8_t cuda_search_kernel_arg1_offset;
uint8_t cuda_search_kernel_arg2_offset;
uint8_t cuda_search_kernel_arg3_offset;
uint8_t cuda_search_kernel_arg4_offset;
uint8_t cuda_search_kernel_arg5_offset;
uint8_t cuda_search_kernel_arg_total;
/* cuda device pointer to thelower case table g_u8_lowercasetable */
CUdeviceptr cuda_g_u8_lowercasetable;
/* cuda device pointer to B2gCudaCtx->B2G */
CUdeviceptr cuda_B2G;
@ -141,17 +124,18 @@ typedef struct B2gCudaThreadCtx_ {
} B2gCudaThreadCtx;
void MpmB2gCudaRegister(void);
void TmModuleCudaMpmB2gRegister(void);
int B2gCudaStartDispatcherThreadRC(const char *);
int B2gCudaStartDispatcherThreadAPC(const char *);
void B2gCudaKillDispatcherThreadRC(void);
void B2gCudaKillDispatcherThreadAPC(void);
void B2gCudaPushPacketTo_tv_CMB2_RC(Packet *);
void B2gCudaPushPacketTo_tv_CMB2_APC(Packet *);
int B2gCudaResultsPostProcessing(Packet *, MpmCtx *, MpmThreadCtx *,
PatternMatcherQueue *);
uint32_t B2gCudaSearch1(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *,
uint8_t *, uint16_t);
#ifdef B2G_CUDA_SEARCH2
uint32_t B2gCudaSearch2(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *,
uint8_t *, uint16_t);
#endif
#endif /* __SC_CUDA_SUPPORT__ */

Loading…
Cancel
Save