From 33f4beb0bcd838aa11ce8cbbda484e2008d69612 Mon Sep 17 00:00:00 2001 From: Anoop Saldanha Date: Fri, 28 May 2010 16:22:07 +0530 Subject: [PATCH] batching of packets support for cuda b2g mpm. Supported for both 32 and 64 bit platforms --- src/Makefile.am | 4 +- src/cuda-packet-batcher.c | 1235 ++++++++++++ src/cuda-packet-batcher.h | 139 ++ src/data-queue.c | 93 + src/data-queue.h | 64 + src/decode.h | 92 +- src/detect-engine-mpm.c | 50 +- src/detect-engine.c | 41 - src/detect.c | 1 + src/runmodes.c | 114 +- src/suricata.c | 15 + src/suricata.h | 3 + src/tm-modules.h | 1 + src/tm-queues.c | 2 + src/tm-queues.h | 2 + src/tm-threads.c | 59 +- src/tm-threads.h | 28 + src/tmqh-simple.c | 42 +- src/tmqh-simple.h | 7 +- src/util-cuda-handlers.c | 213 +- src/util-cuda-handlers.h | 2 + src/util-mpm-b2g-cuda-kernel.cu | 77 +- src/util-mpm-b2g-cuda.c | 3355 ++++++++++++++----------------- src/util-mpm-b2g-cuda.h | 34 +- 24 files changed, 3529 insertions(+), 2144 deletions(-) create mode 100644 src/cuda-packet-batcher.c create mode 100644 src/cuda-packet-batcher.h create mode 100644 src/data-queue.c create mode 100644 src/data-queue.h diff --git a/src/Makefile.am b/src/Makefile.am index 9dece0c953..c40cbacb4f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -8,6 +8,7 @@ bin_PROGRAMS = suricata suricata_SOURCES = suricata.c suricata.h \ runmodes.c runmodes.h \ packet-queue.c packet-queue.h \ +data-queue.c data-queue.h \ threads.c threads.h \ source-nfq.c source-nfq.h \ source-pcap.c source-pcap.h \ @@ -211,7 +212,8 @@ win32-misc.c win32-misc.h \ win32-service.c win32-service.h \ util-action.c util-action.h \ win32-syslog.h \ -util-profiling.c util-profiling.h +util-profiling.c util-profiling.h \ +cuda-packet-batcher.c cuda-packet-batcher.h # set the include path found by configure INCLUDES= $(all_includes) diff --git a/src/cuda-packet-batcher.c b/src/cuda-packet-batcher.c new file mode 100644 index 0000000000..0e8c7fa8b4 --- /dev/null +++ b/src/cuda-packet-batcher.c @@ -0,0 +1,1235 @@ +/** + * Copyright (c) 2010 Open Information Security Foundation. + * + * \author Anoop Saldanha + * + * \todo Some work yet to be done in this file. Firstly change the way we get + * the sgh. Once we implement the retrieval of sghs from the flow, as + * suggested by victor, we can get rid of the sgh retrieval here. + * Make the various parameters user configurable. Terribly hard-coded now. + */ + +/* compile in, only if we have a CUDA enabled on this machine */ +#ifdef __SC_CUDA_SUPPORT__ + +#include "suricata-common.h" +#include "suricata.h" + +#include "detect.h" +#include "decode.h" +#include "flow.h" +#include "data-queue.h" + +#include "threads.h" +#include "threadvars.h" +#include "tm-queuehandlers.h" +#include "tm-modules.h" + +#include "cuda-packet-batcher.h" +#include "conf.h" + +#include "util-error.h" +#include "util-debug.h" +#include "util-unittest.h" + +#include "util-mpm-b2g-cuda.h" +#include "detect-engine-address.h" +#include "detect-engine-port.h" +#include "detect-engine.h" +#include "detect-parse.h" +#include "tm-threads.h" +#include "tmqh-packetpool.h" + +/* \todo Make this user configurable through our yaml file. Also provide options + * where this can be dynamically updated based on the traffic */ +#define SC_CUDA_PB_BATCHER_ALARM_TIME 1 + +/* holds the inq and outq between the cuda-packet-batcher TM and the cuda b2g mpm + * dispatcher thread */ +static Tmq *tmq_inq = NULL; +static Tmq *tmq_outq = NULL; + +/* holds the packet inq between the batcher TM and, the TM feeding it packets + * in the runmode sequence. We will need this to implement the alarm. We will + * have a SIG_ALRM delivered every SC_CUDA_PB_BATCHER_ALARM_TIME seconds, after + * which we willf set a flag informing the batcher TM to queue the buffer to the + * GPU and wake the batcher thread, in case it is waiting on a conditional for a + * packet from the previous TM in the runmode */ +static Tmq *tmq_batcher_inq = NULL; + +/* used to indicate if we want to stop buffering the packets anymore. We + * we will need this while we want to shut the engine down + * \todo give a better description */ +static int run_batcher = 1; + +/* indicates the maximum no of packets we are ready to buffer. Theoretically the + * maximum value held by this var can't exceed the value held by + * "max_pending_packets". Either ways we should make this user configurable like + * SC_CUDA_PB_BATCHER_ALARM_TIME. Also allow dynamic updates to this value based + * on the traffic + * \todo make this user configurable, as well allow dynamic update of this + * variable based on the traffic seen */ +static uint32_t buffer_packet_threshhold = 1280; + +/* flag used by the SIG_ALRM handler to indicate that the batcher TM should queue + * the buffer to be processed by the Cuda Mpm B2g Batcher Thread for further + * processing on the GPU */ +static int queue_buffer = 0; + +/** + * \internal + * \brief The SIG_ALRM handler. We will set the "queue_buffer" flag thus + * informing the batcher TM that it needs to queue the buffer. We + * also signal the cond var for the batcher TM inq(the one it + * receives packets from), incase it is waiting on the conditional + * for a new packet from the previous TM in the runmodes list. + * + * \param signum The signal number that this function just woke up to. In + * our case it is SIG_ALRM. + */ +static void SCCudaPBSetQueueBufferFlag(int signum) +{ + SCLogDebug("Cuda Packet Batche alarm generated after %d seconds. Set the" + "queue_buffer flag and signal the cuda TM inq.", + SC_CUDA_PB_BATCHER_ALARM_TIME); + //queue_buffer = 1; + //SCCondSignal(&((&trans_q[tmq_batcher_inq->id])->cond_q)); + + return; +} + +/** + * \internal. + * \brief Set the SIG_ALRM handler + */ +static void SCCudaPBSetBatcherAlarmTimeHandler() +{ + struct sigaction action; + + SCLogDebug("Setting the SIGALRM handler for the Cuda Batcher TM"); + action.sa_handler = SCCudaPBSetQueueBufferFlag; + sigemptyset(&(action.sa_mask)); + sigaddset(&(action.sa_mask), SIGALRM); + action.sa_flags = 0; + sigaction(SIGALRM, &action, 0); + + return; +} + +/** + * \internal + * \brief Used to retrieve the Signature Group Head for a packet. + * + * \param de_ctx Pointer the detection engine context to search for the + * sgh for an incoming packet. + * \param p Pointer to the incoming packet for which we will have to + * search for a sgh. + * + * \retval sgh Pointer to the relevant matching sgh for the Packet. + */ +static SigGroupHead *SCCudaPBGetSgh(DetectEngineCtx *de_ctx, Packet *p) +{ + int ds, f; + SigGroupHead *sgh = NULL; + + /* select the dsize_gh */ + if (p->payload_len <= 100) + ds = 0; + else + ds = 1; + + /* select the flow_gh */ + if (p->flowflags & FLOW_PKT_TOCLIENT) + f = 0; + else + f = 1; + + /* find the right mpm instance */ + DetectAddress *ag = DetectAddressLookupInHead(de_ctx->dsize_gh[ds].flow_gh[f].src_gh[p->proto], &p->src); + if (ag != NULL) { + /* source group found, lets try a dst group */ + ag = DetectAddressLookupInHead(ag->dst_gh,&p->dst); + if (ag != NULL) { + if (ag->port == NULL) { + SCLogDebug("we don't have ports"); + sgh = ag->sh; + } else { + SCLogDebug("we have ports"); + + DetectPort *sport = DetectPortLookupGroup(ag->port,p->sp); + if (sport != NULL) { + DetectPort *dport = DetectPortLookupGroup(sport->dst_ph, p->dp); + if (dport != NULL) { + sgh = dport->sh; + } else { + SCLogDebug("no dst port group found for the packet with dp %"PRIu16, p->dp); + } + } else { + SCLogDebug("no src port group found for the packet with sp %"PRIu16, p->sp); + } + } + } else { + SCLogDebug("no dst address group found for the packet"); + } + } else { + SCLogDebug("no src address group found for the packet"); + } + + return sgh; +} + +/** + * \internal + * \brief Handles the queuing of the buffer from this batcher TM to the cuda + * mpm b2g dispatcher TM. + * + * \tctx The batcher thread context that holds the current operational buffer + * which has to be buffered by this function. + */ +static void SCCudaPBQueueBuffer(SCCudaPBThreadCtx *tctx) +{ + SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)tctx->curr_pb; + uint32_t nop_in_buffer = pb->nop_in_buffer; + uint32_t *packets_offset_buffer = pb->packets_offset_buffer; + uint32_t offset = *(packets_offset_buffer + nop_in_buffer - 1); + SCCudaPBPacketDataForGPU *last_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + + offset); + + /* if we have no packets buffered in so far, get out */ + if (pb->nop_in_buffer == 0) { + SCLogDebug("No packets buffered in so far in the cuda buffer. Returning"); + return; + } + + /* calculate the total length of all the packets buffered in */ + pb->packets_buffer_len = pb->packets_offset_buffer[pb->nop_in_buffer - 1] + + sizeof(SCCudaPBPacketDataForGPUNonPayload) + + last_packet->payload_len; + + pb->packets_total_payload_len = pb->packets_payload_offset_buffer[pb->nop_in_buffer - 1] + + last_packet->payload_len; + + /* enqueue the buffer in the outq to be consumed by the dispatcher TM */ + SCDQDataQueue *dq_outq = &data_queues[tmq_outq->id]; + SCMutexLock(&dq_outq->mutex_q); + SCDQDataEnqueue(dq_outq, (SCDQGenericQData *)tctx->curr_pb); + SCCondSignal(&dq_outq->cond_q); + SCMutexUnlock(&dq_outq->mutex_q); + + while (run_batcher) { + /* dequeue a new buffer */ + SCDQDataQueue *dq_inq = &data_queues[tmq_inq->id]; + SCMutexLock(&dq_inq->mutex_q); + if (dq_inq->len == 0) { + /* if we have no data in queue, wait... */ + SCondWait(&dq_inq->cond_q, &dq_inq->mutex_q); + } + + if (run_batcher == 0) { + break; + } + + if (dq_inq->len > 0) { + tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq_inq); + tctx->curr_pb->nop_in_buffer = 0; + tctx->curr_pb->packets_buffer_len = 0; + tctx->curr_pb->packets_total_payload_len = 0; + SCMutexUnlock(&dq_inq->mutex_q); + SCLogDebug("Dequeued a new packet buffer for the cuda batcher TM"); + break; + } else { + /* Should only happen on signals. */ + SCMutexUnlock(&dq_inq->mutex_q); + SCLogDebug("Unable to Relooping in the quest to dequeue new buffer\n"); + } + } /* while (run_batcher) */ + + return; +} + +/** + * \brief Custom slot function used by the Batcher TM. + * + * \param td Pointer to the ThreadVars instance. In this case the batcher TM's + * ThreadVars instance. + */ +void *SCCudaPBTmThreadsSlot1(void *td) +{ + ThreadVars *tv = (ThreadVars *)td; + Tm1Slot *s = (Tm1Slot *)tv->tm_slots; + Packet *p = NULL; + char run = 1; + TmEcode r = TM_ECODE_OK; + + /* Set the thread name */ + SCSetThreadName(tv->name); + + if (tv->thread_setup_flags != 0) { + TmThreadSetupOptions(tv); + } + + SCLogDebug("%s starting", tv->name); + + if (s->s.SlotThreadInit != NULL) { + r = s->s.SlotThreadInit(tv, s->s.slot_initdata, &s->s.slot_data); + if (r != TM_ECODE_OK) { + EngineKill(); + + TmThreadsSetFlag(tv, THV_CLOSED); + pthread_exit((void *) -1); + } + } + memset(&s->s.slot_pq, 0, sizeof(PacketQueue)); + + TmThreadsSetFlag(tv, THV_INIT_DONE); + while(run) { + TmThreadTestThreadUnPaused(tv); + + /* input a packet */ + p = tv->tmqh_in(tv); + + if (p == NULL) { + printf("packet is NULL for TM: %s\n", tv->name); + /* the only different between the actual Slot1 function in + * tm-threads.c and this custom Slot1 function is this call + * here. We need to make the call here, even if we don't + * receive a packet from the previous stage in the runmodes. + * This is needed in cases where we the SIG_ALRM handler + * wants us to queue the buffer to the GPU and ends up waking + * the Batcher TM(which is waiting on a cond from the previous + * feeder TM). Please handler the NULL packet case in the + * function that you now call */ + r = s->s.SlotFunc(tv, p, s->s.slot_data, &s->s.slot_pq); + } else { + r = s->s.SlotFunc(tv, p, s->s.slot_data, &s->s.slot_pq); + /* handle error */ + if (r == TM_ECODE_FAILED) { + TmqhReleasePacketsToPacketPool(&s->s.slot_pq); + TmqhOutputPacketpool(tv, p); + TmThreadsSetFlag(tv, THV_FAILED); + break; + } + + /* output the packet */ + tv->tmqh_out(tv, p); + } + + if (TmThreadsCheckFlag(tv, THV_KILL)) { + SCPerfUpdateCounterArray(tv->sc_perf_pca, &tv->sc_perf_pctx, 0); + run = 0; + } + } + + if (s->s.SlotThreadExitPrintStats != NULL) { + s->s.SlotThreadExitPrintStats(tv, s->s.slot_data); + } + + if (s->s.SlotThreadDeinit != NULL) { + r = s->s.SlotThreadDeinit(tv, s->s.slot_data); + if (r != TM_ECODE_OK) { + TmThreadsSetFlag(tv, THV_CLOSED); + pthread_exit((void *) -1); + } + } + + SCLogDebug("%s ending", tv->name); + TmThreadsSetFlag(tv, THV_CLOSED); + pthread_exit((void *) 0); +} + +/** + * \brief Used to de-allocate an instance of SCCudaPBPacketsBuffer. + * + * \param pb Pointer to the SCCudaPacketsBuffer instance to be de-alloced. + */ +void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *pb) +{ + if (pb == NULL) + return; + + if (pb->packets_buffer != NULL) + free(pb->packets_buffer); + if (pb->packets_offset_buffer != NULL) + free(pb->packets_offset_buffer); + if (pb->packets_payload_offset_buffer != NULL) + free(pb->packets_payload_offset_buffer); + if (pb->packets_address_buffer != NULL) + free(pb->packets_address_buffer); + + free(pb); + + return; +} + +/** + * \brief Allocates a new instance of SCCudaPBPacketsBuffer. + * + * \param pb The newly created instance of SCCudaPBPacketsBuffer. + */ +SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void) +{ + SCCudaPBPacketsBuffer *pb = malloc(sizeof(SCCudaPBPacketsBuffer)); + if (pb == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(pb, 0, sizeof(SCCudaPBPacketsBuffer)); + + /* the buffer for the packets to be sent over to the gpu. We allot space for + * a minimum of SC_CUDA_PB_MIN_NO_OF_PACKETS, i.e. if each packet buffered + * is full to the brim */ + pb->packets_buffer = malloc(sizeof(SCCudaPBPacketDataForGPU) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + if (pb->packets_buffer == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(pb->packets_buffer, 0, sizeof(SCCudaPBPacketDataForGPU) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + + /* used to hold the offsets of the buffered packets in the packets_buffer */ + pb->packets_offset_buffer = malloc(sizeof(uint32_t) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + if (pb->packets_offset_buffer == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(pb->packets_offset_buffer, 0, sizeof(uint32_t) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + + /* used to hold the offsets of the packets payload */ + pb->packets_payload_offset_buffer = malloc(sizeof(uint32_t) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + if (pb->packets_payload_offset_buffer == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(pb->packets_payload_offset_buffer, 0, sizeof(uint32_t) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + + /* used to hold the packet addresses for all the packets buffered inside + * packets_buffer */ + pb->packets_address_buffer = malloc(sizeof(Packet *) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + if (pb->packets_address_buffer == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(pb->packets_address_buffer, 0, sizeof(Packet *) * + SC_CUDA_PB_MIN_NO_OF_PACKETS); + + return pb; +} + +/** + * \brief Registration function for the Cuda Packet Batcher TM. + */ +void TmModuleCudaPacketBatcherRegister(void) +{ + tmm_modules[TMM_CUDA_PACKET_BATCHER].name = "CudaPacketBatcher"; + tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadInit = SCCudaPBThreadInit; + tmm_modules[TMM_CUDA_PACKET_BATCHER].Func = SCCudaPBBatchPackets; + tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadExitPrintStats = SCCudaPBThreadExitStats; + tmm_modules[TMM_CUDA_PACKET_BATCHER].ThreadDeinit = SCCudaPBThreadDeInit; + tmm_modules[TMM_CUDA_PACKET_BATCHER].RegisterTests = SCCudaPBRegisterTests; + + return; +} + +/** + * \brief The cuda batcher TM init function. + * + * \param tv The cuda packet batcher TM ThreadVars instance. + * \param initdata The initialization data needed by this cuda batcher TM. + * \param data Pointer to a ponter memory location that would be updated + * with the newly created thread ctx instance. + * + * \retval TM_ECODE_OK On success. + * \retval TM_ECODE_FAILED On failure. + */ +TmEcode SCCudaPBThreadInit(ThreadVars *tv, void *initdata, void **data) +{ + SCCudaPBThreadCtx *tctx = NULL; + + if (initdata == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument. initdata NULL " + "for the cuda batcher TM init thread function"); + return TM_ECODE_FAILED; + } + + tctx = malloc(sizeof(SCCudaPBThreadCtx)); + if (tctx == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(tctx, 0, sizeof(SCCudaPBThreadCtx)); + + /* the detection engine context. We will need it to retrieve the sgh, + * when we start receiving and batching the packets */ + tctx->de_ctx = initdata; + + /* the first packet buffer from the queue */ + tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(&data_queues[tmq_inq->id]); + + *data = tctx; + + /* we will need the cuda packet batcher TM's inq for further use later. Read + * the comments associated with this var definition, for its use */ + tmq_batcher_inq = tv->inq; + + /* set the SIG_ALRM handler */ + SCCudaPBSetBatcherAlarmTimeHandler(); + + /* Set the alarm time limit during which the batcher thread would buffer packets */ + alarm(SC_CUDA_PB_BATCHER_ALARM_TIME); + + return TM_ECODE_OK; +} + +/** + * \brief Batches packets into the packets buffer. + * + * \param tv Pointer to the ThreadVars instance, in this case the cuda packet + * batcher TM's TV instance. + * \param p Pointer the the packet to be buffered. + * \param data Pointer the the batcher TM thread ctx. + * \param pq Pointer to the packetqueue. We don't need this. + * + * \retval TM_ECODE_OK On success. + * \retval TM_ECODE_FAILED On failure. + */ +TmEcode SCCudaPBBatchPackets(ThreadVars *tv, Packet *p, void *data, PacketQueue *pq) +{ +#define ALIGN_UP(offset, alignment) \ + (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) + + /* ah. we have been signalled that we crossed the time limit within which we + * need to buffer packets. Let us queue the buffer to the GPU */ + if (queue_buffer) { + SCLogDebug("Cuda packet buffer TIME limit exceeded. Buffering packet " + "buffer and reseting the alarm"); + queue_buffer = 0; + SCCudaPBQueueBuffer(data); + alarm(SC_CUDA_PB_BATCHER_ALARM_TIME); + } + + /* this is possible, since we are using a custom slot function that calls this + * function, even if it receives no packet from the packet queue */ + if (p == NULL) { + SCLogDebug("packet NULL inside Cuda batcher TM"); + return TM_ECODE_OK; + } + + /* we set it for every incoming packet. We will set this depending on whether + * we end up buffering the packet or not */ + p->cuda_mpm_enabled = 0; + + SCCudaPBThreadCtx *tctx = data; + /* the packets buffer */ + SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)tctx->curr_pb; + /* the previous packet which has been buffered into the packets_buffer */ + SCCudaPBPacketDataForGPU *prev_buff_packet = NULL; + /* holds the position in the packets_buffer where the curr packet would + * be buffered in */ + SCCudaPBPacketDataForGPU *curr_packet = NULL; + /* the sgh to which the incoming packet belongs */ + SigGroupHead *sgh = NULL; + + /* get the signature group head to which this packet belongs. If it belongs + * to no sgh, we don't need to buffer this packet. + * \todo Get rid of this, once we get the sgh from the flow */ + sgh = SCCudaPBGetSgh(tctx->de_ctx, p); + if (sgh == NULL) { + SCLogDebug("No SigGroupHead match for this packet"); + return TM_ECODE_OK; + } + + /* if the payload is less than the maximum content length in this sgh we + * don't need to run the PM on this packet. Chuck the packet out */ + if (sgh->mpm_content_maxlen > p->payload_len) { + SCLogDebug("not mpm-inspecting as pkt payload is smaller than " + "the largest content length we need to match"); + return TM_ECODE_OK; + } + + /* if one of these conditions fail we don't have to run the mpm on this + * packet. Firstly if the payload_len is == 0, we don't have a payload + * to match against. Next if we don't have a mpm_context against this + * sgh, indicating we don't have any patterns in this sgh, again we don't + * have anything to run the PM against. Finally if the flow doesn't want + * to analyze packets for this flow, we can chuck this packet out as well */ + if ( !(p->payload_len > 0 && sgh->mpm_ctx != NULL && + !(p->flags & PKT_NOPAYLOAD_INSPECTION)) ) { + SCLogDebug("Either p->payload_len <= 0 or mpm_ctx for the packet is NULL " + "or PKT_NOPAYLOAD_INSPECTION set for this packet"); + return TM_ECODE_OK; + } + + /* the cuda b2g context */ + B2gCudaCtx *ctx = sgh->mpm_ctx->ctx; + + /* if we have a 1 byte search kernel set we don't buffer this packet for + * cuda matching and instead run this non-cuda mpm function to be run on + * the packet */ + if (ctx->Search == B2gCudaSearch1) { + SCLogDebug("The packet has a one byte patterns. run mpm " + "separately"); + return TM_ECODE_OK; + } + +#ifdef B2G_CUDA_SEARCH2 + /* if we have a 2 byte search kernel set we don't buffer this packet for + * cuda matching and instead run this non-cuda mpm function to be run on the + * packet */ + if (ctx->Search == B2gCudaSearch2) { + SCLogDebug("The packet has two byte patterns. run mpm " + "separately"); + return TM_ECODE_OK; + } +#endif + + /* we have passed all the criterions for buffering the packet. Set the + * flag indicating that the packet goes through cuda mpm */ + p->cuda_mpm_enabled = 1; + + /* first packet to be buffered in */ + if (pb->nop_in_buffer == 0) { + curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer; + + /* buffer is not empty */ + } else { + prev_buff_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + + pb->packets_offset_buffer[pb->nop_in_buffer - 1]); + curr_packet = (SCCudaPBPacketDataForGPU *)((uint8_t *)prev_buff_packet + + sizeof(SCCudaPBPacketDataForGPUNonPayload) + + prev_buff_packet->payload_len) ; + int diff = (int)((uint8_t *)curr_packet - pb->packets_buffer); + /* \todo Feel it is the wrong option taken by nvidia by setting CUdeviceptr + * to unsigned int. Keep this option for now. We will get back to this + * once nvidia responds to the filed bug */ + ALIGN_UP(diff, sizeof(CUdeviceptr)); + curr_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + diff); + } + + /* store the data in the packets_buffer for this packet, which would be passed + * over to the GPU for processing */ + curr_packet->m = ((B2gCudaCtx *)(sgh->mpm_ctx->ctx))->m; + curr_packet->table = ((B2gCudaCtx *)(sgh->mpm_ctx->ctx))->cuda_B2G; + curr_packet->payload_len = p->payload_len; + memcpy(curr_packet->payload, p->payload, p->payload_len); + + /* store the address of the packet just buffered at the same index. The + * dispatcher thread will need this address to communicate the results back + * to the packet */ + pb->packets_address_buffer[pb->nop_in_buffer] = p; + + /* if it is the first packet to be buffered, the offset is 0. If it is not, + * then take the offset for the buffer from curr_packet */ + if (pb->nop_in_buffer == 0) { + pb->packets_offset_buffer[pb->nop_in_buffer] = 0; + pb->packets_payload_offset_buffer[pb->nop_in_buffer] = 0; + } else { + pb->packets_offset_buffer[pb->nop_in_buffer] = (uint8_t *)curr_packet - pb->packets_buffer; + pb->packets_payload_offset_buffer[pb->nop_in_buffer] = + pb->packets_payload_offset_buffer[pb->nop_in_buffer - 1] + + prev_buff_packet->payload_len; + } + + /* indicates the no of packets added so far into the buffer */ + pb->nop_in_buffer++; + + /* we have hit the threshhold for the total no of packets held in the buffer. + * We will change this in the future, instead relying on the remaining space + * left in the buffer or we have been informed that we have hit the time limit + * to queue the buffer */ + if ( (pb->nop_in_buffer == buffer_packet_threshhold) || queue_buffer) { + queue_buffer = 0; + SCLogDebug("Either we have hit the threshold limit for packets(i.e.) we " + "have %d packets limit) OR we have exceeded the buffering " + "time limit. Buffering the packet buffer and reseting the " + "alarm.", buffer_packet_threshhold); + SCCudaPBQueueBuffer(tctx); + alarm(SC_CUDA_PB_BATCHER_ALARM_TIME); + } + + return TM_ECODE_OK; +} + +void SCCudaPBThreadExitStats(ThreadVars *tv, void *data) +{ + return; +} + +/** + * \brief The thread de-init function for the cuda packet batcher TM. + * + * \param tv Pointer to the cuda packet batcher TM ThreadVars instance. + * \param data Pointer the the Thread ctx for the cuda packet batcher TM. + * + * \retval TM_ECODE_OK On success. + * \retval TM_ECODE_FAILED On failure. Although we won't be returning this here. + */ +TmEcode SCCudaPBThreadDeInit(ThreadVars *tv, void *data) +{ + SCCudaPBThreadCtx *tctx = data; + + if (tctx != NULL) { + if (tctx->curr_pb != NULL) { + SCCudaPBDeAllocSCCudaPBPacketsBuffer(tctx->curr_pb); + tctx->curr_pb = NULL; + } + free(tctx); + } + + return TM_ECODE_OK; +} + +/** + * \brief Sets up the queues and buffers needed by the cuda batcher TM function. + */ +void SCCudaPBSetUpQueuesAndBuffers(void) +{ + /* the b2g dispatcher thread would have to use the reverse for incoming + * and outgoing queues */ + char *inq_name = "cuda_batcher_mpm_inqueue"; + char *outq_name = "cuda_batcher_mpm_outqueue"; + int i = 0; + + /* set the incoming queue for the cuda_packet_batcher TM and the cuda B2g + * dispatcher */ + tmq_inq = TmqGetQueueByName(inq_name); + if (tmq_inq == NULL) { + tmq_inq = TmqCreateQueue(inq_name); + if (tmq_inq == NULL) { + return; + } + } + tmq_inq->reader_cnt++; + tmq_inq->writer_cnt++; + + /* set the outgoing queue from the cuda_packet_batcher TM and the cuda B2g + * dispatcher */ + tmq_outq = TmqGetQueueByName(outq_name); + if (tmq_outq == NULL) { + tmq_outq = TmqCreateQueue(outq_name); + if (tmq_outq == NULL) { + return; + } + } + tmq_outq->reader_cnt++; + tmq_outq->writer_cnt++; + + /* allocate the packet buffer */ + /* \todo need to work out the right no of packet buffers that we need to + * queue. I doubt we will need more than 4(as long as we don't run it on + * low traffic line). We don't want to get into the business of creating + * new ones, when we run out of buffers, since malloc for a huge chunk + * like this will take time. We need to figure out a value based on + * various other parameters like alarm time and buffer threshold value */ + for (i = 0; i < 10; i++) { + SCCudaPBPacketsBuffer *pb = SCCudaPBAllocSCCudaPBPacketsBuffer(); + /* dump the buffer into the inqueue for this batcher TM. the batcher + * thread would be the first consumer for these buffers */ + SCDQDataEnqueue(&data_queues[tmq_inq->id], (SCDQGenericQData *)pb); + } + + /* \todo This needs to be changed ASAP. This can't exceed max_pending_packets. + * Also we need to make this user configurable and allow dynamic updaes + * based on live traffic */ + buffer_packet_threshhold = 1280; + + return; +} + +/** + * \brief Clean up all the buffers queued in. Need to write more on this. + */ +void SCCudaPBCleanUpQueuesAndBuffers(void) +{ + SCCudaPBPacketsBuffer *pb = NULL; + SCDQDataQueue *dq = NULL; + + if (tmq_inq == NULL || tmq_outq == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments. tmq_inq or " + "tmq_outq NULL"); + return; + } + + /* clean all the buffers present in the inq */ + dq = &data_queues[tmq_inq->id]; + SCMutexLock(&dq->mutex_q); + while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) { + if (pb->packets_buffer != NULL) + free(pb->packets_buffer); + if (pb->packets_offset_buffer != NULL) + free(pb->packets_offset_buffer); + if (pb->packets_payload_offset_buffer != NULL) + free(pb->packets_payload_offset_buffer); + + free(pb); + } + SCMutexUnlock(&dq->mutex_q); + SCCondSignal(&dq->cond_q); + + /* clean all the buffers present in the outq */ + dq = &data_queues[tmq_outq->id]; + SCMutexLock(&dq->mutex_q); + while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) { + if (pb->packets_buffer != NULL) + free(pb->packets_buffer); + if (pb->packets_offset_buffer != NULL) + free(pb->packets_offset_buffer); + if (pb->packets_payload_offset_buffer != NULL) + free(pb->packets_payload_offset_buffer); + + free(pb); + } + SCMutexUnlock(&dq->mutex_q); + SCCondSignal(&dq->cond_q); + + return; +} + +/** + * \brief Function used to set the packet threshhold limit in the packets buffer. + * + * \param threshhold_override The threshhold limit for the packets_buffer. + */ +void SCCudaPBSetBufferPacketThreshhold(uint32_t threshhold_override) +{ + buffer_packet_threshhold = threshhold_override; + + return; +} + +/** + * \brief Used to inform the cuda packet batcher that packet batching shouldn't + * be done anymore and set the flag to indicate this. We also need to + * signal the cuda batcher data inq, in case it is waiting on the inq + * for a new free packet buffer. + */ +void SCCudaPBKillBatchingPackets(void) +{ + run_batcher = 0; + SCDQDataQueue *dq = &data_queues[tmq_inq->id]; + SCCondSignal(&dq->cond_q); + + return; +} + +/***********************************Unittests**********************************/ + +#ifdef UNITTESTS + +int SCCudaPBTest01(void) +{ +#define ALIGN_UP(offset, alignment) \ + (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) + + uint8_t raw_eth[] = { + 0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02, + 0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00, + 0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06, + 0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8, + 0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11, + 0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18, + 0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01, + 0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21, + 0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31, + 0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46, + 0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f, + 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43, + 0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e, + 0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72, + 0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43, + 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, + 0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20, + 0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, + 0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44, + 0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e, + 0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70, + 0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38, + 0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47, + 0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d, + 0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a, + 0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a, + 0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48, + 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74, + 0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65, + 0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79, + 0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63, + 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75, + 0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c, + 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30, + 0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c, + 0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c, + 0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42, + 0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31, + 0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76, + 0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a, + 0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75, + 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73, + 0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c, + 0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68, + 0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e, + 0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59, + 0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e, + 0x0d, 0x0a }; + + int result = 0; + SCCudaPBThreadCtx *tctx = NULL; + + Packet p; + DecodeThreadVars dtv; + ThreadVars tv; + ThreadVars tv_cuda_PB; + DetectEngineCtx *de_ctx = NULL; + + SCCudaPBPacketsBuffer *pb = NULL; + SCCudaPBPacketDataForGPU *buff_packet = NULL; + SCDQDataQueue *dq = NULL; + + uint32_t i = 0; + + char *strings[] = {"test_one", + "test_two", + "test_three", + "test_four", + "test_five", + "test_six", + "test_seven", + "test_eight", + "test_nine", + "test_ten"}; + + uint32_t packets_payload_offset_buffer[sizeof(strings)/sizeof(char *)]; + memset(packets_payload_offset_buffer, 0, sizeof(packets_payload_offset_buffer)); + uint32_t packets_offset_buffer[sizeof(strings)/sizeof(char *)]; + memset(packets_offset_buffer, 0, sizeof(packets_offset_buffer)); + + uint32_t packets_total_payload_len = 0; + uint32_t packets_buffer_len = 0; + + for (i = 0; i < sizeof(strings)/sizeof(char *); i++) { + packets_total_payload_len += strlen(strings[i]); + } + + for (i = 1; i < sizeof(strings)/sizeof(char *); i++) { + packets_payload_offset_buffer[i] = packets_payload_offset_buffer[i - 1] + strlen(strings[i - 1]); + packets_offset_buffer[i] = packets_offset_buffer[i - 1] + + sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(strings[i - 1]); + ALIGN_UP(packets_offset_buffer[i], sizeof(CUdeviceptr)); + } + packets_buffer_len += packets_offset_buffer[(sizeof(strings)/sizeof(char *)) - 1] + + sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(strings[(sizeof(strings)/sizeof(char *)) - 1]); + + memset(&p, 0, sizeof(Packet)); + memset(&dtv, 0, sizeof(DecodeThreadVars)); + memset(&tv, 0, sizeof(ThreadVars)); + memset(&tv_cuda_PB, 0, sizeof(ThreadVars)); + + FlowInitConfig(FLOW_QUIET); + DecodeEthernet(&tv, &dtv, &p, raw_eth, sizeof(raw_eth), NULL); + + de_ctx = DetectEngineCtxInit(); + if (de_ctx == NULL) { + goto end; + } + + de_ctx->mpm_matcher = MPM_B2G_CUDA; + de_ctx->flags |= DE_QUIET; + + de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:test; sid:1;)"); + if (de_ctx->sig_list == NULL) { + printf("signature parsing failed\n"); + goto end; + } + SigGroupBuild(de_ctx); + + result = 1; + + SCCudaPBSetUpQueuesAndBuffers(); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 10); + SCCudaPBThreadInit(&tv_cuda_PB, de_ctx, (void *)&tctx); + SCCudaPBSetBufferPacketThreshhold(sizeof(strings)/sizeof(char *)); + + p.payload = (uint8_t *)strings[0]; + p.payload_len = strlen(strings[0]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[1]; + p.payload_len = strlen(strings[1]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[2]; + p.payload_len = strlen(strings[2]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[3]; + p.payload_len = strlen(strings[3]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[4]; + p.payload_len = strlen(strings[4]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[5]; + p.payload_len = strlen(strings[5]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[6]; + p.payload_len = strlen(strings[6]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[7]; + p.payload_len = strlen(strings[7]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[8]; + p.payload_len = strlen(strings[8]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + p.payload = (uint8_t *)strings[9]; + p.payload_len = strlen(strings[9]); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 1); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 8); + + dq = &data_queues[tmq_outq->id]; + pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq); + if (pb == NULL) { + result = 0; + goto end; + } + result &= (dq->len == 0); + result &= (pb->nop_in_buffer == 10); + if (result == 0) + goto end; + + for (i = 0; i < pb->nop_in_buffer; i++) { + buff_packet = (SCCudaPBPacketDataForGPU *)(pb->packets_buffer + pb->packets_offset_buffer[i]); + result &= (strlen(strings[i]) == buff_packet->payload_len); + result &= (memcmp(strings[i], buff_packet->payload, buff_packet->payload_len) == 0); + if (result == 0) + goto end; + result &= (packets_payload_offset_buffer[i] == pb->packets_payload_offset_buffer[i]); + result &= (packets_offset_buffer[i] == pb->packets_offset_buffer[i]); + } + result &= (packets_total_payload_len == pb->packets_total_payload_len); + result &= (packets_buffer_len == pb->packets_buffer_len); + + end: + SCCudaPBCleanUpQueuesAndBuffers(); + if (de_ctx) { + SigGroupCleanup(de_ctx); + SigCleanSignatures(de_ctx); + DetectEngineCtxFree(de_ctx); + } + + SCCudaPBThreadDeInit(NULL, tctx); + return result; +} + +int SCCudaPBTest02(void) +{ + uint8_t raw_eth[] = { + 0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02, + 0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00, + 0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06, + 0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8, + 0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11, + 0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18, + 0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01, + 0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21, + 0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31, + 0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46, + 0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f, + 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43, + 0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e, + 0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72, + 0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43, + 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, + 0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20, + 0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, + 0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44, + 0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e, + 0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70, + 0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38, + 0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47, + 0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d, + 0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a, + 0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a, + 0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48, + 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74, + 0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65, + 0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79, + 0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63, + 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75, + 0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c, + 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30, + 0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c, + 0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c, + 0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42, + 0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31, + 0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76, + 0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a, + 0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75, + 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73, + 0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c, + 0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68, + 0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e, + 0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59, + 0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e, + 0x0d, 0x0a }; + + int result = 0; + const char *string = NULL; + SCCudaPBThreadCtx *tctx = NULL; + + Packet p; + DecodeThreadVars dtv; + ThreadVars tv; + ThreadVars tv_cuda_PB; + DetectEngineCtx *de_ctx = NULL; + + SCCudaPBPacketsBuffer *pb = NULL; + SCDQDataQueue *dq = NULL; + + + memset(&p, 0, sizeof(Packet)); + memset(&dtv, 0, sizeof(DecodeThreadVars)); + memset(&tv, 0, sizeof(ThreadVars)); + memset(&tv_cuda_PB, 0, sizeof(ThreadVars)); + + FlowInitConfig(FLOW_QUIET); + DecodeEthernet(&tv, &dtv, &p, raw_eth, sizeof(raw_eth), NULL); + + de_ctx = DetectEngineCtxInit(); + if (de_ctx == NULL) { + goto end; + } + + de_ctx->mpm_matcher = MPM_B2G_CUDA; + de_ctx->flags |= DE_QUIET; + + de_ctx->sig_list = SigInit(de_ctx, "alert tcp any 5555 -> any any (msg:\"Bamboo\"; " + "content:test; sid:1;)"); + if (de_ctx->sig_list == NULL) { + printf("signature parsing failed\n"); + goto end; + } + SigGroupBuild(de_ctx); + + SCCudaPBSetUpQueuesAndBuffers(); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 10); + SCCudaPBThreadInit(&tv_cuda_PB, de_ctx, (void *)&tctx); + + result = 1; + + string = "test_one"; + p.payload = (uint8_t *)string; + p.payload_len = strlen(string); + SCCudaPBBatchPackets(NULL, &p, tctx, NULL); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); + + pb = tctx->curr_pb; + result &= (pb->nop_in_buffer == 0); + + end: + SCCudaPBCleanUpQueuesAndBuffers(); + if (de_ctx) { + SigGroupCleanup(de_ctx); + SigCleanSignatures(de_ctx); + DetectEngineCtxFree(de_ctx); + } + + SCCudaPBThreadDeInit(NULL, tctx); + return result; +} + +#endif /* UNITTESTS */ + +void SCCudaPBRegisterTests(void) +{ + +#ifdef UNITTESTS + UtRegisterTest("SCCudaPBTest01", SCCudaPBTest01, 1); + UtRegisterTest("SCCudaPBTest02", SCCudaPBTest02, 1); +#endif + + return; +} + +#endif /* __SC_CUDA_SUPPORT__ */ diff --git a/src/cuda-packet-batcher.h b/src/cuda-packet-batcher.h new file mode 100644 index 0000000000..60586fb6a3 --- /dev/null +++ b/src/cuda-packet-batcher.h @@ -0,0 +1,139 @@ +/** + * Copyright (c) 2010 Open Information Security Foundation. + * + * \author Anoop Saldanha + */ + +#ifndef __CUDA_PACKET_BATCHER_H__ +#define __CUDA_PACKET_BATCHER_H__ + +#include "suricata-common.h" + +/* compile in, only if we have a CUDA enabled on this machine */ +#ifdef __SC_CUDA_SUPPORT__ + +#include "util-cuda.h" + +/* The min no of packets that we allot the buffer for. We will make + * this user configurable(yaml) based on the traffic they expect. Either ways + * for a low/medium traffic network with occasional sgh matches, we shouldn't + * be enabling cuda. We will only end up screwing performance */ +#define SC_CUDA_PB_MIN_NO_OF_PACKETS 4000 + +/** + * \brief Implement the template SCDQGenericQData to transfer the cuda + * packet buffer from the cuda batcher thread to the dispatcher + * thread using the queue SCDQDataQueue. + */ +typedef struct SCCudaPBPacketsBuffer_ { + /* these members from the template SCDQGenericQData that have to be + * compulsarily implemented */ + struct SCDQGenericQData_ *next; + struct SCDQGenericQData_ *prev; + /* if we want to consider this pointer as the head of a list, this var + * holds the no of elements in the list */ + //uint16_t len; + /* in case this data instance is the head of a list, we can refer the + * bottomost instance directly using this var */ + //struct SCDQGenericaQData *bot; + + /* our own members from here on*/ + + /* current count of packets held in packets_buffer. nop = no of packets */ + uint32_t nop_in_buffer; + /* the packets buffer. We will assign buffer for SC_CUDA_PB_MIN_NO_OF_PACKETS + * packets. Basically the size of this buffer would be + * SC_CUDA_PB_MIN_NO_OF_PACKETS * sizeof(SCCudaPBPacketDataForGPU), so that + * we can hold mininum SC_CUDA_PB_MIN_NO_OF_PACKETS */ + uint8_t *packets_buffer; + /* length of data buffered so far in packets_buffer, which would be sent + * to the GPU. We will need this to copy the buffered data from the + * packets_buffer here on the host, to the buffer on the GPU */ + uint32_t packets_buffer_len; + /* packet offset within the packets_buffer. Each packet would be stored in + * packets buffer at a particular offset. This buffer would indicate the + * offset of a packet inside the packet buffer. We will allot space to hold + * offsets for SC_CUDA_PB_MIN_NO_OF_PACKETS packets + * \todo change it to holds offsets for more than SC_CUDA_PB_MIN_NO_OF_PACKETS + * when we use the buffer to hold packets based on the remaining size in the + * buffer rather than on a fixed limit like SC_CUDA_PB_MIN_NO_OF_PACKETS */ + uint32_t *packets_offset_buffer; + + /* the total packet payload lengths buffered so far. We will need this to + * transfer the total length of the results buffer that has to be transferred + * back from the gpu */ + uint32_t packets_total_payload_len; + /* the payload offsets for the different payload lengths buffered in. For + * example if we buffer 4 packets of lengths 3, 4, 5, 6, we will store four + * offsets in the buffer {0, 3, 7, 12, 18} */ + uint32_t *packets_payload_offset_buffer; + + /* packet addresses for all the packets buffered in the packets_buffer. We + * will allot space to hold packet addresses for SC_CUDA_PB_MIN_NO_OF_PACKETS. + * We will need this, so that the cuda mpm b2g dispatcher thread can inform + * and store the b2g cuda mpm results for the packet*/ + Packet **packets_address_buffer; +} SCCudaPBPacketsBuffer; + +/** + * \brief Structure for each packet that is being batched to the GPU. + */ +typedef struct SCCudaPBPacketDataForGPU_ { + /* holds B2gCudaCtx->m */ + unsigned int m; + /* holds B2gCudaCtx->cuda_B2g */ + CUdeviceptr table; + /* holds the length of the payload */ + unsigned int payload_len; + /* holds the payload. While we actually store the payload in the buffer, + * we may not end up using the entire 1480 bytes if the payload is smaller */ + uint8_t payload[1480]; +} SCCudaPBPacketDataForGPU; + +/** + * \brief Same as struct SCCudaPBPacketDataForGPU_ except for the payload part. + * We will need this for calculating the size of the non-payload part + * of the packet data to be buffered. + */ +typedef struct SCCudaPBPacketDataForGPUNonPayload_ { + /* holds B2gCudaCtx->m */ + unsigned int m; + /* holds B2gCudaCtx->cuda_B2g */ + CUdeviceptr table; + /* holds the length of the payload */ + unsigned int payload_len; +} SCCudaPBPacketDataForGPUNonPayload; + +/** + * \brief The cuda packet batcher threading context. + */ +typedef struct SCCudaPBThreadCtx_ { + /* we need the detection engine context to retrieve the sgh while we start + * receiving and batching the packets */ + DetectEngineCtx *de_ctx; + + /* packets buffer currently in use inside the cuda batcher thread */ + SCCudaPBPacketsBuffer *curr_pb; +} SCCudaPBThreadCtx; + +SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void); +void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *); + +void SCCudaPBSetBufferPacketThreshhold(uint32_t); +void SCCudaPBCleanUpQueuesAndBuffers(void); +void SCCudaPBSetUpQueuesAndBuffers(void); +void SCCudaPBKillBatchingPackets(void); + +TmEcode SCCudaPBBatchPackets(ThreadVars *, Packet *, void *, PacketQueue *); +TmEcode SCCudaPBThreadInit(ThreadVars *, void *, void **); +TmEcode SCCudaPBThreadDeInit(ThreadVars *, void *); +void SCCudaPBThreadExitStats(ThreadVars *, void *); +void SCCudaPBRegisterTests(void); + +void TmModuleCudaPacketBatcherRegister(void); + +void *SCCudaPBTmThreadsSlot1(void *); + +#endif /* __SC_CUDA_SUPPORT__ */ + +#endif /* __CUDA_PACKET_BATCHER_H__ */ diff --git a/src/data-queue.c b/src/data-queue.c new file mode 100644 index 0000000000..297139b9b0 --- /dev/null +++ b/src/data-queue.c @@ -0,0 +1,93 @@ +/** + * Copyright (c) 2009, 2010 Open Information Security Foundation. + * + * \author Anoop Saldanha + */ + +#include "suricata-common.h" +#include "data-queue.h" +#include "threads.h" + +/** + * \brief Enqueues data on the queue. + * + * \param q Pointer to the data queue. + * \param data Pointer to the data to be queued. It should be a pointer to a + * structure instance that implements the template structure + * struct SCDQGenericQData_ defined in data-queue.h. + */ +void SCDQDataEnqueue(SCDQDataQueue *q, SCDQGenericQData *data) +{ + /* we already have some data in queue */ + if (q->top != NULL) { + data->next = q->top; + q->top->prev = data; + q->top = data; + + /* the queue is empty */ + } else { + q->top = data; + q->bot = data; + } + + q->len++; + +#ifdef DBG_PERF + if (q->len > q->dbg_maxlen) + q->dbg_maxlen = q->len; +#endif /* DBG_PERF */ + + return; +} + +/** + * \brief Dequeues and returns an entry from the queue. + * + * \param q Pointer to the data queue. + * \param retval Pointer to the data that has been enqueued. The instance + * returned is/should be a pointer to a structure instance that + * implements the template structure struct SCDQGenericQData_ + * defined in data-queue.h. + */ +SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *q) +{ + SCDQGenericQData *data = NULL; + + /* if the queue is empty there are is no data left and we return NULL */ + if (q->len == 0) { + return NULL; + } + + /* If we are going to get the last packet, set len to 0 + * before doing anything else (to make the threads to follow + * the SCondWait as soon as possible) */ + q->len--; + + /* pull the bottom packet from the queue */ + data = q->bot; + +#ifdef OS_DARWIN + /* Weird issue in OS_DARWIN + * Sometimes it looks that two thread arrive here at the same time + * so the bot ptr is NULL */ + if (data == NULL) { + printf("No data to dequeue!\n"); + return NULL; + } +#endif /* OS_DARWIN */ + + /* more data in queue */ + if (q->bot->prev != NULL) { + q->bot = q->bot->prev; + q->bot->next = NULL; + /* just the one we remove, so now empty */ + } else { + q->top = NULL; + q->bot = NULL; + } + + data->next = NULL; + data->prev = NULL; + + return data; +} diff --git a/src/data-queue.h b/src/data-queue.h new file mode 100644 index 0000000000..f3c11fcf74 --- /dev/null +++ b/src/data-queue.h @@ -0,0 +1,64 @@ +/** + * Copyright (c) 2009, 2010 Open Information Security Foundation. + * + * \author Anoop Saldanha + * + * \file Generic queues. Any instance that wants to get itself on the generic + * queue, would have to implement the template struct SCDQGenericQData_ + * defined below. + */ + +#ifndef __DATA_QUEUE_H__ +#define __DATA_QUEUE_H__ + +#include "threads.h" + +/** + * \brief Generic template for any data structure that wants to be on the + * queue. Any other data structure that wants to be on the queue + * needs to use this template and define its own members from + * onwards. + */ +typedef struct SCDQGenericQData_ { + /* this is needed when we want to supply a list of data items */ + struct SCDQGenericQData_ *next; + struct SCDQGenericQData_ *prev; + /* if we want to consider this pointer as the head of a list, this var + * holds the no of elements in the list. Else it holds a . */ + //uint16_t len; + /* in case this data instance is the head of a list, we can refer the + * bottomost instance directly using this var */ + //struct SCDQGenericaQData *bot; + + + /* any other data structure that wants to be on the queue can implement + * its own memebers from here on, in its structure definition. Just note + * that the first 2 members should always be next and prev in the same + * order */ + // +} SCDQGenericQData; + +/** + * \brief The data queue to hold instances that implement the template + * SCDQGenericQData. + */ +typedef struct SCDQDataQueue_ { + /* holds the item at the top of the queue */ + SCDQGenericQData *top; + /* holds the item at the bottom of the queue */ + SCDQGenericQData *bot; + /* no of items currently in the queue */ + uint16_t len; + + SCMutex mutex_q; + SCCondT cond_q; + +#ifdef DBG_PERF + uint16_t dbg_maxlen; +#endif /* DBG_PERF */ +} SCDQDataQueue; + +void SCDQDataEnqueue(SCDQDataQueue *, SCDQGenericQData *); +SCDQGenericQData *SCDQDataDequeue(SCDQDataQueue *); + +#endif /* __DATA_QUEUE_H__ */ diff --git a/src/decode.h b/src/decode.h index aa31d49275..40cd8b3005 100644 --- a/src/decode.h +++ b/src/decode.h @@ -247,6 +247,10 @@ struct PacketQueue_; */ typedef struct Packet_ { + /* double linked list ptrs */ + struct Packet_ *next; + struct Packet_ *prev; + /* Addresses, Ports and protocol * these are on top so we can use * the Packet as a hash key */ @@ -342,10 +346,6 @@ typedef struct Packet_ /** packet number in the pcap file, matches wireshark */ uint64_t pcap_cnt; - /* double linked list ptrs */ - struct Packet_ *next; - struct Packet_ *prev; - /* ready to set verdict counter, only set in root */ uint8_t rtv_cnt; /* tunnel packet ref count */ @@ -369,20 +369,22 @@ typedef struct Packet_ /* required for cuda support */ #ifdef __SC_CUDA_SUPPORT__ - PatternMatcherQueue *cuda_pmq; - MpmCtx *cuda_mpm_ctx; - MpmThreadCtx *cuda_mtc; - - /* used to hold the match results. We can instead use a void *result - * instead here. That way we can make them hold any result. *todo* */ - uint16_t cuda_matches; - /* indicates if the dispatcher should call the search or the scan phase - * of the pattern matcher. We can instead use a void *cuda_data instead. - * This way we can send any data across to the dispatcher */ - uint8_t cuda_search; - /* the dispatcher thread would pump the packet into this queue once it has - * processed the packet */ - struct PacketQueue_ *cuda_outq; + /* indicates if the cuda mpm would be conducted or a normal cpu mpm would + * be conduced on this packet. If it is set to 0, the cpu mpm; else cuda mpm */ + uint8_t cuda_mpm_enabled; + /* indicates if the cuda mpm has finished running the mpm and processed the + * results for this packet, assuming if cuda_mpm_enabled has been set for this + * packet */ + uint16_t cuda_done; + /* used by the detect thread and the cuda mpm dispatcher thread. The detect + * thread would wait on this cond var, if the cuda mpm dispatcher thread + * still hasn't processed the packet. The dispatcher would use this cond + * to inform the detect thread(in case it is waiting on this packet), once + * the dispatcher is done processing the packet results */ + SCMutex cuda_mutex; + SCCondT cuda_cond; + /* the extra 1 in the 1481, is to hold the no_of_matches from the mpm run */ + uint16_t mpm_offsets[1481]; #endif } Packet; @@ -460,23 +462,29 @@ typedef struct DecodeThreadVars_ /** * \brief Initialize a packet structure for use. */ -#define PACKET_INITIALIZE(p) do { \ - memset((p), 0x00, sizeof(Packet)); \ - SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \ - PACKET_RESET_CHECKSUMS((p)); \ - } while (0) +#ifndef __SC_CUDA_SUPPORT__ +#define PACKET_INITIALIZE(p) { \ + memset((p), 0x00, sizeof(Packet)); \ + SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \ + PACKET_RESET_CHECKSUMS((p)); \ +} +#else +#define PACKET_INITIALIZE(p) { \ + memset((p), 0x00, sizeof(Packet)); \ + SCMutexInit(&(p)->mutex_rtv_cnt, NULL); \ + PACKET_RESET_CHECKSUMS((p)); \ + SCMutexInit(&(p)->cuda_mutex, NULL); \ + SCCondInit(&(p)->cuda_cond, NULL); \ +} +#endif + /** * \brief Recycle a packet structure for reuse. * \todo the mutex destroy & init is necessary because of the memset, reconsider */ -#define PACKET_RECYCLE(p) do { \ - CLEAR_ADDR(&(p)->src); \ - CLEAR_ADDR(&(p)->dst); \ - (p)->sp = 0; \ - (p)->dp = 0; \ - (p)->proto = 0; \ - (p)->recursion_level = 0; \ +#define PACKET_DO_RECYCLE(p) do { \ + (p)->recursion_level = 0; \ (p)->flags = 0; \ (p)->flowflags = 0; \ (p)->flow = NULL; \ @@ -530,15 +538,39 @@ typedef struct DecodeThreadVars_ PACKET_RESET_CHECKSUMS((p)); \ } while (0) +#ifndef __SC_CUDA_SUPPORT__ +#define PACKET_RECYCLE(p) PACKET_DO_RECYCLE((p)) +#else +#define PACKET_RECYCLE(p) do { \ + PACKET_DO_RECYCLE((p)); \ + SCMutexDestroy(&(p)->cuda_mutex); \ + SCCondDestroy(&(p)->cuda_cond); \ + SCMutexInit(&(p)->cuda_mutex, NULL); \ + SCCondInit(&(p)->cuda_cond, NULL); \ + PACKET_RESET_CHECKSUMS((p)); \ +} while(0) +#endif + /** * \brief Cleanup a packet so that we can free it. No memset needed.. */ +#ifndef __SC_CUDA_SUPPORT__ #define PACKET_CLEANUP(p) do { \ if ((p)->pktvar != NULL) { \ PktVarFree((p)->pktvar); \ } \ SCMutexDestroy(&(p)->mutex_rtv_cnt); \ } while (0) +#else +#define PACKET_CLEANUP(p) do { \ + if ((p)->pktvar != NULL) { \ + PktVarFree((p)->pktvar); \ + } \ + SCMutexDestroy(&(p)->mutex_rtv_cnt); \ + SCMutexDestroy(&(p)->cuda_mutex); \ + SCCondDestroy(&(p)->cuda_cond); \ +} while(0) +#endif /* macro's for setting the action diff --git a/src/detect-engine-mpm.c b/src/detect-engine-mpm.c index 66dad79c92..fb123037ae 100644 --- a/src/detect-engine-mpm.c +++ b/src/detect-engine-mpm.c @@ -106,31 +106,42 @@ uint16_t PatternMatchDefaultMatcher(void) { * \retval ret number of matches */ uint32_t PacketPatternSearch(ThreadVars *tv, DetectEngineThreadCtx *det_ctx, - Packet *p) + Packet *p) { SCEnter(); uint32_t ret; + #ifndef __SC_CUDA_SUPPORT__ ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx, - &det_ctx->mtc, - &det_ctx->pmq, - p->payload, - p->payload_len); + &det_ctx->mtc, + &det_ctx->pmq, + p->payload, + p->payload_len); #else /* if the user has enabled cuda support, but is not using the cuda mpm * algo, then we shouldn't take the path of the dispatcher. Call the mpm * directly */ if (det_ctx->sgh->mpm_ctx->mpm_type != MPM_B2G_CUDA) { ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx, - &det_ctx->mtc, - &det_ctx->pmq, - p->payload, - p->payload_len); + &det_ctx->mtc, + &det_ctx->pmq, + p->payload, + p->payload_len); SCReturnInt(ret); } - SCCudaHlProcessPacketWithDispatcher(p, det_ctx, &ret); + if (p->cuda_mpm_enabled) { + ret = B2gCudaResultsPostProcessing(p, det_ctx->sgh->mpm_ctx, + &det_ctx->mtc, &det_ctx->pmq); + } else { + ret = mpm_table[det_ctx->sgh->mpm_ctx->mpm_type].Search(det_ctx->sgh->mpm_ctx, + &det_ctx->mtc, + &det_ctx->pmq, + p->payload, + p->payload_len); + } + #endif SCReturnInt(ret); @@ -154,23 +165,8 @@ uint32_t UriPatternSearch(DetectEngineThreadCtx *det_ctx, //PrintRawDataFp(stdout, uri, uri_len); uint32_t ret; -#ifndef __SC_CUDA_SUPPORT__ - ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search - (det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq, - uri, uri_len); -#else - /* if the user has enabled cuda support, but is not using the cuda mpm - * algo, then we shouldn't take the path of the dispatcher. Call the mpm - * directly */ - if (det_ctx->sgh->mpm_uri_ctx->mpm_type != MPM_B2G_CUDA) { - ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search - (det_ctx->sgh->mpm_uri_ctx, &det_ctx->mtcu, &det_ctx->pmq, - uri, uri_len); - SCReturnUInt(ret); - } - - SCCudaHlProcessUriWithDispatcher(uri, uri_len, det_ctx, &ret); -#endif + ret = mpm_table[det_ctx->sgh->mpm_uri_ctx->mpm_type].Search(det_ctx->sgh->mpm_uri_ctx, + &det_ctx->mtcu, &det_ctx->pmq, uri, uri_len); SCReturnUInt(ret); } diff --git a/src/detect-engine.c b/src/detect-engine.c index 6eb9ca6eae..5d0ce14e99 100644 --- a/src/detect-engine.c +++ b/src/detect-engine.c @@ -378,47 +378,6 @@ TmEcode DetectEngineThreadCtxInit(ThreadVars *tv, void *initdata, void **data) { *data = (void *)det_ctx; -#ifdef __SC_CUDA_SUPPORT__ - if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA) - return TM_ECODE_OK; - - Tmq *tmq; - /* we would prepend this name to the the tv name, to obtain the final unique - * detection thread queue name */ - char *cuda_outq_name = "cuda_mpm_rc_disp_outq"; - uint8_t disp_outq_name_len = (strlen(tv->name) + strlen(cuda_outq_name) + 1); - - char *disp_outq_name = SCMalloc(disp_outq_name_len * sizeof(char)); - if (disp_outq_name == NULL) - goto error; - strcpy(disp_outq_name, tv->name); - strcpy(disp_outq_name + strlen(tv->name), cuda_outq_name); - disp_outq_name[disp_outq_name_len] = '\0'; - - tmq = TmqGetQueueByName(disp_outq_name); - if (tmq != NULL) { - SCLogError(SC_ERR_TMQ_ALREADY_REGISTERED, "A queue by the name \"%s\" " - "is already registered, which shouldn't be the case. Queue " - "name is duplicated. Please check if multiple instances of " - "detection module are given different names ", - disp_outq_name); - goto error; - } - tmq = TmqCreateQueue(disp_outq_name); - if (tmq == NULL) - goto error; - - /* hold the queue instane we create under this detection thread instance */ - det_ctx->cuda_mpm_rc_disp_outq = tmq; - det_ctx->cuda_mpm_rc_disp_outq->reader_cnt++; - det_ctx->cuda_mpm_rc_disp_outq->writer_cnt++; - - return TM_ECODE_OK; - - error: - return TM_ECODE_FAILED; -#endif - return TM_ECODE_OK; } diff --git a/src/detect.c b/src/detect.c index fdd862cf4d..8c5d0db7e0 100644 --- a/src/detect.c +++ b/src/detect.c @@ -3140,6 +3140,7 @@ int SigGroupBuild (DetectEngineCtx *de_ctx) { SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the " "module SC_RULES_CONTENT_B2G_CUDA"); } + SCCudaCtxPushCurrent(dummy_context); if (SCCudaMemGetInfo(&cuda_free_before_alloc, &cuda_total) == 0) { SCLogInfo("Total Memory available in the CUDA context used for mpm " "with b2g: %.2f MB", cuda_total/(1024.0 * 1024.0)); diff --git a/src/runmodes.c b/src/runmodes.c index ed25c943d7..4effce00db 100644 --- a/src/runmodes.c +++ b/src/runmodes.c @@ -24,6 +24,7 @@ #include "suricata-common.h" #include "detect-engine.h" +#include "detect-engine-mpm.h" #include "tm-threads.h" #include "util-debug.h" #include "util-time.h" @@ -43,6 +44,8 @@ #include "output.h" +#include "cuda-packet-batcher.h" + /** * A list of output modules that will be active for the run mode. */ @@ -2305,6 +2308,115 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) { printf("ERROR: TmThreadSpawn failed\n"); exit(EXIT_FAILURE); } + +#if defined(__SC_CUDA_SUPPORT__) + if (PatternMatchDefaultMatcher() == MPM_B2G_CUDA) { + ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode", + "pickup-queue", "simple", + "decode-queue1", "simple", + "1slot"); + if (tv_decode1 == NULL) { + printf("ERROR: TmThreadsCreate failed for Decode1\n"); + exit(EXIT_FAILURE); + } + tm_module = TmModuleGetByName("DecodePcapFile"); + if (tm_module == NULL) { + printf("ERROR: TmModuleGetByName DecodePcap failed\n"); + exit(EXIT_FAILURE); + } + Tm1SlotSetFunc(tv_decode1, tm_module, NULL); + + TmThreadSetCPUAffinity(tv_decode1, 0); + if (ncpus > 1) + TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM); + + if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) { + printf("ERROR: TmThreadSpawn failed\n"); + exit(EXIT_FAILURE); + } + + ThreadVars *tv_cuda_PB = TmThreadCreate("CUDA_PB", + "decode-queue1", "simple", + "cuda-pb-queue1", "simple", + "custom", SCCudaPBTmThreadsSlot1, 0); + if (tv_cuda_PB == NULL) { + printf("ERROR: TmThreadsCreate failed for CUDA_PB\n"); + exit(EXIT_FAILURE); + } + tv_cuda_PB->type = TVT_PPT; + + tm_module = TmModuleGetByName("CudaPacketBatcher"); + if (tm_module == NULL) { + printf("ERROR: TmModuleGetByName CudaPacketBatcher failed\n"); + exit(EXIT_FAILURE); + } + Tm1SlotSetFunc(tv_cuda_PB, tm_module, (void *)de_ctx); + + TmThreadSetCPUAffinity(tv_cuda_PB, 0); + if (ncpus > 1) + TmThreadSetThreadPriority(tv_cuda_PB, PRIO_MEDIUM); + + if (TmThreadSpawn(tv_cuda_PB) != TM_ECODE_OK) { + printf("ERROR: TmThreadSpawn failed\n"); + exit(EXIT_FAILURE); + } + + ThreadVars *tv_stream1 = TmThreadCreatePacketHandler("Stream1", + "cuda-pb-queue1", "simple", + "stream-queue1", "simple", + "1slot"); + if (tv_stream1 == NULL) { + printf("ERROR: TmThreadsCreate failed for Stream1\n"); + exit(EXIT_FAILURE); + } + tm_module = TmModuleGetByName("StreamTcp"); + if (tm_module == NULL) { + printf("ERROR: TmModuleGetByName StreamTcp failed\n"); + exit(EXIT_FAILURE); + } + Tm1SlotSetFunc(tv_stream1,tm_module,NULL); + + TmThreadSetCPUAffinity(tv_stream1, 0); + if (ncpus > 1) + TmThreadSetThreadPriority(tv_stream1, PRIO_MEDIUM); + + if (TmThreadSpawn(tv_stream1) != TM_ECODE_OK) { + printf("ERROR: TmThreadSpawn failed\n"); + exit(EXIT_FAILURE); + } + } else { + ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream", + "pickup-queue", "simple", + "stream-queue1", "simple", + "varslot"); + if (tv_decode1 == NULL) { + printf("ERROR: TmThreadsCreate failed for Decode1\n"); + exit(EXIT_FAILURE); + } + tm_module = TmModuleGetByName("DecodePcapFile"); + if (tm_module == NULL) { + printf("ERROR: TmModuleGetByName DecodePcap failed\n"); + exit(EXIT_FAILURE); + } + TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL); + + tm_module = TmModuleGetByName("StreamTcp"); + if (tm_module == NULL) { + printf("ERROR: TmModuleGetByName StreamTcp failed\n"); + exit(EXIT_FAILURE); + } + TmVarSlotSetFuncAppend(tv_decode1,tm_module,NULL); + + TmThreadSetCPUAffinity(tv_decode1, 0); + if (ncpus > 1) + TmThreadSetThreadPriority(tv_decode1, PRIO_MEDIUM); + + if (TmThreadSpawn(tv_decode1) != TM_ECODE_OK) { + printf("ERROR: TmThreadSpawn failed\n"); + exit(EXIT_FAILURE); + } + } +#else //#if 0 //ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","packetpool","packetpool","varslot"); ThreadVars *tv_decode1 = TmThreadCreatePacketHandler("Decode & Stream","pickup-queue","simple","stream-queue1","simple","varslot"); @@ -2336,7 +2448,7 @@ int RunModeFilePcapAuto(DetectEngineCtx *de_ctx, char *file) { printf("ERROR: TmThreadSpawn failed\n"); exit(EXIT_FAILURE); } - +#endif //#if 0 /* start with cpu 1 so that if we're creating an odd number of detect * threads we're not creating the most on CPU0. */ diff --git a/src/suricata.c b/src/suricata.c index 1e1f7c1919..2b8030c4e8 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -133,6 +133,7 @@ /* holds the cuda b2g module */ #include "util-mpm-b2g-cuda.h" #include "util-cuda-handlers.h" +#include "cuda-packet-batcher.h" #include "output.h" #include "util-privs.h" @@ -218,6 +219,7 @@ SignalHandlerSetup(int sig, void (*handler)()) void GlobalInits() { memset(trans_q, 0, sizeof(trans_q)); + memset(data_queues, 0, sizeof(data_queues)); /* Initialize the trans_q mutex */ int blah; @@ -225,6 +227,9 @@ void GlobalInits() for(blah=0;blah<256;blah++) { r |= SCMutexInit(&trans_q[blah].mutex_q, NULL); r |= SCCondInit(&trans_q[blah].cond_q, NULL); + + r |= SCMutexInit(&data_queues[blah].mutex_q, NULL); + r |= SCCondInit(&data_queues[blah].cond_q, NULL); } if (r != 0) { @@ -793,6 +798,7 @@ int main(int argc, char **argv) TmModuleLogHttpLogIPv6Register(); #ifdef __SC_CUDA_SUPPORT__ TmModuleCudaMpmB2gRegister(); + TmModuleCudaPacketBatcherRegister(); #endif TmModuleReceiveErfFileRegister(); TmModuleDecodeErfFileRegister(); @@ -996,10 +1002,15 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } + #ifdef PROFILING SCProfilingInitRuleCounters(de_ctx); #endif /* PROFILING */ +#ifdef __SC_CUDA_SUPPORT__ + SCCudaPBSetUpQueuesAndBuffers(); +#endif /* __SC_CUDA_SUPPORT__ */ + AppLayerHtpRegisterExtraCallbacks(); SCThresholdConfInitContext(de_ctx,NULL); @@ -1126,6 +1137,10 @@ int main(int argc, char **argv) SCLogInfo("time elapsed %" PRIuMAX "s", (uintmax_t)(end_time.tv_sec - start_time.tv_sec)); +#ifdef __SC_CUDA_SUPPORT__ + SCCudaPBKillBatchingPackets(); +#endif + TmThreadKillThreads(); SCPerfReleaseResources(); break; diff --git a/src/suricata.h b/src/suricata.h index a4ba160bd9..f4f32fbf22 100644 --- a/src/suricata.h +++ b/src/suricata.h @@ -26,6 +26,7 @@ #include "suricata-common.h" #include "packet-queue.h" +#include "data-queue.h" /* the name of our binary */ #define PROG_NAME "Suricata" @@ -54,6 +55,8 @@ enum { * XXX move to the TmQueue structure later */ PacketQueue trans_q[256]; + +SCDQDataQueue data_queues[256]; /* memset to zeros, and mutex init! */ void GlobalInits(); diff --git a/src/tm-modules.h b/src/tm-modules.h index cdb394369a..817497a871 100644 --- a/src/tm-modules.h +++ b/src/tm-modules.h @@ -78,6 +78,7 @@ enum { TMM_RECEIVEIPFW, #ifdef __SC_CUDA_SUPPORT__ TMM_CUDA_MPM_B2G, + TMM_CUDA_PACKET_BATCHER, #endif TMM_RECEIVEERFFILE, TMM_DECODEERFFILE, diff --git a/src/tm-queues.c b/src/tm-queues.c index 7ca0fc6974..80c392447c 100644 --- a/src/tm-queues.c +++ b/src/tm-queues.c @@ -52,6 +52,8 @@ Tmq* TmqCreateQueue(char *name) { Tmq *q = &tmqs[tmq_id]; q->name = name; q->id = tmq_id++; + /* for cuda purposes */ + q->q_type = 0; SCLogDebug("created queue \'%s\', %p", name, q); return q; diff --git a/src/tm-queues.h b/src/tm-queues.h index 022f221e59..01dbb6e704 100644 --- a/src/tm-queues.h +++ b/src/tm-queues.h @@ -29,6 +29,8 @@ typedef struct Tmq_ { uint16_t id; uint16_t reader_cnt; uint16_t writer_cnt; + /* 0 for packet-queue and 1 for data-queue */ + uint8_t q_type; } Tmq; Tmq* TmqCreateQueue(char *name); diff --git a/src/tm-threads.c b/src/tm-threads.c index c55adc0748..5f091863b0 100644 --- a/src/tm-threads.c +++ b/src/tm-threads.c @@ -68,45 +68,6 @@ SCMutex tv_root_lock = PTHREAD_MUTEX_INITIALIZER; thread encounters a failure. Defaults to restart the failed thread */ uint8_t tv_aof = THV_RESTART_THREAD; -typedef struct TmSlot_ { - /* function pointers */ - TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *, PacketQueue *); - - TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **); - void (*SlotThreadExitPrintStats)(ThreadVars *, void *); - TmEcode (*SlotThreadDeinit)(ThreadVars *, void *); - - /* data storage */ - void *slot_initdata; - void *slot_data; - - /**< queue filled by the SlotFunc with packets that will - * be processed futher _before_ the current packet. - * The locks in the queue are NOT used */ - PacketQueue slot_pre_pq; - - /**< queue filled by the SlotFunc with packets that will - * be processed futher _after_ the current packet. The - * locks in the queue are NOT used */ - PacketQueue slot_post_pq; - - /* linked list, only used by TmVarSlot */ - struct TmSlot_ *slot_next; - - int id; /**< slot id, only used my TmVarSlot to know what the first - * slot is. */ -} TmSlot; - -/* 1 function slot */ -typedef struct Tm1Slot_ { - TmSlot s; -} Tm1Slot; - -/* Variable number of function slots */ -typedef struct TmVarSlot_ { - TmSlot *s; -} TmVarSlot; - /** * \brief Check if a thread flag is set * @@ -649,11 +610,12 @@ TmEcode TmThreadSetSlots(ThreadVars *tv, char *name, void *(*fn_p)(void *)) { size = sizeof(TmVarSlot); tv->tm_func = TmThreadsSlotVar; } else if (strcmp(name, "custom") == 0) { + /* \todo this needs to be changed to support slots of any size */ + size = sizeof(Tm1Slot); if (fn_p == NULL) goto error; tv->tm_func = fn_p; - return TM_ECODE_OK; } else { printf("Error: Slot \"%s\" not supported\n", name); goto error; @@ -1134,8 +1096,12 @@ void TmThreadKillThreads(void) { if (tv->InShutdownHandler != NULL) { tv->InShutdownHandler(tv); } - for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) - SCCondSignal(&trans_q[tv->inq->id].cond_q); + for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) { + if (tv->inq->q_type == 0) + SCCondSignal(&trans_q[tv->inq->id].cond_q); + else + SCCondSignal(&data_queues[tv->inq->id].cond_q); + } /* to be sure, signal more */ int cnt = 0; @@ -1151,9 +1117,12 @@ void TmThreadKillThreads(void) { tv->InShutdownHandler(tv); } - for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) - SCCondSignal(&trans_q[tv->inq->id].cond_q); - + for (i = 0; i < (tv->inq->reader_cnt + tv->inq->writer_cnt); i++) { + if (tv->inq->q_type == 0) + SCCondSignal(&trans_q[tv->inq->id].cond_q); + else + SCCondSignal(&data_queues[tv->inq->id].cond_q); + } usleep(100); } diff --git a/src/tm-threads.h b/src/tm-threads.h index 0c30c2b771..20f0b5b879 100644 --- a/src/tm-threads.h +++ b/src/tm-threads.h @@ -34,6 +34,34 @@ enum { TVT_MAX, }; + +typedef struct TmSlot_ { + /* function pointers */ + TmEcode (*SlotFunc)(ThreadVars *, Packet *, void *, PacketQueue *); + + TmEcode (*SlotThreadInit)(ThreadVars *, void *, void **); + void (*SlotThreadExitPrintStats)(ThreadVars *, void *); + TmEcode (*SlotThreadDeinit)(ThreadVars *, void *); + + /* data storage */ + void *slot_initdata; + void *slot_data; + PacketQueue slot_pq; + + /* linked list, only used by TmVarSlot */ + struct TmSlot_ *slot_next; +} TmSlot; + +/* 1 function slot */ +typedef struct Tm1Slot_ { + TmSlot s; +} Tm1Slot; + +/* Variable number of function slots */ +typedef struct TmVarSlot_ { + TmSlot *s; +} TmVarSlot; + extern ThreadVars *tv_root[TVT_MAX]; extern SCMutex tv_root_lock; diff --git a/src/tmqh-simple.c b/src/tmqh-simple.c index 461928149e..fee8069daf 100644 --- a/src/tmqh-simple.c +++ b/src/tmqh-simple.c @@ -90,19 +90,26 @@ void TmqhOutputSimple(ThreadVars *t, Packet *p) SCMutexUnlock(&q->mutex_q); } +/*******************************Generic-Q-Handlers*****************************/ + /** * \brief Public version of TmqhInputSimple from the tmqh-simple queue * handler, except that it is a generic version that is directly - * tied to a PacketQueue instance. + * tied to a "SCDQDataQueue" instance(sent as an arg). + * + * Retrieves a data_instance from the queue. If the queue is empty, it + * waits on the queue, till a data_instance is enqueued into the queue + * by some other module. * - * Retrieves a packet from the queue. If the queue is empty, it waits - * on the queue, till a packet is enqueued into the queue. + * All references to "data_instance" means a reference to a data structure + * instance that implements the template "struct SCDQGenericQData_". * - * \param q The PacketQueue instance to wait on. + * \param q The SCDQDataQueue instance to wait on. * * \retval p The returned packet from the queue. + * \retval data The returned data_instance from the queue. */ -Packet *TmqhInputSimpleOnQ(PacketQueue *q) +SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *q) { SCMutexLock(&q->mutex_q); if (q->len == 0) { @@ -111,11 +118,12 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q) } if (q->len > 0) { - Packet *p = PacketDequeue(q); + SCDQGenericQData *data = SCDQDataDequeue(q); SCMutexUnlock(&q->mutex_q); - return p; + return data; } else { - /* return NULL if we have no pkt. Should only happen on signals. */ + /* return NULL if we have no data in the queue. Should only happen + * on signals. */ SCMutexUnlock(&q->mutex_q); return NULL; } @@ -124,17 +132,23 @@ Packet *TmqhInputSimpleOnQ(PacketQueue *q) /** * \brief Public version of TmqhOutputSimple from the tmqh-simple queue * handler, except that it is a generic version that is directly - * tied to a PacketQueue instance. + * tied to a SCDQDataQueue instance(sent as an arg). + * + * Pumps out a data_instance into the queue. If the queue is empty, it + * waits on the queue, till a data_instance is enqueued into the queue. * - * Enqueues a packet into the packet queue. + * All references to "data_instance" means a reference to a data structure + * instance that implements the template "struct SCDQGenericQData_". * - * \param q The PacketQueue instance to enqueue the packet into. - * \param p The packet to be enqueued into the above queue. + * \param q The SCDQDataQueue instance to pump the data into. + * \param data The data instance to be enqueued. */ -void TmqhOutputSimpleOnQ(PacketQueue *q, Packet *p) +void TmqhOutputSimpleOnQ(SCDQDataQueue *q, SCDQGenericQData *data) { SCMutexLock(&q->mutex_q); - PacketEnqueue(q, p); + SCDQDataEnqueue(q, data); SCCondSignal(&q->cond_q); SCMutexUnlock(&q->mutex_q); + + return; } diff --git a/src/tmqh-simple.h b/src/tmqh-simple.h index 54f69e69c2..1d4417b4e8 100644 --- a/src/tmqh-simple.h +++ b/src/tmqh-simple.h @@ -24,8 +24,11 @@ #ifndef __TMQH_SIMPLE_H__ #define __TMQH_SIMPLE_H__ -Packet *TmqhInputSimpleOnQ(PacketQueue *); -void TmqhOutputSimpleOnQ(PacketQueue *, Packet *); +#include "data-queue.h" + +SCDQGenericQData *TmqhInputSimpleOnQ(SCDQDataQueue *); +void TmqhOutputSimpleOnQ(SCDQDataQueue *, SCDQGenericQData *); + void TmqhSimpleRegister (void); #endif /* __TMQH_SIMPLE_H__ */ diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c index 0789e37d10..e6b51eba87 100644 --- a/src/util-cuda-handlers.c +++ b/src/util-cuda-handlers.c @@ -100,6 +100,72 @@ SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t handle) return data; } +/** + * \brief Returns a cuda_module against the handle in the argument. + * + * If a cuda_module is not present for a handle, it is created + * and associated with this handle and the cuda_module is returned + * in the argument. If a cuda_module is already present for + * a handle, it is returned. + * + * \param p_context Pointer to a cuda context instance that should be updated + * with a cuda context. + * \param handle A unique handle which identifies a module. Obtained from + * a call to SCCudaHlGetUniqueHandle(). + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaHlGetCudaModuleFromFile(CUmodule *p_module, const char *filename, int handle) +{ + SCCudaHlModuleData *data = NULL; + + if (p_module == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments" + "p_module NULL"); + return -1; + } + + /* check if the particular module that wants a CUDA module is already + * registered or not. If it is registered, check if a context has + * been associated with the module. If yes, then we can go ahead and + * create a cuda module or return the reference to the cuda module if + * we already have a cuda module associated with the module. If no, " + * log warning and get out of here */ + if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) || + (data->cuda_context == 0)) { + SCLogDebug("Module not registered or no cuda context associated with " + "this module. You can't create a CUDA module without" + "associatin a context with a module first. To use this " + "registration facility, first register a module using " + "context using SCCudaHlRegisterModule(), and then register " + "a cuda context with that module using " + "SCCudaHlGetCudaContext(), after which you can call this " + "function "); + return -1; + } + + /* we already have a cuda module associated with this module. Return the + * cuda module */ + if (data->cuda_module != 0) { + p_module[0] = data->cuda_module; + return 0; + } + + /* we don't have a cuda module associated with this module. Create a + * cuda module, update the module with this cuda module reference and + * then return the module refernce back to the calling function using + * the argument */ + if (SCCudaModuleLoad(p_module, filename) == -1) + goto error; + data->cuda_module = p_module[0]; + + return 0; + + error: + return -1; +} + /** * \internal * \brief Get a unique handle for a new module registration. This new handle @@ -347,10 +413,12 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name, } } + /* send the newly assigned device pointer back to the caller */ + device_ptr[0] = new_module_device_ptr->d_ptr; + /* insert it into the device_ptr list for the module instance */ if (data->device_ptrs == NULL) { data->device_ptrs = new_module_device_ptr; - device_ptr[0] = new_module_device_ptr->d_ptr; return 0; } @@ -367,6 +435,76 @@ int SCCudaHlGetCudaDevicePtr(CUdeviceptr *device_ptr, const char *name, return -1; } +/** + * \brief Frees a Cuda Device Pointer. + * + * If a device pointer by the name \"name\" is registered for this + * handle, it is freed. + * + * \param name Name of the device pointer by which we have to search + * module for its existance. + * \param handle A unique handle which identifies a module. Obtained from + * a call to SCCudaHlGetUniqueHandle(). + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaHlFreeCudaDevicePtr(const char *name, int handle) +{ + SCCudaHlModuleData *data = NULL; + SCCudaHlModuleDevicePointer *module_device_ptr = NULL; + SCCudaHlModuleDevicePointer *temp_module_device_ptr = NULL; + + if (name == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Error invalid arguments" + "device_ptr is NULL or name is NULL"); + goto error; + } + + /* check if the particular module that wants to free device memory is + * already registered or not. If it is registered, check if a context has + * been associated with the module. If yes, then we can go ahead and + * free the device memory. + */ + if ( ((data = SCCudaHlGetModuleData(handle)) == NULL) || + (data->cuda_context == 0)) { + SCLogDebug("Module not registered or no cuda context associated with " + "this module. You can't create a CUDA module without" + "associatin a context with a module first. To use this " + "registration facility, first register a module using " + "context using SCCudaHlRegisterModule(), and then register " + "a cuda context with that module using " + "SCCudaHlGetCudaContext(), after which you can call this " + "function "); + goto error; + } + + /* if we already have a device pointer registered by this name return the + * cuda device pointer instance */ + if ( (module_device_ptr = SCCudaHlCudaDevicePtrAvailable(data, name)) == NULL) { + goto error; + } + + SCCudaMemFree(module_device_ptr->d_ptr); + module_device_ptr->d_ptr = 0; + if (module_device_ptr == data->device_ptrs) { + data->device_ptrs = data->device_ptrs->next; + } else { + temp_module_device_ptr = data->device_ptrs; + while (strcmp(temp_module_device_ptr->next->name, name) != 0) { + temp_module_device_ptr = temp_module_device_ptr->next; + } + temp_module_device_ptr->next = temp_module_device_ptr->next->next; + } + free(module_device_ptr->name); + free(module_device_ptr); + + return 0; + + error: + return -1; +} + /** * \brief Registers a Dispatcher function against this handle. * @@ -441,8 +579,11 @@ int SCCudaHlGetModuleHandle(const char *name) data = data->next; } - if (data == NULL) + if (data == NULL) { + SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "A cuda module by the name \"%s\" " + "hasn't been registered", name); return -1; + } return data->handle; } @@ -675,72 +816,4 @@ int SCCudaHlTestEnvCudaContextDeInit(void) return 1; } -void SCCudaHlProcessPacketWithDispatcher(Packet *p, DetectEngineThreadCtx *det_ctx, - void *result) -{ - Packet *out_p = NULL; - - p->cuda_mpm_ctx = det_ctx->sgh->mpm_ctx; - p->cuda_mtc = &det_ctx->mtc; - p->cuda_pmq = &det_ctx->pmq; - /* this outq is unique to this detection thread instance. The dispatcher thread - * would use this queue to pump the packets back to this detection thread once - * it has processed the packet */ - p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]; - /* for now it is hardcoded. \todo Make the access to the right queue or the - * ThreadVars generic */ - - /* Push the packet into the dispatcher's input queue */ - B2gCudaPushPacketTo_tv_CMB2_RC(p); - - /* wait for the dispatcher to process and return the packet we pushed */ - out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]); - - /* todo make this generic, so that if we have more than 2 modules using the - * cuda interface, we can call update function for the module that has - * queued the packet and retrieve the results */ - *((uint32_t *)result) = p->cuda_matches; - - return; -} - -void SCCudaHlProcessUriWithDispatcher(uint8_t *uri, uint16_t uri_len, - DetectEngineThreadCtx *det_ctx, - void *result) -{ - Packet *out_p = NULL; - - Packet *p = SCMalloc(sizeof(Packet)); - if (p == NULL) { - SCLogError(SC_ERR_FATAL, "Fatal error encountered in SCCudaHlProcessUriWithDispatcher. Exiting..."); - exit(EXIT_FAILURE); - } - memset(p, 0, sizeof(Packet)); - - p->cuda_mpm_ctx = det_ctx->sgh->mpm_uri_ctx; - p->cuda_mtc = &det_ctx->mtcu; - p->cuda_pmq = &det_ctx->pmq; - p->payload = uri; - p->payload_len = uri_len; - /* this outq is unique to this detection thread instance. The dispatcher thread - * would use this queue to pump the packets back to this detection thread once - * it has processed the packet */ - p->cuda_outq = &trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]; - - /* Push the packet into the dispatcher's input queue */ - B2gCudaPushPacketTo_tv_CMB2_RC(p); - - /* wait for the dispatcher to process and return the packet we pushed */ - out_p = TmqhInputSimpleOnQ(&trans_q[det_ctx->cuda_mpm_rc_disp_outq->id]); - - /* todo make this generic, so that if we have more than 2 modules using the - * cuda interface, we can call update function for the module that has - * queued the packet and retrieve the results */ - *((uint32_t *)result) = p->cuda_matches; - - SCFree(p); - - return; -} - #endif /* __SC_CUDA_SUPPORT */ diff --git a/src/util-cuda-handlers.h b/src/util-cuda-handlers.h index f2faebe4af..cae7d595dd 100644 --- a/src/util-cuda-handlers.h +++ b/src/util-cuda-handlers.h @@ -63,7 +63,9 @@ typedef struct SCCudaHlModuleData_ { int SCCudaHlGetCudaContext(CUcontext *, int); int SCCudaHlGetCudaModule(CUmodule *, const char *, int); +int SCCudaHlGetCudaModuleFromFile(CUmodule *, const char *, int); int SCCudaHlGetCudaDevicePtr(CUdeviceptr *, const char *, size_t, void *, int); +int SCCudaHlFreeCudaDevicePtr(const char *, int); int SCCudaHlRegisterDispatcherFunc(void *(*SCCudaHlDispFunc)(void *), int); SCCudaHlModuleData *SCCudaHlGetModuleData(uint8_t); diff --git a/src/util-mpm-b2g-cuda-kernel.cu b/src/util-mpm-b2g-cuda-kernel.cu index f86e9998ef..d94cda7d00 100644 --- a/src/util-mpm-b2g-cuda-kernel.cu +++ b/src/util-mpm-b2g-cuda-kernel.cu @@ -27,50 +27,59 @@ */ #define B2G_CUDA_Q 2 -#define CUDA_THREADS 16 +#define CUDA_THREADS 4000 #define B2G_CUDA_HASHSHIFT 4 #define B2G_CUDA_TYPE unsigned int #define B2G_CUDA_HASH16(a, b) (((a) << B2G_CUDA_HASHSHIFT) | (b)) #define u8_tolower(c) g_u8_lowercasetable[(c)] +typedef struct SCCudaPBPacketDataForGPU_ { + /* holds the value B2gCtx->m */ + unsigned int m; + /* holds B2gCtx->B2g */ + unsigned int table; + /* holds the length of the payload */ + unsigned int payload_len; + /* holds the payload */ + unsigned char payload; +} SCCudaPBPacketDataForGPU; + extern "C" -__global__ void B2gCudaSearchBNDMq(unsigned int *offsets, - unsigned int *B2G, - unsigned char *g_u8_lowercasetable, - unsigned char *buf, - unsigned short arg_buflen, - unsigned int m) -{ +__global__ void B2gCudaSearchBNDMq(unsigned short *results_buffer, + unsigned char *packets_buffer, + unsigned int *packets_offset_buffer, + unsigned int *packets_payload_offset_buffer, + unsigned int nop, + unsigned char *g_u8_lowercasetable) + { + unsigned int tid = blockIdx.x * 32 + threadIdx.x; + /* if the thread id is greater than the no of packets sent in the packets + * buffer, terminate the thread */ + //if (tid <= nop) + if (tid >= nop) + return; + + SCCudaPBPacketDataForGPU *packet = (SCCudaPBPacketDataForGPU *)(packets_buffer + packets_offset_buffer[tid]); + unsigned int m = packet->m; + unsigned char *buf = &packet->payload; + unsigned int buflen = packet->payload_len; + unsigned int *B2G = (unsigned int *)packet->table; unsigned int pos = m - B2G_CUDA_Q + 1; B2G_CUDA_TYPE d; unsigned short h; - unsigned int j; unsigned int first; - unsigned int tid = threadIdx.x; - unsigned short tid_chunk = arg_buflen / CUDA_THREADS; - unsigned short jump; - unsigned short buflen; - - if (tid_chunk < m) - tid_chunk = m; - - jump = tid_chunk * tid; - if ((jump + tid_chunk) > arg_buflen) - return; + unsigned int j = 0; - buflen = tid_chunk * 2 - 1; - if ((tid == CUDA_THREADS - 1) || ((jump + buflen) > arg_buflen)) { - buflen = arg_buflen - jump; - } - - j = 0; - while (j < buflen) { - offsets[jump + j] = 0; - j++; - } + unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[tid] + tid; + //unsigned short *matches_count = results_buffer + packets_payload_offset_buffer[1] + 1; + //unsigned short *offsets = results_buffer + packets_payload_offset_buffer[1] + 1 + 1; + unsigned short *offsets = matches_count + 1; + // temporarily hold the results here, before we shift it to matches_count + // before returning + unsigned short matches = 0; while (pos <= (buflen - B2G_CUDA_Q + 1)) { - h = B2G_CUDA_HASH16(u8_tolower(buf[jump + pos - 1]), u8_tolower(buf[jump + pos])); + h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]), u8_tolower(buf[pos])); d = B2G[h]; if (d != 0) { @@ -83,19 +92,21 @@ __global__ void B2gCudaSearchBNDMq(unsigned int *offsets, if (j > first) { pos = j; } else { - offsets[j + jump] = 1; + offsets[matches++] = j; } } if (j == 0) break; - h = B2G_CUDA_HASH16(u8_tolower(buf[jump + j - 1]), u8_tolower(buf[jump + j])); + h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]), u8_tolower(buf[j])); d = (d << 1) & B2G[h]; } while (d != 0); } pos = pos + m - B2G_CUDA_Q + 1; } + matches_count[0] = matches; + return; } diff --git a/src/util-mpm-b2g-cuda.c b/src/util-mpm-b2g-cuda.c index 52dcc53a17..24b36651c1 100644 --- a/src/util-mpm-b2g-cuda.c +++ b/src/util-mpm-b2g-cuda.c @@ -45,6 +45,13 @@ #include "threads.h" #include "tmqh-simple.h" +#include "detect-engine-address.h" +#include "detect-engine-port.h" +#include "detect-engine.h" +#include "detect-parse.h" + +#include "cuda-packet-batcher.h" + /* macros decides if cuda is enabled for the platform or not */ #ifdef __SC_CUDA_SUPPORT__ @@ -63,17 +70,6 @@ static void *b2g_func; /* threadvars Cuda(C) Mpm(M) B2G(B) Rules(R) Content(C) */ ThreadVars *tv_CMB2_RC = NULL; -/** - * \todo Would break on x86_64 I believe. We will fix this in a later version. - */ -#define B2G_CUDA_KERNEL_ARG0_OFFSET 0 -#define B2G_CUDA_KERNEL_ARG1_OFFSET 4 -#define B2G_CUDA_KERNEL_ARG2_OFFSET 8 -#define B2G_CUDA_KERNEL_ARG3_OFFSET 12 -#define B2G_CUDA_KERNEL_ARG4_OFFSET 16 -#define B2G_CUDA_KERNEL_ARG5_OFFSET 20 -#define B2G_CUDA_KERNEL_TOTAL_ARG_SIZE 24 - void B2gCudaInitCtx(MpmCtx *, int); void B2gCudaThreadInitCtx(MpmCtx *, MpmThreadCtx *, uint32_t); void B2gCudaDestroyCtx(MpmCtx *); @@ -100,201 +96,171 @@ void B2gCudaPrintInfo(MpmCtx *); void B2gCudaPrintSearchStats(MpmThreadCtx *); void B2gCudaRegisterTests(void); -/* for debugging purposes. keep it for now */ -int arg0 = 0; -int arg1 = 0; -int arg2 = 0; -int arg3 = 0; -int arg4 = 0; -int arg5 = 0; -int arg_total = 0; - #if defined(__x86_64__) || defined(__ia64__) const char *b2g_cuda_ptx_image_64_bit = - " .version 1.4\n" - " .target sm_10, map_f64_to_f32\n" - " .entry B2gCudaSearchBNDMq (\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_offsets,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_B2G,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_buf,\n" - " .param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n" - " {\n" - " .reg .u16 %rh<6>;\n" - " .reg .u32 %r<58>;\n" - " .reg .u64 %rd<31>;\n" - " .reg .pred %p<14>;\n" - " .loc 15 25 0\n" + " .version 1.4\n" + " .target sm_10, map_f64_to_f32\n" + " .entry B2gCudaSearchBNDMq (\n" + " .param .u64 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n" + " .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n" + " .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n" + " .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n" + " .param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n" + " {\n" + " .reg .u16 %rh<7>;\n" + " .reg .u32 %r<38>;\n" + " .reg .u64 %rd<41>;\n" + " .reg .pred %p<10>;\n" + " .loc 3 36 0\n" "$LBB1_B2gCudaSearchBNDMq:\n" - " .loc 15 27 0\n" - " ld.param.u32 %r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n" - " sub.u32 %r2, %r1, 1;\n" - " mov.s32 %r3, %r2;\n" - " .loc 15 33 0\n" - " ld.param.u16 %r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n" - " shr.u32 %r5, %r4, 4;\n" - " cvt.u16.u32 %r6, %r5;\n" - " mov.s32 %r7, %r6;\n" - " setp.ge.u32 %p1, %r6, %r1;\n" - " @%p1 bra $Lt_0_8450;\n" - " .loc 15 38 0\n" - " cvt.u16.u32 %r7, %r1;\n" + " mov.u16 %rh1, %ctaid.x;\n" + " mul.wide.u16 %r1, %rh1, 32;\n" + " cvt.u32.u16 %r2, %tid.x;\n" + " add.u32 %r3, %r2, %r1;\n" + " ld.param.u32 %r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n" + " setp.gt.u32 %p1, %r4, %r3;\n" + " @%p1 bra $Lt_0_5634;\n" + " bra.uni $LBB17_B2gCudaSearchBNDMq;\n" + "$Lt_0_5634:\n" + " .loc 3 45 0\n" + " cvt.u64.u32 %rd1, %r3;\n" + " mul.lo.u64 %rd2, %rd1, 4;\n" + " ld.param.u64 %rd3, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n" + " add.u64 %rd4, %rd3, %rd2;\n" + " ld.global.u32 %r5, [%rd4+0];\n" + " cvt.u64.u32 %rd5, %r5;\n" + " ld.param.u64 %rd6, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n" + " add.u64 %rd7, %rd5, %rd6;\n" + " .loc 3 46 0\n" + " ld.global.u32 %r6, [%rd7+0];\n" + " .loc 3 48 0\n" + " ld.global.u32 %r7, [%rd7+8];\n" + " .loc 3 49 0\n" + " ld.global.u32 %r8, [%rd7+4];\n" + " cvt.u64.u32 %rd8, %r8;\n" + " .loc 3 50 0\n" + " sub.u32 %r9, %r6, 1;\n" + " mov.s32 %r10, %r9;\n" + " .loc 3 56 0\n" + " ld.param.u64 %rd9, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n" + " ld.param.u64 %rd10, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n" + " add.u64 %rd11, %rd10, %rd2;\n" + " ld.global.u32 %r11, [%rd11+0];\n" + " cvt.u64.u32 %rd12, %r11;\n" + " add.u64 %rd13, %rd12, %rd1;\n" + " mul.lo.u64 %rd14, %rd13, 2;\n" + " add.u64 %rd15, %rd9, %rd14;\n" + " sub.u32 %r12, %r7, 1;\n" + " setp.gt.u32 %p2, %r9, %r12;\n" + " mov.u32 %r13, 0;\n" + " @%p2 bra $Lt_0_9474;\n" + " add.u64 %rd16, %rd7, 12;\n" + " add.u64 %rd17, %rd15, 2;\n" + " ld.param.u64 %rd18, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n" + "$Lt_0_6658:\n" + " // Loop body line 66\n" + " .loc 3 66 0\n" + " cvt.u64.u32 %rd19, %r10;\n" + " add.u64 %rd20, %rd19, %rd7;\n" + " ld.global.u8 %rh2, [%rd20+12];\n" + " cvt.u64.u8 %rd21, %rh2;\n" + " add.u64 %rd22, %rd21, %rd18;\n" + " ld.global.u8 %r14, [%rd22+0];\n" + " ld.global.u8 %rh3, [%rd20+11];\n" + " cvt.u64.u8 %rd23, %rh3;\n" + " add.u64 %rd24, %rd23, %rd18;\n" + " ld.global.u8 %r15, [%rd24+0];\n" + " shl.b32 %r16, %r15, 4;\n" + " or.b32 %r17, %r14, %r16;\n" + " cvt.u64.u32 %rd25, %r17;\n" + " mul.lo.u64 %rd26, %rd25, 4;\n" + " add.u64 %rd27, %rd8, %rd26;\n" + " ld.global.u32 %r18, [%rd27+0];\n" + " mov.u32 %r19, 0;\n" + " setp.eq.u32 %p3, %r18, %r19;\n" + " @%p3 bra $Lt_0_258;\n" + " // Part of loop body line 66, head labeled $Lt_0_6658\n" + " .loc 3 69 0\n" + " mov.s32 %r20, %r10;\n" + " .loc 3 70 0\n" + " sub.u32 %r21, %r10, %r6;\n" + " add.u32 %r22, %r21, 1;\n" + " sub.s32 %r23, %r6, 1;\n" + "$Lt_0_7682:\n" + " // Loop body line 73\n" + " .loc 3 73 0\n" + " sub.u32 %r20, %r20, 1;\n" + " shr.u32 %r24, %r18, %r23;\n" + " mov.u32 %r25, 0;\n" + " setp.eq.u32 %p4, %r24, %r25;\n" + " @%p4 bra $Lt_0_8450;\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " setp.le.u32 %p5, %r20, %r22;\n" + " @%p5 bra $Lt_0_8706;\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 76 0\n" + " mov.s32 %r10, %r20;\n" + " bra.uni $Lt_0_8450;\n" + "$Lt_0_8706:\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 78 0\n" + " mov.s32 %r26, %r13;\n" + " add.u32 %r27, %r26, 1;\n" + " cvt.u16.u32 %r13, %r27;\n" + " cvt.u64.u32 %rd28, %r26;\n" + " mul.lo.u64 %rd29, %rd28, 2;\n" + " add.u64 %rd30, %rd15, %rd29;\n" + " st.global.u16 [%rd30+2], %r20;\n" "$Lt_0_8450:\n" - " cvt.u32.u16 %r8, %tid.x;\n" - " mul.lo.u32 %r9, %r7, %r8;\n" - " cvt.u16.u32 %r10, %r9;\n" - " add.s32 %r11, %r7, %r10;\n" - " setp.ge.s32 %p2, %r4, %r11;\n" - " @%p2 bra $Lt_0_8962;\n" - " bra.uni $LBB23_B2gCudaSearchBNDMq;\n" - "$Lt_0_8962:\n" - " .loc 15 44 0\n" - " mul24.lo.s32 %r12, %r7, 2;\n" - " sub.s32 %r13, %r12, 1;\n" - " mov.s32 %r14, %r13;\n" - " cvt.u16.u32 %r15, %r14;\n" - " mov.s32 %r16, %r15;\n" - " add.s32 %r17, %r10, %r15;\n" - " set.lt.u32.s32 %r18, %r4, %r17;\n" - " neg.s32 %r19, %r18;\n" - " mov.u32 %r20, 15;\n" - " set.eq.u32.u32 %r21, %r8, %r20;\n" - " neg.s32 %r22, %r21;\n" - " or.b32 %r23, %r19, %r22;\n" - " mov.u32 %r24, 0;\n" - " setp.eq.s32 %p3, %r23, %r24;\n" - " @%p3 bra $Lt_0_9474;\n" - " .loc 15 46 0\n" - " sub.u32 %r25, %r4, %r9;\n" - " cvt.u16.u32 %r16, %r25;\n" - "$Lt_0_9474:\n" - " mov.u32 %r26, 0;\n" - " setp.eq.u32 %p4, %r16, %r26;\n" - " @%p4 bra $Lt_0_9986;\n" - " mov.s32 %r27, %r16;\n" - " ld.param.u64 %rd1, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n" - " mov.u32 %r28, 0;\n" - " mov.s32 %r29, %r27;\n" - "$Lt_0_10498:\n" - " // Loop body line 46, nesting depth: 1, estimated iterations: unknown\n" - " .loc 15 51 0\n" - " mov.u32 %r30, 0;\n" - " add.u32 %r31, %r10, %r28;\n" - " cvt.u64.u32 %rd2, %r31;\n" - " mul.lo.u64 %rd3, %rd2, 4;\n" - " add.u64 %rd4, %rd1, %rd3;\n" - " st.global.u32 [%rd4+0], %r30;\n" - " add.u32 %r28, %r28, 1;\n" - " setp.ne.u32 %p5, %r16, %r28;\n" - " @%p5 bra $Lt_0_10498;\n" - "$Lt_0_9986:\n" - " sub.u32 %r32, %r16, 1;\n" - " setp.gt.u32 %p6, %r2, %r32;\n" - " @%p6 bra $LBB23_B2gCudaSearchBNDMq;\n" - " ld.param.u64 %rd5, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n" - " ld.param.u64 %rd6, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n" - " ld.param.u64 %rd7, [__cudaparm_B2gCudaSearchBNDMq_buf];\n" - "$Lt_0_11522:\n" - " // Loop body line 57\n" - " .loc 15 57 0\n" - " add.u32 %r33, %r10, %r3;\n" - " cvt.u64.u32 %rd8, %r33;\n" - " add.u64 %rd9, %rd8, %rd7;\n" - " ld.global.u8 %rh1, [%rd9+0];\n" - " cvt.u64.u8 %rd10, %rh1;\n" - " add.u64 %rd11, %rd10, %rd5;\n" - " ld.global.u8 %r34, [%rd11+0];\n" - " ld.global.u8 %rh2, [%rd9+-1];\n" - " cvt.u64.u8 %rd12, %rh2;\n" - " add.u64 %rd13, %rd12, %rd5;\n" - " ld.global.u8 %r35, [%rd13+0];\n" - " shl.b32 %r36, %r35, 4;\n" - " or.b32 %r37, %r34, %r36;\n" - " cvt.u64.u32 %rd14, %r37;\n" - " mul.lo.u64 %rd15, %rd14, 4;\n" - " add.u64 %rd16, %rd6, %rd15;\n" - " ld.global.u32 %r38, [%rd16+0];\n" - " mov.u32 %r39, 0;\n" - " setp.eq.u32 %p7, %r38, %r39;\n" - " @%p7 bra $Lt_0_258;\n" - " // Part of loop body line 57, head labeled $Lt_0_11522\n" - " .loc 15 60 0\n" - " mov.s32 %r28, %r3;\n" - " .loc 15 61 0\n" - " sub.u32 %r40, %r3, %r1;\n" - " add.u32 %r41, %r40, 1;\n" - " sub.s32 %r42, %r1, 1;\n" - "$Lt_0_12546:\n" - " // Loop body line 64\n" - " .loc 15 64 0\n" - " sub.u32 %r28, %r28, 1;\n" - " shr.u32 %r43, %r38, %r42;\n" - " mov.u32 %r44, 0;\n" - " setp.eq.u32 %p8, %r43, %r44;\n" - " @%p8 bra $Lt_0_13314;\n" - " // Part of loop body line 64, head labeled $Lt_0_12546\n" - " setp.ge.u32 %p9, %r41, %r28;\n" - " @%p9 bra $Lt_0_13570;\n" - " // Part of loop body line 64, head labeled $Lt_0_12546\n" - " .loc 15 67 0\n" - " mov.s32 %r3, %r28;\n" - " bra.uni $Lt_0_13314;\n" - "$Lt_0_13570:\n" - " // Part of loop body line 64, head labeled $Lt_0_12546\n" - " .loc 15 69 0\n" - " mov.u32 %r45, 1;\n" - " ld.param.u64 %rd17, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n" - " add.u32 %r46, %r10, %r28;\n" - " cvt.u64.u32 %rd18, %r46;\n" - " mul.lo.u64 %rd19, %rd18, 4;\n" - " add.u64 %rd20, %rd17, %rd19;\n" - " st.global.u32 [%rd20+0], %r45;\n" - "$Lt_0_13314:\n" - "$Lt_0_12802:\n" - " // Part of loop body line 64, head labeled $Lt_0_12546\n" - " .loc 15 74 0\n" - " mov.u32 %r47, 0;\n" - " setp.eq.u32 %p10, %r28, %r47;\n" - " @%p10 bra $Lt_0_258;\n" - "// Part of loop body line 64, head labeled $Lt_0_12546\n" - " .loc 15 77 0\n" - " add.u32 %r48, %r10, %r28;\n" - " cvt.u64.u32 %rd21, %r48;\n" - " add.u64 %rd22, %rd21, %rd7;\n" - " ld.global.u8 %rh3, [%rd22+0];\n" - " cvt.u64.u8 %rd23, %rh3;\n" - " add.u64 %rd24, %rd23, %rd5;\n" - " ld.global.u8 %r49, [%rd24+0];\n" - " ld.global.u8 %rh4, [%rd22+-1];\n" - " cvt.u64.u8 %rd25, %rh4;\n" - " add.u64 %rd26, %rd25, %rd5;\n" - " ld.global.u8 %r50, [%rd26+0];\n" - " shl.b32 %r51, %r50, 4;\n" - " or.b32 %r52, %r49, %r51;\n" - " cvt.u64.u32 %rd27, %r52;\n" - " mul.lo.u64 %rd28, %rd27, 4;\n" - " add.u64 %rd29, %rd6, %rd28;\n" - " ld.global.u32 %r53, [%rd29+0];\n" - " shl.b32 %r54, %r38, 1;\n" - " and.b32 %r38, %r53, %r54;\n" - " mov.u32 %r55, 0;\n" - " setp.ne.u32 %p11, %r38, %r55;\n" - " @%p11 bra $Lt_0_12546;\n" + "$Lt_0_7938:\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 83 0\n" + " mov.u32 %r28, 0;\n" + " setp.eq.u32 %p6, %r20, %r28;\n" + " @%p6 bra $Lt_0_258;\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 86 0\n" + " cvt.u64.u32 %rd31, %r20;\n" + " add.u64 %rd32, %rd31, %rd7;\n" + " ld.global.u8 %rh4, [%rd32+12];\n" + " cvt.u64.u8 %rd33, %rh4;\n" + " add.u64 %rd34, %rd33, %rd18;\n" + " ld.global.u8 %r29, [%rd34+0];\n" + " ld.global.u8 %rh5, [%rd32+11];\n" + " cvt.u64.u8 %rd35, %rh5;\n" + " add.u64 %rd36, %rd35, %rd18;\n" + " ld.global.u8 %r30, [%rd36+0];\n" + " shl.b32 %r31, %r30, 4;\n" + " or.b32 %r32, %r29, %r31;\n" + " cvt.u64.u32 %rd37, %r32;\n" + " mul.lo.u64 %rd38, %rd37, 4;\n" + " add.u64 %rd39, %rd8, %rd38;\n" + " ld.global.u32 %r33, [%rd39+0];\n" + " shl.b32 %r34, %r18, 1;\n" + " and.b32 %r18, %r33, %r34;\n" + " mov.u32 %r35, 0;\n" + " setp.ne.u32 %p7, %r18, %r35;\n" + " @%p7 bra $Lt_0_7682;\n" "$Lt_0_258:\n" - "$Lt_0_11778:\n" - " // Part of loop body line 57, head labeled $Lt_0_11522\n" - " .loc 15 80 0\n" - " add.u32 %r56, %r3, %r1;\n" - " sub.u32 %r3, %r56, 1;\n" - " setp.ge.u32 %p12, %r32, %r3;\n" - " @%p12 bra $Lt_0_11522;\n" - "$LBB23_B2gCudaSearchBNDMq:\n" - " .loc 15 83 0\n" - " exit;\n" + "$Lt_0_6914:\n" + " // Part of loop body line 66, head labeled $Lt_0_6658\n" + " .loc 3 89 0\n" + " add.u32 %r36, %r6, %r10;\n" + " sub.u32 %r10, %r36, 1;\n" + " setp.ge.u32 %p8, %r12, %r10;\n" + " @%p8 bra $Lt_0_6658;\n" + " bra.uni $Lt_0_6146;\n" + "$Lt_0_9474:\n" + "$Lt_0_6146:\n" + " .loc 3 92 0\n" + " st.global.u16 [%rd15+0], %r13;\n" + "$LBB17_B2gCudaSearchBNDMq:\n" + " .loc 3 94 0\n" + " exit;\n" "$LDWend_B2gCudaSearchBNDMq:\n" - " } // B2gCudaSearchBNDMq\n" - "\n"; + " } // B2gCudaSearchBNDMq\n" + ""; #else /** * \todo Optimize the kernel. Also explore the options for compiling the @@ -304,175 +270,152 @@ const char *b2g_cuda_ptx_image_32_bit = " .version 1.4\n" " .target sm_10, map_f64_to_f32\n" " .entry B2gCudaSearchBNDMq (\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_offsets,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_B2G,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_buf,\n" - " .param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n" + " .param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n" " {\n" - " .reg .u32 %r<81>;\n" - " .reg .pred %p<14>;\n" - " .loc 15 14 0\n" + " .reg .u16 %rh<6>;\n" + " .reg .u32 %r<65>;\n" + " .reg .pred %p<10>;\n" + " .loc 3 36 0\n" "$LBB1_B2gCudaSearchBNDMq:\n" - " .loc 15 16 0\n" - " ld.param.u32 %r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n" - " sub.u32 %r2, %r1, 1;\n" - " mov.s32 %r3, %r2;\n" - " .loc 15 22 0\n" - " ld.param.u16 %r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n" - " shr.u32 %r5, %r4, 4;\n" - " cvt.u16.u32 %r6, %r5;\n" - " mov.s32 %r7, %r6;\n" - " setp.ge.u32 %p1, %r6, %r1;\n" - " @%p1 bra $Lt_0_8450;\n" - " .loc 15 27 0\n" - " cvt.u16.u32 %r7, %r1;\n" + " mov.u16 %rh1, %ctaid.x;\n" + " mul.wide.u16 %r1, %rh1, 32;\n" + " cvt.u32.u16 %r2, %tid.x;\n" + " add.u32 %r3, %r2, %r1;\n" + " ld.param.u32 %r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n" + " setp.gt.u32 %p1, %r4, %r3;\n" + " @%p1 bra $Lt_0_5634;\n" + " bra.uni $LBB17_B2gCudaSearchBNDMq;\n" + "$Lt_0_5634:\n" + " .loc 3 45 0\n" + " mul.lo.u32 %r5, %r3, 4;\n" + " ld.param.u32 %r6, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n" + " add.u32 %r7, %r6, %r5;\n" + " ld.global.u32 %r8, [%r7+0];\n" + " ld.param.u32 %r9, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n" + " add.u32 %r10, %r8, %r9;\n" + " .loc 3 46 0\n" + " ld.global.u32 %r11, [%r10+0];\n" + " .loc 3 48 0\n" + " ld.global.u32 %r12, [%r10+8];\n" + " .loc 3 49 0\n" + " ld.global.u32 %r13, [%r10+4];\n" + " .loc 3 50 0\n" + " sub.u32 %r14, %r11, 1;\n" + " mov.s32 %r15, %r14;\n" + " .loc 3 56 0\n" + " ld.param.u32 %r16, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n" + " ld.param.u32 %r17, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n" + " add.u32 %r18, %r17, %r5;\n" + " ld.global.u32 %r19, [%r18+0];\n" + " add.u32 %r20, %r19, %r3;\n" + " mul.lo.u32 %r21, %r20, 2;\n" + " add.u32 %r22, %r16, %r21;\n" + " sub.u32 %r23, %r12, 1;\n" + " setp.gt.u32 %p2, %r14, %r23;\n" + " mov.u16 %rh2, 0;\n" + " @%p2 bra $Lt_0_9474;\n" + " add.u32 %r24, %r10, 12;\n" + " add.u32 %r25, %r22, 2;\n" + " ld.param.u32 %r26, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n" + "$Lt_0_6658:\n" + " // Loop body line 66\n" + " .loc 3 66 0\n" + " add.u32 %r27, %r10, %r15;\n" + " ld.global.u8 %r28, [%r27+12];\n" + " add.u32 %r29, %r28, %r26;\n" + " ld.global.u8 %r30, [%r29+0];\n" + " ld.global.u8 %r31, [%r27+11];\n" + " add.u32 %r32, %r31, %r26;\n" + " ld.global.u8 %r33, [%r32+0];\n" + " shl.b32 %r34, %r33, 4;\n" + " or.b32 %r35, %r30, %r34;\n" + " mul.lo.u32 %r36, %r35, 4;\n" + " add.u32 %r37, %r13, %r36;\n" + " ld.global.u32 %r38, [%r37+0];\n" + " mov.u32 %r39, 0;\n" + " setp.eq.u32 %p3, %r38, %r39;\n" + " @%p3 bra $Lt_0_258;\n" + " // Part of loop body line 66, head labeled $Lt_0_6658\n" + " .loc 3 69 0\n" + " mov.s32 %r40, %r15;\n" + " .loc 3 70 0\n" + " sub.u32 %r41, %r15, %r11;\n" + " add.u32 %r42, %r41, 1;\n" + " sub.s32 %r43, %r11, 1;\n" + "$Lt_0_7682:\n" + " // Loop body line 73\n" + " .loc 3 73 0\n" + " sub.u32 %r40, %r40, 1;\n" + " shr.u32 %r44, %r38, %r43;\n" + " mov.u32 %r45, 0;\n" + " setp.eq.u32 %p4, %r44, %r45;\n" + " @%p4 bra $Lt_0_8450;\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " setp.le.u32 %p5, %r40, %r42;\n" + " @%p5 bra $Lt_0_8706;\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 76 0\n" + " mov.s32 %r15, %r40;\n" + " bra.uni $Lt_0_8450;\n" + "$Lt_0_8706:\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 78 0\n" + " mov.s16 %rh3, %rh2;\n" + " add.u16 %rh4, %rh3, 1;\n" + " mov.u16 %rh2, %rh4;\n" + " mul.wide.u16 %r46, %rh3, 2;\n" + " add.u32 %r47, %r22, %r46;\n" + " st.global.u16 [%r47+2], %r40;\n" "$Lt_0_8450:\n" - " cvt.u32.u16 %r8, %tid.x;\n" - " mul.lo.u32 %r9, %r7, %r8;\n" - " cvt.u16.u32 %r10, %r9;\n" - " add.s32 %r11, %r7, %r10;\n" - " setp.ge.s32 %p2, %r4, %r11;\n" - " @%p2 bra $Lt_0_8962;\n" - " bra.uni $LBB23_B2gCudaSearchBNDMq;\n" - "$Lt_0_8962:\n" - " .loc 15 33 0\n" - " mul24.lo.s32 %r12, %r7, 2;\n" - " sub.s32 %r13, %r12, 1;\n" - " mov.s32 %r14, %r13;\n" - " cvt.u16.u32 %r15, %r14;\n" - " mov.s32 %r16, %r15;\n" - " add.s32 %r17, %r10, %r15;\n" - " set.lt.u32.s32 %r18, %r4, %r17;\n" - " neg.s32 %r19, %r18;\n" - " mov.u32 %r20, 15;\n" - " set.eq.u32.u32 %r21, %r8, %r20;\n" - " neg.s32 %r22, %r21;\n" - " or.b32 %r23, %r19, %r22;\n" - " mov.u32 %r24, 0;\n" - " setp.eq.s32 %p3, %r23, %r24;\n" - " @%p3 bra $Lt_0_9474;\n" - " .loc 15 35 0\n" - " sub.u32 %r25, %r4, %r9;\n" - " cvt.u16.u32 %r16, %r25;\n" - "$Lt_0_9474:\n" - " mov.u32 %r26, 0;\n" - " setp.eq.u32 %p4, %r16, %r26;\n" - " @%p4 bra $Lt_0_9986;\n" - " mov.s32 %r27, %r16;\n" - " ld.param.u32 %r28, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n" - " mov.u32 %r29, 0;\n" - " mov.s32 %r30, %r27;\n" - "$Lt_0_10498:\n" - " // Loop body line 35, nesting depth: 1, estimated iterations: unknown\n" - " .loc 15 40 0\n" - " mov.u32 %r31, 0;\n" - " add.u32 %r32, %r10, %r29;\n" - " mul.lo.u32 %r33, %r32, 4;\n" - " add.u32 %r34, %r28, %r33;\n" - " st.global.u32 [%r34+0], %r31;\n" - " add.u32 %r29, %r29, 1;\n" - " setp.ne.u32 %p5, %r16, %r29;\n" - " @%p5 bra $Lt_0_10498;\n" - "$Lt_0_9986:\n" - " sub.u32 %r35, %r16, 1;\n" - " setp.gt.u32 %p6, %r2, %r35;\n" - " @%p6 bra $LBB23_B2gCudaSearchBNDMq;\n" - " ld.param.u32 %r36, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n" - " ld.param.u32 %r37, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n" - " ld.param.u32 %r38, [__cudaparm_B2gCudaSearchBNDMq_buf];\n" - "$Lt_0_11522:\n" - " // Loop body line 46\n" - " .loc 15 46 0\n" - " add.u32 %r39, %r10, %r3;\n" - " add.u32 %r40, %r39, %r38;\n" - " ld.global.u8 %r41, [%r40+0];\n" - " add.u32 %r42, %r41, %r36;\n" - " ld.global.u8 %r43, [%r42+0];\n" - " ld.global.u8 %r44, [%r40+-1];\n" - " add.u32 %r45, %r44, %r36;\n" - " ld.global.u8 %r46, [%r45+0];\n" - " shl.b32 %r47, %r46, 4;\n" - " or.b32 %r48, %r43, %r47;\n" - " mul.lo.u32 %r49, %r48, 4;\n" - " add.u32 %r50, %r37, %r49;\n" - " ld.global.u32 %r51, [%r50+0];\n" - " mov.u32 %r52, 0;\n" - " setp.eq.u32 %p7, %r51, %r52;\n" - " @%p7 bra $Lt_0_258;\n" - " // Part of loop body line 46, head labeled $Lt_0_11522\n" - " .loc 15 49 0\n" - " mov.s32 %r29, %r3;\n" - " .loc 15 50 0\n" - " sub.u32 %r53, %r3, %r1;\n" - " add.u32 %r54, %r53, 1;\n" - " sub.s32 %r55, %r1, 1;\n" - "$Lt_0_12546:\n" - " // Loop body line 53\n" - " .loc 15 53 0\n" - " sub.u32 %r29, %r29, 1;\n" - " shr.u32 %r56, %r51, %r55;\n" - " mov.u32 %r57, 0;\n" - " setp.eq.u32 %p8, %r56, %r57;\n" - " @%p8 bra $Lt_0_13314;\n" - " // Part of loop body line 53, head labeled $Lt_0_12546\n" - " setp.ge.u32 %p9, %r54, %r29;\n" - " @%p9 bra $Lt_0_13570;\n" - " // Part of loop body line 53, head labeled $Lt_0_12546\n" - " .loc 15 56 0\n" - " mov.s32 %r3, %r29;\n" - " bra.uni $Lt_0_13314;\n" - "$Lt_0_13570:\n" - " // Part of loop body line 53, head labeled $Lt_0_12546\n" - " .loc 15 58 0\n" - " mov.u32 %r58, 1;\n" - " ld.param.u32 %r59, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n" - " add.u32 %r60, %r10, %r29;\n" - " mul.lo.u32 %r61, %r60, 4;\n" - " add.u32 %r62, %r59, %r61;\n" - " st.global.u32 [%r62+0], %r58;\n" - "$Lt_0_13314:\n" - "$Lt_0_12802:\n" - " // Part of loop body line 53, head labeled $Lt_0_12546\n" - " .loc 15 63 0\n" - " mov.u32 %r63, 0;\n" - " setp.eq.u32 %p10, %r29, %r63;\n" - " @%p10 bra $Lt_0_258;\n" - " // Part of loop body line 53, head labeled $Lt_0_12546\n" - " .loc 15 66 0\n" - " add.u32 %r64, %r10, %r29;\n" - " add.u32 %r65, %r64, %r38;\n" - " ld.global.u8 %r66, [%r65+0];\n" - " add.u32 %r67, %r66, %r36;\n" - " ld.global.u8 %r68, [%r67+0];\n" - " ld.global.u8 %r69, [%r65+-1];\n" - " add.u32 %r70, %r69, %r36;\n" - " ld.global.u8 %r71, [%r70+0];\n" - " shl.b32 %r72, %r71, 4;\n" - " or.b32 %r73, %r68, %r72;\n" - " mul.lo.u32 %r74, %r73, 4;\n" - " add.u32 %r75, %r37, %r74;\n" - " ld.global.u32 %r76, [%r75+0];\n" - " shl.b32 %r77, %r51, 1;\n" - " and.b32 %r51, %r76, %r77;\n" - " mov.u32 %r78, 0;\n" - " setp.ne.u32 %p11, %r51, %r78;\n" - " @%p11 bra $Lt_0_12546;\n" + "$Lt_0_7938:\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 83 0\n" + " mov.u32 %r48, 0;\n" + " setp.eq.u32 %p6, %r40, %r48;\n" + " @%p6 bra $Lt_0_258;\n" + " // Part of loop body line 73, head labeled $Lt_0_7682\n" + " .loc 3 86 0\n" + " add.u32 %r49, %r10, %r40;\n" + " ld.global.u8 %r50, [%r49+12];\n" + " add.u32 %r51, %r50, %r26;\n" + " ld.global.u8 %r52, [%r51+0];\n" + " ld.global.u8 %r53, [%r49+11];\n" + " add.u32 %r54, %r53, %r26;\n" + " ld.global.u8 %r55, [%r54+0];\n" + " shl.b32 %r56, %r55, 4;\n" + " or.b32 %r57, %r52, %r56;\n" + " mul.lo.u32 %r58, %r57, 4;\n" + " add.u32 %r59, %r13, %r58;\n" + " ld.global.u32 %r60, [%r59+0];\n" + " shl.b32 %r61, %r38, 1;\n" + " and.b32 %r38, %r60, %r61;\n" + " mov.u32 %r62, 0;\n" + " setp.ne.u32 %p7, %r38, %r62;\n" + " @%p7 bra $Lt_0_7682;\n" "$Lt_0_258:\n" - "$Lt_0_11778:\n" - " // Part of loop body line 46, head labeled $Lt_0_11522\n" - " .loc 15 69 0\n" - " add.u32 %r79, %r3, %r1;\n" - " sub.u32 %r3, %r79, 1;\n" - " setp.ge.u32 %p12, %r35, %r3;\n" - " @%p12 bra $Lt_0_11522;\n" - "$LBB23_B2gCudaSearchBNDMq:\n" - " .loc 15 72 0\n" + "$Lt_0_6914:\n" + " // Part of loop body line 66, head labeled $Lt_0_6658\n" + " .loc 3 89 0\n" + " add.u32 %r63, %r11, %r15;\n" + " sub.u32 %r15, %r63, 1;\n" + " setp.ge.u32 %p8, %r23, %r15;\n" + " @%p8 bra $Lt_0_6658;\n" + " bra.uni $Lt_0_6146;\n" + "$Lt_0_9474:\n" + "$Lt_0_6146:\n" + " .loc 3 92 0\n" + " st.global.u16 [%r22+0], %rh2;\n" + "$LBB17_B2gCudaSearchBNDMq:\n" + " .loc 3 94 0\n" " exit;\n" "$LDWend_B2gCudaSearchBNDMq:\n" " } // B2gCudaSearchBNDMq\n" - "\n"; + ""; #endif /** @@ -482,7 +425,6 @@ void MpmB2gCudaRegister(void) { mpm_table[MPM_B2G_CUDA].name = "b2g_cuda"; mpm_table[MPM_B2G_CUDA].max_pattern_length = B2G_CUDA_WORD_SIZE; - mpm_table[MPM_B2G_CUDA].InitCtx = B2gCudaInitCtx; mpm_table[MPM_B2G_CUDA].InitThreadCtx = B2gCudaThreadInitCtx; mpm_table[MPM_B2G_CUDA].DestroyCtx = B2gCudaDestroyCtx; @@ -497,37 +439,6 @@ void MpmB2gCudaRegister(void) mpm_table[MPM_B2G_CUDA].RegisterUnittests = B2gCudaRegisterTests; } -static inline void B2gCudaEndMatchAppend(MpmCtx *mpm_ctx, B2gCudaPattern *p, - uint16_t offset, uint16_t depth, - uint32_t pid, uint32_t sid) -{ - MpmEndMatch *em = MpmAllocEndMatch(mpm_ctx); - if (em == NULL) { - SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); - return; - } - - SCLogDebug("em alloced at %p", em); - - em->id = pid; - em->sig_id = sid; - em->depth = depth; - em->offset = offset; - - if (p->em == NULL) { - p->em = em; - return; - } - - MpmEndMatch *m = p->em; - while (m->next != NULL) { - m = m->next; - } - m->next = em; - - return; -} - void B2gCudaPrintInfo(MpmCtx *mpm_ctx) { B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx; @@ -674,9 +585,6 @@ static inline B2gCudaPattern *B2gCudaInitHashLookup(B2gCudaCtx *ctx, uint8_t *pa void B2gCudaFreePattern(MpmCtx *mpm_ctx, B2gCudaPattern *p) { - if (p != NULL && p->em != NULL) - MpmEndMatchFreeAll(mpm_ctx, p->em); - if (p != NULL && p->cs != NULL && p->cs != p->ci) { SCFree(p->cs); mpm_ctx->memory_cnt--; @@ -705,7 +613,7 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat, { B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx; - SCLogDebug("ctx %p len %" PRIu16 " pid %" PRIu32, ctx, patlen, pid); + SCLogDebug("ctx %p len %"PRIu16" pid %" PRIu32, ctx, patlen, pid); if (patlen == 0) return 0; @@ -721,11 +629,13 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat, p->len = patlen; p->flags = flags; + p->id = pid; /* setup the case insensitive part of the pattern */ p->ci = SCMalloc(patlen); if (p->ci == NULL) goto error; + mpm_ctx->memory_cnt++; mpm_ctx->memory_size += patlen; memcpy_tolower(p->ci, pat, patlen); @@ -735,7 +645,7 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat, /* nocase means no difference between cs and ci */ p->cs = p->ci; } else { - if (memcmp(p->ci, pat, p->len) == 0) { + if (memcmp(p->ci,pat,p->len) == 0) { /* no diff between cs and ci: pat is lowercase */ p->cs = p->ci; } else { @@ -745,34 +655,29 @@ static inline int B2gCudaAddPattern(MpmCtx *mpm_ctx, uint8_t *pat, mpm_ctx->memory_cnt++; mpm_ctx->memory_size += patlen; - memcpy(p->cs, pat, patlen); } } + //printf("B2gAddPattern: ci \""); prt(p->ci,p->len); + //printf("\" cs \""); prt(p->cs,p->len); + //printf("\"\n"); + /* put in the pattern hash */ B2gCudaInitHashAdd(ctx, p); if (mpm_ctx->pattern_cnt == 65535) { printf("Max search words reached\n"); - exit(EXIT_FAILURE); + exit(1); } mpm_ctx->pattern_cnt++; - if (mpm_ctx->maxlen < patlen) - mpm_ctx->maxlen = patlen; - - if (mpm_ctx->minlen == 0) - mpm_ctx->minlen = patlen; - else if (mpm_ctx->minlen > patlen) - mpm_ctx->minlen = patlen; + if (mpm_ctx->maxlen < patlen) mpm_ctx->maxlen = patlen; + if (mpm_ctx->minlen == 0) mpm_ctx->minlen = patlen; + else if (mpm_ctx->minlen > patlen) mpm_ctx->minlen = patlen; } - /* we need a match */ - B2gCudaEndMatchAppend(mpm_ctx, p, offset, depth, pid, sid); - mpm_ctx->total_pattern_cnt++; - return 0; error: @@ -1011,12 +916,6 @@ int B2gCudaSetDeviceBuffers(MpmCtx *mpm_ctx) { B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx; - if (SCCudaHlGetCudaDevicePtr(&ctx->cuda_g_u8_lowercasetable, - "G_U8_LOWERCASETABLE", 256 * sizeof(char), - g_u8_lowercasetable, ctx->module_handle) == -1) { - goto error; - } - /* search kernel */ if (SCCudaMemAlloc(&ctx->cuda_B2G, sizeof(B2G_CUDA_TYPE) * ctx->hash_size) == -1) { @@ -1035,19 +934,7 @@ int B2gCudaSetDeviceBuffers(MpmCtx *mpm_ctx) int B2gCudaSetKernelArgs(MpmCtx *mpm_ctx) { - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx; - - /* search kernel */ - if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg2_offset, - (void *)&ctx->cuda_g_u8_lowercasetable, - sizeof(void *)) == -1) { - goto error; - } - return 0; - - error: - return -1; } int B2gCudaPreparePatterns(MpmCtx *mpm_ctx) @@ -1244,6 +1131,9 @@ void B2gCudaInitCtx(MpmCtx *mpm_ctx, int module_handle) /* initialize the hash we use to speed up pattern insertions */ B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx; + + /* hold the cuda module handle against which we are registered. This is our + * only reference to know our place of birth */ ctx->module_handle = module_handle; ctx->init_hash = SCMalloc(sizeof(B2gCudaPattern *) * INIT_HASH_SIZE); @@ -1260,67 +1150,6 @@ void B2gCudaInitCtx(MpmCtx *mpm_ctx, int module_handle) /* init defaults search functions */ ctx->Search = b2g_func; - if (SCCudaHlGetCudaContext(&ctx->cuda_context, module_handle) == -1) { - SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context"); - } - -#if defined(__x86_64__) || defined(__ia64__) - if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image_64_bit, - module_handle) == -1) { - SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module"); - } -#else - if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image_32_bit, - module_handle) == -1) { - SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module"); - } -#endif - - - if (SCCudaModuleGetFunction(&ctx->cuda_search_kernel, ctx->cuda_module, - B2G_CUDA_SEARCHFUNC_NAME) == -1) { - SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda function"); - } - -#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) - - int offset = 0; - - ALIGN_UP(offset, __alignof(void *)); - ctx->cuda_search_kernel_arg0_offset = offset; - offset += sizeof(void *); - - ALIGN_UP(offset, __alignof(void *)); - ctx->cuda_search_kernel_arg1_offset = offset; - offset += sizeof(void *); - - ALIGN_UP(offset, __alignof(void *)); - ctx->cuda_search_kernel_arg2_offset = offset; - offset += sizeof(void *); - - ALIGN_UP(offset, __alignof(void *)); - ctx->cuda_search_kernel_arg3_offset = offset; - offset += sizeof(void *); - - ALIGN_UP(offset, __alignof(unsigned short)); - ctx->cuda_search_kernel_arg4_offset = offset; - offset += sizeof(unsigned short); - - ALIGN_UP(offset, __alignof(unsigned int)); - ctx->cuda_search_kernel_arg5_offset = offset; - offset += sizeof(unsigned int); - - ctx->cuda_search_kernel_arg_total = offset; - - //printf("arg0: %d\n", arg0); - //printf("arg1: %d\n", arg1); - //printf("arg2: %d\n", arg2); - //printf("arg3: %d\n", arg3); - //printf("arg4: %d\n", arg4); - //printf("arg5: %d\n", arg5); - - //printf("arg_total: %d\n", arg_total); - return; } @@ -1395,16 +1224,35 @@ void B2gCudaDestroyCtx(MpmCtx *mpm_ctx) mpm_ctx->memory_size -= (sizeof(uint8_t) * ctx->hash_size); } + CUcontext dummy_context; + SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(ctx->module_handle); + if (module_data == NULL) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "How did we even fail to get a " + "module_data if we are having a module_handle"); + goto error; + } + if (SCCudaHlGetCudaContext(&dummy_context, ctx->module_handle) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the " + "module %s", module_data->name); + goto error; + } + SCCudaCtxPushCurrent(dummy_context); + if (ctx->cuda_B2G != 0) { - if (SCCudaMemFree(ctx->cuda_B2G) == -1) - SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error freeing ctx->cuda_search_B2G "); + if (SCCudaMemFree(ctx->cuda_B2G) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error freeing ctx->cuda_B2G "); + goto error; + } ctx->cuda_B2G = 0; } + SCCudaCtxPopCurrent(&dummy_context); SCFree(mpm_ctx->ctx); mpm_ctx->memory_cnt--; mpm_ctx->memory_size -= sizeof(B2gCudaCtx); + + error: return; } @@ -1456,143 +1304,122 @@ uint32_t B2gCudaSearchBNDMq(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, PatternMatcherQueue *pmq, uint8_t *buf, uint16_t buflen) { -#define CUDA_THREADS 16 - CUdeviceptr cuda_buf = 0; - CUdeviceptr cuda_offsets = 0; - uint32_t matches = 0; - B2gCudaCtx *ctx = mpm_ctx->ctx; - uint16_t h = 0; - int i = 0; - int host_offsets[UINT16_MAX]; - - if (buflen < ctx->m) - return 0; - - if (SCCudaMemAlloc(&cuda_buf, buflen * sizeof(char)) == -1) { - goto error; - } - if (SCCudaMemcpyHtoD(cuda_buf, buf, - buflen * sizeof(char)) == -1) { - goto error; - } - - if (SCCudaMemAlloc(&cuda_offsets, buflen * sizeof(int)) == -1) { - goto error; - } - - if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg0_offset, - (void *)&cuda_offsets, sizeof(void *)) == -1) { - goto error; - } - - if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg1_offset, - (void *)&ctx->cuda_B2G, sizeof(void *)) == -1) { - goto error; - } - - if (SCCudaParamSetv(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg3_offset, - (void *)&cuda_buf, sizeof(void *)) == -1) { - goto error; - } - - if (SCCudaParamSeti(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg4_offset, - buflen) == -1) { - goto error; - } - - if (SCCudaParamSeti(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg5_offset, - ctx->m) == -1) { - goto error; - } - - if (SCCudaParamSetSize(ctx->cuda_search_kernel, ctx->cuda_search_kernel_arg_total) == -1) - goto error; - - if (SCCudaFuncSetBlockShape(ctx->cuda_search_kernel, CUDA_THREADS, 1, 1) == -1) - goto error; - - if (SCCudaLaunchGrid(ctx->cuda_search_kernel, 1, 1) == -1) - goto error; - - if (SCCudaMemcpyDtoH(host_offsets, cuda_offsets, buflen * sizeof(int)) == -1) - goto error; + B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx->ctx; +#ifdef B2G_COUNTERS + B2gCudaThreadCtx *tctx = (B2gCudaThreadCtx *)mpm_thread_ctx->ctx; +#endif + uint32_t pos = ctx->m - B2G_CUDA_Q + 1, matches = 0; + B2G_CUDA_TYPE d; - //printf("Raw matches: "); - //for (i = 0; i < buflen; i++) { - // printf("%d",offsets_buffer[i]); - //} //printf("\n"); + //PrintRawDataFp(stdout, buf, buflen); - //printf("Matches: "); - for (i = 0; i < buflen; i++) { - if (host_offsets[i] == 0) - continue; - - - /* get our patterns from the hash */ - h = B2G_CUDA_HASH16(u8_tolower(buf[i + ctx->m - 2]), - u8_tolower(buf[i + ctx->m - 1])); - - if (ctx->bloom[h] != NULL) { - COUNT(tctx->stat_pminlen_calls++); - COUNT(tctx->stat_pminlen_total+=ctx->pminlen[h]); - - if ((buflen - i) < ctx->pminlen[h]) { - continue; - } else { - COUNT(tctx->stat_bloom_calls++); + SCLogDebug("buflen %"PRIu16", ctx->m %"PRIu32", pos %"PRIu32"", buflen, + ctx->m, pos); - if (BloomFilterTest(ctx->bloom[h], buf + i, ctx->pminlen[h]) == 0) { - COUNT(tctx->stat_bloom_hits++); + COUNT(tctx->stat_calls++); + COUNT(tctx->stat_m_total+=ctx->m); - continue; - } - } - } + if (buflen < ctx->m) + return 0; - B2gCudaHashItem *hi = ctx->hash[h], *thi; - for (thi = hi; thi != NULL; thi = thi->nxt) { - COUNT(tctx->stat_d0_hashloop++); - B2gCudaPattern *p = ctx->parray[thi->idx]; + while (pos <= (uint32_t)(buflen - B2G_CUDA_Q + 1)) { + uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos - 1]),u8_tolower(buf[pos])); + d = ctx->B2G[h]; - if (p->flags & MPM_PATTERN_FLAG_NOCASE) { - if ((buflen - i) < p->len) { - continue; + if (d != 0) { + COUNT(tctx->stat_d0++); + uint32_t j = pos; + uint32_t first = pos - (ctx->m - B2G_CUDA_Q + 1); + + do { + j = j - 1; + + if (d >= (uint32_t)(1 << (ctx->m - 1))) { + if (j > first) pos = j; + else { + /* get our patterns from the hash */ + h = B2G_CUDA_HASH16(u8_tolower(buf[j + ctx->m - 2]),u8_tolower(buf[j + ctx->m - 1])); + + if (ctx->bloom[h] != NULL) { + COUNT(tctx->stat_pminlen_calls++); + COUNT(tctx->stat_pminlen_total+=ctx->pminlen[h]); + + if ((buflen - j) < ctx->pminlen[h]) { + goto skip_loop; + } else { + COUNT(tctx->stat_bloom_calls++); + + if (BloomFilterTest(ctx->bloom[h], buf+j, ctx->pminlen[h]) == 0) { + COUNT(tctx->stat_bloom_hits++); + + SCLogDebug("Bloom: %p, buflen %" PRIu32 ", pos %" PRIu32 ", p_min_len %" PRIu32 "", + ctx->bloom[h], buflen, pos, ctx->pminlen[h]); + goto skip_loop; + } + } + } + + B2gCudaHashItem *hi = ctx->hash[h], *thi; + for (thi = hi; thi != NULL; thi = thi->nxt) { + COUNT(tctx->stat_d0_hashloop++); + B2gCudaPattern *p = ctx->parray[thi->idx]; + + if (p->flags & MPM_PATTERN_FLAG_NOCASE) { + if ((buflen - j) < p->len) { + continue; + } + + if (memcmp_lowercase(p->ci, buf+j, p->len) == 0) { +#ifdef PRINTMATCH + printf("CI Exact match: "); prt(p->ci, p->len); printf("\n"); +#endif + COUNT(tctx->stat_loop_match++); + + matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); + } else { + COUNT(tctx->stat_loop_no_match++); + } + } else { + if (buflen - j < p->len) + continue; + + if (memcmp(p->cs, buf+j, p->len) == 0) { +#ifdef PRINTMATCH + printf("CS Exact match: "); prt(p->cs, p->len); printf("\n"); +#endif + COUNT(tctx->stat_loop_match++); + + matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); + } else { + COUNT(tctx->stat_loop_no_match++); + } + } + } +skip_loop: + SCLogDebug("skipped"); + //SCLogDebug("output at pos %" PRIu32 ": ", j); prt(buf + (j), ctx->m); printf("\n"); + ; + } } - if (memcmp_lowercase(p->ci, buf + i, p->len) == 0) { - COUNT(tctx->stat_loop_match++); - - matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, i, p->len); - } else { - COUNT(tctx->stat_loop_no_match++); + if (j == 0) { + break; } - } else { - if (buflen - i < p->len) - continue; - - if (memcmp(p->cs, buf + i, p->len) == 0) { - COUNT(tctx->stat_loop_match++); - matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, i, p->len); - } else { - COUNT(tctx->stat_loop_no_match++); - } - } + h = B2G_CUDA_HASH16(u8_tolower(buf[j - 1]),u8_tolower(buf[j])); + d = (d << 1) & ctx->B2G[h]; + } while (d != 0); } - } /* for(i = 0; i < buflen; i++) */ + COUNT(tctx->stat_num_shift++); + COUNT(tctx->stat_total_shift += (ctx->m - B2G_Q + 1)); + pos = pos + ctx->m - B2G_CUDA_Q + 1; - SCCudaMemFree(cuda_buf); - SCCudaMemFree(cuda_offsets); + SCLogDebug("pos %"PRIu32"", pos); + } + SCLogDebug("matches %"PRIu32"", matches); return matches; - - error: - if (cuda_buf != 0) - SCCudaMemFree(cuda_buf); - if (cuda_offsets != 0) - SCCudaMemFree(cuda_offsets); - return 0; } uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, @@ -1604,7 +1431,7 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, #endif uint32_t pos = 0, matches = 0; B2G_CUDA_TYPE d; - uint32_t j = 0; + uint32_t j; COUNT(tctx->stat_calls++); COUNT(tctx->stat_m_total+=ctx->m); @@ -1617,8 +1444,7 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, d = ~0; do { - uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + j - 1]), - u8_tolower(buf[pos + j])); + uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + j - 1]),u8_tolower(buf[pos + j])); d = ((d << 1) & ctx->B2G[h]); j = j - 1; } while (d != 0 && j != 0); @@ -1626,10 +1452,10 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, /* (partial) match, move on to verification */ if (d != 0) { COUNT(tctx->stat_d0++); + //printf("output at pos %" PRIu32 ": ", pos); prt(buf + pos, ctx->m); printf("\n"); /* get our patterns from the hash */ - uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + ctx->m - 2]), - u8_tolower(buf[pos + ctx->m - 1])); + uint16_t h = B2G_CUDA_HASH16(u8_tolower(buf[pos + ctx->m - 2]),u8_tolower(buf[pos + ctx->m - 1])); if (ctx->bloom[h] != NULL) { COUNT(tctx->stat_pminlen_calls++); @@ -1643,13 +1469,13 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (BloomFilterTest(ctx->bloom[h], buf+pos, ctx->pminlen[h]) == 0) { COUNT(tctx->stat_bloom_hits++); + //printf("Bloom: %p, buflen %" PRIu32 ", pos %" PRIu32 ", p_min_len %" PRIu32 "\n", ctx->bloom[h], buflen, pos, ctx->pminlen[h]); goto skip_loop; } } } - B2gCudaHashItem *hi = ctx->hash[h]; - B2gCudaHashItem *thi = NULL; + B2gCudaHashItem *hi = ctx->hash[h], *thi; for (thi = hi; thi != NULL; thi = thi->nxt) { COUNT(tctx->stat_d0_hashloop++); B2gCudaPattern *p = ctx->parray[thi->idx]; @@ -1661,7 +1487,7 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (memcmp_lowercase(p->ci, buf+pos, p->len) == 0) { COUNT(tctx->stat_loop_match++); - matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, pos, p->len); + matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); } else { COUNT(tctx->stat_loop_no_match++); } @@ -1672,13 +1498,14 @@ uint32_t B2gCudaSearch(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (memcmp(p->cs, buf+pos, p->len) == 0) { COUNT(tctx->stat_loop_match++); - matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, pos, p->len); + matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); } else { COUNT(tctx->stat_loop_no_match++); } } } skip_loop: + //pos = pos + ctx->s0; pos = pos + 1; } else { COUNT(tctx->stat_num_shift++); @@ -1688,6 +1515,7 @@ skip_loop: } } + //printf("Total matches %" PRIu32 "\n", matches); return matches; } @@ -1700,12 +1528,13 @@ uint32_t B2gCudaSearch2(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, uint8_t *bufend = buf + buflen - 1; uint32_t cnt = 0; B2gCudaPattern *p; - MpmEndMatch *em; B2gCudaHashItem *thi, *hi; if (buflen < 2) return 0; + //printf("BUF "); prt(buf,buflen); printf("\n"); + while (buf <= bufend) { uint8_t h8 = u8_tolower(*buf); hi = &ctx->hash1[h8]; @@ -1716,11 +1545,11 @@ uint32_t B2gCudaSearch2(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (p->flags & MPM_PATTERN_FLAG_NOCASE) { if (h8 == p->ci[0]) { - cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len); + cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); } } else { if (*buf == p->cs[0]) { - cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len); + cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); } } } @@ -1735,29 +1564,32 @@ uint32_t B2gCudaSearch2(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (p->flags & MPM_PATTERN_FLAG_NOCASE) { if (h8 == p->ci[0] && u8_tolower(*(buf+1)) == p->ci[1]) { - for (em = p->em; em; em = em->next) { - if (MpmVerifyMatch(mpm_thread_ctx, pmq, em, (buf+1 - bufmin), p->len)) + //printf("CI Exact match: "); prt(p->ci, p->len); printf(" in buf "); prt(buf, p->len);printf(" (B2gSearch1)\n"); +// for (em = p->em; em; em = em->next) { + if (MpmVerifyMatch(mpm_thread_ctx, pmq, p->id)) cnt++; - } +// } } } else { if (*buf == p->cs[0] && *(buf+1) == p->cs[1]) { - for (em = p->em; em; em = em->next) { - if (MpmVerifyMatch(mpm_thread_ctx, pmq, em, (buf+1 - bufmin), p->len)) + //printf("CS Exact match: "); prt(p->cs, p->len); printf(" in buf "); prt(buf, p->len);printf(" (B2gSearch1)\n"); +// for (em = p->em; em; em = em->next) { + if (MpmVerifyMatch(mpm_thread_ctx, pmq, p->id)) cnt++; - } +// } } } } buf += 1; } + //printf("B2gSearch2: after 2byte cnt %" PRIu32 "\n", cnt); if (ctx->pat_x_cnt > 0) { /* Pass bufmin on because buf no longer points to the * start of the buffer. */ cnt += ctx->MBSearch(mpm_ctx, mpm_thread_ctx, pmq, bufmin, buflen); + //printf("B2gSearch1: after 2+byte cnt %" PRIu32 "\n", cnt); } - return cnt; } #endif @@ -1777,6 +1609,8 @@ uint32_t B2gCudaSearch1(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (buflen == 0) SCReturnUInt(0); + //printf("BUF "); prt(buf,buflen); printf("\n"); + while (buf <= bufend) { uint8_t h = u8_tolower(*buf); hi = &ctx->hash1[h]; @@ -1790,11 +1624,11 @@ uint32_t B2gCudaSearch1(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, if (p->flags & MPM_PATTERN_FLAG_NOCASE) { if (u8_tolower(*buf) == p->ci[0]) { - cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len); + cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); } } else { if (*buf == p->cs[0]) { - cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->em, (buf+1 - bufmin), p->len); + cnt += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); } } } @@ -1802,22 +1636,57 @@ uint32_t B2gCudaSearch1(MpmCtx *mpm_ctx, MpmThreadCtx *mpm_thread_ctx, buf += 1; } + //printf("B2gSearch1: after 1byte cnt %" PRIu32 "\n", cnt); #ifdef B2G_CUDA_SEARCH2 if (ctx->pat_2_cnt) { /* Pass bufmin on because buf no longer points to the * start of the buffer. */ cnt += ctx->MBSearch2(mpm_ctx, mpm_thread_ctx, pmq, bufmin, buflen); + //printf("B2gSearch1: after 2+byte cnt %" PRIu32 "\n", cnt); } else #endif if (ctx->pat_x_cnt) { cnt += ctx->MBSearch(mpm_ctx, mpm_thread_ctx, pmq, bufmin, buflen); } - SCReturnUInt(cnt); } /*********************Cuda_Specific_Mgmt_Code_Starts_Here**********************/ +typedef struct B2gCudaMpmThreadCtxData_ { + int b2g_cuda_module_handle; + + CUcontext b2g_cuda_context; + CUmodule b2g_cuda_module; + + /* the search kernel */ + CUfunction b2g_cuda_search_kernel; + + /* the cuda_search_kernel argument offsets */ + uint8_t b2g_cuda_search_kernel_arg0_offset; + uint8_t b2g_cuda_search_kernel_arg1_offset; + uint8_t b2g_cuda_search_kernel_arg2_offset; + uint8_t b2g_cuda_search_kernel_arg3_offset; + uint8_t b2g_cuda_search_kernel_arg4_offset; + uint8_t b2g_cuda_search_kernel_arg5_offset; + uint8_t b2g_cuda_search_kernel_arg_total; + + /* the results buffer to hold the match offsets for the packets */ + uint16_t *results_buffer; + /* gpu buffer corresponding to the above buffer */ + CUdeviceptr cuda_results_buffer; + + /* gpu buffer corresponding to SCCudaPBPacketsBuffer->packets_buffer */ + CUdeviceptr cuda_packets_buffer; + /* gpu buffer corresponding to SCCudaPBPacketsBuffer->packets_offset_buffer */ + CUdeviceptr cuda_packets_offset_buffer; + /* gpu buffer corresponding to SCCudaPBPacketsBuffer->packets_payload_offset_buffer */ + CUdeviceptr cuda_packets_payload_offset_buffer; + /* gpu buffer corresponding to the global symbol g_u8_lowercasetable + * XXX Remove this. Store it as a constant buffer inside the kernel*/ + CUdeviceptr cuda_g_u8_lowercasetable; +} B2gCudaMpmThreadCtxData; + /** * \brief The Cuda MPM B2G module's thread init function. * @@ -1839,63 +1708,415 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data) SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error pushing cuda context"); } - return TM_ECODE_OK; -} + B2gCudaMpmThreadCtxData *tctx = malloc(sizeof(B2gCudaMpmThreadCtxData)); + if (tctx == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + memset(tctx, 0, sizeof(B2gCudaMpmThreadCtxData)); -/** - * \brief The Cuda MPM B2G module's thread de-init function. - * - * \param tv Pointer to the ThreadVars which has invoked this function. - * \param data Pointer to the slot data if anything had been attached in - * the thread init function. - * - * \retval TM_ECODE_OK Always. - */ -TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data) -{ - if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA) - return TM_ECODE_OK; + tctx->b2g_cuda_module_handle = module_data->handle; - if (SCCudaCtxPopCurrent(NULL) == -1) { - SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error popping cuda context"); + if (SCCudaHlGetCudaContext(&tctx->b2g_cuda_context, module_data->handle) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context"); + goto error; } - return TM_ECODE_OK; -} +#if defined(__x86_64__) || defined(__ia64__) + if (SCCudaHlGetCudaModule(&tctx->b2g_cuda_module, b2g_cuda_ptx_image_64_bit, + module_data->handle) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module"); + } +#else + if (SCCudaHlGetCudaModule(&tctx->b2g_cuda_module, b2g_cuda_ptx_image_32_bit, + module_data->handle) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module"); + } +#endif -/** - * \brief The dispatcher function for the cuda mpm. Takes a packet, feeds - * it to the gpu and informs the calling client when it has the - * results ready. - * - * \param tv We don't need this. - * \param p Pointer to the Packet which contains all the relevant data, - * like the bufffer, buflen, the contexts. - * \param data Pointer to the slot data if anything had been attached in - * the thread init function. - * \param pq We don't need this. - * - * \retval TM_ECODE_OK Always. - */ -TmEcode B2gCudaMpmDispatcher(ThreadVars *tv, Packet *p, void *data, - PacketQueue *pq) -{ - if (p == NULL) - return TM_ECODE_OK; + if (SCCudaModuleGetFunction(&tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_module, + B2G_CUDA_SEARCHFUNC_NAME) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda function"); + goto error; + } - p->cuda_matches = mpm_table[p->cuda_mpm_ctx->mpm_type].Search(p->cuda_mpm_ctx, - p->cuda_mtc, - p->cuda_pmq, - p->payload, - p->payload_len); - TmqhOutputSimpleOnQ(p->cuda_outq, p); + if (SCCudaFuncSetBlockShape(tctx->b2g_cuda_search_kernel, 32, 1, 1) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error setting function block shape"); + goto error; + } - return TM_ECODE_OK; -} +#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) -/** - * \brief Registers the Cuda B2G MPM Module. - */ + int offset = 0; + + ALIGN_UP(offset, __alignof(void *)); + tctx->b2g_cuda_search_kernel_arg0_offset = offset; + offset += sizeof(void *); + + ALIGN_UP(offset, __alignof(void *)); + tctx->b2g_cuda_search_kernel_arg1_offset = offset; + offset += sizeof(void *); + + ALIGN_UP(offset, __alignof(void *)); + tctx->b2g_cuda_search_kernel_arg2_offset = offset; + offset += sizeof(void *); + + ALIGN_UP(offset, __alignof(void *)); + tctx->b2g_cuda_search_kernel_arg3_offset = offset; + offset += sizeof(void *); + + ALIGN_UP(offset, __alignof(uint16_t)); + tctx->b2g_cuda_search_kernel_arg4_offset = offset; + offset += sizeof(void *); + + ALIGN_UP(offset, __alignof(void *)); + tctx->b2g_cuda_search_kernel_arg5_offset = offset; + offset += sizeof(void *); + + tctx->b2g_cuda_search_kernel_arg_total = offset; + + /* buffer to hold the b2g cuda mpm match results for 4000 packets. The + * extra 2 bytes(the 1 in 1481 instead of 1480) is to hold the no of + * matches for the payload. The remaining 1480 positions in the buffer + * is to hold the match offsets */ + tctx->results_buffer = malloc(sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS); + if (tctx->results_buffer == NULL) { + SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory"); + exit(EXIT_FAILURE); + } + + if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_results_buffer, + "MPM_B2G_RESULTS", + sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS, + NULL, module_data->handle) == -1) { + goto error; + } + + if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_g_u8_lowercasetable, + "G_U8_LOWERCASETABLE", 256 * sizeof(char), + g_u8_lowercasetable, module_data->handle) == -1) { + goto error; + } + + if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_buffer, + "MPM_B2G_PACKETS_BUFFER", + (sizeof(SCCudaPBPacketDataForGPU) * + SC_CUDA_PB_MIN_NO_OF_PACKETS), + NULL, module_data->handle) == -1) { + goto error; + } + + if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_offset_buffer, + "MPM_B2G_PACKETS_BUFFER_OFFSETS", + sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS, + NULL, module_data->handle) == -1) { + goto error; + } + + if (SCCudaHlGetCudaDevicePtr(&tctx->cuda_packets_payload_offset_buffer, + "MPM_B2G_PACKETS_PAYLOAD_BUFFER_OFFSETS", + sizeof(uint32_t) * SC_CUDA_PB_MIN_NO_OF_PACKETS, + NULL, module_data->handle) == -1) { + goto error; + } + + if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_search_kernel_arg0_offset, + (void *)&tctx->cuda_results_buffer, + sizeof(void *)) == -1) { + goto error; + } + + if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_search_kernel_arg1_offset, + (void *)&tctx->cuda_packets_buffer, + sizeof(void *)) == -1) { + goto error; + } + + if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_search_kernel_arg2_offset, + (void *)&tctx->cuda_packets_offset_buffer, + sizeof(void *)) == -1) { + goto error; + } + + if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_search_kernel_arg3_offset, + (void *)&tctx->cuda_packets_payload_offset_buffer, + sizeof(void *)) == -1) { + goto error; + } + + if (SCCudaParamSetv(tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_search_kernel_arg5_offset, + (void *)&tctx->cuda_g_u8_lowercasetable, + sizeof(void *)) == -1) { + goto error; + } + + if (SCCudaParamSetSize(tctx->b2g_cuda_search_kernel, + tctx->b2g_cuda_search_kernel_arg_total) == -1) { + goto error; + } + + *data = tctx; + + return TM_ECODE_OK; + + error: + return TM_ECODE_FAILED; +} + +/** + * \brief The Cuda MPM B2G module's thread de-init function. + * + * \param tv Pointer to the ThreadVars which has invoked this function. + * \param data Pointer to the slot data if anything had been attached in + * the thread init function. + * + * \retval TM_ECODE_OK Always. + */ +TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data) +{ + B2gCudaMpmThreadCtxData *tctx = data; + + if (tctx == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid arguments. data NULL\n"); + return TM_ECODE_OK; + } + + if (PatternMatchDefaultMatcher() != MPM_B2G_CUDA) + return TM_ECODE_OK; + + CUcontext dummy_context; + SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(tctx->b2g_cuda_module_handle); + if (module_data == NULL) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "How did we even fail to get a " + "module_data if we are having a module_handle"); + goto error; + } + if (SCCudaHlGetCudaContext(&dummy_context, tctx->b2g_cuda_module_handle) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the " + "module %s", module_data->name); + goto error; + } + SCCudaCtxPushCurrent(dummy_context); + + free(tctx->results_buffer); + SCCudaHlFreeCudaDevicePtr("MPM_B2G_RESULTS", tctx->b2g_cuda_module_handle); + SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER", tctx->b2g_cuda_module_handle); + SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER_OFFSETS", + tctx->b2g_cuda_module_handle); + SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_PAYLOAD_BUFFER_OFFSETS", + tctx->b2g_cuda_module_handle); + SCCudaHlFreeCudaDevicePtr("G_U8_LOWERCASETABLE", tctx->b2g_cuda_module_handle); + + free(tctx); + + if (SCCudaCtxPopCurrent(NULL) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error popping cuda context"); + } + + return TM_ECODE_OK; + + error: + return TM_ECODE_FAILED; +} + +/** + * \brief The dispatcher function for the cuda mpm. Takes a packet, feeds + * it to the gpu and informs the calling client when it has the + * results ready. + * + * \param tv We don't need this. + * \param p Pointer to the Packet which contains all the relevant data, + * like the bufffer, buflen, the contexts. + * \param data Pointer to the slot data if anything had been attached in + * the thread init function. + * \param pq We don't need this. + * + * \retval TM_ECODE_OK Always. + */ +TmEcode B2gCudaMpmDispatcher(ThreadVars *tv, Packet *incoming_buffer, + void *data, PacketQueue *pq) +{ + SCCudaPBPacketsBuffer *pb = (SCCudaPBPacketsBuffer *)incoming_buffer; + B2gCudaMpmThreadCtxData *tctx = data; + uint32_t i = 0; + + SCLogDebug("Running the B2g CUDA mpm dispatcher"); + + if (pb == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument. pb is NULL!!"); + return TM_ECODE_OK; + } + + if (SCCudaMemcpyHtoD(tctx->cuda_packets_buffer, pb->packets_buffer, + pb->packets_buffer_len) == -1) { + goto error; + } + + if (SCCudaMemcpyHtoD(tctx->cuda_packets_offset_buffer, + pb->packets_offset_buffer, + sizeof(uint32_t) * pb->nop_in_buffer) == -1) { + goto error; + } + + if (SCCudaMemcpyHtoD(tctx->cuda_packets_payload_offset_buffer, + pb->packets_payload_offset_buffer, + sizeof(uint32_t) * pb->nop_in_buffer) == -1) { + goto error; + } + + if (SCCudaParamSeti(tctx->b2g_cuda_search_kernel, tctx->b2g_cuda_search_kernel_arg4_offset, + pb->nop_in_buffer) == -1) { + goto error; + } + + /* the no of threads per block has already been set to 32 + * \todo if we are very sure we are allocating a multiple of block_size + * buffer_threshold, then we can remove this + 1 here below */ + int no_of_cuda_blocks = (pb->nop_in_buffer / 32) + 1; + if (SCCudaLaunchGrid(tctx->b2g_cuda_search_kernel, no_of_cuda_blocks, 1) == -1) { + goto error; + } + + if (SCCudaMemcpyDtoH(tctx->results_buffer, + tctx->cuda_results_buffer, + sizeof(uint16_t) * (pb->nop_in_buffer + pb->packets_total_payload_len)) == -1) { + goto error; + } + + i = 0; + for (i = 0; i < pb->nop_in_buffer; i++) { + memcpy(pb->packets_address_buffer[i]->mpm_offsets, + (tctx->results_buffer + i + + pb->packets_payload_offset_buffer[i]), + (pb->packets_address_buffer[i]->payload_len + 1) * sizeof(uint16_t)); + SCMutexLock(&pb->packets_address_buffer[i]->cuda_mutex); + pb->packets_address_buffer[i]->cuda_done = 1; + SCMutexUnlock(&pb->packets_address_buffer[i]->cuda_mutex); + SCCondSignal(&pb->packets_address_buffer[i]->cuda_cond); + } + + SCLogDebug("B2g Cuda mpm dispatcher returning"); + return TM_ECODE_OK; + + error: + for (i = 0; i < pb->nop_in_buffer; i++) { + SCMutexLock(&pb->packets_address_buffer[i]->cuda_mutex); + pb->packets_address_buffer[i]->cuda_done = 1; + SCMutexUnlock(&pb->packets_address_buffer[i]->cuda_mutex); + SCCondSignal(&pb->packets_address_buffer[i]->cuda_cond); + } + SCLogError(SC_ERR_B2G_CUDA_ERROR, "B2g Cuda mpm dispatcher returning with error"); + return TM_ECODE_OK; +} + +/** + * \brief The post processing of cuda mpm b2g results for a packet + * is done here. Will be used by the detection thread. We basically + * obtain the match offsets from the cuda mpm search and carry out + * further matches on those offsets. Also if the results are not + * read for a packet, we wait on the conditional, which will then + * be signalled by the cuda mpm dispatcher thread, once the results + * for the packet are ready. + * + * \param p Pointer to the packet whose mpm cuda results are + * to be further processed. + * \param mpm_ctx Pointer to the mpm context for this packet. + * \param mpm_thread_ctx Pointer to the mpm thread context. + * \param pmq Pointer to the patter matcher queue. + * + * \retval matches Holds the no of matches. + */ +int B2gCudaResultsPostProcessing(Packet *p, MpmCtx *mpm_ctx, + MpmThreadCtx *mpm_thread_ctx, + PatternMatcherQueue *pmq) +{ + B2gCudaCtx *ctx = mpm_ctx->ctx; + + while (p->cuda_done == 0) { + SCMutexLock(&p->cuda_mutex); + if (p->cuda_done == 1) { + SCMutexUnlock(&p->cuda_mutex); + break; + } else { + SCondWait(&p->cuda_cond, &p->cuda_mutex); + SCMutexUnlock(&p->cuda_mutex); + } + } + + /* reset this flag for the packet */ + p->cuda_done = 0; + + uint16_t *no_of_matches = p->mpm_offsets; + uint16_t *host_offsets = p->mpm_offsets + 1; + int i = 0, h = 0; + uint8_t *buf = p->payload; + uint16_t buflen = p->payload_len; + int matches = 0; + for (i = 0; i < no_of_matches[0]; i++) { + h = B2G_CUDA_HASH16(u8_tolower(buf[host_offsets[i] + ctx->m - 2]), + u8_tolower(buf[host_offsets[i] + ctx->m - 1])); + + if (ctx->bloom[h] != NULL) { + COUNT(tctx->stat_pminlen_calls++); + COUNT(tctx->stat_pminlen_total+=ctx->pminlen[h]); + + if ((buflen - host_offsets[i]) < ctx->pminlen[h]) { + continue; + } else { + COUNT(tctx->stat_bloom_calls++); + + if (BloomFilterTest(ctx->bloom[h], buf + host_offsets[i], ctx->pminlen[h]) == 0) { + COUNT(tctx->stat_bloom_hits++); + + continue; + } + } + } + + B2gCudaHashItem *hi = ctx->hash[h], *thi; + for (thi = hi; thi != NULL; thi = thi->nxt) { + COUNT(tctx->stat_d0_hashloop++); + B2gCudaPattern *p = ctx->parray[thi->idx]; + + if (p->flags & MPM_PATTERN_FLAG_NOCASE) { + if ((buflen - host_offsets[i]) < p->len) { + continue; + } + + if (memcmp_lowercase(p->ci, buf + host_offsets[i], p->len) == 0) { + COUNT(tctx->stat_loop_match++); + + matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); + } else { + COUNT(tctx->stat_loop_no_match++); + } + } else { + if (buflen - host_offsets[i] < p->len) + continue; + + if (memcmp(p->cs, buf + host_offsets[i], p->len) == 0) { + COUNT(tctx->stat_loop_match++); + + matches += MpmVerifyMatch(mpm_thread_ctx, pmq, p->id); + } else { + COUNT(tctx->stat_loop_no_match++); + } + } + } + } + + return matches; +} + +/** + * \brief Registers the Cuda B2G MPM Module. + */ void TmModuleCudaMpmB2gRegister(void) { tmm_modules[TMM_CUDA_MPM_B2G].name = "Cuda_Mpm_B2g"; @@ -1908,6 +2129,72 @@ void TmModuleCudaMpmB2gRegister(void) /***************************Code_Specific_To_Mpm_B2g***************************/ +void *CudaMpmB2gThreadsSlot1(void *td) +{ + ThreadVars *tv = (ThreadVars *)td; + Tm1Slot *s = (Tm1Slot *)tv->tm_slots; + SCCudaPBPacketsBuffer *data = NULL; + char run = 1; + TmEcode r = TM_ECODE_OK; + + /* Set the thread name */ + SCSetThreadName(tv->name); + + if (tv->thread_setup_flags != 0) + TmThreadSetupOptions(tv); + + SCLogDebug("%s starting", tv->name); + + if (s->s.SlotThreadInit != NULL) { + r = s->s.SlotThreadInit(tv, s->s.slot_initdata, &s->s.slot_data); + if (r != TM_ECODE_OK) { + EngineKill(); + + TmThreadsSetFlag(tv, THV_CLOSED); + pthread_exit((void *) -1); + } + } + memset(&s->s.slot_pq, 0, sizeof(PacketQueue)); + + TmThreadsSetFlag(tv, THV_INIT_DONE); + while(run) { + TmThreadTestThreadUnPaused(tv); + + /* input data */ + data = (SCCudaPBPacketsBuffer *)TmqhInputSimpleOnQ(&data_queues[tv->inq->id]); + + if (data == NULL) { + //printf("%s: TmThreadsSlot1: p == NULL\n", tv->name); + } else { + r = s->s.SlotFunc(tv, (Packet *)data, s->s.slot_data, &s->s.slot_pq); + /* handle error */ + + /* output the packet */ + TmqhOutputSimpleOnQ(&data_queues[tv->outq->id], (SCDQGenericQData *)data); + } + + if (TmThreadsCheckFlag(tv, THV_KILL)) { + run = 0; + } + } + + if (s->s.SlotThreadExitPrintStats != NULL) { + s->s.SlotThreadExitPrintStats(tv, s->s.slot_data); + } + + if (s->s.SlotThreadDeinit != NULL) { + r = s->s.SlotThreadDeinit(tv, s->s.slot_data); + if (r != TM_ECODE_OK) { + TmThreadsSetFlag(tv, THV_CLOSED); + pthread_exit((void *) -1); + } + } + + SCLogDebug("%s ending", tv->name); + TmThreadsSetFlag(tv, THV_CLOSED); + pthread_exit((void *) 0); +} + int B2gCudaStartDispatcherThreadRC(const char *name) { SCCudaHlModuleData *data = NULL; @@ -1935,15 +2222,17 @@ int B2gCudaStartDispatcherThreadRC(const char *name) } /* create the threads */ - tv_CMB2_RC = TmThreadCreatePacketHandler("Cuda_Mpm_B2g_RC", - "rules_content_mpm_inqueue", "simple", - NULL, NULL, - "1slot_noout"); + tv_CMB2_RC = TmThreadCreate("Cuda_Mpm_B2g_RC", + "cuda_batcher_mpm_outqueue", "simple", + "cuda_batcher_mpm_inqueue", "simple", + "custom", CudaMpmB2gThreadsSlot1, 0); if (tv_CMB2_RC == NULL) { SCLogError(SC_ERR_TM_THREADS_ERROR, "ERROR: TmThreadsCreate failed"); exit(EXIT_FAILURE); } - tv_CMB2_RC->inq->writer_cnt++; + tv_CMB2_RC->type = TVT_PPT; + tv_CMB2_RC->inq->q_type = 1; + tv_CMB2_RC->outq->q_type = 1; tm_module = TmModuleGetByName("Cuda_Mpm_B2g"); if (tm_module == NULL) { @@ -1981,28 +2270,10 @@ void B2gCudaKillDispatcherThreadRC(void) return; } -void B2gCudaPushPacketTo_tv_CMB2_RC(Packet *p) -{ - PacketQueue *q = &trans_q[tv_CMB2_RC->inq->id]; - - SCMutexLock(&q->mutex_q); - PacketEnqueue(q, p); - SCCondSignal(&q->cond_q); - SCMutexUnlock(&q->mutex_q); - - return; -} - /*********************************Unittests************************************/ #ifdef UNITTESTS -static int B2gCudaTestInitTestEnv(void) -{ - SCCudaHlRegisterModule("B2G_CUDA_TEST"); - - return 1; -} static int B2gCudaTest01(void) { @@ -2010,1207 +2281,621 @@ static int B2gCudaTest01(void) MpmThreadCtx mpm_thread_ctx; B2gCudaCtx *ctx = NULL; int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); + int module_handle = SCCudaHlRegisterModule("B2G_CUDA_TEST"); + SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(module_handle); + SCCudaPBPacketsBuffer *pb = NULL; + + /* get the cuda context and push it */ + CUcontext dummy_context; + if (SCCudaHlGetCudaContext(&dummy_context, module_handle) == -1) { + SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context for the " + "module SC_RULES_CONTENT_B2G_CUDA"); + } + SCCudaCtxPushCurrent(dummy_context); memset(&mpm_ctx, 0, sizeof(MpmCtx)); B2gCudaInitCtx(&mpm_ctx, module_handle); + /* pop the context before we make further calls to the mpm cuda dispatcher */ + SCCudaCtxPopCurrent(NULL); + + B2gCudaMpmThreadCtxData *tctx = NULL; + B2gCudaMpmDispThreadInit(NULL, module_data, (void *)&tctx); ctx = mpm_ctx.ctx; - if (ctx->cuda_context == 0) + if (tctx->b2g_cuda_context == 0) goto end; - if (ctx->cuda_module == 0) + if (tctx->b2g_cuda_module == 0) goto end; - if (ctx->cuda_search_kernel == 0) + if (tctx->b2g_cuda_search_kernel == 0) goto end; if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"one", 3, 0, 0, 1, 1, 0) == -1) goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"two", 3, 0, 0, 2, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1) - goto end; if (B2gCudaPreparePatterns(&mpm_ctx) == -1) goto end; - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 4 /* 4 patterns */); + B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - char *string = "onetwothreeaaaaoneaatwobbbthrbsonwehowvonwoonsldffoursadnothreewtowoneowtwo"; - result = (B2gCudaSearchBNDMq(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)string, strlen(string)) == 9); + result = 1; - end: - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} + pb = SCCudaPBAllocSCCudaPBPacketsBuffer(); + SCCudaPBPacketDataForGPU *curr_packet = (SCCudaPBPacketDataForGPU *)pb->packets_buffer; -static int B2gCudaTest02(void) -{ - MpmCtx mpm_ctx; - MpmThreadCtx mpm_thread_ctx; - B2gCudaCtx *ctx = NULL; - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - - memset(&mpm_ctx, 0, sizeof(MpmCtx)); - B2gCudaInitCtx(&mpm_ctx, module_handle); - - ctx = mpm_ctx.ctx; + char *string = "tone_one_one_one"; + curr_packet->m = ctx->m; + curr_packet->table = ctx->cuda_B2G; + curr_packet->payload_len = strlen(string); + memcpy(curr_packet->payload, string, strlen(string)); - if (ctx->cuda_context == 0) - goto end; - if (ctx->cuda_module == 0) - goto end; - if (ctx->cuda_search_kernel == 0) - goto end; + pb->nop_in_buffer = 1; + pb->packets_buffer_len = sizeof(SCCudaPBPacketDataForGPUNonPayload) + strlen(string); + pb->packets_total_payload_len = strlen(string); + pb->packets_offset_buffer[0] = 0; + pb->packets_payload_offset_buffer[0] = 0; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"one", 3, 0, 0, 1, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"two", 3, 0, 0, 2, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1) - goto end; + Packet p; + memset(&p, 0, sizeof(Packet)); + pb->packets_address_buffer[0] = &p; + p.payload_len = strlen(string); - if (B2gCudaPreparePatterns(&mpm_ctx) == -1) - goto end; - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 4 /* 4 patterns */); + B2gCudaMpmDispatcher(NULL, (Packet *)pb, tctx, NULL); - char *string = "onetwothreeaaaaoneaatwobbbthrbsonwehowvonwoonsldffoursadnothreewtowoneowtwo"; - result = (B2gCudaSearchBNDMq(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)string, strlen(string)) == 9); + result &= (p.mpm_offsets[0] == 4); + result &= (p.mpm_offsets[1] == 1); + result &= (p.mpm_offsets[2] == 5); + result &= (p.mpm_offsets[3] == 9); + result &= (p.mpm_offsets[4] == 13); end: - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); + SCCudaPBDeAllocSCCudaPBPacketsBuffer(pb); + B2gCudaMpmDispThreadDeInit(NULL, (void *)tctx); B2gCudaDestroyCtx(&mpm_ctx); + B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); return result; } -/** - * \test Test that the *AddPattern* functions work as expected. - */ -static int B2gCudaTest03(void) +static int B2gCudaTest02(void) { - MpmCtx mpm_ctx; - B2gCudaCtx *ctx = NULL; - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); + uint8_t raw_eth[] = { + 0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02, + 0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00, + 0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06, + 0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8, + 0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11, + 0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18, + 0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01, + 0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21, + 0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31, + 0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46, + 0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f, + 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43, + 0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e, + 0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72, + 0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43, + 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, + 0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20, + 0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, + 0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44, + 0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e, + 0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70, + 0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38, + 0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47, + 0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d, + 0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a, + 0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a, + 0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48, + 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74, + 0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65, + 0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79, + 0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63, + 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75, + 0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c, + 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30, + 0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c, + 0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c, + 0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42, + 0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31, + 0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76, + 0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a, + 0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75, + 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73, + 0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c, + 0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68, + 0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e, + 0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59, + 0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e, + 0x0d, 0x0a }; - memset(&mpm_ctx, 0, sizeof(MpmCtx)); - B2gCudaInitCtx(&mpm_ctx, module_handle); + int result = 0; + const char *strings[10] = { + "test_test_one", + "test_two_test", + "test_three_test", + "test_four_test", + "test_five_test", + "test_six_test", + "test_seven_test", + "test_eight_test", + "test_nine_test", + "test_ten_test"}; + /* don't shoot me for hardcoding the results. We will change this in + * sometime, by running a separate mpm on the cpu, and then hold + * the results in this temp buffer */ + int results[10][2] = { {0, 5}, + {0, 9}, + {0, 11}, + {0, 10}, + {0, 10}, + {0, 9}, + {0, 11}, + {0, 11}, + {0, 10}, + {0, 9} }; + Packet *p[10]; + SCCudaPBThreadCtx *pb_tctx = NULL; + + DecodeThreadVars dtv; + ThreadVars tv; + DetectEngineCtx *de_ctx = NULL; + + SCCudaPBPacketsBuffer *pb = NULL; + SCDQDataQueue *dq = NULL; + + char *inq_name = "cuda_batcher_mpm_inqueue"; + char *outq_name = "cuda_batcher_mpm_outqueue"; + + Tmq *tmq_outq = NULL; + Tmq *tmq_inq = NULL; + + uint32_t i = 0, j = 0; + + uint8_t no_of_pkts = 10; + + memset(&dtv, 0, sizeof(DecodeThreadVars)); + memset(&tv, 0, sizeof(ThreadVars)); + + FlowInitConfig(FLOW_QUIET); + + memset(p, 0, sizeof(p)); + for (i = 0; i < no_of_pkts; i++) { + p[i] = malloc(sizeof(Packet)); + if (p[i] == NULL) { + printf("error allocating memory\n"); + exit(EXIT_FAILURE); + } + memset(p[i], 0, sizeof(Packet)); + DecodeEthernet(&tv, &dtv, p[i], raw_eth, sizeof(raw_eth), NULL); + } - ctx = mpm_ctx.ctx; - if (ctx->cuda_context == 0) - goto end; - if (ctx->cuda_module == 0) - goto end; - if (ctx->cuda_search_kernel == 0) + de_ctx = DetectEngineCtxInit(); + if (de_ctx == NULL) { goto end; + } - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"onee", 4, 0, 0, 1, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"twoo", 4, 0, 0, 2, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"onee", 4, 0, 0, 1, 2, 0) == -1) - goto end; + de_ctx->mpm_matcher = MPM_B2G_CUDA; + de_ctx->flags |= DE_QUIET; - if (B2gCudaPreparePatterns(&mpm_ctx) == -1) + de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:test; sid:1;)"); + if (de_ctx->sig_list == NULL) { + printf("signature parsing failed\n"); goto end; + } + SigGroupBuild(de_ctx); - char *string = "one"; - result = (B2gCudaSearchBNDMq(&mpm_ctx, NULL, NULL, (uint8_t *)string, strlen(string)) == 0); - - end: - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -/** - * \test Test that the *AddPattern* functions work as expected. - */ -static int B2gCudaTest04(void) -{ - MpmCtx mpm_ctx; - MpmThreadCtx mpm_thread_ctx; - B2gCudaCtx *ctx = NULL; - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - - memset(&mpm_ctx, 0, sizeof(MpmCtx)); - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - - ctx = mpm_ctx.ctx; - if (ctx->cuda_context == 0) - goto end; - if (ctx->cuda_module == 0) - goto end; - if (ctx->cuda_search_kernel == 0) - goto end; + SCCudaPBSetUpQueuesAndBuffers(); - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"one", 3, 0, 0, 1, 1, 0) == -1) + /* get the queues used by the batcher thread */ + tmq_inq = TmqGetQueueByName(inq_name); + if (tmq_inq == NULL) { + printf("tmq_inq NULL\n"); goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"two", 3, 0, 0, 2, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"three", 5, 0, 0, 3, 1, 0) == -1) - goto end; - if (B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"four", 4, 0, 0, 4, 1, 0) == -1) - goto end; - - if (B2gCudaPreparePatterns(&mpm_ctx) == -1) + } + tmq_outq = TmqGetQueueByName(outq_name); + if (tmq_outq == NULL) { + printf("tmq_outq NULL\n"); goto end; - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 4 /* 4 patterns */); - - char *string = "onetwothreeaaaaoneaatwobbbthrbsonwehowvonwfouoonsldffoursadnothreewtowoneowtwo"; - result = (B2gCudaSearchBNDMq(&mpm_ctx, &mpm_thread_ctx, - NULL, (uint8_t *)string, strlen(string)) == 9); + } result = 1; - end: - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch01(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "abcdefghjiklmnopqrstuvwxyz"; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); - - if (cnt == 1) - result = 1; - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - - return result; -} - -static int B2gCudaTestSearch02(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "abcdefghjiklmnopqrstuvwxyz"; - - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abce", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); - - if (cnt == 0) - result = 1; - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - - return result; -} - -static int B2gCudaTestSearch03(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "abcdefghjiklmnopqrstuvwxyz"; - - /* a match each for these strings */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcde", 4, 0, 0, 1, 0, 0); - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghj", 4, 0, 0, 2, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); - - if (cnt == 3) - result = 1; - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - - return result; -} + /* queue state before calling the thread init function */ + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 10); -/** - * \test Test patterns longer than 'm'. M is 4 here. - */ -static int B2gCudaTestSearch04(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "abcdefghjiklmnopqrstuvwxyz"; + /* init the TM thread */ + SCCudaPBThreadInit(&tv, de_ctx, (void *)&pb_tctx); + SCCudaPBSetBufferPacketThreshhold(no_of_pkts); - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcdegh", 6, 0, 0, 1, 0, 0); - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghjxyz", 7, 0, 0, 2, 0, 0); + /* queue state after calling the thread init function */ + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */); + pb = pb_tctx->curr_pb; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); + for (i = 0; i < no_of_pkts; i++) { + p[i]->payload = (uint8_t *)strings[i]; + p[i]->payload_len = strlen(strings[i]); + SCCudaPBBatchPackets(NULL, p[i], pb_tctx, NULL); + } - if (cnt == 1) - result = 1; + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 1); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 8); - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} + result &= (pb->nop_in_buffer == no_of_pkts); -/** - * \test Case insensitive test patterns longer than 'm'. M is 4 here. - */ -static int B2gCudaTestSearch05(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "abcdefghjiklmnopqrstuvwxyz"; + int module_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA"); + SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(module_handle); - B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"ABCD", 4, 0, 0, 0, 0, 0); - B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"bCdEfG", 6, 0, 0, 1, 0, 0); - B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"fghJikl", 7, 0, 0, 2, 0, 0); + B2gCudaMpmThreadCtxData *b2g_tctx = NULL; + B2gCudaMpmDispThreadInit(NULL, module_data, (void *)&b2g_tctx); - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */); + if (b2g_tctx->b2g_cuda_context == 0) + goto end; + if (b2g_tctx->b2g_cuda_module == 0) + goto end; + if (b2g_tctx->b2g_cuda_search_kernel == 0) + goto end; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); + B2gCudaMpmDispatcher(NULL, (Packet *)pb, b2g_tctx, NULL); - if (cnt == 3) - result = 1; + for (i = 0; i < no_of_pkts; i++) { + for (j = 0; j < p[i]->mpm_offsets[0]; j++) + result &= (results[i][j] == p[i]->mpm_offsets[j + 1]); + } - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); + end: + for (i = 0; i < no_of_pkts; i++) { + free(p[i]); + } + SCCudaPBCleanUpQueuesAndBuffers(); + if (de_ctx != NULL) { + SigGroupCleanup(de_ctx); + SigCleanSignatures(de_ctx); + DetectEngineCtxFree(de_ctx); + } + SCCudaPBThreadDeInit(NULL, (void *)pb_tctx); + B2gCudaMpmDispThreadDeInit(NULL, (void *)b2g_tctx); return result; } -static int B2gCudaTestSearch06(void) +static int B2gCudaTest03(void) { - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "abcd"; - - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); - - if (cnt == 1) - result = 1; - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - - return result; -} + uint8_t raw_eth[] = { + 0x00, 0x25, 0x00, 0x9e, 0xfa, 0xfe, 0x00, 0x02, + 0xcf, 0x74, 0xfe, 0xe1, 0x08, 0x00, 0x45, 0x00, + 0x01, 0xcc, 0xcb, 0x91, 0x00, 0x00, 0x34, 0x06, + 0xdf, 0xa8, 0xd1, 0x55, 0xe3, 0x67, 0xc0, 0xa8, + 0x64, 0x8c, 0x00, 0x50, 0xc0, 0xb7, 0xd1, 0x11, + 0xed, 0x63, 0x81, 0xa9, 0x9a, 0x05, 0x80, 0x18, + 0x00, 0x75, 0x0a, 0xdd, 0x00, 0x00, 0x01, 0x01, + 0x08, 0x0a, 0x09, 0x8a, 0x06, 0xd0, 0x12, 0x21, + 0x2a, 0x3b, 0x48, 0x54, 0x54, 0x50, 0x2f, 0x31, + 0x2e, 0x31, 0x20, 0x33, 0x30, 0x32, 0x20, 0x46, + 0x6f, 0x75, 0x6e, 0x64, 0x0d, 0x0a, 0x4c, 0x6f, + 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x0d, 0x0a, 0x43, + 0x61, 0x63, 0x68, 0x65, 0x2d, 0x43, 0x6f, 0x6e, + 0x74, 0x72, 0x6f, 0x6c, 0x3a, 0x20, 0x70, 0x72, + 0x69, 0x76, 0x61, 0x74, 0x65, 0x0d, 0x0a, 0x43, + 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, + 0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x20, + 0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, + 0x55, 0x54, 0x46, 0x2d, 0x38, 0x0d, 0x0a, 0x44, + 0x61, 0x74, 0x65, 0x3a, 0x20, 0x4d, 0x6f, 0x6e, + 0x2c, 0x20, 0x31, 0x34, 0x20, 0x53, 0x65, 0x70, + 0x20, 0x32, 0x30, 0x30, 0x39, 0x20, 0x30, 0x38, + 0x3a, 0x34, 0x38, 0x3a, 0x33, 0x31, 0x20, 0x47, + 0x4d, 0x54, 0x0d, 0x0a, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x72, 0x3a, 0x20, 0x67, 0x77, 0x73, 0x0d, + 0x0a, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x2d, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a, + 0x20, 0x32, 0x31, 0x38, 0x0d, 0x0a, 0x0d, 0x0a, + 0x3c, 0x48, 0x54, 0x4d, 0x4c, 0x3e, 0x3c, 0x48, + 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x6d, 0x65, 0x74, + 0x61, 0x20, 0x68, 0x74, 0x74, 0x70, 0x2d, 0x65, + 0x71, 0x75, 0x69, 0x76, 0x3d, 0x22, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x74, 0x79, + 0x70, 0x65, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x74, 0x65, 0x78, + 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x3b, 0x63, + 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x75, + 0x74, 0x66, 0x2d, 0x38, 0x22, 0x3e, 0x0a, 0x3c, + 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x33, 0x30, + 0x32, 0x20, 0x4d, 0x6f, 0x76, 0x65, 0x64, 0x3c, + 0x2f, 0x54, 0x49, 0x54, 0x4c, 0x45, 0x3e, 0x3c, + 0x2f, 0x48, 0x45, 0x41, 0x44, 0x3e, 0x3c, 0x42, + 0x4f, 0x44, 0x59, 0x3e, 0x0a, 0x3c, 0x48, 0x31, + 0x3e, 0x33, 0x30, 0x32, 0x20, 0x4d, 0x6f, 0x76, + 0x65, 0x64, 0x3c, 0x2f, 0x48, 0x31, 0x3e, 0x0a, + 0x54, 0x68, 0x65, 0x20, 0x64, 0x6f, 0x63, 0x75, + 0x6d, 0x65, 0x6e, 0x74, 0x20, 0x68, 0x61, 0x73, + 0x20, 0x6d, 0x6f, 0x76, 0x65, 0x64, 0x0a, 0x3c, + 0x41, 0x20, 0x48, 0x52, 0x45, 0x46, 0x3d, 0x22, + 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, + 0x77, 0x77, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, + 0x65, 0x2e, 0x65, 0x73, 0x2f, 0x22, 0x3e, 0x68, + 0x65, 0x72, 0x65, 0x3c, 0x2f, 0x41, 0x3e, 0x2e, + 0x0d, 0x0a, 0x3c, 0x2f, 0x42, 0x4f, 0x44, 0x59, + 0x3e, 0x3c, 0x2f, 0x48, 0x54, 0x4d, 0x4c, 0x3e, + 0x0d, 0x0a }; -static int B2gCudaTestSearch07(void) -{ int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - char *buf = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const char *strings[10] = { + "test_test_one", + "test_two_test", + "test_three_test", + "test_four_test", + "test_five_test", + "test_six_test", + "test_seven_test", + "test_eight_test", + "test_nine_test", + "test_ten_test"}; + /* don't shoot me for hardcoding the results. We will change this in + * sometime, by having run a separate mpm on the cpu and then hold + * the results in a temp buffer */ + Packet *p[10]; + SCCudaPBThreadCtx *pb_tctx = NULL; + + DecodeThreadVars dtv; + ThreadVars tv; + DetectEngineCtx *de_ctx = NULL; + DetectEngineThreadCtx *det_ctx; + ThreadVars de_tv; + + SCCudaPBPacketsBuffer *pb = NULL; + SCDQDataQueue *dq = NULL; + + char *inq_name = "cuda_batcher_mpm_inqueue"; + char *outq_name = "cuda_batcher_mpm_outqueue"; + + Tmq *tmq_outq = NULL; + Tmq *tmq_inq = NULL; + + uint32_t i = 0, j = 0; + + uint8_t no_of_pkts = 10; + + Signature *sig = NULL; + + memset(&dtv, 0, sizeof(DecodeThreadVars)); + memset(&tv, 0, sizeof(ThreadVars)); + memset(&de_tv, 0, sizeof(ThreadVars)); + + FlowInitConfig(FLOW_QUIET); + for (i = 0; i < no_of_pkts; i++) { + p[i] = malloc(sizeof(Packet)); + if (p[i] == NULL) { + printf("error allocating memory\n"); + exit(EXIT_FAILURE); + } + memset(p[i], 0, sizeof(Packet)); + DecodeEthernet(&tv, &dtv, p[i], raw_eth, sizeof(raw_eth), NULL); + } - /* total matches: 135 */ - /* should match 30 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"A", 1, 0, 0, 0, 0, 0); - /* should match 29 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AA", 2, 0, 0, 1, 0, 0); - /* should match 28 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAA", 3, 0, 0, 2, 0, 0); - /* should match 26 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAA", 5, 0, 0, 3, 0, 0); - /* should match 21 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAA", 10, 0, 0, 4, 0, 0); - /* should match 1 time */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 30, 0, 0, 5, 0, 0); + de_ctx = DetectEngineCtxInit(); + if (de_ctx == NULL) { + goto end; + } + de_ctx->mpm_matcher = MPM_B2G_CUDA; + de_ctx->flags |= DE_QUIET; - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 6 /* 6 patterns */); + de_ctx->sig_list = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:test; sid:0;)"); + if (de_ctx->sig_list == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = de_ctx->sig_list; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:one; sid:1;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:two; sid:2;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - if (cnt == 135) - result = 1; + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:three; sid:3;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:four; sid:4;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - return result; -} + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:five; sid:5;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; -static int B2gCudaTestSearch08(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:six; sid:6;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); /* 1 match */ + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:seven; sid:7;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:eight; sid:8;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"a", 1); + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:nine; sid:9;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } + sig = sig->next; - if (cnt == 0) - result = 1; - else - printf("0 != %" PRIu32 " ",cnt); + sig->next = SigInit(de_ctx, "alert tcp any any -> any any (msg:\"Bamboo\"; " + "content:ten; sid:10;)"); + if (sig->next == NULL) { + printf("signature parsing failed\n"); + goto end; + } - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} + /* build the signatures */ + SigGroupBuild(de_ctx); + DetectEngineThreadCtxInit(&de_tv, (void *)de_ctx, (void *)&det_ctx); -/* we segfault with this test */ -static int B2gCudaTestSearch09(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; + SCCudaPBSetUpQueuesAndBuffers(); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"ab", 2, 0, 0, 0, 0, 0); + /* get the queues used by the batcher thread */ + tmq_inq = TmqGetQueueByName(inq_name); + if (tmq_inq == NULL) { + printf("tmq_inq NULL\n"); + goto end; + } + tmq_outq = TmqGetQueueByName(outq_name); + if (tmq_outq == NULL) { + printf("tmq_outq NULL\n"); + goto end; + } - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); + result = 1; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"ab", 2); + /* queue state before calling the thread init function */ + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 10); - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); + /* init the TM thread */ + SCCudaPBThreadInit(&tv, de_ctx, (void *)&pb_tctx); + SCCudaPBSetBufferPacketThreshhold(no_of_pkts); - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} + /* queue state after calling the thread init function */ + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 0); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 9); -static int B2gCudaTestSearch10(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; + pb = pb_tctx->curr_pb; - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefgh", 8, 0, 0, 0, 0, 0); /* 1 match */ + for (i = 0; i < no_of_pkts; i++) { + p[i]->payload = (uint8_t *)strings[i]; + p[i]->payload_len = strlen(strings[i]); + SCCudaPBBatchPackets(NULL, p[i], pb_tctx, NULL); + } - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); + dq = &data_queues[tmq_outq->id]; + result &= (dq->len == 1); + dq = &data_queues[tmq_inq->id]; + result &= (dq->len == 8); - char *buf = "01234567890123456789012345678901234567890123456789" - "01234567890123456789012345678901234567890123456789" - "abcdefgh" - "01234567890123456789012345678901234567890123456789" - "01234567890123456789012345678901234567890123456789"; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, strlen(buf)); + result &= (pb->nop_in_buffer == no_of_pkts); - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); + int module_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA"); + SCCudaHlModuleData *module_data = SCCudaHlGetModuleData(module_handle); - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} + B2gCudaMpmThreadCtxData *b2g_tctx = NULL; + B2gCudaMpmDispThreadInit(NULL, module_data, (void *)&b2g_tctx); -static int B2gCudaTestSearch11(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; + if (b2g_tctx->b2g_cuda_context == 0) + goto end; + if (b2g_tctx->b2g_cuda_module == 0) + goto end; + if (b2g_tctx->b2g_cuda_search_kernel == 0) + goto end; - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcde", 5, 0, 0, 0, 0, 0); + B2gCudaMpmDispatcher(NULL, (Packet *)pb, b2g_tctx, NULL); - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */); + for (i = 0; i < 10; i++) + SigMatchSignatures(&de_tv, de_ctx, det_ctx, p[i]); - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyz", 26); + for (i = 0; i < 10; i++) { + if (!PacketAlertCheck(p[i], 0)) { + result = 0; + goto end; + } + for (j = 1; j <= 10; j++) { + if (j == i + 1) { + if (!PacketAlertCheck(p[i], j)) { + result = 0; + goto end; + } + } else { + if (PacketAlertCheck(p[i], j)) { + result = 0; + goto end; + } + } + } + } - if (cnt == 2) - result = 1; - else - printf("2 != %" PRIu32 " ",cnt); + end: + for (i = 0; i < no_of_pkts; i++) { + free(p[i]); + } + SCCudaPBCleanUpQueuesAndBuffers(); + if (de_ctx) { + SigGroupCleanup(de_ctx); + SigCleanSignatures(de_ctx); + DetectEngineCtxFree(de_ctx); + } + SCCudaPBThreadDeInit(NULL, (void *)pb_tctx); + B2gCudaMpmDispThreadDeInit(NULL, (void *)b2g_tctx); - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); return result; } -static int B2gCudaTestSearch12(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"wxyz", 4, 0, 0, 0, 0, 0); /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"vwxyz", 5, 0, 0, 0, 0, 0); /* 1 match */ - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyz", 26); - - if (cnt == 2) - result = 1; - else - printf("2 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch13(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCD", - 30, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCD", 30); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch14(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDE", - 31, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDE", 31); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch15(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDEF", - 32, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyzABCDEF", 32); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch16(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzABC", - 29, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyzABC", 29); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch17(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefghijklmnopqrstuvwxyzAB", - 28, 0, 0, 0, 0, 0); /* 1 match */ - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghijklmnopqrstuvwxyzAB", 28); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch18(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, - (uint8_t *)"abcde""fghij""klmno""pqrst""uvwxy""z", - 26, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcde""fghij""klmno""pqrst""uvwxy""z", - 26); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch19(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", - 30, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 30); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch20(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 */ - B2gCudaAddPatternCS(&mpm_ctx, - (uint8_t *)"AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AA", - 32, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AAAAA""AA", - 32); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch21(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AA", 2, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"AA", 2); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - - - -static int B2gCudaTestSearch22(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch23(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abce", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 0) - result = 1; - else - printf("0 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch24(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcde", 4, 0, 0, 1, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghj", 4, 0, 0, 2, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 3) - result = 1; - else - printf("3 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -/** - * \test test patterns longer than 'm'. M is 4 here. - */ -static int B2gCudaTestSearch25(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"bcdegh", 6, 0, 0, 1, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"fghjxyz", 7, 0, 0, 2, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -/** - * \test case insensitive test patterns longer than 'm'. M is 4 here. - */ -static int B2gCudaTestSearch26(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"ABCD", 4, 0, 0, 0, 0, 0); - /* 1 match */ - B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"bCdEfG", 6, 0, 0, 1, 0, 0); - /* 1 match */ - B2gCudaAddPatternCI(&mpm_ctx, (uint8_t *)"fghJikl", 7, 0, 0, 2, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 3 /* 3 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 3) - result = 1; - else - printf("3 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch27(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"abcd", 4); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch28(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* should match 30 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"A", 1, 0, 0, 0, 0, 0); - /* should match 29 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AA", 2, 0, 0, 1, 0, 0); - /* should match 28 times */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAA", 3, 0, 0, 2, 0, 0); - /* 26 */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAA", 5, 0, 0, 3, 0, 0); - /* 21 */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAA", 10, 0, 0, 4, 0, 0); - /* 1 */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", - 30, 0, 0, 5, 0, 0); - /* total matches: 135 */ - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 6 /* 6 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 30); - - if (cnt == 135) - result = 1; - else - printf("135 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch29(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"a", 1); - - if (cnt == 0) - result = 1; - else - printf("0 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch30(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"ab", 2, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)"ab", 2); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -/* 1 match */ -static int B2gCudaTestSearch31(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcdefgh", 8, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 1 /* 1 pattern */); - - char *buf = "01234567890123456789012345678901234567890123456789" - "01234567890123456789012345678901234567890123456789" - "abcdefgh" - "01234567890123456789012345678901234567890123456789" - "01234567890123456789012345678901234567890123456789"; - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, (uint8_t *)buf, - strlen(buf)); - - if (cnt == 1) - result = 1; - else - printf("1 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch32(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcd", 4, 0, 0, 0, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"abcde", 5, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 2) - result = 1; - else - printf("2 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestSearch33(void) -{ - int result = 0; - int module_handle = SCCudaHlGetModuleHandle("B2G_CUDA_TEST"); - MpmCtx mpm_ctx; - memset(&mpm_ctx, 0x00, sizeof(MpmCtx)); - MpmThreadCtx mpm_thread_ctx; - MpmInitCtx(&mpm_ctx, MPM_B2G_CUDA, module_handle); - B2gCudaCtx *ctx = (B2gCudaCtx *)mpm_ctx.ctx; - - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"wxyz", 4, 0, 0, 0, 0, 0); - /* 1 match */ - B2gCudaAddPatternCS(&mpm_ctx, (uint8_t *)"vwxyz", 5, 0, 0, 0, 0, 0); - - B2gCudaPreparePatterns(&mpm_ctx); - B2gCudaThreadInitCtx(&mpm_ctx, &mpm_thread_ctx, 2 /* 2 patterns */); - - uint32_t cnt = ctx->Search(&mpm_ctx, &mpm_thread_ctx, NULL, - (uint8_t *)"abcdefghjiklmnopqrstuvwxyz", 26); - - if (cnt == 2) - result = 1; - else - printf("2 != %" PRIu32 " ",cnt); - - B2gCudaThreadDestroyCtx(&mpm_ctx, &mpm_thread_ctx); - B2gCudaDestroyCtx(&mpm_ctx); - return result; -} - -static int B2gCudaTestDeInitTestEnv(void) -{ - CUcontext context; - if (SCCudaCtxPopCurrent(&context) == -1) - exit(EXIT_FAILURE); - SCCudaHlDeRegisterModule("B2G_CUDA_TEST"); - - return 1; -} - #endif /* UNITTESTS */ /*********************************Unittests************************************/ @@ -3218,49 +2903,9 @@ static int B2gCudaTestDeInitTestEnv(void) void B2gCudaRegisterTests(void) { #ifdef UNITTESTS - UtRegisterTest("B2gCudaTestInitTestEnv", B2gCudaTestInitTestEnv, 1); UtRegisterTest("B2gCudaTest01", B2gCudaTest01, 1); UtRegisterTest("B2gCudaTest02", B2gCudaTest02, 1); UtRegisterTest("B2gCudaTest03", B2gCudaTest03, 1); - UtRegisterTest("B2gCudaTest04", B2gCudaTest04, 1); - UtRegisterTest("B2gCudaTestSearch01", B2gCudaTestSearch01, 1); - UtRegisterTest("B2gCudaTestSearch02", B2gCudaTestSearch02, 1); - UtRegisterTest("B2gCudaTestSearch03", B2gCudaTestSearch03, 1); - UtRegisterTest("B2gCudaTestSearch04", B2gCudaTestSearch04, 1); - UtRegisterTest("B2gCudaTestSearch05", B2gCudaTestSearch05, 1); - UtRegisterTest("B2gCudaTestSearch06", B2gCudaTestSearch06, 1); - UtRegisterTest("B2gCudaTestSearch07", B2gCudaTestSearch07, 1); - UtRegisterTest("B2gCudaTestSearch08", B2gCudaTestSearch08, 1); - UtRegisterTest("B2gCudaTestSearch09", B2gCudaTestSearch09, 1); - UtRegisterTest("B2gCudaTestSearch10", B2gCudaTestSearch10, 1); - UtRegisterTest("B2gCudaTestSearch11", B2gCudaTestSearch11, 1); - UtRegisterTest("B2gCudaTestSearch12", B2gCudaTestSearch12, 1); - UtRegisterTest("B2gCudaTestSearch13", B2gCudaTestSearch13, 1); - - UtRegisterTest("B2gCudaTestSearch14", B2gCudaTestSearch14, 1); - UtRegisterTest("B2gCudaTestSearch15", B2gCudaTestSearch15, 1); - UtRegisterTest("B2gCudaTestSearch16", B2gCudaTestSearch16, 1); - UtRegisterTest("B2gCudaTestSearch17", B2gCudaTestSearch17, 1); - UtRegisterTest("B2gCudaTestSearch18", B2gCudaTestSearch18, 1); - UtRegisterTest("B2gCudaTestSearch19", B2gCudaTestSearch19, 1); - UtRegisterTest("B2gCudaTestSearch20", B2gCudaTestSearch20, 1); - UtRegisterTest("B2gCudaTestSearch21", B2gCudaTestSearch21, 1); - - UtRegisterTest("B2gCudaTestSearch22", B2gCudaTestSearch22, 1); - UtRegisterTest("B2gCudaTestSearch23", B2gCudaTestSearch23, 1); - UtRegisterTest("B2gCudaTestSearch24", B2gCudaTestSearch24, 1); - UtRegisterTest("B2gCudaTestSearch25", B2gCudaTestSearch25, 1); - UtRegisterTest("B2gCudaTestSearch26", B2gCudaTestSearch26, 1); - UtRegisterTest("B2gCudaTestSearch27", B2gCudaTestSearch27, 1); - UtRegisterTest("B2gCudaTestSearch28", B2gCudaTestSearch28, 1); - UtRegisterTest("B2gCudaTestSearch29", B2gCudaTestSearch29, 1); - UtRegisterTest("B2gCudaTestSearch30", B2gCudaTestSearch30, 1); - UtRegisterTest("B2gCudaTestSearch31", B2gCudaTestSearch31, 1); - UtRegisterTest("B2gCudaTestSearch32", B2gCudaTestSearch32, 1); - UtRegisterTest("B2gCudaTestSearch33", B2gCudaTestSearch33, 1); - /* we actually need to call this. right now we don't need this. we will - * change this in the next patch for cuda batching */ - UtRegisterTest("B2gCudaTestDeInitTestEnv", B2gCudaTestDeInitTestEnv, 1); #endif /* UNITTESTS */ } diff --git a/src/util-mpm-b2g-cuda.h b/src/util-mpm-b2g-cuda.h index 751e9d5798..a30e155ea3 100644 --- a/src/util-mpm-b2g-cuda.h +++ b/src/util-mpm-b2g-cuda.h @@ -51,7 +51,7 @@ typedef struct B2gCudaPattern_ { /* case INsensitive */ uint8_t *ci; struct B2gCudaPattern_ *next; - MpmEndMatch *em; + uint32_t id; } B2gCudaPattern; typedef struct B2gCudaHashItem_ { @@ -65,23 +65,6 @@ typedef struct B2gCudaCtx_ { * in the engine that is holding this B2g_Cuda_Ctx */ int module_handle; - CUcontext cuda_context; - CUmodule cuda_module; - - /* the search kernel */ - CUfunction cuda_search_kernel; - - /* the cuda_search_kernel argument offsets */ - uint8_t cuda_search_kernel_arg0_offset; - uint8_t cuda_search_kernel_arg1_offset; - uint8_t cuda_search_kernel_arg2_offset; - uint8_t cuda_search_kernel_arg3_offset; - uint8_t cuda_search_kernel_arg4_offset; - uint8_t cuda_search_kernel_arg5_offset; - uint8_t cuda_search_kernel_arg_total; - - /* cuda device pointer to thelower case table g_u8_lowercasetable */ - CUdeviceptr cuda_g_u8_lowercasetable; /* cuda device pointer to B2gCudaCtx->B2G */ CUdeviceptr cuda_B2G; @@ -141,17 +124,18 @@ typedef struct B2gCudaThreadCtx_ { } B2gCudaThreadCtx; void MpmB2gCudaRegister(void); - void TmModuleCudaMpmB2gRegister(void); int B2gCudaStartDispatcherThreadRC(const char *); -int B2gCudaStartDispatcherThreadAPC(const char *); - void B2gCudaKillDispatcherThreadRC(void); -void B2gCudaKillDispatcherThreadAPC(void); - -void B2gCudaPushPacketTo_tv_CMB2_RC(Packet *); -void B2gCudaPushPacketTo_tv_CMB2_APC(Packet *); +int B2gCudaResultsPostProcessing(Packet *, MpmCtx *, MpmThreadCtx *, + PatternMatcherQueue *); +uint32_t B2gCudaSearch1(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *, + uint8_t *, uint16_t); +#ifdef B2G_CUDA_SEARCH2 +uint32_t B2gCudaSearch2(MpmCtx *, MpmThreadCtx *, PatternMatcherQueue *, + uint8_t *, uint16_t); +#endif #endif /* __SC_CUDA_SUPPORT__ */