From 784843b14636f6873596b39a433f079cc13078a1 Mon Sep 17 00:00:00 2001 From: Ken Steele Date: Wed, 31 Jul 2013 23:15:46 -0400 Subject: [PATCH] Use Tilera SIMD for Signature matching ala SSE3 Makes use of 8-wide byte compare instructions in signature matching. For allocating aligned memory, _mm_malloc() is SSE only, so added check for __tile__ to use memalign() instead. Shows a 13% speed up. --- src/detect-engine-siggroup.c | 10 ++-- src/detect.c | 92 ++++++++++++++++++++++++++++-------- src/detect.h | 4 +- src/util-mem.h | 10 +++- 4 files changed, 89 insertions(+), 27 deletions(-) diff --git a/src/detect-engine-siggroup.c b/src/detect-engine-siggroup.c index ee5ee8cb65..2c21d4d00b 100644 --- a/src/detect-engine-siggroup.c +++ b/src/detect-engine-siggroup.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation +/* Copyright (C) 2007-2013 Open Information Security Foundation * * You can copy, redistribute or modify this Program under the terms of * the GNU General Public License version 2 as published by the Free @@ -175,7 +175,7 @@ void SigGroupHeadFree(SigGroupHead *sgh) PatternMatchDestroyGroup(sgh); -#if defined(__SSE3__) +#if defined(__SSE3__) || defined(__tile__) if (sgh->mask_array != NULL) { /* mask is aligned */ SCFreeAligned(sgh->mask_array); @@ -1690,7 +1690,7 @@ int SigGroupHeadBuildHeadArray(DetectEngineCtx *de_ctx, SigGroupHead *sgh) return 0; BUG_ON(sgh->head_array != NULL); -#if defined(__SSE3__) +#if defined(__SSE3__) || defined(__tile__) BUG_ON(sgh->mask_array != NULL); /* mask array is 16 byte aligned for SIMD checking, also we always @@ -1706,7 +1706,7 @@ int SigGroupHeadBuildHeadArray(DetectEngineCtx *de_ctx, SigGroupHead *sgh) } #endif /* __WORDSIZE */ - sgh->mask_array = SCMallocAligned((cnt * sizeof(SignatureMask)), 16); + sgh->mask_array = (SignatureMask *)SCMallocAligned((cnt * sizeof(SignatureMask)), 16); if (sgh->mask_array == NULL) return -1; @@ -1732,7 +1732,7 @@ int SigGroupHeadBuildHeadArray(DetectEngineCtx *de_ctx, SigGroupHead *sgh) sgh->head_array[idx].hdr_copy3 = s->hdr_copy3; sgh->head_array[idx].full_sig = s; -#if defined(__SSE3__) +#if defined(__SSE3__) || defined(__tile__) sgh->mask_array[idx] = s->mask; #endif idx++; diff --git a/src/detect.c b/src/detect.c index a6ecc794d3..b22d4b49e0 100644 --- a/src/detect.c +++ b/src/detect.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2011 Open Information Security Foundation +/* Copyright (C) 2007-2013 Open Information Security Foundation * * You can copy, redistribute or modify this Program under the terms of * the GNU General Public License version 2 as published by the Free @@ -576,7 +576,7 @@ static inline int SigMatchSignaturesBuildMatchArrayAddSignature(DetectEngineThre * On 64 bit systems we inspect in 64 sig batches, creating a u64 with flags. * The size of a register is leading here. */ -static inline void SigMatchSignaturesBuildMatchArraySIMD(DetectEngineThreadCtx *det_ctx, +static inline void SigMatchSignaturesBuildMatchArray(DetectEngineThreadCtx *det_ctx, Packet *p, SignatureMask mask, uint16_t alproto) { uint32_t u; @@ -712,28 +712,70 @@ static inline void SigMatchSignaturesBuildMatchArraySIMD(DetectEngineThreadCtx * #error Wordsize (__WORDSIZE) neither 32 or 64. #endif } -#endif /* defined(__SSE3__) */ + /* end defined(__SSE3__) */ +#elif defined(__tile__) -static inline void SigMatchSignaturesBuildMatchArrayNoSIMD(DetectEngineThreadCtx *det_ctx, +/** + * \brief SIMD implementation of mask prefiltering for TILE-Gx + * + * Mass mask matching is done creating a bitmap of signatures that need + * futher inspection. + */ +static inline void SigMatchSignaturesBuildMatchArray(DetectEngineThreadCtx *det_ctx, Packet *p, SignatureMask mask, uint16_t alproto) { uint32_t u; + register uint64_t bm; /* bit mask, 64 bits used */ - /* reset previous run */ - det_ctx->match_array_cnt = 0; + /* Keep local copies of variables that don't change during this function. */ + uint64_t *mask_vector = (uint64_t*)det_ctx->sgh->mask_array; + uint32_t sig_cnt = det_ctx->sgh->sig_cnt; + SignatureHeader *head_array = det_ctx->sgh->head_array; + + Signature **match_array = det_ctx->match_array; + uint32_t match_count = 0; + + /* Replicate the packet mask into each byte of the vector. */ + uint64_t pm = __insn_shufflebytes(mask, 0, 0); + + /* u is the signature index. */ + for (u = 0; u < sig_cnt; u += 8) { + /* Load 8 masks */ + uint64_t sm = *mask_vector++; + /* Binary AND 8 masks with the packet's mask */ + uint64_t r1 = pm & sm; + /* Compare the result with the original mask + * Result if equal puts a 1 in LSB of bytes that match. + */ + bm = __insn_v1cmpeq(sm, r1); + + /* Check the LSB bit of each byte in the bit map. Little endian is assumed, + * so the LSB byte is index 0. Uses count trailing zeros to find least + * significant bit that is set. */ + while (bm) { + /* Find first bit set starting from LSB. */ + unsigned int first_bit = __insn_ctz(bm); + unsigned int first_byte = first_bit >> 3; + unsigned int x = u + first_byte; + if (x >= sig_cnt) + break; + SignatureHeader *s = &head_array[x]; + + /* Clear the first bit set, so it is not found again. */ + bm -= (1UL << first_bit); - for (u = 0; u < det_ctx->sgh->sig_cnt; u++) { - SignatureHeader *s = &det_ctx->sgh->head_array[u]; - if ((mask & s->mask) == s->mask) { if (SigMatchSignaturesBuildMatchArrayAddSignature(det_ctx, p, s, alproto) == 1) { /* okay, store it */ - det_ctx->match_array[det_ctx->match_array_cnt] = s->full_sig; - det_ctx->match_array_cnt++; + *match_array++ = s->full_sig; + match_count++; } } } + det_ctx->match_array_cnt = match_count; } - +/* end defined(__tile__) */ +#else +/* No SIMD implementation */ /** * \brief build an array of signatures that will be inspected * @@ -745,15 +787,27 @@ static inline void SigMatchSignaturesBuildMatchArrayNoSIMD(DetectEngineThreadCtx * \param mask Packets mask * \param alproto application layer protocol */ -static void SigMatchSignaturesBuildMatchArray(DetectEngineThreadCtx *det_ctx, - Packet *p, SignatureMask mask, uint16_t alproto) +static inline void SigMatchSignaturesBuildMatchArray(DetectEngineThreadCtx *det_ctx, + Packet *p, SignatureMask mask, + uint16_t alproto) { -#if defined(__SSE3__) - SigMatchSignaturesBuildMatchArraySIMD(det_ctx, p, mask, alproto); -#else - SigMatchSignaturesBuildMatchArrayNoSIMD(det_ctx, p, mask, alproto); -#endif + uint32_t u; + + /* reset previous run */ + det_ctx->match_array_cnt = 0; + + for (u = 0; u < det_ctx->sgh->sig_cnt; u++) { + SignatureHeader *s = &det_ctx->sgh->head_array[u]; + if ((mask & s->mask) == s->mask) { + if (SigMatchSignaturesBuildMatchArrayAddSignature(det_ctx, p, s, alproto) == 1) { + /* okay, store it */ + det_ctx->match_array[det_ctx->match_array_cnt] = s->full_sig; + det_ctx->match_array_cnt++; + } + } + } } +#endif /* No SIMD implementation */ int SigMatchSignaturesRunPostMatch(ThreadVars *tv, DetectEngineCtx *de_ctx, DetectEngineThreadCtx *det_ctx, Packet *p, diff --git a/src/detect.h b/src/detect.h index f4afcecc6d..559363b23c 100644 --- a/src/detect.h +++ b/src/detect.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2011 Open Information Security Foundation +/* Copyright (C) 2007-2013 Open Information Security Foundation * * You can copy, redistribute or modify this Program under the terms of * the GNU General Public License version 2 as published by the Free @@ -949,7 +949,7 @@ typedef struct SigGroupHead_ { /** array of masks, used to check multiple masks against * a packet using SIMD. */ -#if defined(__SSE3__) +#if defined(__SSE3__) || defined(__tile__) SignatureMask *mask_array; #endif /** chunk of memory containing the "header" part of each diff --git a/src/util-mem.h b/src/util-mem.h index 61c0328fab..1379b83178 100644 --- a/src/util-mem.h +++ b/src/util-mem.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2007-2010 Open Information Security Foundation +/* Copyright (C) 2007-2013 Open Information Security Foundation * * You can copy, redistribute or modify this Program under the terms of * the GNU General Public License version 2 as published by the Free @@ -36,6 +36,14 @@ #include "mm_malloc.h" #endif +#if defined(__tile__) +/* Need to define __mm_ function alternatives, since these are SSE only. + */ +#include +#define _mm_malloc(a,b) memalign((b),(a)) +#define _mm_free(a) free((a)) +#endif /* defined(__tile__) */ + SC_ATOMIC_EXTERN(unsigned int, engine_stage); /* Use this only if you want to debug memory allocation and free()