mirror of https://github.com/OISF/suricata
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
478 lines
14 KiB
C
478 lines
14 KiB
C
/* Copyright (C) 2007-2013 Open Information Security Foundation
|
|
*
|
|
* You can copy, redistribute or modify this Program under the terms of
|
|
* the GNU General Public License version 2 as published by the Free
|
|
* Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* version 2 along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
|
* 02110-1301, USA.
|
|
*/
|
|
|
|
/**
|
|
* \file
|
|
*
|
|
* \author Victor Julien <victor@inliniac.net>
|
|
*
|
|
* Memcmp implementations for SSE3, SSE4.1, SSE4.2 and TILE-Gx SIMD.
|
|
*
|
|
* Both SCMemcmp and SCMemcmpLowercase return 0 on a exact match,
|
|
* 1 on a failed match.
|
|
*/
|
|
|
|
#ifndef __UTIL_MEMCMP_H__
|
|
#define __UTIL_MEMCMP_H__
|
|
|
|
#include "util-optimize.h"
|
|
|
|
/** \brief compare two patterns, converting the 2nd to lowercase
|
|
* \warning *ONLY* the 2nd pattern is converted to lowercase
|
|
*/
|
|
static inline int SCMemcmpLowercase(void *, void *, size_t);
|
|
|
|
void MemcmpRegisterTests(void);
|
|
|
|
static inline int
|
|
MemcmpLowercase(void *s1, void *s2, size_t n) {
|
|
size_t i;
|
|
|
|
/* check backwards because we already tested the first
|
|
* 2 to 4 chars. This way we are more likely to detect
|
|
* a miss and thus speed up a little... */
|
|
for (i = n - 1; i; i--) {
|
|
if (((uint8_t *)s1)[i] != u8_tolower(*(((uint8_t *)s2)+i)))
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if defined(__SSE4_2__)
|
|
|
|
#include <nmmintrin.h>
|
|
|
|
static inline int SCMemcmp(void *s1, void *s2, size_t n)
|
|
{
|
|
__m128i b1, b2;
|
|
|
|
int r;
|
|
/* counter for how far we already matched in the buffer */
|
|
size_t m = 0;
|
|
|
|
do {
|
|
/* load the buffers into the 128bit vars */
|
|
b1 = _mm_loadu_si128((const __m128i *) s1);
|
|
b2 = _mm_loadu_si128((const __m128i *) s2);
|
|
|
|
/* do the actual compare */
|
|
m += (r = _mm_cmpestri(b1, n - m, b2, 16,
|
|
_SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY));
|
|
|
|
s1 += 16;
|
|
s2 += 16;
|
|
} while (r == 16);
|
|
|
|
return ((m == n) ? 0 : 1);
|
|
}
|
|
|
|
/* Range of values of uppercase characters */
|
|
static char scmemcmp_uppercase[2] __attribute__((aligned(16))) = {
|
|
'A', 'Z' };
|
|
|
|
/** \brief compare two buffers in a case insensitive way
|
|
* \param s1 buffer already in lowercase
|
|
* \param s2 buffer with mixed upper and lowercase
|
|
*/
|
|
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t n)
|
|
{
|
|
__m128i b1, b2, mask;
|
|
|
|
int r;
|
|
/* counter for how far we already matched in the buffer */
|
|
size_t m = 0;
|
|
|
|
__m128i ucase = _mm_load_si128((const __m128i *) scmemcmp_uppercase);
|
|
__m128i nulls = _mm_setzero_si128();
|
|
__m128i uplow = _mm_set1_epi8(0x20);
|
|
|
|
do {
|
|
b1 = _mm_loadu_si128((const __m128i *) s1);
|
|
b2 = _mm_loadu_si128((const __m128i *) s2);
|
|
size_t len = n - m;
|
|
|
|
/* The first step is creating a mask that is FF for all uppercase
|
|
* characters, 00 for all others */
|
|
mask = _mm_cmpestrm(ucase, 2, b2, len, _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
|
|
/* Next we use that mask to create a new: this one has 0x20 for
|
|
* the uppercase chars, 00 for all other. */
|
|
mask = _mm_blendv_epi8(nulls, uplow, mask);
|
|
/* finally, merge the mask and the buffer converting the
|
|
* uppercase to lowercase */
|
|
b2 = _mm_add_epi8(b2, mask);
|
|
|
|
/* search using our converted buffer */
|
|
m += (r = _mm_cmpestri(b1, len, b2, 16,
|
|
_SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY));
|
|
|
|
s1 += 16;
|
|
s2 += 16;
|
|
} while (r == 16);
|
|
|
|
return ((m == n) ? 0 : 1);
|
|
}
|
|
|
|
#elif defined(__SSE4_1__)
|
|
|
|
#include <smmintrin.h>
|
|
|
|
#define SCMEMCMP_BYTES 16
|
|
|
|
static inline int SCMemcmp(void *s1, void *s2, size_t len) {
|
|
size_t offset = 0;
|
|
__m128i b1, b2, c;
|
|
|
|
do {
|
|
/* apparently we can't just read 16 bytes even though
|
|
* it almost always works fine :) */
|
|
if (likely(len - offset < 16)) {
|
|
return memcmp(s1, s2, len - offset) ? 1 : 0;
|
|
}
|
|
|
|
/* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
|
|
* _mm_lddqu_si128 was about 2% slower even though it's supposed to
|
|
* be faster. */
|
|
b1 = _mm_loadu_si128((const __m128i *) s1);
|
|
b2 = _mm_loadu_si128((const __m128i *) s2);
|
|
c = _mm_cmpeq_epi8(b1, b2);
|
|
|
|
int diff = len - offset;
|
|
if (diff < 16) {
|
|
int rmask = ~(0xFFFFFFFF << diff);
|
|
|
|
if ((_mm_movemask_epi8(c) & rmask) != rmask) {
|
|
return 1;
|
|
}
|
|
} else {
|
|
if (_mm_movemask_epi8(c) != 0x0000FFFF) {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
offset += SCMEMCMP_BYTES;
|
|
s1 += SCMEMCMP_BYTES;
|
|
s2 += SCMEMCMP_BYTES;
|
|
} while (len > offset);
|
|
|
|
return 0;
|
|
}
|
|
|
|
#define UPPER_LOW 0x40 /* "A" - 1 */
|
|
#define UPPER_HIGH 0x5B /* "Z" + 1 */
|
|
|
|
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
|
|
size_t offset = 0;
|
|
__m128i b1, b2, mask1, mask2, upper1, upper2, nulls, uplow;
|
|
|
|
/* setup registers for upper to lower conversion */
|
|
upper1 = _mm_set1_epi8(UPPER_LOW);
|
|
upper2 = _mm_set1_epi8(UPPER_HIGH);
|
|
nulls = _mm_setzero_si128();
|
|
uplow = _mm_set1_epi8(0x20);
|
|
|
|
do {
|
|
/* apparently we can't just read 16 bytes even though
|
|
* it almost always works fine :) */
|
|
if (likely(len - offset < 16)) {
|
|
return MemcmpLowercase(s1, s2, len - offset);
|
|
}
|
|
|
|
/* unaligned loading of the bytes to compare */
|
|
b1 = _mm_loadu_si128((const __m128i *) s1);
|
|
b2 = _mm_loadu_si128((const __m128i *) s2);
|
|
|
|
/* mark all chars bigger than upper1 */
|
|
mask1 = _mm_cmpgt_epi8(b2, upper1);
|
|
/* mark all chars lower than upper2 */
|
|
mask2 = _mm_cmplt_epi8(b2, upper2);
|
|
/* merge the two, leaving only those that are true in both */
|
|
mask1 = _mm_cmpeq_epi8(mask1, mask2);
|
|
/* Next we use that mask to create a new: this one has 0x20 for
|
|
* the uppercase chars, 00 for all other. */
|
|
mask1 = _mm_blendv_epi8(nulls, uplow, mask1);
|
|
|
|
/* add to b2, converting uppercase to lowercase */
|
|
b2 = _mm_add_epi8(b2, mask1);
|
|
|
|
/* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
|
|
mask1 = _mm_cmpeq_epi8(b1, b2);
|
|
|
|
int diff = len - offset;
|
|
if (diff < 16) {
|
|
int rmask = ~(0xFFFFFFFF << diff);
|
|
|
|
if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
|
|
return 1;
|
|
}
|
|
} else {
|
|
if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
offset += SCMEMCMP_BYTES;
|
|
s1 += SCMEMCMP_BYTES;
|
|
s2 += SCMEMCMP_BYTES;
|
|
} while (len > offset);
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
#elif defined(__SSE3__)
|
|
|
|
#include <pmmintrin.h> /* for SSE3 */
|
|
|
|
#define SCMEMCMP_BYTES 16
|
|
|
|
static inline int SCMemcmp(void *s1, void *s2, size_t len) {
|
|
size_t offset = 0;
|
|
__m128i b1, b2, c;
|
|
|
|
do {
|
|
/* apparently we can't just read 16 bytes even though
|
|
* it almost always works fine :) */
|
|
if (likely(len - offset < 16)) {
|
|
return memcmp(s1, s2, len - offset) ? 1 : 0;
|
|
}
|
|
|
|
/* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
|
|
* _mm_lddqu_si128 was about 2% slower even though it's supposed to
|
|
* be faster. */
|
|
b1 = _mm_loadu_si128((const __m128i *) s1);
|
|
b2 = _mm_loadu_si128((const __m128i *) s2);
|
|
c = _mm_cmpeq_epi8(b1, b2);
|
|
|
|
int diff = len - offset;
|
|
if (diff < 16) {
|
|
int rmask = ~(0xFFFFFFFF << diff);
|
|
|
|
if ((_mm_movemask_epi8(c) & rmask) != rmask) {
|
|
return 1;
|
|
}
|
|
} else {
|
|
if (_mm_movemask_epi8(c) != 0x0000FFFF) {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
offset += SCMEMCMP_BYTES;
|
|
s1 += SCMEMCMP_BYTES;
|
|
s2 += SCMEMCMP_BYTES;
|
|
} while (len > offset);
|
|
|
|
return 0;
|
|
}
|
|
|
|
#define UPPER_LOW 0x40 /* "A" - 1 */
|
|
#define UPPER_HIGH 0x5B /* "Z" + 1 */
|
|
#define UPPER_DELTA 0xDF /* 0xFF - 0x20 */
|
|
|
|
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
|
|
size_t offset = 0;
|
|
__m128i b1, b2, mask1, mask2, upper1, upper2, delta;
|
|
|
|
/* setup registers for upper to lower conversion */
|
|
upper1 = _mm_set1_epi8(UPPER_LOW);
|
|
upper2 = _mm_set1_epi8(UPPER_HIGH);
|
|
delta = _mm_set1_epi8(UPPER_DELTA);
|
|
|
|
do {
|
|
/* apparently we can't just read 16 bytes even though
|
|
* it almost always works fine :) */
|
|
if (likely(len - offset < 16)) {
|
|
return MemcmpLowercase(s1, s2, len - offset);
|
|
}
|
|
|
|
/* unaligned loading of the bytes to compare */
|
|
b1 = _mm_loadu_si128((const __m128i *) s1);
|
|
b2 = _mm_loadu_si128((const __m128i *) s2);
|
|
|
|
/* mark all chars bigger than upper1 */
|
|
mask1 = _mm_cmpgt_epi8(b2, upper1);
|
|
/* mark all chars lower than upper2 */
|
|
mask2 = _mm_cmplt_epi8(b2, upper2);
|
|
/* merge the two, leaving only those that are true in both */
|
|
mask1 = _mm_cmpeq_epi8(mask1, mask2);
|
|
|
|
/* sub delta leaves 0x20 only for uppercase positions, the
|
|
rest is 0x00 due to the saturation (reuse mask1 reg)*/
|
|
mask1 = _mm_subs_epu8(mask1, delta);
|
|
|
|
/* add to b2, converting uppercase to lowercase */
|
|
b2 = _mm_add_epi8(b2, mask1);
|
|
|
|
/* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
|
|
mask1 = _mm_cmpeq_epi8(b1, b2);
|
|
|
|
int diff = len - offset;
|
|
if (diff < 16) {
|
|
int rmask = ~(0xFFFFFFFF << diff);
|
|
|
|
if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
|
|
return 1;
|
|
}
|
|
} else {
|
|
if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
offset += SCMEMCMP_BYTES;
|
|
s1 += SCMEMCMP_BYTES;
|
|
s2 += SCMEMCMP_BYTES;
|
|
} while (len > offset);
|
|
|
|
return 0;
|
|
}
|
|
|
|
#elif defined(__tile__)
|
|
|
|
#include <ctype.h>
|
|
|
|
static inline int SCMemcmp(void *s1, void *s2, size_t len)
|
|
{
|
|
uint64_t b1, w1, aligned1;
|
|
uint64_t b2, w2, aligned2;
|
|
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
/* Load aligned words containing the beginning of each string.
|
|
* These loads don't trigger unaligned events.
|
|
*/
|
|
w1 = __insn_ldna(s1);
|
|
w2 = __insn_ldna(s2);
|
|
/* Can't just read next 8 bytes because it might go past the end
|
|
* of a page. */
|
|
while (len > 8) {
|
|
/* Here, the buffer extends into the next word by at least one
|
|
* byte, so it is safe to read the next word. Do an aligned
|
|
* loads on the next word. Then use the two words to create
|
|
* an aligned word from each string. */
|
|
b1 = __insn_ldna(s1 + 8);
|
|
b2 = __insn_ldna(s2 + 8);
|
|
aligned1 = __insn_dblalign(w1, b1, s1);
|
|
aligned2 = __insn_dblalign(w2, b2, s2);
|
|
if (aligned1 != aligned2)
|
|
return 1;
|
|
|
|
/* Move forward one word (8 bytes) */
|
|
w1 = b1;
|
|
w2 = b2;
|
|
len -= 8;
|
|
s1 += 8;
|
|
s2 += 8;
|
|
}
|
|
/* Process the last up-to 8 bytes. */
|
|
do {
|
|
if (*(char*)s1 != *(char*)s2)
|
|
return 1;
|
|
s1++;
|
|
s2++;
|
|
len--;
|
|
} while (len);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/** \brief Convert 8 characters to lower case using SIMD.
|
|
* \param Word containing the 8 bytes.
|
|
* \return Word containing 8-bytes each converted to lowercase.
|
|
*/
|
|
static inline uint64_t
|
|
vec_tolower(uint64_t cc)
|
|
{
|
|
/* For Uppercases letters, add 32 to convert to lower case. */
|
|
uint64_t less_than_eq_Z = __insn_v1cmpltui (cc, 'Z' + 1);
|
|
uint64_t less_than_A = __insn_v1cmpltui (cc, 'A');
|
|
uint64_t is_upper = __insn_v1cmpne (less_than_eq_Z, less_than_A);
|
|
return __insn_v1add (cc,__insn_v1shli (is_upper, 5));
|
|
}
|
|
|
|
/** \brief compare two buffers in a case insensitive way
|
|
* \param s1 buffer already in lowercase
|
|
* \param s2 buffer with mixed upper and lowercase
|
|
*/
|
|
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len)
|
|
{
|
|
uint64_t b1, w1, aligned1;
|
|
uint64_t b2, w2, aligned2;
|
|
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
/* TODO Check for already aligned cases. To optimize. */
|
|
|
|
/* Load word containing the beginning of each string.
|
|
* These loads don't trigger unaligned events.
|
|
*/
|
|
w1 = __insn_ldna(s1);
|
|
w2 = __insn_ldna(s2);
|
|
/* Can't just read next 8 bytes because it might go past the end
|
|
* of a page. */
|
|
while (len > 8) {
|
|
/* Here, the buffer extends into the next word by at least one
|
|
* byte, so it is safe to read the next word. Do aligned
|
|
* loads on next word. Then use the two words to create an
|
|
* aligned word from each string. */
|
|
b1 = __insn_ldna(s1 + 8);
|
|
b2 = __insn_ldna(s2 + 8);
|
|
aligned1 = __insn_dblalign(w1, b1, s1);
|
|
aligned2 = vec_tolower(__insn_dblalign(w2, b2, s2));
|
|
if (aligned1 != aligned2)
|
|
return 1;
|
|
|
|
/* Move forward one word (8 bytes) */
|
|
w1 = b1;
|
|
w2 = b2;
|
|
len -= 8;
|
|
s1 += 8;
|
|
s2 += 8;
|
|
}
|
|
|
|
do {
|
|
if (*(char*)s1 != tolower(*(char*)s2))
|
|
return 1;
|
|
s1++;
|
|
s2++;
|
|
len--;
|
|
} while (len);
|
|
|
|
return 0;
|
|
}
|
|
|
|
#else
|
|
|
|
/* No SIMD support, fall back to plain memcmp and a home grown lowercase one */
|
|
|
|
/* wrapper around memcmp to match the retvals of the SIMD implementations */
|
|
#define SCMemcmp(a,b,c) ({ \
|
|
memcmp((a), (b), (c)) ? 1 : 0; \
|
|
})
|
|
|
|
static inline int SCMemcmpLowercase(void *s1, void *s2, size_t len) {
|
|
return MemcmpLowercase(s1, s2, len);
|
|
}
|
|
|
|
#endif /* SIMD */
|
|
|
|
#endif /* __UTIL_MEMCMP_H__ */
|
|
|