diff --git a/configure.in b/configure.in index 0376eb2b66..5412fd2b9f 100644 --- a/configure.in +++ b/configure.in @@ -786,6 +786,9 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)]) AC_ARG_WITH(cuda_libraries, [ --with-cuda-libraries=DIR cuda library directory], [with_cuda_libraries="$withval"],[with_cuda_libraries="no"]) + AC_ARG_WITH(cuda_nvcc, + [ --with-cuda-nvcc=DIR cuda nvcc compiler directory], + [with_cuda_nvcc="$withval"],[with_cuda_nvcc=no]) CFLAGS="${CFLAGS} -D__SC_CUDA_SUPPORT__" @@ -799,6 +802,12 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)]) LDFLAGS="${LDFLAGS} -L${with_cuda_libraries}" fi + if test "$with_cuda_nvcc" != "no"; then + NVCC_DIR="${with_cuda_nvcc}" + else + NVCC_DIR="/usr/local/cuda/bin" + fi + AC_CHECK_HEADER(cuda.h,,[AC_ERROR(cuda.h not found ...)]) LIBCUDA="" @@ -809,7 +818,24 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)]) echo exit 1 fi + + AC_PATH_PROG([NVCC], [nvcc], , [$PATH:$NVCC_DIR]) + if test "$NVCC" = "no"; then + echo + echo " ERROR! CUDA nvcc compiler not found: use --with-cuda-nvcc=DIR" + echo + exit 1 + fi + + AC_PATH_PROG([PYTHON], [python], no) + if test "$PYTHON" = "no"; then + echo + echo " ERROR! Compiling CUDA source requires python interpreter" + echo + exit 1 + fi ]) + AM_CONDITIONAL([BUILD_CUDA], [test "${NVCC}" != ""]) # Check for libcap-ng diff --git a/src/Makefile.am b/src/Makefile.am index 6d22a2a6ff..47061041e1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -268,6 +268,40 @@ suricata_LDADD = $(top_builddir)/libhtp/htp/libhtp.la INCLUDES += -I$(top_srcdir)/libhtp endif +# Rules to build CUDA ptx modules +if BUILD_CUDA +BUILT_SOURCES = cuda-ptxdump.h +suricata_SOURCES += cuda-ptxdump.h + +suricata_CUDA_KERNELS = \ +util-mpm-b2g-cuda-kernel.cu + +SMVERSIONS = 10 11 12 13 20 +NVCCFLAGS=-O2 + +# FIXME +PTXS = +PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_10.ptx) +PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_11.ptx) +PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_12.ptx) +PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_13.ptx) +PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_20.ptx) + +# template to build for different compute capabilities +define BUILDTEMPLATE +# PTXS += $(patsubst %.cu, %_sm_$(1).ptx, $(suricata_CUDA_KERNELS)) +%_sm_$(1).ptx: %.cu + $(NVCC) $(NVCCFLAGS) -o $$@ -arch=sm_$(1) -ptx $$< +endef +$(foreach SMVER,$(SMVERSIONS),$(eval $(call BUILDTEMPLATE,$(SMVER)))) + +cuda-ptxdump.h: $(PTXS) + python ptxdump.py cuda-ptxdump $(PTXS) + +CLEANFILES = $(PTXS) cuda-ptxdump.h +endif + + #suricata_CFLAGS = -Wall -fno-strict-aliasing if BUILD_UNITTESTS diff --git a/src/ptxdump.py b/src/ptxdump.py new file mode 100644 index 0000000000..1b31a35769 --- /dev/null +++ b/src/ptxdump.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +from string import * +import os, commands, getopt, sys, platform + +header = '''/* Auto-generated by ptxdump.py DO NOT EDIT +* +* This file contains the ptx code of the Cuda kernels. +* A kernel is identified by its name and the compute capability (e.g. _sm_10). +*/ +''' + +def FormatCharHex(d): + s = hex(ord(d)) + if len(s) == 3: + s = "0x0" + s[2] + return s + +def CleanFileName(f): + v = replace(f,"-","_") + v = replace(v,".ptx","") + return v + +if not(len(sys.argv[1:]) >= 2): + print "Usage: ptx2c.py " + print "Description: creates a header file containing the ptx files as character array" + os.linesep + sys.exit(0) + +out_h = sys.argv[1] + ".h" +out = open(out_h, 'w') + +out.writelines(header) +out.writelines("#ifdef __SC_CUDA_SUPPORT__ \n") +out.writelines("#ifndef __ptxdump_h__ \n") +out.writelines("#define __ptxdump_h__ \n\n") + +# write char arrays +for file in sys.argv[2:]: + in_ptx = open(file, 'r') + source = in_ptx.read() + source_len = len(source) + + varname = CleanFileName(file) + + out.writelines("const unsigned char " + varname + "[" + str(source_len+1) + "] = {\n") + newlinecnt = 0 + for i in range(0, source_len): + out.write(FormatCharHex(source[i]) + ", ") + newlinecnt += 1 + if newlinecnt == 16: + newlinecnt = 0 + out.write("\n") + out.write("0x00\n};\n") + + print(sys.argv[0] + ": CUmodule " + varname + " packed successfully") + +# write retrieval function +out.writelines("const unsigned char* SCCudaPtxDumpGetModule(const char* module){\n"); +for file in sys.argv[2:]: + out.writelines('\tif (!strcmp(module, "'+replace(file,".ptx","")+'"))\n') + out.writelines("\t\treturn "+CleanFileName(file)+";\n") +out.writelines('\tSCLogError(SC_ERR_FATAL, "Error in SCCudaPtxDumpGetModule, module %s not found. Exiting...",module);\n') +out.writelines("\texit(EXIT_FAILURE);\n") +out.writelines("};\n") + +out.writelines("#endif // __ptxdump_h__ \n") +out.writelines("#endif // __SC_CUDA_SUPPORT__\n") + +print(sys.argv[0] + ": " + out_h + " written successfully") + +in_ptx.close() +out.close() diff --git a/src/util-cuda-handlers.c b/src/util-cuda-handlers.c index 022ee21ed7..2204a3aacb 100644 --- a/src/util-cuda-handlers.c +++ b/src/util-cuda-handlers.c @@ -74,6 +74,9 @@ /* macros decides if cuda is enabled for the platform or not */ #ifdef __SC_CUDA_SUPPORT__ +/* file only exists if cuda is enabled */ +#include "cuda-ptxdump.h" + static SCCudaHlModuleData *module_data = NULL; static uint8_t module_handle = 1; @@ -417,8 +420,8 @@ int SCCudaHlGetCudaContext(CUcontext *p_context, char *cuda_profile, int handle) * and associated with this handle and the cuda_module is returned * in the argument. * - * \param p_module Pointer to a cuda module instance that should be updated - * with a cuda module. + * \param p_module The loaded CUmodule that is returned. + * \param ptx_image Name of the module source file, w/o the .cu extension * \param handle A unique handle which identifies a module. Obtained from * a call to SCCudaHlGetUniqueHandle(). * @@ -464,11 +467,35 @@ int SCCudaHlGetCudaModule(CUmodule *p_module, const char *ptx_image, int handle) } memset(new_module_cumodule, 0, sizeof(SCCudaHlModuleCUmodule)); - /* Create a cuda module, update the module with this cuda module reference - * and then return the module reference back to the calling function using + /* select the ptx image based on the compute capability supported by all + * devices (i.e. the lowest) */ + char* image = malloc(strlen(ptx_image)+15); + memset(image, 0x0, sizeof(image)); + + int major = INT_MAX; + int minor = INT_MAX; + SCCudaDevices *devices = SCCudaGetDeviceList(); + int i=0; + for (; icount; i++){ + if (devices->devices[i]->major_rev < major){ + major = devices->devices[i]->major_rev; + minor = devices->devices[i]->minor_rev; + } + if (devices->devices[i]->major_rev == major && + devices->devices[i]->minor_rev < minor){ + minor = devices->devices[i]->minor_rev; + } + } + sprintf(image, "%s_sm_%u%u", ptx_image, major, minor); + + /* we don't have a cuda module associated with this module. Create a + * cuda module, update the module with this cuda module reference and + * then return the module refernce back to the calling function using * the argument */ - if (SCCudaModuleLoadData(p_module, (void *)ptx_image) == -1) + SCLogDebug("Loading kernel module: %s\n",image); + if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1) goto error; + free(image); new_module_cumodule->cuda_module = p_module[0]; new_module_cumodule->cuda_module_handle = SCCudaHlGetUniqueHandle(); @@ -487,6 +514,7 @@ int SCCudaHlGetCudaModule(CUmodule *p_module, const char *ptx_image, int handle) return new_module_cumodule->cuda_module_handle; error: + free(image); return -1; } diff --git a/src/util-mpm-b2g-cuda.c b/src/util-mpm-b2g-cuda.c index f8831f29a0..7e0c72e4d9 100644 --- a/src/util-mpm-b2g-cuda.c +++ b/src/util-mpm-b2g-cuda.c @@ -98,328 +98,6 @@ void B2gCudaPrintInfo(MpmCtx *); void B2gCudaPrintSearchStats(MpmThreadCtx *); void B2gCudaRegisterTests(void); -#if defined(__x86_64__) || defined(__ia64__) -const char *b2g_cuda_ptx_image_64_bit = - " .version 1.4\n" - " .target sm_10, map_f64_to_f32\n" - " .entry B2gCudaSearchBNDMq (\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n" - " .param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n" - " {\n" - " .reg .u16 %rh<7>;\n" - " .reg .u32 %r<38>;\n" - " .reg .u64 %rd<41>;\n" - " .reg .pred %p<10>;\n" - " .loc 3 36 0\n" - "$LBB1_B2gCudaSearchBNDMq:\n" - " mov.u16 %rh1, %ctaid.x;\n" - " mul.wide.u16 %r1, %rh1, 32;\n" - " cvt.u32.u16 %r2, %tid.x;\n" - " add.u32 %r3, %r2, %r1;\n" - " ld.param.u32 %r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n" - " setp.gt.u32 %p1, %r4, %r3;\n" - " @%p1 bra $Lt_0_5634;\n" - " bra.uni $LBB17_B2gCudaSearchBNDMq;\n" - "$Lt_0_5634:\n" - " .loc 3 45 0\n" - " cvt.u64.u32 %rd1, %r3;\n" - " mul.lo.u64 %rd2, %rd1, 4;\n" - " ld.param.u64 %rd3, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n" - " add.u64 %rd4, %rd3, %rd2;\n" - " ld.global.u32 %r5, [%rd4+0];\n" - " cvt.u64.u32 %rd5, %r5;\n" - " ld.param.u64 %rd6, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n" - " add.u64 %rd7, %rd5, %rd6;\n" - " .loc 3 46 0\n" - " ld.global.u32 %r6, [%rd7+0];\n" - " .loc 3 48 0\n" - " ld.global.u32 %r7, [%rd7+8];\n" - " .loc 3 49 0\n" - " ld.global.u32 %r8, [%rd7+4];\n" - " cvt.u64.u32 %rd8, %r8;\n" - " .loc 3 50 0\n" - " sub.u32 %r9, %r6, 1;\n" - " mov.s32 %r10, %r9;\n" - " .loc 3 56 0\n" - " ld.param.u64 %rd9, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n" - " ld.param.u64 %rd10, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n" - " add.u64 %rd11, %rd10, %rd2;\n" - " ld.global.u32 %r11, [%rd11+0];\n" - " cvt.u64.u32 %rd12, %r11;\n" - " add.u64 %rd13, %rd12, %rd1;\n" - " mul.lo.u64 %rd14, %rd13, 2;\n" - " add.u64 %rd15, %rd9, %rd14;\n" - " sub.u32 %r12, %r7, 1;\n" - " setp.gt.u32 %p2, %r9, %r12;\n" - " mov.u32 %r13, 0;\n" - " @%p2 bra $Lt_0_9474;\n" - " add.u64 %rd16, %rd7, 12;\n" - " add.u64 %rd17, %rd15, 2;\n" - " ld.param.u64 %rd18, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n" - "$Lt_0_6658:\n" - " // Loop body line 66\n" - " .loc 3 66 0\n" - " cvt.u64.u32 %rd19, %r10;\n" - " add.u64 %rd20, %rd19, %rd7;\n" - " ld.global.u8 %rh2, [%rd20+12];\n" - " cvt.u64.u8 %rd21, %rh2;\n" - " add.u64 %rd22, %rd21, %rd18;\n" - " ld.global.u8 %r14, [%rd22+0];\n" - " ld.global.u8 %rh3, [%rd20+11];\n" - " cvt.u64.u8 %rd23, %rh3;\n" - " add.u64 %rd24, %rd23, %rd18;\n" - " ld.global.u8 %r15, [%rd24+0];\n" - " shl.b32 %r16, %r15, 4;\n" - " or.b32 %r17, %r14, %r16;\n" - " cvt.u64.u32 %rd25, %r17;\n" - " mul.lo.u64 %rd26, %rd25, 4;\n" - " add.u64 %rd27, %rd8, %rd26;\n" - " ld.global.u32 %r18, [%rd27+0];\n" - " mov.u32 %r19, 0;\n" - " setp.eq.u32 %p3, %r18, %r19;\n" - " @%p3 bra $Lt_0_258;\n" - " // Part of loop body line 66, head labeled $Lt_0_6658\n" - " .loc 3 69 0\n" - " mov.s32 %r20, %r10;\n" - " .loc 3 70 0\n" - " sub.u32 %r21, %r10, %r6;\n" - " add.u32 %r22, %r21, 1;\n" - " sub.s32 %r23, %r6, 1;\n" - "$Lt_0_7682:\n" - " // Loop body line 73\n" - " .loc 3 73 0\n" - " sub.u32 %r20, %r20, 1;\n" - " shr.u32 %r24, %r18, %r23;\n" - " mov.u32 %r25, 0;\n" - " setp.eq.u32 %p4, %r24, %r25;\n" - " @%p4 bra $Lt_0_8450;\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " setp.le.u32 %p5, %r20, %r22;\n" - " @%p5 bra $Lt_0_8706;\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 76 0\n" - " mov.s32 %r10, %r20;\n" - " bra.uni $Lt_0_8450;\n" - "$Lt_0_8706:\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 78 0\n" - " mov.s32 %r26, %r13;\n" - " add.u32 %r27, %r26, 1;\n" - " cvt.u16.u32 %r13, %r27;\n" - " cvt.u64.u32 %rd28, %r26;\n" - " mul.lo.u64 %rd29, %rd28, 2;\n" - " add.u64 %rd30, %rd15, %rd29;\n" - " st.global.u16 [%rd30+2], %r20;\n" - "$Lt_0_8450:\n" - "$Lt_0_7938:\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 83 0\n" - " mov.u32 %r28, 0;\n" - " setp.eq.u32 %p6, %r20, %r28;\n" - " @%p6 bra $Lt_0_258;\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 86 0\n" - " cvt.u64.u32 %rd31, %r20;\n" - " add.u64 %rd32, %rd31, %rd7;\n" - " ld.global.u8 %rh4, [%rd32+12];\n" - " cvt.u64.u8 %rd33, %rh4;\n" - " add.u64 %rd34, %rd33, %rd18;\n" - " ld.global.u8 %r29, [%rd34+0];\n" - " ld.global.u8 %rh5, [%rd32+11];\n" - " cvt.u64.u8 %rd35, %rh5;\n" - " add.u64 %rd36, %rd35, %rd18;\n" - " ld.global.u8 %r30, [%rd36+0];\n" - " shl.b32 %r31, %r30, 4;\n" - " or.b32 %r32, %r29, %r31;\n" - " cvt.u64.u32 %rd37, %r32;\n" - " mul.lo.u64 %rd38, %rd37, 4;\n" - " add.u64 %rd39, %rd8, %rd38;\n" - " ld.global.u32 %r33, [%rd39+0];\n" - " shl.b32 %r34, %r18, 1;\n" - " and.b32 %r18, %r33, %r34;\n" - " mov.u32 %r35, 0;\n" - " setp.ne.u32 %p7, %r18, %r35;\n" - " @%p7 bra $Lt_0_7682;\n" - "$Lt_0_258:\n" - "$Lt_0_6914:\n" - " // Part of loop body line 66, head labeled $Lt_0_6658\n" - " .loc 3 89 0\n" - " add.u32 %r36, %r6, %r10;\n" - " sub.u32 %r10, %r36, 1;\n" - " setp.ge.u32 %p8, %r12, %r10;\n" - " @%p8 bra $Lt_0_6658;\n" - " bra.uni $Lt_0_6146;\n" - "$Lt_0_9474:\n" - "$Lt_0_6146:\n" - " .loc 3 92 0\n" - " st.global.u16 [%rd15+0], %r13;\n" - "$LBB17_B2gCudaSearchBNDMq:\n" - " .loc 3 94 0\n" - " exit;\n" - "$LDWend_B2gCudaSearchBNDMq:\n" - " } // B2gCudaSearchBNDMq\n" - ""; -#else -/** - * \todo Optimize the kernel. Also explore the options for compiling the - * *.cu file at compile/runtime. - */ -const char *b2g_cuda_ptx_image_32_bit = - " .version 1.4\n" - " .target sm_10, map_f64_to_f32\n" - " .entry B2gCudaSearchBNDMq (\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n" - " .param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n" - " {\n" - " .reg .u16 %rh<6>;\n" - " .reg .u32 %r<65>;\n" - " .reg .pred %p<10>;\n" - " .loc 3 36 0\n" - "$LBB1_B2gCudaSearchBNDMq:\n" - " mov.u16 %rh1, %ctaid.x;\n" - " mul.wide.u16 %r1, %rh1, 32;\n" - " cvt.u32.u16 %r2, %tid.x;\n" - " add.u32 %r3, %r2, %r1;\n" - " ld.param.u32 %r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n" - " setp.gt.u32 %p1, %r4, %r3;\n" - " @%p1 bra $Lt_0_5634;\n" - " bra.uni $LBB17_B2gCudaSearchBNDMq;\n" - "$Lt_0_5634:\n" - " .loc 3 45 0\n" - " mul.lo.u32 %r5, %r3, 4;\n" - " ld.param.u32 %r6, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n" - " add.u32 %r7, %r6, %r5;\n" - " ld.global.u32 %r8, [%r7+0];\n" - " ld.param.u32 %r9, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n" - " add.u32 %r10, %r8, %r9;\n" - " .loc 3 46 0\n" - " ld.global.u32 %r11, [%r10+0];\n" - " .loc 3 48 0\n" - " ld.global.u32 %r12, [%r10+8];\n" - " .loc 3 49 0\n" - " ld.global.u32 %r13, [%r10+4];\n" - " .loc 3 50 0\n" - " sub.u32 %r14, %r11, 1;\n" - " mov.s32 %r15, %r14;\n" - " .loc 3 56 0\n" - " ld.param.u32 %r16, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n" - " ld.param.u32 %r17, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n" - " add.u32 %r18, %r17, %r5;\n" - " ld.global.u32 %r19, [%r18+0];\n" - " add.u32 %r20, %r19, %r3;\n" - " mul.lo.u32 %r21, %r20, 2;\n" - " add.u32 %r22, %r16, %r21;\n" - " sub.u32 %r23, %r12, 1;\n" - " setp.gt.u32 %p2, %r14, %r23;\n" - " mov.u16 %rh2, 0;\n" - " @%p2 bra $Lt_0_9474;\n" - " add.u32 %r24, %r10, 12;\n" - " add.u32 %r25, %r22, 2;\n" - " ld.param.u32 %r26, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n" - "$Lt_0_6658:\n" - " // Loop body line 66\n" - " .loc 3 66 0\n" - " add.u32 %r27, %r10, %r15;\n" - " ld.global.u8 %r28, [%r27+12];\n" - " add.u32 %r29, %r28, %r26;\n" - " ld.global.u8 %r30, [%r29+0];\n" - " ld.global.u8 %r31, [%r27+11];\n" - " add.u32 %r32, %r31, %r26;\n" - " ld.global.u8 %r33, [%r32+0];\n" - " shl.b32 %r34, %r33, 4;\n" - " or.b32 %r35, %r30, %r34;\n" - " mul.lo.u32 %r36, %r35, 4;\n" - " add.u32 %r37, %r13, %r36;\n" - " ld.global.u32 %r38, [%r37+0];\n" - " mov.u32 %r39, 0;\n" - " setp.eq.u32 %p3, %r38, %r39;\n" - " @%p3 bra $Lt_0_258;\n" - " // Part of loop body line 66, head labeled $Lt_0_6658\n" - " .loc 3 69 0\n" - " mov.s32 %r40, %r15;\n" - " .loc 3 70 0\n" - " sub.u32 %r41, %r15, %r11;\n" - " add.u32 %r42, %r41, 1;\n" - " sub.s32 %r43, %r11, 1;\n" - "$Lt_0_7682:\n" - " // Loop body line 73\n" - " .loc 3 73 0\n" - " sub.u32 %r40, %r40, 1;\n" - " shr.u32 %r44, %r38, %r43;\n" - " mov.u32 %r45, 0;\n" - " setp.eq.u32 %p4, %r44, %r45;\n" - " @%p4 bra $Lt_0_8450;\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " setp.le.u32 %p5, %r40, %r42;\n" - " @%p5 bra $Lt_0_8706;\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 76 0\n" - " mov.s32 %r15, %r40;\n" - " bra.uni $Lt_0_8450;\n" - "$Lt_0_8706:\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 78 0\n" - " mov.s16 %rh3, %rh2;\n" - " add.u16 %rh4, %rh3, 1;\n" - " mov.u16 %rh2, %rh4;\n" - " mul.wide.u16 %r46, %rh3, 2;\n" - " add.u32 %r47, %r22, %r46;\n" - " st.global.u16 [%r47+2], %r40;\n" - "$Lt_0_8450:\n" - "$Lt_0_7938:\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 83 0\n" - " mov.u32 %r48, 0;\n" - " setp.eq.u32 %p6, %r40, %r48;\n" - " @%p6 bra $Lt_0_258;\n" - " // Part of loop body line 73, head labeled $Lt_0_7682\n" - " .loc 3 86 0\n" - " add.u32 %r49, %r10, %r40;\n" - " ld.global.u8 %r50, [%r49+12];\n" - " add.u32 %r51, %r50, %r26;\n" - " ld.global.u8 %r52, [%r51+0];\n" - " ld.global.u8 %r53, [%r49+11];\n" - " add.u32 %r54, %r53, %r26;\n" - " ld.global.u8 %r55, [%r54+0];\n" - " shl.b32 %r56, %r55, 4;\n" - " or.b32 %r57, %r52, %r56;\n" - " mul.lo.u32 %r58, %r57, 4;\n" - " add.u32 %r59, %r13, %r58;\n" - " ld.global.u32 %r60, [%r59+0];\n" - " shl.b32 %r61, %r38, 1;\n" - " and.b32 %r38, %r60, %r61;\n" - " mov.u32 %r62, 0;\n" - " setp.ne.u32 %p7, %r38, %r62;\n" - " @%p7 bra $Lt_0_7682;\n" - "$Lt_0_258:\n" - "$Lt_0_6914:\n" - " // Part of loop body line 66, head labeled $Lt_0_6658\n" - " .loc 3 89 0\n" - " add.u32 %r63, %r11, %r15;\n" - " sub.u32 %r15, %r63, 1;\n" - " setp.ge.u32 %p8, %r23, %r15;\n" - " @%p8 bra $Lt_0_6658;\n" - " bra.uni $Lt_0_6146;\n" - "$Lt_0_9474:\n" - "$Lt_0_6146:\n" - " .loc 3 92 0\n" - " st.global.u16 [%r22+0], %rh2;\n" - "$LBB17_B2gCudaSearchBNDMq:\n" - " .loc 3 94 0\n" - " exit;\n" - "$LDWend_B2gCudaSearchBNDMq:\n" - " } // B2gCudaSearchBNDMq\n" - ""; -#endif - /** * \brief Register the CUDA B2g Mpm. */ @@ -1752,15 +1430,9 @@ static int B2gCudaMpmStreamDataInit(B2gCudaMpmThreadCtxData *tctx, MpmCudaConf * } /* Load the CUmodule */ -#if defined(__x86_64__) || defined(__ia64__) sd->b2g_cuda_cumodule_handle = SCCudaHlGetCudaModule(&sd->b2g_cuda_cumodule, - b2g_cuda_ptx_image_64_bit, + "util-mpm-b2g-cuda-kernel", module_data->handle); -#else - sd->b2g_cuda_cumodule_handle = SCCudaHlGetCudaModule(&sd->b2g_cuda_cumodule, - b2g_cuda_ptx_image_32_bit, - module_data->handle); -#endif if (sd->b2g_cuda_cumodule_handle == -1) { SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module"); goto error;