build cuda modules with make

remotes/origin/master-1.1.x
Martin Beyer 15 years ago committed by Victor Julien
parent f7f037c1d1
commit 49d66430bc

@ -786,6 +786,9 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)])
AC_ARG_WITH(cuda_libraries,
[ --with-cuda-libraries=DIR cuda library directory],
[with_cuda_libraries="$withval"],[with_cuda_libraries="no"])
AC_ARG_WITH(cuda_nvcc,
[ --with-cuda-nvcc=DIR cuda nvcc compiler directory],
[with_cuda_nvcc="$withval"],[with_cuda_nvcc=no])
CFLAGS="${CFLAGS} -D__SC_CUDA_SUPPORT__"
@ -799,6 +802,12 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)])
LDFLAGS="${LDFLAGS} -L${with_cuda_libraries}"
fi
if test "$with_cuda_nvcc" != "no"; then
NVCC_DIR="${with_cuda_nvcc}"
else
NVCC_DIR="/usr/local/cuda/bin"
fi
AC_CHECK_HEADER(cuda.h,,[AC_ERROR(cuda.h not found ...)])
LIBCUDA=""
@ -809,7 +818,24 @@ AC_CHECK_HEADER(pcap.h,,[AC_ERROR(pcap.h not found ...)])
echo
exit 1
fi
AC_PATH_PROG([NVCC], [nvcc], , [$PATH:$NVCC_DIR])
if test "$NVCC" = "no"; then
echo
echo " ERROR! CUDA nvcc compiler not found: use --with-cuda-nvcc=DIR"
echo
exit 1
fi
AC_PATH_PROG([PYTHON], [python], no)
if test "$PYTHON" = "no"; then
echo
echo " ERROR! Compiling CUDA source requires python interpreter"
echo
exit 1
fi
])
AM_CONDITIONAL([BUILD_CUDA], [test "${NVCC}" != ""])
# Check for libcap-ng

@ -268,6 +268,40 @@ suricata_LDADD = $(top_builddir)/libhtp/htp/libhtp.la
INCLUDES += -I$(top_srcdir)/libhtp
endif
# Rules to build CUDA ptx modules
if BUILD_CUDA
BUILT_SOURCES = cuda-ptxdump.h
suricata_SOURCES += cuda-ptxdump.h
suricata_CUDA_KERNELS = \
util-mpm-b2g-cuda-kernel.cu
SMVERSIONS = 10 11 12 13 20
NVCCFLAGS=-O2
# FIXME
PTXS =
PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_10.ptx)
PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_11.ptx)
PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_12.ptx)
PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_13.ptx)
PTXS += $(suricata_CUDA_KERNELS:.cu=_sm_20.ptx)
# template to build for different compute capabilities
define BUILDTEMPLATE
# PTXS += $(patsubst %.cu, %_sm_$(1).ptx, $(suricata_CUDA_KERNELS))
%_sm_$(1).ptx: %.cu
$(NVCC) $(NVCCFLAGS) -o $$@ -arch=sm_$(1) -ptx $$<
endef
$(foreach SMVER,$(SMVERSIONS),$(eval $(call BUILDTEMPLATE,$(SMVER))))
cuda-ptxdump.h: $(PTXS)
python ptxdump.py cuda-ptxdump $(PTXS)
CLEANFILES = $(PTXS) cuda-ptxdump.h
endif
#suricata_CFLAGS = -Wall -fno-strict-aliasing
if BUILD_UNITTESTS

@ -0,0 +1,71 @@
#!/usr/bin/env python
from string import *
import os, commands, getopt, sys, platform
header = '''/* Auto-generated by ptxdump.py DO NOT EDIT
*
* This file contains the ptx code of the Cuda kernels.
* A kernel is identified by its name and the compute capability (e.g. _sm_10).
*/
'''
def FormatCharHex(d):
s = hex(ord(d))
if len(s) == 3:
s = "0x0" + s[2]
return s
def CleanFileName(f):
v = replace(f,"-","_")
v = replace(v,".ptx","")
return v
if not(len(sys.argv[1:]) >= 2):
print "Usage: ptx2c.py <output> <in.ptx ..> "
print "Description: creates a header file containing the ptx files as character array" + os.linesep
sys.exit(0)
out_h = sys.argv[1] + ".h"
out = open(out_h, 'w')
out.writelines(header)
out.writelines("#ifdef __SC_CUDA_SUPPORT__ \n")
out.writelines("#ifndef __ptxdump_h__ \n")
out.writelines("#define __ptxdump_h__ \n\n")
# write char arrays
for file in sys.argv[2:]:
in_ptx = open(file, 'r')
source = in_ptx.read()
source_len = len(source)
varname = CleanFileName(file)
out.writelines("const unsigned char " + varname + "[" + str(source_len+1) + "] = {\n")
newlinecnt = 0
for i in range(0, source_len):
out.write(FormatCharHex(source[i]) + ", ")
newlinecnt += 1
if newlinecnt == 16:
newlinecnt = 0
out.write("\n")
out.write("0x00\n};\n")
print(sys.argv[0] + ": CUmodule " + varname + " packed successfully")
# write retrieval function
out.writelines("const unsigned char* SCCudaPtxDumpGetModule(const char* module){\n");
for file in sys.argv[2:]:
out.writelines('\tif (!strcmp(module, "'+replace(file,".ptx","")+'"))\n')
out.writelines("\t\treturn "+CleanFileName(file)+";\n")
out.writelines('\tSCLogError(SC_ERR_FATAL, "Error in SCCudaPtxDumpGetModule, module %s not found. Exiting...",module);\n')
out.writelines("\texit(EXIT_FAILURE);\n")
out.writelines("};\n")
out.writelines("#endif // __ptxdump_h__ \n")
out.writelines("#endif // __SC_CUDA_SUPPORT__\n")
print(sys.argv[0] + ": " + out_h + " written successfully")
in_ptx.close()
out.close()

@ -74,6 +74,9 @@
/* macros decides if cuda is enabled for the platform or not */
#ifdef __SC_CUDA_SUPPORT__
/* file only exists if cuda is enabled */
#include "cuda-ptxdump.h"
static SCCudaHlModuleData *module_data = NULL;
static uint8_t module_handle = 1;
@ -417,8 +420,8 @@ int SCCudaHlGetCudaContext(CUcontext *p_context, char *cuda_profile, int handle)
* and associated with this handle and the cuda_module is returned
* in the argument.
*
* \param p_module Pointer to a cuda module instance that should be updated
* with a cuda module.
* \param p_module The loaded CUmodule that is returned.
* \param ptx_image Name of the module source file, w/o the .cu extension
* \param handle A unique handle which identifies a module. Obtained from
* a call to SCCudaHlGetUniqueHandle().
*
@ -464,11 +467,35 @@ int SCCudaHlGetCudaModule(CUmodule *p_module, const char *ptx_image, int handle)
}
memset(new_module_cumodule, 0, sizeof(SCCudaHlModuleCUmodule));
/* Create a cuda module, update the module with this cuda module reference
* and then return the module reference back to the calling function using
/* select the ptx image based on the compute capability supported by all
* devices (i.e. the lowest) */
char* image = malloc(strlen(ptx_image)+15);
memset(image, 0x0, sizeof(image));
int major = INT_MAX;
int minor = INT_MAX;
SCCudaDevices *devices = SCCudaGetDeviceList();
int i=0;
for (; i<devices->count; i++){
if (devices->devices[i]->major_rev < major){
major = devices->devices[i]->major_rev;
minor = devices->devices[i]->minor_rev;
}
if (devices->devices[i]->major_rev == major &&
devices->devices[i]->minor_rev < minor){
minor = devices->devices[i]->minor_rev;
}
}
sprintf(image, "%s_sm_%u%u", ptx_image, major, minor);
/* we don't have a cuda module associated with this module. Create a
* cuda module, update the module with this cuda module reference and
* then return the module refernce back to the calling function using
* the argument */
if (SCCudaModuleLoadData(p_module, (void *)ptx_image) == -1)
SCLogDebug("Loading kernel module: %s\n",image);
if (SCCudaModuleLoadData(p_module, (void *)SCCudaPtxDumpGetModule(image)) == -1)
goto error;
free(image);
new_module_cumodule->cuda_module = p_module[0];
new_module_cumodule->cuda_module_handle = SCCudaHlGetUniqueHandle();
@ -487,6 +514,7 @@ int SCCudaHlGetCudaModule(CUmodule *p_module, const char *ptx_image, int handle)
return new_module_cumodule->cuda_module_handle;
error:
free(image);
return -1;
}

@ -98,328 +98,6 @@ void B2gCudaPrintInfo(MpmCtx *);
void B2gCudaPrintSearchStats(MpmThreadCtx *);
void B2gCudaRegisterTests(void);
#if defined(__x86_64__) || defined(__ia64__)
const char *b2g_cuda_ptx_image_64_bit =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry B2gCudaSearchBNDMq (\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n"
" {\n"
" .reg .u16 %rh<7>;\n"
" .reg .u32 %r<38>;\n"
" .reg .u64 %rd<41>;\n"
" .reg .pred %p<10>;\n"
" .loc 3 36 0\n"
"$LBB1_B2gCudaSearchBNDMq:\n"
" mov.u16 %rh1, %ctaid.x;\n"
" mul.wide.u16 %r1, %rh1, 32;\n"
" cvt.u32.u16 %r2, %tid.x;\n"
" add.u32 %r3, %r2, %r1;\n"
" ld.param.u32 %r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n"
" setp.gt.u32 %p1, %r4, %r3;\n"
" @%p1 bra $Lt_0_5634;\n"
" bra.uni $LBB17_B2gCudaSearchBNDMq;\n"
"$Lt_0_5634:\n"
" .loc 3 45 0\n"
" cvt.u64.u32 %rd1, %r3;\n"
" mul.lo.u64 %rd2, %rd1, 4;\n"
" ld.param.u64 %rd3, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n"
" add.u64 %rd4, %rd3, %rd2;\n"
" ld.global.u32 %r5, [%rd4+0];\n"
" cvt.u64.u32 %rd5, %r5;\n"
" ld.param.u64 %rd6, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n"
" add.u64 %rd7, %rd5, %rd6;\n"
" .loc 3 46 0\n"
" ld.global.u32 %r6, [%rd7+0];\n"
" .loc 3 48 0\n"
" ld.global.u32 %r7, [%rd7+8];\n"
" .loc 3 49 0\n"
" ld.global.u32 %r8, [%rd7+4];\n"
" cvt.u64.u32 %rd8, %r8;\n"
" .loc 3 50 0\n"
" sub.u32 %r9, %r6, 1;\n"
" mov.s32 %r10, %r9;\n"
" .loc 3 56 0\n"
" ld.param.u64 %rd9, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n"
" ld.param.u64 %rd10, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n"
" add.u64 %rd11, %rd10, %rd2;\n"
" ld.global.u32 %r11, [%rd11+0];\n"
" cvt.u64.u32 %rd12, %r11;\n"
" add.u64 %rd13, %rd12, %rd1;\n"
" mul.lo.u64 %rd14, %rd13, 2;\n"
" add.u64 %rd15, %rd9, %rd14;\n"
" sub.u32 %r12, %r7, 1;\n"
" setp.gt.u32 %p2, %r9, %r12;\n"
" mov.u32 %r13, 0;\n"
" @%p2 bra $Lt_0_9474;\n"
" add.u64 %rd16, %rd7, 12;\n"
" add.u64 %rd17, %rd15, 2;\n"
" ld.param.u64 %rd18, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
"$Lt_0_6658:\n"
" //<loop> Loop body line 66\n"
" .loc 3 66 0\n"
" cvt.u64.u32 %rd19, %r10;\n"
" add.u64 %rd20, %rd19, %rd7;\n"
" ld.global.u8 %rh2, [%rd20+12];\n"
" cvt.u64.u8 %rd21, %rh2;\n"
" add.u64 %rd22, %rd21, %rd18;\n"
" ld.global.u8 %r14, [%rd22+0];\n"
" ld.global.u8 %rh3, [%rd20+11];\n"
" cvt.u64.u8 %rd23, %rh3;\n"
" add.u64 %rd24, %rd23, %rd18;\n"
" ld.global.u8 %r15, [%rd24+0];\n"
" shl.b32 %r16, %r15, 4;\n"
" or.b32 %r17, %r14, %r16;\n"
" cvt.u64.u32 %rd25, %r17;\n"
" mul.lo.u64 %rd26, %rd25, 4;\n"
" add.u64 %rd27, %rd8, %rd26;\n"
" ld.global.u32 %r18, [%rd27+0];\n"
" mov.u32 %r19, 0;\n"
" setp.eq.u32 %p3, %r18, %r19;\n"
" @%p3 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
" .loc 3 69 0\n"
" mov.s32 %r20, %r10;\n"
" .loc 3 70 0\n"
" sub.u32 %r21, %r10, %r6;\n"
" add.u32 %r22, %r21, 1;\n"
" sub.s32 %r23, %r6, 1;\n"
"$Lt_0_7682:\n"
" //<loop> Loop body line 73\n"
" .loc 3 73 0\n"
" sub.u32 %r20, %r20, 1;\n"
" shr.u32 %r24, %r18, %r23;\n"
" mov.u32 %r25, 0;\n"
" setp.eq.u32 %p4, %r24, %r25;\n"
" @%p4 bra $Lt_0_8450;\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" setp.le.u32 %p5, %r20, %r22;\n"
" @%p5 bra $Lt_0_8706;\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 76 0\n"
" mov.s32 %r10, %r20;\n"
" bra.uni $Lt_0_8450;\n"
"$Lt_0_8706:\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 78 0\n"
" mov.s32 %r26, %r13;\n"
" add.u32 %r27, %r26, 1;\n"
" cvt.u16.u32 %r13, %r27;\n"
" cvt.u64.u32 %rd28, %r26;\n"
" mul.lo.u64 %rd29, %rd28, 2;\n"
" add.u64 %rd30, %rd15, %rd29;\n"
" st.global.u16 [%rd30+2], %r20;\n"
"$Lt_0_8450:\n"
"$Lt_0_7938:\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 83 0\n"
" mov.u32 %r28, 0;\n"
" setp.eq.u32 %p6, %r20, %r28;\n"
" @%p6 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 86 0\n"
" cvt.u64.u32 %rd31, %r20;\n"
" add.u64 %rd32, %rd31, %rd7;\n"
" ld.global.u8 %rh4, [%rd32+12];\n"
" cvt.u64.u8 %rd33, %rh4;\n"
" add.u64 %rd34, %rd33, %rd18;\n"
" ld.global.u8 %r29, [%rd34+0];\n"
" ld.global.u8 %rh5, [%rd32+11];\n"
" cvt.u64.u8 %rd35, %rh5;\n"
" add.u64 %rd36, %rd35, %rd18;\n"
" ld.global.u8 %r30, [%rd36+0];\n"
" shl.b32 %r31, %r30, 4;\n"
" or.b32 %r32, %r29, %r31;\n"
" cvt.u64.u32 %rd37, %r32;\n"
" mul.lo.u64 %rd38, %rd37, 4;\n"
" add.u64 %rd39, %rd8, %rd38;\n"
" ld.global.u32 %r33, [%rd39+0];\n"
" shl.b32 %r34, %r18, 1;\n"
" and.b32 %r18, %r33, %r34;\n"
" mov.u32 %r35, 0;\n"
" setp.ne.u32 %p7, %r18, %r35;\n"
" @%p7 bra $Lt_0_7682;\n"
"$Lt_0_258:\n"
"$Lt_0_6914:\n"
" //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
" .loc 3 89 0\n"
" add.u32 %r36, %r6, %r10;\n"
" sub.u32 %r10, %r36, 1;\n"
" setp.ge.u32 %p8, %r12, %r10;\n"
" @%p8 bra $Lt_0_6658;\n"
" bra.uni $Lt_0_6146;\n"
"$Lt_0_9474:\n"
"$Lt_0_6146:\n"
" .loc 3 92 0\n"
" st.global.u16 [%rd15+0], %r13;\n"
"$LBB17_B2gCudaSearchBNDMq:\n"
" .loc 3 94 0\n"
" exit;\n"
"$LDWend_B2gCudaSearchBNDMq:\n"
" } // B2gCudaSearchBNDMq\n"
"";
#else
/**
* \todo Optimize the kernel. Also explore the options for compiling the
* *.cu file at compile/runtime.
*/
const char *b2g_cuda_ptx_image_32_bit =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry B2gCudaSearchBNDMq (\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_results_buffer,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_buffer,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_nop,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable)\n"
" {\n"
" .reg .u16 %rh<6>;\n"
" .reg .u32 %r<65>;\n"
" .reg .pred %p<10>;\n"
" .loc 3 36 0\n"
"$LBB1_B2gCudaSearchBNDMq:\n"
" mov.u16 %rh1, %ctaid.x;\n"
" mul.wide.u16 %r1, %rh1, 32;\n"
" cvt.u32.u16 %r2, %tid.x;\n"
" add.u32 %r3, %r2, %r1;\n"
" ld.param.u32 %r4, [__cudaparm_B2gCudaSearchBNDMq_nop];\n"
" setp.gt.u32 %p1, %r4, %r3;\n"
" @%p1 bra $Lt_0_5634;\n"
" bra.uni $LBB17_B2gCudaSearchBNDMq;\n"
"$Lt_0_5634:\n"
" .loc 3 45 0\n"
" mul.lo.u32 %r5, %r3, 4;\n"
" ld.param.u32 %r6, [__cudaparm_B2gCudaSearchBNDMq_packets_offset_buffer];\n"
" add.u32 %r7, %r6, %r5;\n"
" ld.global.u32 %r8, [%r7+0];\n"
" ld.param.u32 %r9, [__cudaparm_B2gCudaSearchBNDMq_packets_buffer];\n"
" add.u32 %r10, %r8, %r9;\n"
" .loc 3 46 0\n"
" ld.global.u32 %r11, [%r10+0];\n"
" .loc 3 48 0\n"
" ld.global.u32 %r12, [%r10+8];\n"
" .loc 3 49 0\n"
" ld.global.u32 %r13, [%r10+4];\n"
" .loc 3 50 0\n"
" sub.u32 %r14, %r11, 1;\n"
" mov.s32 %r15, %r14;\n"
" .loc 3 56 0\n"
" ld.param.u32 %r16, [__cudaparm_B2gCudaSearchBNDMq_results_buffer];\n"
" ld.param.u32 %r17, [__cudaparm_B2gCudaSearchBNDMq_packets_payload_offset_buffer];\n"
" add.u32 %r18, %r17, %r5;\n"
" ld.global.u32 %r19, [%r18+0];\n"
" add.u32 %r20, %r19, %r3;\n"
" mul.lo.u32 %r21, %r20, 2;\n"
" add.u32 %r22, %r16, %r21;\n"
" sub.u32 %r23, %r12, 1;\n"
" setp.gt.u32 %p2, %r14, %r23;\n"
" mov.u16 %rh2, 0;\n"
" @%p2 bra $Lt_0_9474;\n"
" add.u32 %r24, %r10, 12;\n"
" add.u32 %r25, %r22, 2;\n"
" ld.param.u32 %r26, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
"$Lt_0_6658:\n"
" //<loop> Loop body line 66\n"
" .loc 3 66 0\n"
" add.u32 %r27, %r10, %r15;\n"
" ld.global.u8 %r28, [%r27+12];\n"
" add.u32 %r29, %r28, %r26;\n"
" ld.global.u8 %r30, [%r29+0];\n"
" ld.global.u8 %r31, [%r27+11];\n"
" add.u32 %r32, %r31, %r26;\n"
" ld.global.u8 %r33, [%r32+0];\n"
" shl.b32 %r34, %r33, 4;\n"
" or.b32 %r35, %r30, %r34;\n"
" mul.lo.u32 %r36, %r35, 4;\n"
" add.u32 %r37, %r13, %r36;\n"
" ld.global.u32 %r38, [%r37+0];\n"
" mov.u32 %r39, 0;\n"
" setp.eq.u32 %p3, %r38, %r39;\n"
" @%p3 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
" .loc 3 69 0\n"
" mov.s32 %r40, %r15;\n"
" .loc 3 70 0\n"
" sub.u32 %r41, %r15, %r11;\n"
" add.u32 %r42, %r41, 1;\n"
" sub.s32 %r43, %r11, 1;\n"
"$Lt_0_7682:\n"
" //<loop> Loop body line 73\n"
" .loc 3 73 0\n"
" sub.u32 %r40, %r40, 1;\n"
" shr.u32 %r44, %r38, %r43;\n"
" mov.u32 %r45, 0;\n"
" setp.eq.u32 %p4, %r44, %r45;\n"
" @%p4 bra $Lt_0_8450;\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" setp.le.u32 %p5, %r40, %r42;\n"
" @%p5 bra $Lt_0_8706;\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 76 0\n"
" mov.s32 %r15, %r40;\n"
" bra.uni $Lt_0_8450;\n"
"$Lt_0_8706:\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 78 0\n"
" mov.s16 %rh3, %rh2;\n"
" add.u16 %rh4, %rh3, 1;\n"
" mov.u16 %rh2, %rh4;\n"
" mul.wide.u16 %r46, %rh3, 2;\n"
" add.u32 %r47, %r22, %r46;\n"
" st.global.u16 [%r47+2], %r40;\n"
"$Lt_0_8450:\n"
"$Lt_0_7938:\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 83 0\n"
" mov.u32 %r48, 0;\n"
" setp.eq.u32 %p6, %r40, %r48;\n"
" @%p6 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 73, head labeled $Lt_0_7682\n"
" .loc 3 86 0\n"
" add.u32 %r49, %r10, %r40;\n"
" ld.global.u8 %r50, [%r49+12];\n"
" add.u32 %r51, %r50, %r26;\n"
" ld.global.u8 %r52, [%r51+0];\n"
" ld.global.u8 %r53, [%r49+11];\n"
" add.u32 %r54, %r53, %r26;\n"
" ld.global.u8 %r55, [%r54+0];\n"
" shl.b32 %r56, %r55, 4;\n"
" or.b32 %r57, %r52, %r56;\n"
" mul.lo.u32 %r58, %r57, 4;\n"
" add.u32 %r59, %r13, %r58;\n"
" ld.global.u32 %r60, [%r59+0];\n"
" shl.b32 %r61, %r38, 1;\n"
" and.b32 %r38, %r60, %r61;\n"
" mov.u32 %r62, 0;\n"
" setp.ne.u32 %p7, %r38, %r62;\n"
" @%p7 bra $Lt_0_7682;\n"
"$Lt_0_258:\n"
"$Lt_0_6914:\n"
" //<loop> Part of loop body line 66, head labeled $Lt_0_6658\n"
" .loc 3 89 0\n"
" add.u32 %r63, %r11, %r15;\n"
" sub.u32 %r15, %r63, 1;\n"
" setp.ge.u32 %p8, %r23, %r15;\n"
" @%p8 bra $Lt_0_6658;\n"
" bra.uni $Lt_0_6146;\n"
"$Lt_0_9474:\n"
"$Lt_0_6146:\n"
" .loc 3 92 0\n"
" st.global.u16 [%r22+0], %rh2;\n"
"$LBB17_B2gCudaSearchBNDMq:\n"
" .loc 3 94 0\n"
" exit;\n"
"$LDWend_B2gCudaSearchBNDMq:\n"
" } // B2gCudaSearchBNDMq\n"
"";
#endif
/**
* \brief Register the CUDA B2g Mpm.
*/
@ -1752,15 +1430,9 @@ static int B2gCudaMpmStreamDataInit(B2gCudaMpmThreadCtxData *tctx, MpmCudaConf *
}
/* Load the CUmodule */
#if defined(__x86_64__) || defined(__ia64__)
sd->b2g_cuda_cumodule_handle = SCCudaHlGetCudaModule(&sd->b2g_cuda_cumodule,
b2g_cuda_ptx_image_64_bit,
"util-mpm-b2g-cuda-kernel",
module_data->handle);
#else
sd->b2g_cuda_cumodule_handle = SCCudaHlGetCudaModule(&sd->b2g_cuda_cumodule,
b2g_cuda_ptx_image_32_bit,
module_data->handle);
#endif
if (sd->b2g_cuda_cumodule_handle == -1) {
SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
goto error;

Loading…
Cancel
Save