compiled and added a 64 bit version of the cuda b2g kernel

remotes/origin/master-1.0.x
Anoop Saldanha 16 years ago committed by Victor Julien
parent 606516911b
commit 35bd0c6b39

@ -4068,7 +4068,48 @@ int SCCudaTest01(void)
return (devices->count != 0);
}
static const char *sc_cuda_test_kernel =
#if defined(__x86_64__) || defined(__ia64__)
/**
* extern "C" __global__ void SCCudaSuricataTest(int *input, int *output)
* {
* output[threadIdx.x] = input[threadIdx.x] * 2;
* }
*/
static const char *sc_cuda_test_kernel_64_bit =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry SCCudaSuricataTest (\n"
" .param .u64 __cudaparm_SCCudaSuricataTest_input,\n"
" .param .u64 __cudaparm_SCCudaSuricataTest_output)\n"
"{\n"
" .reg .u32 %r<5>;\n"
" .reg .u64 %rd<8>;\n"
" .loc 15 1 0\n"
" $LBB1_SCCudaSuricataTest:\n"
" .loc 15 3 0\n"
" cvt.u32.u16 %r1, %tid.x;\n"
" cvt.u64.u32 %rd1, %r1;\n"
" mul.lo.u64 %rd2, %rd1, 4;\n"
" ld.param.u64 %rd3, [__cudaparm_SCCudaSuricataTest_input];\n"
" add.u64 %rd4, %rd3, %rd2;\n"
" ld.global.s32 %r2, [%rd4+0];\n"
" mul.lo.s32 %r3, %r2, 2;\n"
" ld.param.u64 %rd5, [__cudaparm_SCCudaSuricataTest_output];\n"
" add.u64 %rd6, %rd5, %rd2;\n"
" st.global.s32 [%rd6+0], %r3;\n"
" .loc 15 4 0\n"
" exit;\n"
" $LDWend_SCCudaSuricataTest:\n"
"} // SCCudaSuricataTest\n"
"\n";
#else
/**
* extern "C" __global__ void SCCudaSuricataTest(int *input, int *output)
* {
* output[threadIdx.x] = input[threadIdx.x] * 2;
* }
*/
static const char *sc_cuda_test_kernel_32_bit =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry SCCudaSuricataTest (\n"
@ -4094,6 +4135,7 @@ static const char *sc_cuda_test_kernel =
"$LDWend_SCCudaSuricataTest:\n"
" } // SCCudaSuricataTest\n"
"";
#endif
int SCCudaTest02(void)
{
@ -4120,8 +4162,13 @@ int SCCudaTest02(void)
if (SCCudaCtxCreate(&context, 0, devices->devices[0]->device) == -1)
goto end;
if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel) == -1)
#if defined(__x86_64__) || defined(__ia64__)
if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel_64_bit) == -1)
goto end;
#else
if (SCCudaModuleLoadData(&module, (void *)sc_cuda_test_kernel_32_bit) == -1)
goto end;
#endif
if (SCCudaModuleGetFunction(&kernel, module, "SCCudaSuricataTest") == -1)
goto end;

@ -91,183 +91,371 @@ int arg4 = 0;
int arg5 = 0;
int arg_total = 0;
#if defined(__x86_64__) || defined(__ia64__)
const char *b2g_cuda_ptx_image_64_bit =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry B2gCudaSearchBNDMq (\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_offsets,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_B2G,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n"
" .param .u64 __cudaparm_B2gCudaSearchBNDMq_buf,\n"
" .param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n"
" {\n"
" .reg .u16 %rh<6>;\n"
" .reg .u32 %r<58>;\n"
" .reg .u64 %rd<31>;\n"
" .reg .pred %p<14>;\n"
" .loc 15 25 0\n"
"$LBB1_B2gCudaSearchBNDMq:\n"
" .loc 15 27 0\n"
" ld.param.u32 %r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n"
" sub.u32 %r2, %r1, 1;\n"
" mov.s32 %r3, %r2;\n"
" .loc 15 33 0\n"
" ld.param.u16 %r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n"
" shr.u32 %r5, %r4, 4;\n"
" cvt.u16.u32 %r6, %r5;\n"
" mov.s32 %r7, %r6;\n"
" setp.ge.u32 %p1, %r6, %r1;\n"
" @%p1 bra $Lt_0_8450;\n"
" .loc 15 38 0\n"
" cvt.u16.u32 %r7, %r1;\n"
"$Lt_0_8450:\n"
" cvt.u32.u16 %r8, %tid.x;\n"
" mul.lo.u32 %r9, %r7, %r8;\n"
" cvt.u16.u32 %r10, %r9;\n"
" add.s32 %r11, %r7, %r10;\n"
" setp.ge.s32 %p2, %r4, %r11;\n"
" @%p2 bra $Lt_0_8962;\n"
" bra.uni $LBB23_B2gCudaSearchBNDMq;\n"
"$Lt_0_8962:\n"
" .loc 15 44 0\n"
" mul24.lo.s32 %r12, %r7, 2;\n"
" sub.s32 %r13, %r12, 1;\n"
" mov.s32 %r14, %r13;\n"
" cvt.u16.u32 %r15, %r14;\n"
" mov.s32 %r16, %r15;\n"
" add.s32 %r17, %r10, %r15;\n"
" set.lt.u32.s32 %r18, %r4, %r17;\n"
" neg.s32 %r19, %r18;\n"
" mov.u32 %r20, 15;\n"
" set.eq.u32.u32 %r21, %r8, %r20;\n"
" neg.s32 %r22, %r21;\n"
" or.b32 %r23, %r19, %r22;\n"
" mov.u32 %r24, 0;\n"
" setp.eq.s32 %p3, %r23, %r24;\n"
" @%p3 bra $Lt_0_9474;\n"
" .loc 15 46 0\n"
" sub.u32 %r25, %r4, %r9;\n"
" cvt.u16.u32 %r16, %r25;\n"
"$Lt_0_9474:\n"
" mov.u32 %r26, 0;\n"
" setp.eq.u32 %p4, %r16, %r26;\n"
" @%p4 bra $Lt_0_9986;\n"
" mov.s32 %r27, %r16;\n"
" ld.param.u64 %rd1, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
" mov.u32 %r28, 0;\n"
" mov.s32 %r29, %r27;\n"
"$Lt_0_10498:\n"
" //<loop> Loop body line 46, nesting depth: 1, estimated iterations: unknown\n"
" .loc 15 51 0\n"
" mov.u32 %r30, 0;\n"
" add.u32 %r31, %r10, %r28;\n"
" cvt.u64.u32 %rd2, %r31;\n"
" mul.lo.u64 %rd3, %rd2, 4;\n"
" add.u64 %rd4, %rd1, %rd3;\n"
" st.global.u32 [%rd4+0], %r30;\n"
" add.u32 %r28, %r28, 1;\n"
" setp.ne.u32 %p5, %r16, %r28;\n"
" @%p5 bra $Lt_0_10498;\n"
"$Lt_0_9986:\n"
" sub.u32 %r32, %r16, 1;\n"
" setp.gt.u32 %p6, %r2, %r32;\n"
" @%p6 bra $LBB23_B2gCudaSearchBNDMq;\n"
" ld.param.u64 %rd5, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
" ld.param.u64 %rd6, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n"
" ld.param.u64 %rd7, [__cudaparm_B2gCudaSearchBNDMq_buf];\n"
"$Lt_0_11522:\n"
" //<loop> Loop body line 57\n"
" .loc 15 57 0\n"
" add.u32 %r33, %r10, %r3;\n"
" cvt.u64.u32 %rd8, %r33;\n"
" add.u64 %rd9, %rd8, %rd7;\n"
" ld.global.u8 %rh1, [%rd9+0];\n"
" cvt.u64.u8 %rd10, %rh1;\n"
" add.u64 %rd11, %rd10, %rd5;\n"
" ld.global.u8 %r34, [%rd11+0];\n"
" ld.global.u8 %rh2, [%rd9+-1];\n"
" cvt.u64.u8 %rd12, %rh2;\n"
" add.u64 %rd13, %rd12, %rd5;\n"
" ld.global.u8 %r35, [%rd13+0];\n"
" shl.b32 %r36, %r35, 4;\n"
" or.b32 %r37, %r34, %r36;\n"
" cvt.u64.u32 %rd14, %r37;\n"
" mul.lo.u64 %rd15, %rd14, 4;\n"
" add.u64 %rd16, %rd6, %rd15;\n"
" ld.global.u32 %r38, [%rd16+0];\n"
" mov.u32 %r39, 0;\n"
" setp.eq.u32 %p7, %r38, %r39;\n"
" @%p7 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 57, head labeled $Lt_0_11522\n"
" .loc 15 60 0\n"
" mov.s32 %r28, %r3;\n"
" .loc 15 61 0\n"
" sub.u32 %r40, %r3, %r1;\n"
" add.u32 %r41, %r40, 1;\n"
" sub.s32 %r42, %r1, 1;\n"
"$Lt_0_12546:\n"
" //<loop> Loop body line 64\n"
" .loc 15 64 0\n"
" sub.u32 %r28, %r28, 1;\n"
" shr.u32 %r43, %r38, %r42;\n"
" mov.u32 %r44, 0;\n"
" setp.eq.u32 %p8, %r43, %r44;\n"
" @%p8 bra $Lt_0_13314;\n"
" //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
" setp.ge.u32 %p9, %r41, %r28;\n"
" @%p9 bra $Lt_0_13570;\n"
" //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
" .loc 15 67 0\n"
" mov.s32 %r3, %r28;\n"
" bra.uni $Lt_0_13314;\n"
"$Lt_0_13570:\n"
" //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
" .loc 15 69 0\n"
" mov.u32 %r45, 1;\n"
" ld.param.u64 %rd17, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
" add.u32 %r46, %r10, %r28;\n"
" cvt.u64.u32 %rd18, %r46;\n"
" mul.lo.u64 %rd19, %rd18, 4;\n"
" add.u64 %rd20, %rd17, %rd19;\n"
" st.global.u32 [%rd20+0], %r45;\n"
"$Lt_0_13314:\n"
"$Lt_0_12802:\n"
" //<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
" .loc 15 74 0\n"
" mov.u32 %r47, 0;\n"
" setp.eq.u32 %p10, %r28, %r47;\n"
" @%p10 bra $Lt_0_258;\n"
"//<loop> Part of loop body line 64, head labeled $Lt_0_12546\n"
" .loc 15 77 0\n"
" add.u32 %r48, %r10, %r28;\n"
" cvt.u64.u32 %rd21, %r48;\n"
" add.u64 %rd22, %rd21, %rd7;\n"
" ld.global.u8 %rh3, [%rd22+0];\n"
" cvt.u64.u8 %rd23, %rh3;\n"
" add.u64 %rd24, %rd23, %rd5;\n"
" ld.global.u8 %r49, [%rd24+0];\n"
" ld.global.u8 %rh4, [%rd22+-1];\n"
" cvt.u64.u8 %rd25, %rh4;\n"
" add.u64 %rd26, %rd25, %rd5;\n"
" ld.global.u8 %r50, [%rd26+0];\n"
" shl.b32 %r51, %r50, 4;\n"
" or.b32 %r52, %r49, %r51;\n"
" cvt.u64.u32 %rd27, %r52;\n"
" mul.lo.u64 %rd28, %rd27, 4;\n"
" add.u64 %rd29, %rd6, %rd28;\n"
" ld.global.u32 %r53, [%rd29+0];\n"
" shl.b32 %r54, %r38, 1;\n"
" and.b32 %r38, %r53, %r54;\n"
" mov.u32 %r55, 0;\n"
" setp.ne.u32 %p11, %r38, %r55;\n"
" @%p11 bra $Lt_0_12546;\n"
"$Lt_0_258:\n"
"$Lt_0_11778:\n"
" //<loop> Part of loop body line 57, head labeled $Lt_0_11522\n"
" .loc 15 80 0\n"
" add.u32 %r56, %r3, %r1;\n"
" sub.u32 %r3, %r56, 1;\n"
" setp.ge.u32 %p12, %r32, %r3;\n"
" @%p12 bra $Lt_0_11522;\n"
"$LBB23_B2gCudaSearchBNDMq:\n"
" .loc 15 83 0\n"
" exit;\n"
"$LDWend_B2gCudaSearchBNDMq:\n"
" } // B2gCudaSearchBNDMq\n"
"\n";
#else
/**
* \todo Optimize the kernel. Also explore the options for compiling the
* *.cu file at compile/runtime.
*/
const char *b2g_cuda_ptx_image =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry B2gCudaSearchBNDMq (\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_offsets,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_B2G,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_buf,\n"
" .param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n"
" {\n"
" .reg .u32 %r<81>;\n"
" .reg .pred %p<14>;\n"
" .loc 15 14 0\n"
"$LBB1_B2gCudaSearchBNDMq:\n"
" .loc 15 16 0\n"
" ld.param.u32 %r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n"
" sub.u32 %r2, %r1, 1;\n"
" mov.s32 %r3, %r2;\n"
" .loc 15 22 0\n"
" ld.param.u16 %r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n"
" shr.u32 %r5, %r4, 4;\n"
" cvt.u16.u32 %r6, %r5;\n"
" mov.s32 %r7, %r6;\n"
" setp.ge.u32 %p1, %r6, %r1;\n"
" @%p1 bra $Lt_0_8450;\n"
" .loc 15 27 0\n"
" cvt.u16.u32 %r7, %r1;\n"
"$Lt_0_8450:\n"
" cvt.u32.u16 %r8, %tid.x;\n"
" mul.lo.u32 %r9, %r7, %r8;\n"
" cvt.u16.u32 %r10, %r9;\n"
" add.s32 %r11, %r7, %r10;\n"
" setp.ge.s32 %p2, %r4, %r11;\n"
" @%p2 bra $Lt_0_8962;\n"
" bra.uni $LBB23_B2gCudaSearchBNDMq;\n"
"$Lt_0_8962:\n"
" .loc 15 33 0\n"
" mul24.lo.s32 %r12, %r7, 2;\n"
" sub.s32 %r13, %r12, 1;\n"
" mov.s32 %r14, %r13;\n"
" cvt.u16.u32 %r15, %r14;\n"
" mov.s32 %r16, %r15;\n"
" add.s32 %r17, %r10, %r15;\n"
" set.lt.u32.s32 %r18, %r4, %r17;\n"
" neg.s32 %r19, %r18;\n"
" mov.u32 %r20, 15;\n"
" set.eq.u32.u32 %r21, %r8, %r20;\n"
" neg.s32 %r22, %r21;\n"
" or.b32 %r23, %r19, %r22;\n"
" mov.u32 %r24, 0;\n"
" setp.eq.s32 %p3, %r23, %r24;\n"
" @%p3 bra $Lt_0_9474;\n"
" .loc 15 35 0\n"
" sub.u32 %r25, %r4, %r9;\n"
" cvt.u16.u32 %r16, %r25;\n"
"$Lt_0_9474:\n"
" mov.u32 %r26, 0;\n"
" setp.eq.u32 %p4, %r16, %r26;\n"
" @%p4 bra $Lt_0_9986;\n"
" mov.s32 %r27, %r16;\n"
" ld.param.u32 %r28, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
" mov.u32 %r29, 0;\n"
" mov.s32 %r30, %r27;\n"
"$Lt_0_10498:\n"
" //<loop> Loop body line 35, nesting depth: 1, estimated iterations: unknown\n"
" .loc 15 40 0\n"
" mov.u32 %r31, 0;\n"
" add.u32 %r32, %r10, %r29;\n"
" mul.lo.u32 %r33, %r32, 4;\n"
" add.u32 %r34, %r28, %r33;\n"
" st.global.u32 [%r34+0], %r31;\n"
" add.u32 %r29, %r29, 1;\n"
" setp.ne.u32 %p5, %r16, %r29;\n"
" @%p5 bra $Lt_0_10498;\n"
"$Lt_0_9986:\n"
" sub.u32 %r35, %r16, 1;\n"
" setp.gt.u32 %p6, %r2, %r35;\n"
" @%p6 bra $LBB23_B2gCudaSearchBNDMq;\n"
" ld.param.u32 %r36, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
" ld.param.u32 %r37, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n"
" ld.param.u32 %r38, [__cudaparm_B2gCudaSearchBNDMq_buf];\n"
"$Lt_0_11522:\n"
" //<loop> Loop body line 46\n"
" .loc 15 46 0\n"
" add.u32 %r39, %r10, %r3;\n"
" add.u32 %r40, %r39, %r38;\n"
" ld.global.u8 %r41, [%r40+0];\n"
" add.u32 %r42, %r41, %r36;\n"
" ld.global.u8 %r43, [%r42+0];\n"
" ld.global.u8 %r44, [%r40+-1];\n"
" add.u32 %r45, %r44, %r36;\n"
" ld.global.u8 %r46, [%r45+0];\n"
" shl.b32 %r47, %r46, 4;\n"
" or.b32 %r48, %r43, %r47;\n"
" mul.lo.u32 %r49, %r48, 4;\n"
" add.u32 %r50, %r37, %r49;\n"
" ld.global.u32 %r51, [%r50+0];\n"
" mov.u32 %r52, 0;\n"
" setp.eq.u32 %p7, %r51, %r52;\n"
" @%p7 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 46, head labeled $Lt_0_11522\n"
" .loc 15 49 0\n"
" mov.s32 %r29, %r3;\n"
" .loc 15 50 0\n"
" sub.u32 %r53, %r3, %r1;\n"
" add.u32 %r54, %r53, 1;\n"
" sub.s32 %r55, %r1, 1;\n"
"$Lt_0_12546:\n"
" //<loop> Loop body line 53\n"
" .loc 15 53 0\n"
" sub.u32 %r29, %r29, 1;\n"
" shr.u32 %r56, %r51, %r55;\n"
" mov.u32 %r57, 0;\n"
" setp.eq.u32 %p8, %r56, %r57;\n"
" @%p8 bra $Lt_0_13314;\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" setp.ge.u32 %p9, %r54, %r29;\n"
" @%p9 bra $Lt_0_13570;\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 56 0\n"
" mov.s32 %r3, %r29;\n"
" bra.uni $Lt_0_13314;\n"
"$Lt_0_13570:\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 58 0\n"
" mov.u32 %r58, 1;\n"
" ld.param.u32 %r59, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
" add.u32 %r60, %r10, %r29;\n"
" mul.lo.u32 %r61, %r60, 4;\n"
" add.u32 %r62, %r59, %r61;\n"
" st.global.u32 [%r62+0], %r58;\n"
"$Lt_0_13314:\n"
"$Lt_0_12802:\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 63 0\n"
" mov.u32 %r63, 0;\n"
" setp.eq.u32 %p10, %r29, %r63;\n"
" @%p10 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 66 0\n"
" add.u32 %r64, %r10, %r29;\n"
" add.u32 %r65, %r64, %r38;\n"
" ld.global.u8 %r66, [%r65+0];\n"
" add.u32 %r67, %r66, %r36;\n"
" ld.global.u8 %r68, [%r67+0];\n"
" ld.global.u8 %r69, [%r65+-1];\n"
" add.u32 %r70, %r69, %r36;\n"
" ld.global.u8 %r71, [%r70+0];\n"
" shl.b32 %r72, %r71, 4;\n"
" or.b32 %r73, %r68, %r72;\n"
" mul.lo.u32 %r74, %r73, 4;\n"
" add.u32 %r75, %r37, %r74;\n"
" ld.global.u32 %r76, [%r75+0];\n"
" shl.b32 %r77, %r51, 1;\n"
" and.b32 %r51, %r76, %r77;\n"
" mov.u32 %r78, 0;\n"
" setp.ne.u32 %p11, %r51, %r78;\n"
" @%p11 bra $Lt_0_12546;\n"
"$Lt_0_258:\n"
"$Lt_0_11778:\n"
" //<loop> Part of loop body line 46, head labeled $Lt_0_11522\n"
" .loc 15 69 0\n"
" add.u32 %r79, %r3, %r1;\n"
" sub.u32 %r3, %r79, 1;\n"
" setp.ge.u32 %p12, %r35, %r3;\n"
" @%p12 bra $Lt_0_11522;\n"
"$LBB23_B2gCudaSearchBNDMq:\n"
" .loc 15 72 0\n"
" exit;\n"
"$LDWend_B2gCudaSearchBNDMq:\n"
" } // B2gCudaSearchBNDMq\n"
"\n";
const char *b2g_cuda_ptx_image_32_bit =
" .version 1.4\n"
" .target sm_10, map_f64_to_f32\n"
" .entry B2gCudaSearchBNDMq (\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_offsets,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_B2G,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_buf,\n"
" .param .u16 __cudaparm_B2gCudaSearchBNDMq_arg_buflen,\n"
" .param .u32 __cudaparm_B2gCudaSearchBNDMq_m)\n"
" {\n"
" .reg .u32 %r<81>;\n"
" .reg .pred %p<14>;\n"
" .loc 15 14 0\n"
"$LBB1_B2gCudaSearchBNDMq:\n"
" .loc 15 16 0\n"
" ld.param.u32 %r1, [__cudaparm_B2gCudaSearchBNDMq_m];\n"
" sub.u32 %r2, %r1, 1;\n"
" mov.s32 %r3, %r2;\n"
" .loc 15 22 0\n"
" ld.param.u16 %r4, [__cudaparm_B2gCudaSearchBNDMq_arg_buflen];\n"
" shr.u32 %r5, %r4, 4;\n"
" cvt.u16.u32 %r6, %r5;\n"
" mov.s32 %r7, %r6;\n"
" setp.ge.u32 %p1, %r6, %r1;\n"
" @%p1 bra $Lt_0_8450;\n"
" .loc 15 27 0\n"
" cvt.u16.u32 %r7, %r1;\n"
"$Lt_0_8450:\n"
" cvt.u32.u16 %r8, %tid.x;\n"
" mul.lo.u32 %r9, %r7, %r8;\n"
" cvt.u16.u32 %r10, %r9;\n"
" add.s32 %r11, %r7, %r10;\n"
" setp.ge.s32 %p2, %r4, %r11;\n"
" @%p2 bra $Lt_0_8962;\n"
" bra.uni $LBB23_B2gCudaSearchBNDMq;\n"
"$Lt_0_8962:\n"
" .loc 15 33 0\n"
" mul24.lo.s32 %r12, %r7, 2;\n"
" sub.s32 %r13, %r12, 1;\n"
" mov.s32 %r14, %r13;\n"
" cvt.u16.u32 %r15, %r14;\n"
" mov.s32 %r16, %r15;\n"
" add.s32 %r17, %r10, %r15;\n"
" set.lt.u32.s32 %r18, %r4, %r17;\n"
" neg.s32 %r19, %r18;\n"
" mov.u32 %r20, 15;\n"
" set.eq.u32.u32 %r21, %r8, %r20;\n"
" neg.s32 %r22, %r21;\n"
" or.b32 %r23, %r19, %r22;\n"
" mov.u32 %r24, 0;\n"
" setp.eq.s32 %p3, %r23, %r24;\n"
" @%p3 bra $Lt_0_9474;\n"
" .loc 15 35 0\n"
" sub.u32 %r25, %r4, %r9;\n"
" cvt.u16.u32 %r16, %r25;\n"
"$Lt_0_9474:\n"
" mov.u32 %r26, 0;\n"
" setp.eq.u32 %p4, %r16, %r26;\n"
" @%p4 bra $Lt_0_9986;\n"
" mov.s32 %r27, %r16;\n"
" ld.param.u32 %r28, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
" mov.u32 %r29, 0;\n"
" mov.s32 %r30, %r27;\n"
"$Lt_0_10498:\n"
" //<loop> Loop body line 35, nesting depth: 1, estimated iterations: unknown\n"
" .loc 15 40 0\n"
" mov.u32 %r31, 0;\n"
" add.u32 %r32, %r10, %r29;\n"
" mul.lo.u32 %r33, %r32, 4;\n"
" add.u32 %r34, %r28, %r33;\n"
" st.global.u32 [%r34+0], %r31;\n"
" add.u32 %r29, %r29, 1;\n"
" setp.ne.u32 %p5, %r16, %r29;\n"
" @%p5 bra $Lt_0_10498;\n"
"$Lt_0_9986:\n"
" sub.u32 %r35, %r16, 1;\n"
" setp.gt.u32 %p6, %r2, %r35;\n"
" @%p6 bra $LBB23_B2gCudaSearchBNDMq;\n"
" ld.param.u32 %r36, [__cudaparm_B2gCudaSearchBNDMq_g_u8_lowercasetable];\n"
" ld.param.u32 %r37, [__cudaparm_B2gCudaSearchBNDMq_B2G];\n"
" ld.param.u32 %r38, [__cudaparm_B2gCudaSearchBNDMq_buf];\n"
"$Lt_0_11522:\n"
" //<loop> Loop body line 46\n"
" .loc 15 46 0\n"
" add.u32 %r39, %r10, %r3;\n"
" add.u32 %r40, %r39, %r38;\n"
" ld.global.u8 %r41, [%r40+0];\n"
" add.u32 %r42, %r41, %r36;\n"
" ld.global.u8 %r43, [%r42+0];\n"
" ld.global.u8 %r44, [%r40+-1];\n"
" add.u32 %r45, %r44, %r36;\n"
" ld.global.u8 %r46, [%r45+0];\n"
" shl.b32 %r47, %r46, 4;\n"
" or.b32 %r48, %r43, %r47;\n"
" mul.lo.u32 %r49, %r48, 4;\n"
" add.u32 %r50, %r37, %r49;\n"
" ld.global.u32 %r51, [%r50+0];\n"
" mov.u32 %r52, 0;\n"
" setp.eq.u32 %p7, %r51, %r52;\n"
" @%p7 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 46, head labeled $Lt_0_11522\n"
" .loc 15 49 0\n"
" mov.s32 %r29, %r3;\n"
" .loc 15 50 0\n"
" sub.u32 %r53, %r3, %r1;\n"
" add.u32 %r54, %r53, 1;\n"
" sub.s32 %r55, %r1, 1;\n"
"$Lt_0_12546:\n"
" //<loop> Loop body line 53\n"
" .loc 15 53 0\n"
" sub.u32 %r29, %r29, 1;\n"
" shr.u32 %r56, %r51, %r55;\n"
" mov.u32 %r57, 0;\n"
" setp.eq.u32 %p8, %r56, %r57;\n"
" @%p8 bra $Lt_0_13314;\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" setp.ge.u32 %p9, %r54, %r29;\n"
" @%p9 bra $Lt_0_13570;\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 56 0\n"
" mov.s32 %r3, %r29;\n"
" bra.uni $Lt_0_13314;\n"
"$Lt_0_13570:\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 58 0\n"
" mov.u32 %r58, 1;\n"
" ld.param.u32 %r59, [__cudaparm_B2gCudaSearchBNDMq_offsets];\n"
" add.u32 %r60, %r10, %r29;\n"
" mul.lo.u32 %r61, %r60, 4;\n"
" add.u32 %r62, %r59, %r61;\n"
" st.global.u32 [%r62+0], %r58;\n"
"$Lt_0_13314:\n"
"$Lt_0_12802:\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 63 0\n"
" mov.u32 %r63, 0;\n"
" setp.eq.u32 %p10, %r29, %r63;\n"
" @%p10 bra $Lt_0_258;\n"
" //<loop> Part of loop body line 53, head labeled $Lt_0_12546\n"
" .loc 15 66 0\n"
" add.u32 %r64, %r10, %r29;\n"
" add.u32 %r65, %r64, %r38;\n"
" ld.global.u8 %r66, [%r65+0];\n"
" add.u32 %r67, %r66, %r36;\n"
" ld.global.u8 %r68, [%r67+0];\n"
" ld.global.u8 %r69, [%r65+-1];\n"
" add.u32 %r70, %r69, %r36;\n"
" ld.global.u8 %r71, [%r70+0];\n"
" shl.b32 %r72, %r71, 4;\n"
" or.b32 %r73, %r68, %r72;\n"
" mul.lo.u32 %r74, %r73, 4;\n"
" add.u32 %r75, %r37, %r74;\n"
" ld.global.u32 %r76, [%r75+0];\n"
" shl.b32 %r77, %r51, 1;\n"
" and.b32 %r51, %r76, %r77;\n"
" mov.u32 %r78, 0;\n"
" setp.ne.u32 %p11, %r51, %r78;\n"
" @%p11 bra $Lt_0_12546;\n"
"$Lt_0_258:\n"
"$Lt_0_11778:\n"
" //<loop> Part of loop body line 46, head labeled $Lt_0_11522\n"
" .loc 15 69 0\n"
" add.u32 %r79, %r3, %r1;\n"
" sub.u32 %r3, %r79, 1;\n"
" setp.ge.u32 %p12, %r35, %r3;\n"
" @%p12 bra $Lt_0_11522;\n"
"$LBB23_B2gCudaSearchBNDMq:\n"
" .loc 15 72 0\n"
" exit;\n"
"$LDWend_B2gCudaSearchBNDMq:\n"
" } // B2gCudaSearchBNDMq\n"
"\n";
#endif
/**
* \brief Register the CUDA B2g Mpm.
@ -1052,10 +1240,18 @@ void B2gCudaInitCtx(MpmCtx *mpm_ctx, int module_handle)
SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda context");
}
if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image,
#if defined(__x86_64__) || defined(__ia64__)
if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image_64_bit,
module_handle) == -1) {
SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
}
#else
if (SCCudaHlGetCudaModule(&ctx->cuda_module, b2g_cuda_ptx_image_32_bit,
module_handle) == -1) {
SCLogError(SC_ERR_B2G_CUDA_ERROR, "Error getting a cuda module");
}
#endif
if (SCCudaModuleGetFunction(&ctx->cuda_search_kernel, ctx->cuda_module,
B2G_CUDA_SEARCHFUNC_NAME) == -1) {
@ -2983,6 +3179,9 @@ static int B2gCudaTestSearch33(void)
static int B2gCudaTestDeInitTestEnv(void)
{
CUcontext context;
if (SCCudaCtxPopCurrent(&context) == -1)
exit(EXIT_FAILURE);
SCCudaHlDeRegisterModule("B2G_CUDA_TEST");
return 1;

Loading…
Cancel
Save