use of pagelocked memory for CUDA

15 years ago · 8adff3c63c
parent 0e8e8e3728
commit 8adff3c63c
2 changed files with 105 additions and 28 deletions
--- a/src/cuda-packet-batcher.c
+++ b/src/cuda-packet-batcher.c
@ -43,6 +43,7 @@
 #include "util-unittest.h"

 #include "util-mpm-b2g-cuda.h"
+#include "util-cuda-handlers.h"
 #include "detect-engine-address.h"
 #include "detect-engine-port.h"
 #include "detect-engine.h"
@ -353,12 +354,21 @@ void SCCudaPBDeAllocSCCudaPBPacketsBuffer(SCCudaPBPacketsBuffer *pb)
    if (pb == NULL)
        return;

-    if (pb->packets_buffer != NULL)
-        free(pb->packets_buffer);
-    if (pb->packets_offset_buffer != NULL)
-        free(pb->packets_offset_buffer);
-    if (pb->packets_payload_offset_buffer != NULL)
-        free(pb->packets_payload_offset_buffer);
+    if (pb->packets_buffer != NULL){
+        if (SCCudaMemFreeHost(pb->packets_buffer) == -1)
+            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                       "packets_buffer\n");
+    }
+    if (pb->packets_offset_buffer != NULL){
+        if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1)
+            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                       "packets_offset_buffer\n");
+    }
+    if (pb->packets_payload_offset_buffer != NULL){
+        if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1)
+            SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory "
+                       "packets_payload_offset_buffer\n");
+    }
    if (pb->packets_address_buffer != NULL)
        free(pb->packets_address_buffer);

@ -381,38 +391,65 @@ SCCudaPBPacketsBuffer *SCCudaPBAllocSCCudaPBPacketsBuffer(void)
    }
    memset(pb, 0, sizeof(SCCudaPBPacketsBuffer));

+    /* Register new module, needed for some unit tests */
+    if (SCCudaHlGetModuleHandle("SC_CUDA_PACKET_BATCHER") == -1) {
+        SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");
+    }
+
    /* the buffer for the packets to be sent over to the gpu.  We allot space for
     * a minimum of SC_CUDA_PB_MIN_NO_OF_PACKETS, i.e. if each packet buffered
     * is full to the brim */
-    pb->packets_buffer = malloc(sizeof(SCCudaPBPacketDataForGPU) *
-                                SC_CUDA_PB_MIN_NO_OF_PACKETS);
-    if (pb->packets_buffer == NULL) {
-        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+    SCCudaHlModuleData *data = NULL;
+    data = SCCudaHlGetModuleData(SCCudaHlGetModuleHandle("SC_CUDA_PACKET_BATCHER"));
+    if (data == NULL) {
+        SCLogDebug("Module not registered.  To avail the benefits of this "
+                   "registration facility, first register a module using "
+                   "context using SCCudaHlRegisterModule(), after which you "
+                   "can call this function");
+        return NULL;
+    }
+
+    if (SCCudaHlGetCudaContext(&data->cuda_context, data->handle) == -1){
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Error getting cuda context\n");
+        return NULL;
+    }
+
+    if (SCCudaCtxPushCurrent(data->cuda_context) == -1){
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR,
+                   "Error pushing cuda context to allocate memory\n");
+    }
+
+    if (SCCudaMemHostAlloc((void**)&pb->packets_buffer, sizeof(SCCudaPBPacketDataForGPU) *
+                SC_CUDA_PB_MIN_NO_OF_PACKETS, CU_MEMHOSTALLOC_PORTABLE) == -1){
+        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
        exit(EXIT_FAILURE);
    }
    memset(pb->packets_buffer, 0, sizeof(SCCudaPBPacketDataForGPU) *
           SC_CUDA_PB_MIN_NO_OF_PACKETS);

    /* used to hold the offsets of the buffered packets in the packets_buffer */
-    pb->packets_offset_buffer = malloc(sizeof(uint32_t) *
-                                       SC_CUDA_PB_MIN_NO_OF_PACKETS);
-    if (pb->packets_offset_buffer == NULL) {
-        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+    if (SCCudaMemHostAlloc((void**)&pb->packets_offset_buffer, sizeof(uint32_t) *
+                SC_CUDA_PB_MIN_NO_OF_PACKETS, CU_MEMHOSTALLOC_PORTABLE) == -1){
+        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
        exit(EXIT_FAILURE);
    }
    memset(pb->packets_offset_buffer, 0, sizeof(uint32_t) *
           SC_CUDA_PB_MIN_NO_OF_PACKETS);

    /* used to hold the offsets of the packets payload */
-    pb->packets_payload_offset_buffer = malloc(sizeof(uint32_t) *
-                                               SC_CUDA_PB_MIN_NO_OF_PACKETS);
-    if (pb->packets_payload_offset_buffer == NULL) {
-        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+    if (SCCudaMemHostAlloc((void**)&pb->packets_payload_offset_buffer, sizeof(uint32_t) *
+                SC_CUDA_PB_MIN_NO_OF_PACKETS, CU_MEMHOSTALLOC_PORTABLE) == -1){
+        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
        exit(EXIT_FAILURE);
    }
    memset(pb->packets_payload_offset_buffer, 0, sizeof(uint32_t) *
           SC_CUDA_PB_MIN_NO_OF_PACKETS);

+    SCLogDebug("Allocated pagelocked CUDA memory\n");
+    if (SCCudaCtxPopCurrent(NULL) == -1){
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Could not pop cuda context\n");
+    }
+
    /* used to hold the packet addresses for all the packets buffered inside
     * packets_buffer */
    pb->packets_address_buffer = malloc(sizeof(Packet *) *
@ -477,6 +514,9 @@ TmEcode SCCudaPBThreadInit(ThreadVars *tv, void *initdata, void **data)
    /* the first packet buffer from the queue */
    tctx->curr_pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(&data_queues[tmq_inq->id]);

+    /* register new module */
+    SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");
+
    *data = tctx;

    /* we will need the cuda packet batcher TM's inq for further use later.  Read
@ -715,8 +755,21 @@ TmEcode SCCudaPBThreadDeInit(ThreadVars *tv, void *data)

    if (tctx != NULL) {
        if (tctx->curr_pb != NULL) {
+            if (SCCudaHlPushCudaContextFromModule("SC_CUDA_PACKET_BATCHER") == -1){
+                SCLogError(SC_ERR_CUDA_HANDLER_ERROR,
+                           "Failed to push cuda context from module\n");
+            }
+
            SCCudaPBDeAllocSCCudaPBPacketsBuffer(tctx->curr_pb);
            tctx->curr_pb = NULL;
+
+            if (SCCudaCtxPopCurrent(NULL) == -1){
+                SCLogError(SC_ERR_CUDA_ERROR, "Failed to pop cuda context\n");
+            }
+
+            if (SCCudaHlDeRegisterModule("SC_CUDA_PACKET_BATCHER") == -1){
+                SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Failed to deregister module\n");
+            }
        }
        free(tctx);
    }
@ -759,6 +812,10 @@ void SCCudaPBSetUpQueuesAndBuffers(void)
    tmq_outq->reader_cnt++;
    tmq_outq->writer_cnt++;

+    /* Register a new module to be used by the packet batcher to allocate
+     * page-locked memory */
+    SCCudaHlRegisterModule("SC_CUDA_PACKET_BATCHER");
+
    /* allocate the packet buffer */
    /* \todo need to work out the right no of packet buffers that we need to
     * queue.  I doubt we will need more than 4(as long as we don't run it on
@ -794,17 +851,26 @@ void SCCudaPBCleanUpQueuesAndBuffers(void)
                   "tmq_outq NULL");
        return;
    }
+    if (SCCudaHlPushCudaContextFromModule("SC_CUDA_PACKET_BATCHER") == -1){
+        SCLogError(SC_ERR_CUDA_HANDLER_ERROR, "Could not push cuda context from module\n");
+    }

    /* clean all the buffers present in the inq */
    dq = &data_queues[tmq_inq->id];
    SCMutexLock(&dq->mutex_q);
    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
        if (pb->packets_buffer != NULL)
-            free(pb->packets_buffer);
+            if (SCCudaMemFreeHost(pb->packets_buffer) == -1)
+                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                           "packets_buffer\n");
        if (pb->packets_offset_buffer != NULL)
-            free(pb->packets_offset_buffer);
+            if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1)
+                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                           "packets_offset_buffer\n");
        if (pb->packets_payload_offset_buffer != NULL)
-            free(pb->packets_payload_offset_buffer);
+            if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1)
+                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                           "packets_payload_offset_buffer\n");

        free(pb);
    }
@ -816,14 +882,23 @@ void SCCudaPBCleanUpQueuesAndBuffers(void)
    SCMutexLock(&dq->mutex_q);
    while ( (pb = (SCCudaPBPacketsBuffer *)SCDQDataDequeue(dq)) != NULL) {
        if (pb->packets_buffer != NULL)
-            free(pb->packets_buffer);
+            if (SCCudaMemFreeHost(pb->packets_buffer) == -1)
+                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                           "packets_buffer\n");
        if (pb->packets_offset_buffer != NULL)
-            free(pb->packets_offset_buffer);
+            if (SCCudaMemFreeHost(pb->packets_offset_buffer) == -1)
+                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                           "packets_offset_buffer\n");
        if (pb->packets_payload_offset_buffer != NULL)
-            free(pb->packets_payload_offset_buffer);
+            if (SCCudaMemFreeHost(pb->packets_payload_offset_buffer) == -1)
+                SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                           "packets_payload_offset_buffer\n");

        free(pb);
    }
+    if (SCCudaCtxPopCurrent(NULL) == -1){
+        SCLogError(SC_ERR_CUDA_ERROR, "Could not pop cuda context\n");
+    }
    SCMutexUnlock(&dq->mutex_q);
    SCCondSignal(&dq->cond_q);

--- a/src/util-mpm-b2g-cuda.c
+++ b/src/util-mpm-b2g-cuda.c
@ -1781,9 +1781,9 @@ TmEcode B2gCudaMpmDispThreadInit(ThreadVars *tv, void *initdata, void **data)
     * extra 2 bytes(the 1 in 1481 instead of 1480) is to hold the no of
     * matches for the payload.  The remaining 1480 positions in the buffer
     * is to hold the match offsets */
-    tctx->results_buffer = malloc(sizeof(uint16_t) * 1481 * SC_CUDA_PB_MIN_NO_OF_PACKETS);
-    if (tctx->results_buffer == NULL) {
-        SCLogError(SC_ERR_MEM_ALLOC, "Error allocating memory");
+    if (SCCudaMemHostAlloc((void**)&tctx->results_buffer, sizeof(uint16_t) * 1481 *
+                SC_CUDA_PB_MIN_NO_OF_PACKETS, CU_MEMHOSTALLOC_PORTABLE) == -1){
+        SCLogError(SC_ERR_CUDA_ERROR, "Error allocating page-locked memory\n");
        exit(EXIT_FAILURE);
    }

@ -1905,7 +1905,9 @@ TmEcode B2gCudaMpmDispThreadDeInit(ThreadVars *tv, void *data)
    }
    SCCudaCtxPushCurrent(dummy_context);

-    free(tctx->results_buffer);
+    if (SCCudaMemFreeHost(tctx->results_buffer) == -1)
+        SCLogError(SC_ERR_CUDA_ERROR, "Error deallocating pagelocked memory: "
+                   "results_buffer\n");
    SCCudaHlFreeCudaDevicePtr("MPM_B2G_RESULTS", tctx->b2g_cuda_module_handle);
    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER", tctx->b2g_cuda_module_handle);
    SCCudaHlFreeCudaDevicePtr("MPM_B2G_PACKETS_BUFFER_OFFSETS",