dpdk: add interrupt (power-saving) mode

When the packet load is low, Suricata can run in interrupt
mode. This more resembles the classic approach of processing
packets - CPU cores run low and only fetch packets
on interrupt.

Ticket: #5839
pull/10204/head
Lukas Sismis 1 year ago committed by Victor Julien
parent ca6f7c2d00
commit 2a2898053c

@ -146,3 +146,41 @@ management and worker CPU set.
- worker-cpu-set: - worker-cpu-set:
cpu: [ 2,4,6,8 ] cpu: [ 2,4,6,8 ]
... ...
Interrupt (power-saving) mode
-----------------------------
The DPDK is traditionally recognized for its polling mode operation.
In this mode, CPU cores are continuously querying for packets from
the Network Interface Card (NIC). While this approach offers benefits like
reduced latency and improved performance, it might not be the most efficient
in scenarios with sporadic or low traffic.
The constant polling can lead to unnecessary CPU consumption.
To address this, DPDK offers an `interrupt` mode.
The obvious advantage that interrupt mode brings is power efficiency.
So far in our tests, we haven't observed a decrease in performance. Suricata's
performance has actually seen a slight improvement.
The (IPS runmode) users should be aware that interrupts can
introduce non-deterministic latency. However, the latency should never be
higher than in other (e.g. AF_PACKET/AF_XDP/...) capture methods.
Interrupt mode in DPDK can be configured on a per-interface basis.
This allows for a hybrid setup where some workers operate in polling mode,
while others utilize the interrupt mode.
The configuration for the interrupt mode can be found and modified in the
DPDK section of the suricata.yaml file.
Below is a sample configuration that demonstrates how to enable the interrupt mode for a specific interface:
::
...
dpdk:
eal-params:
proc-type: primary
interfaces:
- interface: 0000:3b:00.0
interrupt-mode: true
threads: 4

@ -111,6 +111,7 @@ static void *ParseDpdkConfigAndConfigureDevice(const char *iface);
static void DPDKDerefConfig(void *conf); static void DPDKDerefConfig(void *conf);
#define DPDK_CONFIG_DEFAULT_THREADS "auto" #define DPDK_CONFIG_DEFAULT_THREADS "auto"
#define DPDK_CONFIG_DEFAULT_INTERRUPT_MODE false
#define DPDK_CONFIG_DEFAULT_MEMPOOL_SIZE 65535 #define DPDK_CONFIG_DEFAULT_MEMPOOL_SIZE 65535
#define DPDK_CONFIG_DEFAULT_MEMPOOL_CACHE_SIZE "auto" #define DPDK_CONFIG_DEFAULT_MEMPOOL_CACHE_SIZE "auto"
#define DPDK_CONFIG_DEFAULT_RX_DESCRIPTORS 1024 #define DPDK_CONFIG_DEFAULT_RX_DESCRIPTORS 1024
@ -126,6 +127,7 @@ static void DPDKDerefConfig(void *conf);
DPDKIfaceConfigAttributes dpdk_yaml = { DPDKIfaceConfigAttributes dpdk_yaml = {
.threads = "threads", .threads = "threads",
.irq_mode = "interrupt-mode",
.promisc = "promisc", .promisc = "promisc",
.multicast = "multicast", .multicast = "multicast",
.checksum_checks = "checksum-checks", .checksum_checks = "checksum-checks",
@ -434,6 +436,15 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str)
SCReturnInt(0); SCReturnInt(0);
} }
static bool ConfigSetInterruptMode(DPDKIfaceConfig *iconf, bool enable)
{
SCEnter();
if (enable)
iconf->flags |= DPDK_IRQ_MODE;
SCReturnBool(true);
}
static int ConfigSetRxQueues(DPDKIfaceConfig *iconf, uint16_t nb_queues) static int ConfigSetRxQueues(DPDKIfaceConfig *iconf, uint16_t nb_queues)
{ {
SCEnter(); SCEnter();
@ -695,6 +706,17 @@ static int ConfigLoad(DPDKIfaceConfig *iconf, const char *iface)
if (retval < 0) if (retval < 0)
SCReturnInt(retval); SCReturnInt(retval);
bool irq_enable;
retval = ConfGetChildValueBoolWithDefault(if_root, if_default, dpdk_yaml.irq_mode, &entry_bool);
if (retval != 1) {
irq_enable = DPDK_CONFIG_DEFAULT_INTERRUPT_MODE;
} else {
irq_enable = entry_bool ? true : false;
}
retval = ConfigSetInterruptMode(iconf, irq_enable);
if (retval != true)
SCReturnInt(-EINVAL);
// currently only mapping "1 thread == 1 RX (and 1 TX queue in IPS mode)" is supported // currently only mapping "1 thread == 1 RX (and 1 TX queue in IPS mode)" is supported
retval = ConfigSetRxQueues(iconf, (uint16_t)iconf->threads); retval = ConfigSetRxQueues(iconf, (uint16_t)iconf->threads);
if (retval < 0) if (retval < 0)
@ -1106,6 +1128,11 @@ static void DeviceInitPortConf(const DPDKIfaceConfig *iconf,
}, },
}; };
SCLogConfig("%s: interrupt mode is %s", iconf->iface,
iconf->flags & DPDK_IRQ_MODE ? "enabled" : "disabled");
if (iconf->flags & DPDK_IRQ_MODE)
port_conf->intr_conf.rxq = 1;
// configure RX offloads // configure RX offloads
if (dev_info->rx_offload_capa & RTE_ETH_RX_OFFLOAD_RSS_HASH) { if (dev_info->rx_offload_capa & RTE_ETH_RX_OFFLOAD_RSS_HASH) {
if (iconf->nb_rx_queues > 1) { if (iconf->nb_rx_queues > 1) {

@ -25,6 +25,7 @@
typedef struct DPDKIfaceConfigAttributes_ { typedef struct DPDKIfaceConfigAttributes_ {
const char *threads; const char *threads;
const char *irq_mode;
const char *promisc; const char *promisc;
const char *multicast; const char *multicast;
const char *checksum_checks; const char *checksum_checks;

@ -93,6 +93,13 @@ TmEcode NoDPDKSupportExit(ThreadVars *tv, const void *initdata, void **data)
#define BURST_SIZE 32 #define BURST_SIZE 32
static struct timeval machine_start_time = { 0, 0 }; static struct timeval machine_start_time = { 0, 0 };
// interrupt mode constants
#define MIN_ZERO_POLL_COUNT 10U
#define MIN_ZERO_POLL_COUNT_TO_SLEEP 10U
#define MINIMUM_SLEEP_TIME_US 1U
#define STANDARD_SLEEP_TIME_US 100U
#define MAX_EPOLL_TIMEOUT_MS 500U
static rte_spinlock_t intr_lock[RTE_MAX_ETHPORTS];
/** /**
* \brief Structure to hold thread specific variables. * \brief Structure to hold thread specific variables.
@ -104,6 +111,7 @@ typedef struct DPDKThreadVars_ {
TmSlot *slot; TmSlot *slot;
LiveDevice *livedev; LiveDevice *livedev;
ChecksumValidationMode checksum_mode; ChecksumValidationMode checksum_mode;
bool intr_enabled;
/* references to packet and drop counters */ /* references to packet and drop counters */
uint16_t capture_dpdk_packets; uint16_t capture_dpdk_packets;
uint16_t capture_dpdk_rx_errs; uint16_t capture_dpdk_rx_errs;
@ -142,6 +150,40 @@ static uint64_t CyclesToSeconds(uint64_t cycles);
static void DPDKFreeMbufArray(struct rte_mbuf **mbuf_array, uint16_t mbuf_cnt, uint16_t offset); static void DPDKFreeMbufArray(struct rte_mbuf **mbuf_array, uint16_t mbuf_cnt, uint16_t offset);
static uint64_t DPDKGetSeconds(void); static uint64_t DPDKGetSeconds(void);
static bool InterruptsRXEnable(uint16_t port_id, uint16_t queue_id)
{
uint32_t event_data = port_id << UINT16_WIDTH | queue_id;
int32_t ret = rte_eth_dev_rx_intr_ctl_q(port_id, queue_id, RTE_EPOLL_PER_THREAD,
RTE_INTR_EVENT_ADD, (void *)((uintptr_t)event_data));
if (ret != 0) {
SCLogError("%s-Q%d: failed to enable interrupt mode: %s", DPDKGetPortNameByPortID(port_id),
queue_id, rte_strerror(-ret));
return false;
}
return true;
}
static inline uint32_t InterruptsSleepHeuristic(uint32_t no_pkt_polls_count)
{
if (no_pkt_polls_count < MIN_ZERO_POLL_COUNT_TO_SLEEP)
return MINIMUM_SLEEP_TIME_US;
return STANDARD_SLEEP_TIME_US;
}
static inline void InterruptsTurnOnOff(uint16_t port_id, uint16_t queue_id, bool on)
{
rte_spinlock_lock(&(intr_lock[port_id]));
if (on)
rte_eth_dev_rx_intr_enable(port_id, queue_id);
else
rte_eth_dev_rx_intr_disable(port_id, queue_id);
rte_spinlock_unlock(&(intr_lock[port_id]));
}
static void DPDKFreeMbufArray(struct rte_mbuf **mbuf_array, uint16_t mbuf_cnt, uint16_t offset) static void DPDKFreeMbufArray(struct rte_mbuf **mbuf_array, uint16_t mbuf_cnt, uint16_t offset)
{ {
for (int i = offset; i < mbuf_cnt; i++) { for (int i = offset; i < mbuf_cnt; i++) {
@ -377,6 +419,11 @@ static TmEcode ReceiveDPDKLoop(ThreadVars *tv, void *data, void *slot)
rte_eth_stats_reset(ptv->port_id); rte_eth_stats_reset(ptv->port_id);
rte_eth_xstats_reset(ptv->port_id); rte_eth_xstats_reset(ptv->port_id);
uint32_t pwd_zero_rx_packet_polls_count = 0;
if (ptv->intr_enabled && !InterruptsRXEnable(ptv->port_id, ptv->queue_id))
SCReturnInt(TM_ECODE_FAILED);
while (1) { while (1) {
if (unlikely(suricata_ctl_flags != 0)) { if (unlikely(suricata_ctl_flags != 0)) {
SCLogDebug("Stopping Suricata!"); SCLogDebug("Stopping Suricata!");
@ -398,7 +445,27 @@ static TmEcode ReceiveDPDKLoop(ThreadVars *tv, void *data, void *slot)
TmThreadsCaptureHandleTimeout(tv, NULL); TmThreadsCaptureHandleTimeout(tv, NULL);
last_timeout_msec = msecs; last_timeout_msec = msecs;
} }
continue;
if (!ptv->intr_enabled)
continue;
pwd_zero_rx_packet_polls_count++;
if (pwd_zero_rx_packet_polls_count <= MIN_ZERO_POLL_COUNT)
continue;
uint32_t pwd_idle_hint = InterruptsSleepHeuristic(pwd_zero_rx_packet_polls_count);
if (pwd_idle_hint < STANDARD_SLEEP_TIME_US) {
rte_delay_us(pwd_idle_hint);
} else {
InterruptsTurnOnOff(ptv->port_id, ptv->queue_id, true);
struct rte_epoll_event event;
rte_epoll_wait(RTE_EPOLL_PER_THREAD, &event, 1, MAX_EPOLL_TIMEOUT_MS);
InterruptsTurnOnOff(ptv->port_id, ptv->queue_id, false);
continue;
}
} else if (ptv->intr_enabled && pwd_zero_rx_packet_polls_count) {
pwd_zero_rx_packet_polls_count = 0;
} }
ptv->pkts += (uint64_t)nb_rx; ptv->pkts += (uint64_t)nb_rx;
@ -522,6 +589,7 @@ static TmEcode ReceiveDPDKThreadInit(ThreadVars *tv, const void *initdata, void
ptv->checksum_mode = dpdk_config->checksum_mode; ptv->checksum_mode = dpdk_config->checksum_mode;
ptv->threads = dpdk_config->threads; ptv->threads = dpdk_config->threads;
ptv->intr_enabled = (dpdk_config->flags & DPDK_IRQ_MODE) ? true : false;
ptv->port_id = dpdk_config->port_id; ptv->port_id = dpdk_config->port_id;
ptv->out_port_id = dpdk_config->out_port_id; ptv->out_port_id = dpdk_config->out_port_id;
ptv->port_socket_id = dpdk_config->socket_id; ptv->port_socket_id = dpdk_config->socket_id;
@ -569,6 +637,9 @@ static TmEcode ReceiveDPDKThreadInit(ThreadVars *tv, const void *initdata, void
"%s: unable to determine NIC's NUMA node, degraded performance can be expected", "%s: unable to determine NIC's NUMA node, degraded performance can be expected",
dpdk_config->iface); dpdk_config->iface);
} }
if (ptv->intr_enabled) {
rte_spinlock_init(&intr_lock[ptv->port_id]);
}
} }
*data = (void *)ptv; *data = (void *)ptv;

@ -38,6 +38,7 @@ typedef enum { DPDK_COPY_MODE_NONE, DPDK_COPY_MODE_TAP, DPDK_COPY_MODE_IPS } Dpd
// General flags // General flags
#define DPDK_PROMISC (1 << 0) /**< Promiscuous mode */ #define DPDK_PROMISC (1 << 0) /**< Promiscuous mode */
#define DPDK_MULTICAST (1 << 1) /**< Enable multicast packets */ #define DPDK_MULTICAST (1 << 1) /**< Enable multicast packets */
#define DPDK_IRQ_MODE (1 << 2) /**< Interrupt mode */
// Offloads // Offloads
#define DPDK_RX_CHECKSUM_OFFLOAD (1 << 4) /**< Enable chsum offload */ #define DPDK_RX_CHECKSUM_OFFLOAD (1 << 4) /**< Enable chsum offload */

@ -747,6 +747,7 @@ dpdk:
# - auto takes all cores # - auto takes all cores
# in IPS mode it is required to specify the number of cores and the numbers on both interfaces must match # in IPS mode it is required to specify the number of cores and the numbers on both interfaces must match
threads: auto threads: auto
# interrupt-mode: false # true to switch to interrupt mode
promisc: true # promiscuous mode - capture all packets promisc: true # promiscuous mode - capture all packets
multicast: true # enables also detection on multicast packets multicast: true # enables also detection on multicast packets
checksum-checks: true # if Suricata should validate checksums checksum-checks: true # if Suricata should validate checksums

Loading…
Cancel
Save