Merge branch 'eth-bnxt-handle-invalid-tx-completions-more-gracefully'
Jakub Kicinski says: ==================== eth: bnxt: handle invalid Tx completions more gracefully bnxt trusts the events generated by the device which may lead to kernel crashes. These are extremely rare but they do happen. For a while I thought crashing may be intentional, because device reporting invalid completions should never happen, and having a core dump could be useful if it does. But in practice I haven't found any clues in the core dumps, and panic_on_warn exists. Series was tested by forcing the recovery path manually. Because of how rare the real crashes are I can't confirm it works for the actual device errors until it's been widely deployed. v1: https://lore.kernel.org/all/20230710205611.1198878-1-kuba@kernel.org/ ==================== Link: https://lore.kernel.org/r/20230720010440.1967136-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -293,6 +293,60 @@ static void bnxt_db_cq(struct bnxt *bp, struct bnxt_db_info *db, u32 idx)
|
||||
BNXT_DB_CQ(db, idx);
|
||||
}
|
||||
|
||||
static void bnxt_queue_fw_reset_work(struct bnxt *bp, unsigned long delay)
|
||||
{
|
||||
if (!(test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)))
|
||||
return;
|
||||
|
||||
if (BNXT_PF(bp))
|
||||
queue_delayed_work(bnxt_pf_wq, &bp->fw_reset_task, delay);
|
||||
else
|
||||
schedule_delayed_work(&bp->fw_reset_task, delay);
|
||||
}
|
||||
|
||||
static void __bnxt_queue_sp_work(struct bnxt *bp)
|
||||
{
|
||||
if (BNXT_PF(bp))
|
||||
queue_work(bnxt_pf_wq, &bp->sp_task);
|
||||
else
|
||||
schedule_work(&bp->sp_task);
|
||||
}
|
||||
|
||||
static void bnxt_queue_sp_work(struct bnxt *bp, unsigned int event)
|
||||
{
|
||||
set_bit(event, &bp->sp_event);
|
||||
__bnxt_queue_sp_work(bp);
|
||||
}
|
||||
|
||||
static void bnxt_sched_reset_rxr(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
|
||||
{
|
||||
if (!rxr->bnapi->in_reset) {
|
||||
rxr->bnapi->in_reset = true;
|
||||
if (bp->flags & BNXT_FLAG_CHIP_P5)
|
||||
set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
|
||||
else
|
||||
set_bit(BNXT_RST_RING_SP_EVENT, &bp->sp_event);
|
||||
__bnxt_queue_sp_work(bp);
|
||||
}
|
||||
rxr->rx_next_cons = 0xffff;
|
||||
}
|
||||
|
||||
void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
|
||||
int idx)
|
||||
{
|
||||
struct bnxt_napi *bnapi = txr->bnapi;
|
||||
|
||||
if (bnapi->tx_fault)
|
||||
return;
|
||||
|
||||
netdev_err(bp->dev, "Invalid Tx completion (ring:%d tx_pkts:%d cons:%u prod:%u i:%d)",
|
||||
txr->txq_index, bnapi->tx_pkts,
|
||||
txr->tx_cons, txr->tx_prod, idx);
|
||||
WARN_ON_ONCE(1);
|
||||
bnapi->tx_fault = 1;
|
||||
bnxt_queue_sp_work(bp, BNXT_RESET_TASK_SP_EVENT);
|
||||
}
|
||||
|
||||
const u16 bnxt_lhint_arr[] = {
|
||||
TX_BD_FLAGS_LHINT_512_AND_SMALLER,
|
||||
TX_BD_FLAGS_LHINT_512_TO_1023,
|
||||
@@ -652,6 +706,11 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
|
||||
skb = tx_buf->skb;
|
||||
tx_buf->skb = NULL;
|
||||
|
||||
if (unlikely(!skb)) {
|
||||
bnxt_sched_reset_txr(bp, txr, i);
|
||||
return;
|
||||
}
|
||||
|
||||
tx_bytes += skb->len;
|
||||
|
||||
if (tx_buf->is_push) {
|
||||
@@ -1234,38 +1293,6 @@ static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bnxt_queue_fw_reset_work(struct bnxt *bp, unsigned long delay)
|
||||
{
|
||||
if (!(test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)))
|
||||
return;
|
||||
|
||||
if (BNXT_PF(bp))
|
||||
queue_delayed_work(bnxt_pf_wq, &bp->fw_reset_task, delay);
|
||||
else
|
||||
schedule_delayed_work(&bp->fw_reset_task, delay);
|
||||
}
|
||||
|
||||
static void bnxt_queue_sp_work(struct bnxt *bp)
|
||||
{
|
||||
if (BNXT_PF(bp))
|
||||
queue_work(bnxt_pf_wq, &bp->sp_task);
|
||||
else
|
||||
schedule_work(&bp->sp_task);
|
||||
}
|
||||
|
||||
static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
|
||||
{
|
||||
if (!rxr->bnapi->in_reset) {
|
||||
rxr->bnapi->in_reset = true;
|
||||
if (bp->flags & BNXT_FLAG_CHIP_P5)
|
||||
set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
|
||||
else
|
||||
set_bit(BNXT_RST_RING_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
}
|
||||
rxr->rx_next_cons = 0xffff;
|
||||
}
|
||||
|
||||
static u16 bnxt_alloc_agg_idx(struct bnxt_rx_ring_info *rxr, u16 agg_id)
|
||||
{
|
||||
struct bnxt_tpa_idx_map *map = rxr->rx_tpa_idx_map;
|
||||
@@ -1320,7 +1347,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
|
||||
netdev_warn(bp->dev, "TPA cons %x, expected cons %x, error code %x\n",
|
||||
cons, rxr->rx_next_cons,
|
||||
TPA_START_ERROR_CODE(tpa_start1));
|
||||
bnxt_sched_reset(bp, rxr);
|
||||
bnxt_sched_reset_rxr(bp, rxr);
|
||||
return;
|
||||
}
|
||||
/* Store cfa_code in tpa_info to use in tpa_end
|
||||
@@ -1844,7 +1871,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
|
||||
if (rxr->rx_next_cons != 0xffff)
|
||||
netdev_warn(bp->dev, "RX cons %x != expected cons %x\n",
|
||||
cons, rxr->rx_next_cons);
|
||||
bnxt_sched_reset(bp, rxr);
|
||||
bnxt_sched_reset_rxr(bp, rxr);
|
||||
if (rc1)
|
||||
return rc1;
|
||||
goto next_rx_no_prod_no_len;
|
||||
@@ -1882,7 +1909,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
|
||||
!(bp->fw_cap & BNXT_FW_CAP_RING_MONITOR)) {
|
||||
netdev_warn_once(bp->dev, "RX buffer error %x\n",
|
||||
rx_err);
|
||||
bnxt_sched_reset(bp, rxr);
|
||||
bnxt_sched_reset_rxr(bp, rxr);
|
||||
}
|
||||
}
|
||||
goto next_rx_no_len;
|
||||
@@ -2329,7 +2356,7 @@ static int bnxt_async_event_process(struct bnxt *bp,
|
||||
goto async_event_process_exit;
|
||||
}
|
||||
rxr = bp->bnapi[grp_idx]->rx_ring;
|
||||
bnxt_sched_reset(bp, rxr);
|
||||
bnxt_sched_reset_rxr(bp, rxr);
|
||||
goto async_event_process_exit;
|
||||
}
|
||||
case ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST: {
|
||||
@@ -2384,7 +2411,7 @@ static int bnxt_async_event_process(struct bnxt *bp,
|
||||
default:
|
||||
goto async_event_process_exit;
|
||||
}
|
||||
bnxt_queue_sp_work(bp);
|
||||
__bnxt_queue_sp_work(bp);
|
||||
async_event_process_exit:
|
||||
return 0;
|
||||
}
|
||||
@@ -2413,8 +2440,7 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp)
|
||||
}
|
||||
|
||||
set_bit(vf_id - bp->pf.first_vf_id, bp->pf.vf_event_bmap);
|
||||
set_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
bnxt_queue_sp_work(bp, BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT);
|
||||
break;
|
||||
|
||||
case CMPL_BASE_TYPE_HWRM_ASYNC_EVENT:
|
||||
@@ -2571,7 +2597,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
|
||||
|
||||
static void __bnxt_poll_work_done(struct bnxt *bp, struct bnxt_napi *bnapi)
|
||||
{
|
||||
if (bnapi->tx_pkts) {
|
||||
if (bnapi->tx_pkts && !bnapi->tx_fault) {
|
||||
bnapi->tx_int(bp, bnapi, bnapi->tx_pkts);
|
||||
bnapi->tx_pkts = 0;
|
||||
}
|
||||
@@ -9424,6 +9450,8 @@ static void bnxt_enable_napi(struct bnxt *bp)
|
||||
struct bnxt_napi *bnapi = bp->bnapi[i];
|
||||
struct bnxt_cp_ring_info *cpr;
|
||||
|
||||
bnapi->tx_fault = 0;
|
||||
|
||||
cpr = &bnapi->cp_ring;
|
||||
if (bnapi->in_reset)
|
||||
cpr->sw_stats.rx.rx_resets++;
|
||||
@@ -11031,8 +11059,7 @@ static void bnxt_set_rx_mode(struct net_device *dev)
|
||||
if (mask != vnic->rx_mask || uc_update || mc_update) {
|
||||
vnic->rx_mask = mask;
|
||||
|
||||
set_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
bnxt_queue_sp_work(bp, BNXT_RX_MASK_SP_EVENT);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11597,8 +11624,7 @@ static void bnxt_tx_timeout(struct net_device *dev, unsigned int txqueue)
|
||||
struct bnxt *bp = netdev_priv(dev);
|
||||
|
||||
netdev_err(bp->dev, "TX timeout detected, starting reset task!\n");
|
||||
set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
bnxt_queue_sp_work(bp, BNXT_RESET_TASK_SP_EVENT);
|
||||
}
|
||||
|
||||
static void bnxt_fw_health_check(struct bnxt *bp)
|
||||
@@ -11635,8 +11661,7 @@ static void bnxt_fw_health_check(struct bnxt *bp)
|
||||
return;
|
||||
|
||||
fw_reset:
|
||||
set_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
bnxt_queue_sp_work(bp, BNXT_FW_EXCEPTION_SP_EVENT);
|
||||
}
|
||||
|
||||
static void bnxt_timer(struct timer_list *t)
|
||||
@@ -11653,21 +11678,15 @@ static void bnxt_timer(struct timer_list *t)
|
||||
if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
|
||||
bnxt_fw_health_check(bp);
|
||||
|
||||
if (BNXT_LINK_IS_UP(bp) && bp->stats_coal_ticks) {
|
||||
set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
}
|
||||
if (BNXT_LINK_IS_UP(bp) && bp->stats_coal_ticks)
|
||||
bnxt_queue_sp_work(bp, BNXT_PERIODIC_STATS_SP_EVENT);
|
||||
|
||||
if (bnxt_tc_flower_enabled(bp)) {
|
||||
set_bit(BNXT_FLOW_STATS_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
}
|
||||
if (bnxt_tc_flower_enabled(bp))
|
||||
bnxt_queue_sp_work(bp, BNXT_FLOW_STATS_SP_EVENT);
|
||||
|
||||
#ifdef CONFIG_RFS_ACCEL
|
||||
if ((bp->flags & BNXT_FLAG_RFS) && bp->ntp_fltr_count) {
|
||||
set_bit(BNXT_RX_NTP_FLTR_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
}
|
||||
if ((bp->flags & BNXT_FLAG_RFS) && bp->ntp_fltr_count)
|
||||
bnxt_queue_sp_work(bp, BNXT_RX_NTP_FLTR_SP_EVENT);
|
||||
#endif /*CONFIG_RFS_ACCEL*/
|
||||
|
||||
if (bp->link_info.phy_retry) {
|
||||
@@ -11675,21 +11694,17 @@ static void bnxt_timer(struct timer_list *t)
|
||||
bp->link_info.phy_retry = false;
|
||||
netdev_warn(bp->dev, "failed to update phy settings after maximum retries.\n");
|
||||
} else {
|
||||
set_bit(BNXT_UPDATE_PHY_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
bnxt_queue_sp_work(bp, BNXT_UPDATE_PHY_SP_EVENT);
|
||||
}
|
||||
}
|
||||
|
||||
if (test_bit(BNXT_STATE_L2_FILTER_RETRY, &bp->state)) {
|
||||
set_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
}
|
||||
if (test_bit(BNXT_STATE_L2_FILTER_RETRY, &bp->state))
|
||||
bnxt_queue_sp_work(bp, BNXT_RX_MASK_SP_EVENT);
|
||||
|
||||
if ((bp->flags & BNXT_FLAG_CHIP_P5) && !bp->chip_rev &&
|
||||
netif_carrier_ok(dev)) {
|
||||
set_bit(BNXT_RING_COAL_NOW_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
}
|
||||
netif_carrier_ok(dev))
|
||||
bnxt_queue_sp_work(bp, BNXT_RING_COAL_NOW_SP_EVENT);
|
||||
|
||||
bnxt_restart_timer:
|
||||
mod_timer(&bp->timer, jiffies + bp->current_interval);
|
||||
}
|
||||
@@ -12968,8 +12983,7 @@ static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
|
||||
bp->ntp_fltr_count++;
|
||||
spin_unlock_bh(&bp->ntp_fltr_lock);
|
||||
|
||||
set_bit(BNXT_RX_NTP_FLTR_SP_EVENT, &bp->sp_event);
|
||||
bnxt_queue_sp_work(bp);
|
||||
bnxt_queue_sp_work(bp, BNXT_RX_NTP_FLTR_SP_EVENT);
|
||||
|
||||
return new_fltr->sw_id;
|
||||
|
||||
|
||||
@@ -1008,6 +1008,7 @@ struct bnxt_napi {
|
||||
int);
|
||||
int tx_pkts;
|
||||
u8 events;
|
||||
u8 tx_fault:1;
|
||||
|
||||
u32 flags;
|
||||
#define BNXT_NAPI_FLAG_XDP 0x1
|
||||
@@ -2329,6 +2330,8 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num);
|
||||
int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init);
|
||||
void bnxt_tx_disable(struct bnxt *bp);
|
||||
void bnxt_tx_enable(struct bnxt *bp);
|
||||
void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
|
||||
int idx);
|
||||
void bnxt_report_link(struct bnxt *bp);
|
||||
int bnxt_update_link(struct bnxt *bp, bool chng_link_state);
|
||||
int bnxt_hwrm_set_pause(struct bnxt *);
|
||||
|
||||
@@ -149,6 +149,7 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
|
||||
tx_buf->action = 0;
|
||||
tx_buf->xdpf = NULL;
|
||||
} else if (tx_buf->action == XDP_TX) {
|
||||
tx_buf->action = 0;
|
||||
rx_doorbell_needed = true;
|
||||
last_tx_cons = tx_cons;
|
||||
|
||||
@@ -158,6 +159,9 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
|
||||
tx_buf = &txr->tx_buf_ring[tx_cons];
|
||||
page_pool_recycle_direct(rxr->page_pool, tx_buf->page);
|
||||
}
|
||||
} else {
|
||||
bnxt_sched_reset_txr(bp, txr, i);
|
||||
return;
|
||||
}
|
||||
tx_cons = NEXT_TX(tx_cons);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user