diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 47b49cbf67ab..cbb7c29966ff 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -797,10 +797,11 @@ out: static void cs_timedout(struct work_struct *work) { struct hl_device *hdev; + u64 event_mask; int rc; struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); - bool skip_reset_on_timeout = cs->skip_reset_on_timeout; + bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; rc = cs_get_unless_zero(cs); if (!rc) @@ -811,9 +812,15 @@ static void cs_timedout(struct work_struct *work) return; } - /* Mark the CS is timed out so we won't try to cancel its TDR */ - if (likely(!skip_reset_on_timeout)) + if (likely(!skip_reset_on_timeout)) { + if (hdev->reset_on_lockup) + device_reset = true; + else + hdev->reset_info.needs_reset = true; + + /* Mark the CS is timed out so we won't try to cancel its TDR */ cs->timedout = true; + } hdev = cs->ctx->hdev; @@ -822,6 +829,11 @@ static void cs_timedout(struct work_struct *work) if (rc) { hdev->last_error.cs_timeout.timestamp = ktime_get(); hdev->last_error.cs_timeout.seq = cs->sequence; + + event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT | + HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT; + + hl_notifier_event_send_all(hdev, event_mask); } switch (cs->type) { @@ -856,12 +868,8 @@ static void cs_timedout(struct work_struct *work) cs_put(cs); - if (likely(!skip_reset_on_timeout)) { - if (hdev->reset_on_lockup) - hl_device_reset(hdev, HL_DRV_RESET_TDR); - else - hdev->reset_info.needs_reset = true; - } + if (device_reset) + hl_device_reset(hdev, HL_DRV_RESET_TDR); } static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 5f9a6097f5f3..18f86d259421 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -1435,10 +1435,12 @@ struct hl_debug_args { * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code * HL_NOTIFIER_EVENT_DEVICE_RESET - Indicates device requires a reset + * HL_NOTIFIER_EVENT_CS_TIMEOUT - Indicates CS timeout error */ #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) #define HL_NOTIFIER_EVENT_DEVICE_RESET (1ULL << 2) +#define HL_NOTIFIER_EVENT_CS_TIMEOUT (1ULL << 3) /* * Various information operations such as: