Merge tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next

Oded writes:

This tag contains habanalabs driver changes for v6.2:

- New feature of graceful hard-reset. Instead of immediately killing the
  user-process when a command submission times out, we wait a bit and give
  the user-process notification and let it try to close things gracefully,
  with the ability to retrieve debug information.

- Enhance the EventFD mechanism. Add new events such as access to illegal
  address (RAZWI), page fault, device unavailable. In addition, change the
  event workqueue to be handled in a single-threaded workqueue.

- Allow the control device to work during reset of the ASIC, to enable
  monitoring applications to continue getting the data.

- Add handling for Gaudi2 with PCI revision 2.

- Reduce severity of prints due to power/thermal events.

- Change how we use the h/w to perform memory scrubbing in Gaudi2.

- Multiple bug fixes, refactors and renames.

* tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (63 commits)
  habanalabs: fix VA range calculation
  habanalabs: fail driver load if EEPROM errors detected
  habanalabs: make print of engines idle mask more readable
  habanalabs: clear non-released encapsulated signals
  habanalabs: don't put context in hl_encaps_handle_do_release_sob()
  habanalabs: print context refcount value if hard reset fails
  habanalabs: add RMWREG32_SHIFTED to set a val within a mask
  habanalabs: fix rc when new CPUCP opcodes are not supported
  habanalabs/gaudi2: added memset for the cq_size register
  habanalabs: added return value check for hl_fw_dynamic_send_clear_cmd()
  habanalabs: increase the size of busy engines mask
  habanalabs/gaudi2: change memory scrub mechanism
  habanalabs: extend process wait timeout in device fine
  habanalabs: check schedule_hard_reset correctly
  habanalabs: reset device if still in use when released
  habanalabs/gaudi2: return to reset upon SM SEI BRESP error
  habanalabs/gaudi2: don't enable entries in the MSIX_GW table
  habanalabs/gaudi2: remove redundant firmware version check
  habanalabs/gaudi: fix print for firmware-alive event
  habanalabs: fix print for out-of-sync and pkt-failure events
  ...
This commit is contained in:
Greg Kroah-Hartman
2022-11-29 13:19:29 +01:00
21 changed files with 1271 additions and 533 deletions
+76 -17
View File
@@ -597,6 +597,10 @@ enum gaudi2_engine_id {
GAUDI2_ENGINE_ID_NIC10_1,
GAUDI2_ENGINE_ID_NIC11_0,
GAUDI2_ENGINE_ID_NIC11_1,
GAUDI2_ENGINE_ID_PCIE,
GAUDI2_ENGINE_ID_PSOC,
GAUDI2_ENGINE_ID_ARC_FARM,
GAUDI2_ENGINE_ID_KDMA,
GAUDI2_ENGINE_ID_SIZE
};
@@ -717,6 +721,8 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable
* HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@@ -725,6 +731,8 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4)
#define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5)
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
/* Opcode for management ioctl
*
@@ -778,6 +786,9 @@ enum hl_server_type {
* HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
* HL_INFO_GET_EVENTS - Retrieve the last occurred events
* HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
* HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic.
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -809,6 +820,8 @@ enum hl_server_type {
#define HL_INFO_GET_EVENTS 30
#define HL_INFO_UNDEFINED_OPCODE_EVENT 31
#define HL_INFO_ENGINE_STATUS 32
#define HL_INFO_PAGE_FAULT_EVENT 33
#define HL_INFO_USER_MAPPINGS 34
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -859,6 +872,7 @@ enum hl_server_type {
* @number_of_user_interrupts: The number of interrupts that are available to the userspace
* application to use. Relevant for Gaudi2 and later.
* @device_mem_alloc_default_page_size: default page size used in device memory allocation.
* @revision_id: PCI revision ID of the ASIC.
*/
struct hl_info_hw_ip_info {
__u64 sram_base_address;
@@ -889,6 +903,12 @@ struct hl_info_hw_ip_info {
__u16 pad2;
__u64 reserved4;
__u64 device_mem_alloc_default_page_size;
__u64 reserved5;
__u64 reserved6;
__u32 reserved7;
__u8 reserved8;
__u8 revision_id;
__u8 pad[2];
};
struct hl_info_dram_usage {
@@ -896,7 +916,7 @@ struct hl_info_dram_usage {
__u64 ctx_dram_mem;
};
#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2
#define HL_BUSY_ENGINES_MASK_EXT_SIZE 4
struct hl_info_hw_idle {
__u32 is_idle;
@@ -1071,31 +1091,44 @@ struct hl_info_cs_timeout_event {
__u64 seq;
};
#define HL_RAZWI_PAGE_FAULT 0
#define HL_RAZWI_MMU_ACCESS_ERROR 1
#define HL_RAZWI_NA_ENG_ID U16_MAX
#define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128
#define HL_RAZWI_READ BIT(0)
#define HL_RAZWI_WRITE BIT(1)
#define HL_RAZWI_LBW BIT(2)
#define HL_RAZWI_HBW BIT(3)
#define HL_RAZWI_RR BIT(4)
#define HL_RAZWI_ADDR_DEC BIT(5)
/**
* struct hl_info_razwi_event - razwi information.
* @timestamp: timestamp of razwi.
* @addr: address which accessing it caused razwi.
* @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
* have engine id it will be set to U16_MAX.
* @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
* engines which one them caused the razwi. In that case, it will contain the
* second possible engine id, otherwise it will be set to U16_MAX.
* @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
* otherwise 0.
* @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
* @pad: padding to 64 bit.
* @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not
* have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible
* engines which caused the razwi, it will hold all of them.
* @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication
* might be common for several engines and there is no way to get the
* exact engine. In this way, engine_id array will be filled with all
* possible engines caused this razwi. Also, there might be possibility
* in gaudi, where we don't indication on specific engine, in that case
* the value of this parameter will be zero.
* @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation
* HL_RAZWI_WRITE - razwi caused by write operation
* HL_RAZWI_LBW - razwi caused by lbw fabric transaction
* HL_RAZWI_HBW - razwi caused by hbw fabric transaction
* HL_RAZWI_RR - razwi caused by range register
* HL_RAZWI_ADDR_DEC - razwi caused by address decode error
* Note: this data is not supported by all asics, in that case the relevant bits will not
* be set.
*/
struct hl_info_razwi_event {
__s64 timestamp;
__u64 addr;
__u16 engine_id_1;
__u16 engine_id_2;
__u8 no_engine_id;
__u8 error_type;
__u8 pad[2];
__u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR];
__u16 num_of_possible_engines;
__u8 flags;
__u8 pad[5];
};
#define MAX_QMAN_STREAMS_INFO 4
@@ -1174,6 +1207,29 @@ struct hl_info_sec_attest {
__u8 pad0[2];
};
/**
* struct hl_page_fault_info - page fault information.
* @timestamp: timestamp of page fault.
* @addr: address which accessing it caused page fault.
* @engine_id: engine id which caused the page fault, supported only in gaudi3.
*/
struct hl_page_fault_info {
__s64 timestamp;
__u64 addr;
__u16 engine_id;
__u8 pad[6];
};
/**
* struct hl_user_mapping - user mapping information.
* @dev_va: device virtual address.
* @size: virtual address mapping size.
*/
struct hl_user_mapping {
__u64 dev_va;
__u64 size;
};
enum gaudi_dcores {
HL_GAUDI_WS_DCORE,
HL_GAUDI_WN_DCORE,
@@ -1200,6 +1256,8 @@ enum gaudi_dcores {
* needed, hence updating this variable so user will know the exact amount
* of bytes copied by the kernel to the buffer.
* @sec_attest_nonce: Nonce number used for attestation report.
* @array_size: Number of array members copied to user buffer.
* Relevant for HL_INFO_USER_MAPPINGS info ioctl.
* @pad: Padding to 64 bit.
*/
struct hl_info_args {
@@ -1215,6 +1273,7 @@ struct hl_info_args {
__u32 eventfd;
__u32 user_buffer_actual_size;
__u32 sec_attest_nonce;
__u32 array_size;
};
__u32 pad;