Merge tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next
Oded writes: This tag contains habanalabs driver changes for v6.2: - New feature of graceful hard-reset. Instead of immediately killing the user-process when a command submission times out, we wait a bit and give the user-process notification and let it try to close things gracefully, with the ability to retrieve debug information. - Enhance the EventFD mechanism. Add new events such as access to illegal address (RAZWI), page fault, device unavailable. In addition, change the event workqueue to be handled in a single-threaded workqueue. - Allow the control device to work during reset of the ASIC, to enable monitoring applications to continue getting the data. - Add handling for Gaudi2 with PCI revision 2. - Reduce severity of prints due to power/thermal events. - Change how we use the h/w to perform memory scrubbing in Gaudi2. - Multiple bug fixes, refactors and renames. * tag 'misc-habanalabs-next-2022-11-23' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (63 commits) habanalabs: fix VA range calculation habanalabs: fail driver load if EEPROM errors detected habanalabs: make print of engines idle mask more readable habanalabs: clear non-released encapsulated signals habanalabs: don't put context in hl_encaps_handle_do_release_sob() habanalabs: print context refcount value if hard reset fails habanalabs: add RMWREG32_SHIFTED to set a val within a mask habanalabs: fix rc when new CPUCP opcodes are not supported habanalabs/gaudi2: added memset for the cq_size register habanalabs: added return value check for hl_fw_dynamic_send_clear_cmd() habanalabs: increase the size of busy engines mask habanalabs/gaudi2: change memory scrub mechanism habanalabs: extend process wait timeout in device fine habanalabs: check schedule_hard_reset correctly habanalabs: reset device if still in use when released habanalabs/gaudi2: return to reset upon SM SEI BRESP error habanalabs/gaudi2: don't enable entries in the MSIX_GW table habanalabs/gaudi2: remove redundant firmware version check habanalabs/gaudi: fix print for firmware-alive event habanalabs: fix print for out-of-sync and pkt-failure events ...
This commit is contained in:
@@ -597,6 +597,10 @@ enum gaudi2_engine_id {
|
||||
GAUDI2_ENGINE_ID_NIC10_1,
|
||||
GAUDI2_ENGINE_ID_NIC11_0,
|
||||
GAUDI2_ENGINE_ID_NIC11_1,
|
||||
GAUDI2_ENGINE_ID_PCIE,
|
||||
GAUDI2_ENGINE_ID_PSOC,
|
||||
GAUDI2_ENGINE_ID_ARC_FARM,
|
||||
GAUDI2_ENGINE_ID_KDMA,
|
||||
GAUDI2_ENGINE_ID_SIZE
|
||||
};
|
||||
|
||||
@@ -717,6 +721,8 @@ enum hl_server_type {
|
||||
* HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable
|
||||
* HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state
|
||||
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
|
||||
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
|
||||
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
|
||||
*/
|
||||
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
|
||||
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
|
||||
@@ -725,6 +731,8 @@ enum hl_server_type {
|
||||
#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4)
|
||||
#define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5)
|
||||
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
|
||||
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
|
||||
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
|
||||
|
||||
/* Opcode for management ioctl
|
||||
*
|
||||
@@ -778,6 +786,9 @@ enum hl_server_type {
|
||||
* HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
|
||||
* HL_INFO_GET_EVENTS - Retrieve the last occurred events
|
||||
* HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information.
|
||||
* HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic.
|
||||
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
|
||||
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
|
||||
*/
|
||||
#define HL_INFO_HW_IP_INFO 0
|
||||
#define HL_INFO_HW_EVENTS 1
|
||||
@@ -809,6 +820,8 @@ enum hl_server_type {
|
||||
#define HL_INFO_GET_EVENTS 30
|
||||
#define HL_INFO_UNDEFINED_OPCODE_EVENT 31
|
||||
#define HL_INFO_ENGINE_STATUS 32
|
||||
#define HL_INFO_PAGE_FAULT_EVENT 33
|
||||
#define HL_INFO_USER_MAPPINGS 34
|
||||
|
||||
#define HL_INFO_VERSION_MAX_LEN 128
|
||||
#define HL_INFO_CARD_NAME_MAX_LEN 16
|
||||
@@ -859,6 +872,7 @@ enum hl_server_type {
|
||||
* @number_of_user_interrupts: The number of interrupts that are available to the userspace
|
||||
* application to use. Relevant for Gaudi2 and later.
|
||||
* @device_mem_alloc_default_page_size: default page size used in device memory allocation.
|
||||
* @revision_id: PCI revision ID of the ASIC.
|
||||
*/
|
||||
struct hl_info_hw_ip_info {
|
||||
__u64 sram_base_address;
|
||||
@@ -889,6 +903,12 @@ struct hl_info_hw_ip_info {
|
||||
__u16 pad2;
|
||||
__u64 reserved4;
|
||||
__u64 device_mem_alloc_default_page_size;
|
||||
__u64 reserved5;
|
||||
__u64 reserved6;
|
||||
__u32 reserved7;
|
||||
__u8 reserved8;
|
||||
__u8 revision_id;
|
||||
__u8 pad[2];
|
||||
};
|
||||
|
||||
struct hl_info_dram_usage {
|
||||
@@ -896,7 +916,7 @@ struct hl_info_dram_usage {
|
||||
__u64 ctx_dram_mem;
|
||||
};
|
||||
|
||||
#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2
|
||||
#define HL_BUSY_ENGINES_MASK_EXT_SIZE 4
|
||||
|
||||
struct hl_info_hw_idle {
|
||||
__u32 is_idle;
|
||||
@@ -1071,31 +1091,44 @@ struct hl_info_cs_timeout_event {
|
||||
__u64 seq;
|
||||
};
|
||||
|
||||
#define HL_RAZWI_PAGE_FAULT 0
|
||||
#define HL_RAZWI_MMU_ACCESS_ERROR 1
|
||||
#define HL_RAZWI_NA_ENG_ID U16_MAX
|
||||
#define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128
|
||||
#define HL_RAZWI_READ BIT(0)
|
||||
#define HL_RAZWI_WRITE BIT(1)
|
||||
#define HL_RAZWI_LBW BIT(2)
|
||||
#define HL_RAZWI_HBW BIT(3)
|
||||
#define HL_RAZWI_RR BIT(4)
|
||||
#define HL_RAZWI_ADDR_DEC BIT(5)
|
||||
|
||||
/**
|
||||
* struct hl_info_razwi_event - razwi information.
|
||||
* @timestamp: timestamp of razwi.
|
||||
* @addr: address which accessing it caused razwi.
|
||||
* @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
|
||||
* have engine id it will be set to U16_MAX.
|
||||
* @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
|
||||
* engines which one them caused the razwi. In that case, it will contain the
|
||||
* second possible engine id, otherwise it will be set to U16_MAX.
|
||||
* @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
|
||||
* otherwise 0.
|
||||
* @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
|
||||
* @pad: padding to 64 bit.
|
||||
* @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not
|
||||
* have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible
|
||||
* engines which caused the razwi, it will hold all of them.
|
||||
* @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication
|
||||
* might be common for several engines and there is no way to get the
|
||||
* exact engine. In this way, engine_id array will be filled with all
|
||||
* possible engines caused this razwi. Also, there might be possibility
|
||||
* in gaudi, where we don't indication on specific engine, in that case
|
||||
* the value of this parameter will be zero.
|
||||
* @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation
|
||||
* HL_RAZWI_WRITE - razwi caused by write operation
|
||||
* HL_RAZWI_LBW - razwi caused by lbw fabric transaction
|
||||
* HL_RAZWI_HBW - razwi caused by hbw fabric transaction
|
||||
* HL_RAZWI_RR - razwi caused by range register
|
||||
* HL_RAZWI_ADDR_DEC - razwi caused by address decode error
|
||||
* Note: this data is not supported by all asics, in that case the relevant bits will not
|
||||
* be set.
|
||||
*/
|
||||
struct hl_info_razwi_event {
|
||||
__s64 timestamp;
|
||||
__u64 addr;
|
||||
__u16 engine_id_1;
|
||||
__u16 engine_id_2;
|
||||
__u8 no_engine_id;
|
||||
__u8 error_type;
|
||||
__u8 pad[2];
|
||||
__u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR];
|
||||
__u16 num_of_possible_engines;
|
||||
__u8 flags;
|
||||
__u8 pad[5];
|
||||
};
|
||||
|
||||
#define MAX_QMAN_STREAMS_INFO 4
|
||||
@@ -1174,6 +1207,29 @@ struct hl_info_sec_attest {
|
||||
__u8 pad0[2];
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_page_fault_info - page fault information.
|
||||
* @timestamp: timestamp of page fault.
|
||||
* @addr: address which accessing it caused page fault.
|
||||
* @engine_id: engine id which caused the page fault, supported only in gaudi3.
|
||||
*/
|
||||
struct hl_page_fault_info {
|
||||
__s64 timestamp;
|
||||
__u64 addr;
|
||||
__u16 engine_id;
|
||||
__u8 pad[6];
|
||||
};
|
||||
|
||||
/**
|
||||
* struct hl_user_mapping - user mapping information.
|
||||
* @dev_va: device virtual address.
|
||||
* @size: virtual address mapping size.
|
||||
*/
|
||||
struct hl_user_mapping {
|
||||
__u64 dev_va;
|
||||
__u64 size;
|
||||
};
|
||||
|
||||
enum gaudi_dcores {
|
||||
HL_GAUDI_WS_DCORE,
|
||||
HL_GAUDI_WN_DCORE,
|
||||
@@ -1200,6 +1256,8 @@ enum gaudi_dcores {
|
||||
* needed, hence updating this variable so user will know the exact amount
|
||||
* of bytes copied by the kernel to the buffer.
|
||||
* @sec_attest_nonce: Nonce number used for attestation report.
|
||||
* @array_size: Number of array members copied to user buffer.
|
||||
* Relevant for HL_INFO_USER_MAPPINGS info ioctl.
|
||||
* @pad: Padding to 64 bit.
|
||||
*/
|
||||
struct hl_info_args {
|
||||
@@ -1215,6 +1273,7 @@ struct hl_info_args {
|
||||
__u32 eventfd;
|
||||
__u32 user_buffer_actual_size;
|
||||
__u32 sec_attest_nonce;
|
||||
__u32 array_size;
|
||||
};
|
||||
|
||||
__u32 pad;
|
||||
|
||||
Reference in New Issue
Block a user