/* * Copyright 2020 Advanced Micro Devices, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * */ #include "si_pipe.h" #include "si_build_pm4.h" #include "si_compute.h" #include "ac_rgp.h" #include "ac_sqtt.h" #include "util/u_memory.h" #include "tgsi/tgsi_from_mesa.h" static void si_emit_spi_config_cntl(struct si_context* sctx, struct radeon_cmdbuf *cs, bool enable); static bool si_thread_trace_init_bo(struct si_context *sctx) { unsigned max_se = sctx->screen->info.max_se; struct radeon_winsys *ws = sctx->ws; uint64_t size; /* The buffer size and address need to be aligned in HW regs. Align the * size as early as possible so that we do all the allocation & addressing * correctly. */ sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT); /* Compute total size of the thread trace BO for all SEs. */ size = align64(sizeof(struct ac_thread_trace_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT); size += sctx->thread_trace->buffer_size * (uint64_t)max_se; sctx->thread_trace->bo = ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC); if (!sctx->thread_trace->bo) return false; return true; } static bool si_se_is_disabled(struct si_context* sctx, unsigned se) { /* No active CU on the SE means it is disabled. */ return sctx->screen->info.cu_mask[se][0] == 0; } static void si_emit_thread_trace_start(struct si_context* sctx, struct radeon_cmdbuf *cs, uint32_t queue_family_index) { struct si_screen *sscreen = sctx->screen; uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; unsigned max_se = sscreen->info.max_se; radeon_begin(cs); for (unsigned se = 0; se < max_se; se++) { uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se); uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; if (si_se_is_disabled(sctx, se)) continue; /* Target SEx and SH0. */ radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); /* Select the first active CUs */ int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]); if (sctx->chip_class >= GFX10) { /* Order seems important for the following 2 registers. */ radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32)); radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); int wgp = first_active_cu / 2; radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK, S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */ S_008D14_SA_SEL(0) | S_008D14_WGP_SEL(wgp) | S_008D14_SIMD_SEL(0)); radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONFIG) | S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF)); /* Should be emitted last (it enables thread traces). */ radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(1) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0) | S_008D1C_LOWATER_OFFSET( sctx->chip_class >= GFX10_3 ? 4 : 0)); } else { /* Order seems important for the following 4 registers. */ radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32)); radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va); radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size)); radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1)); uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) | S_030CC8_SQ_STALL_EN(1); radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, thread_trace_mask); /* Trace all tokens and registers. */ radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK, S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0)); /* Enable SQTT perf counters for all CUs. */ radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK, S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff)); radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff); radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4)); if (sctx->chip_class == GFX9) { /* Reset thread trace status errors. */ radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0)); } /* Enable the thread trace mode. */ uint32_t thread_trace_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) | S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) | S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */ S_030CD8_MODE(1); if (sctx->chip_class == GFX9) { /* Count SQTT traffic in TCC perf counters. */ thread_trace_mode |= S_030CD8_TC_PERF_EN(1); } radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, thread_trace_mode); } } /* Restore global broadcasting. */ radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1)); /* Start the thread trace with a different event based on the queue. */ if (queue_family_index == RING_COMPUTE) { radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1)); } else { radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); } radeon_end(); } static const uint32_t gfx9_thread_trace_info_regs[] = { R_030CE4_SQ_THREAD_TRACE_WPTR, R_030CE8_SQ_THREAD_TRACE_STATUS, R_030CF0_SQ_THREAD_TRACE_CNTR, }; static const uint32_t gfx10_thread_trace_info_regs[] = { R_008D10_SQ_THREAD_TRACE_WPTR, R_008D20_SQ_THREAD_TRACE_STATUS, R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, }; static void si_copy_thread_trace_info_regs(struct si_context* sctx, struct radeon_cmdbuf *cs, unsigned se_index) { const uint32_t *thread_trace_info_regs = NULL; switch (sctx->chip_class) { case GFX10_3: case GFX10: thread_trace_info_regs = gfx10_thread_trace_info_regs; break; case GFX9: thread_trace_info_regs = gfx9_thread_trace_info_regs; break; default: unreachable("Unsupported chip_class"); } /* Get the VA where the info struct is stored for this SE. */ uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); radeon_begin(cs); /* Copy back the info struct one DWORD at a time. */ for (unsigned i = 0; i < 3; i++) { radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM); radeon_emit(thread_trace_info_regs[i] >> 2); radeon_emit(0); /* unused */ radeon_emit((info_va + i * 4)); radeon_emit((info_va + i * 4) >> 32); } radeon_end(); } static void si_emit_thread_trace_stop(struct si_context *sctx, struct radeon_cmdbuf *cs, uint32_t queue_family_index) { unsigned max_se = sctx->screen->info.max_se; radeon_begin(cs); /* Stop the thread trace with a different event based on the queue. */ if (queue_family_index == RING_COMPUTE) { radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0)); } else { radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0)); } radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); radeon_end(); for (unsigned se = 0; se < max_se; se++) { if (si_se_is_disabled(sctx, se)) continue; radeon_begin(cs); /* Target SEi and SH0. */ radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); if (sctx->chip_class >= GFX10) { /* Make sure to wait for the trace buffer. */ radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */ radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ radeon_emit(0); radeon_emit(0); /* reference value */ radeon_emit(S_008D20_FINISH_DONE(1)); /* mask */ radeon_emit(4); /* poll interval */ /* Disable the thread trace mode. */ radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0)); /* Wait for thread trace completion. */ radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ radeon_emit(0); radeon_emit(0); /* reference value */ radeon_emit(S_008D20_BUSY(1)); /* mask */ radeon_emit(4); /* poll interval */ } else { /* Disable the thread trace mode. */ radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0)); /* Wait for thread trace completion. */ radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */ radeon_emit(0); radeon_emit(0); /* reference value */ radeon_emit(S_030CE8_BUSY(1)); /* mask */ radeon_emit(4); /* poll interval */ } radeon_end(); si_copy_thread_trace_info_regs(sctx, cs, se); } /* Restore global broadcasting. */ radeon_begin_again(cs); radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1)); radeon_end(); } static void si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) { struct radeon_winsys *ws = sctx->ws; radeon_begin(cs); switch (family) { case RING_GFX: radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); break; case RING_COMPUTE: radeon_emit(PKT3(PKT3_NOP, 0, 0)); radeon_emit(0); break; } radeon_end(); ws->cs_add_buffer(cs, sctx->thread_trace->bo, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM, 0); si_cp_dma_wait_for_idle(sctx, cs); /* Make sure to wait-for-idle before starting SQTT. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; sctx->emit_cache_flush(sctx, cs); si_inhibit_clockgating(sctx, cs, true); /* Enable SQG events that collects thread trace data. */ si_emit_spi_config_cntl(sctx, cs, true); si_emit_thread_trace_start(sctx, cs, family); } static void si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) { struct radeon_winsys *ws = sctx->ws; radeon_begin(cs); switch (family) { case RING_GFX: radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); break; case RING_COMPUTE: radeon_emit(PKT3(PKT3_NOP, 0, 0)); radeon_emit(0); break; } radeon_end(); ws->cs_add_buffer(cs, sctx->thread_trace->bo, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM, 0); si_cp_dma_wait_for_idle(sctx, cs); /* Make sure to wait-for-idle before stopping SQTT. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; sctx->emit_cache_flush(sctx, cs); si_emit_thread_trace_stop(sctx, cs, family); /* Restore previous state by disabling SQG events. */ si_emit_spi_config_cntl(sctx, cs, false); si_inhibit_clockgating(sctx, cs, false); } static void si_thread_trace_init_cs(struct si_context *sctx) { struct radeon_winsys *ws = sctx->ws; /* Thread trace start CS (only handles RING_GFX). */ sctx->thread_trace->start_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf); if (!ws->cs_create(sctx->thread_trace->start_cs[RING_GFX], sctx->ctx, RING_GFX, NULL, NULL, 0)) { free(sctx->thread_trace->start_cs[RING_GFX]); sctx->thread_trace->start_cs[RING_GFX] = NULL; return; } si_thread_trace_start(sctx, RING_GFX, sctx->thread_trace->start_cs[RING_GFX]); /* Thread trace stop CS. */ sctx->thread_trace->stop_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf); if (!ws->cs_create(sctx->thread_trace->stop_cs[RING_GFX], sctx->ctx, RING_GFX, NULL, NULL, 0)) { free(sctx->thread_trace->start_cs[RING_GFX]); sctx->thread_trace->start_cs[RING_GFX] = NULL; free(sctx->thread_trace->stop_cs[RING_GFX]); sctx->thread_trace->stop_cs[RING_GFX] = NULL; return; } si_thread_trace_stop(sctx, RING_GFX, sctx->thread_trace->stop_cs[RING_GFX]); } static void si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) { struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[RING_GFX]; sctx->ws->cs_flush(cs, 0, NULL); } static void si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) { struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[RING_GFX]; sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence); } static bool si_get_thread_trace(struct si_context *sctx, struct ac_thread_trace *thread_trace) { unsigned max_se = sctx->screen->info.max_se; memset(thread_trace, 0, sizeof(*thread_trace)); thread_trace->num_traces = max_se; sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo, NULL, PIPE_MAP_READ); if (!sctx->thread_trace->ptr) return false; void *thread_trace_ptr = sctx->thread_trace->ptr; for (unsigned se = 0; se < max_se; se++) { uint64_t info_offset = ac_thread_trace_get_info_offset(se); uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se); void *info_ptr = thread_trace_ptr + info_offset; void *data_ptr = thread_trace_ptr + data_offset; struct ac_thread_trace_info *info = (struct ac_thread_trace_info *)info_ptr; struct ac_thread_trace_se thread_trace_se = {0}; if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) { uint32_t expected_size = ac_get_expected_buffer_size(&sctx->screen->info, info); uint32_t available_size = (info->cur_offset * 32) / 1024; fprintf(stderr, "Failed to get the thread trace " "because the buffer is too small. The " "hardware needs %d KB but the " "buffer size is %d KB.\n", expected_size, available_size); fprintf(stderr, "Please update the buffer size with " "AMD_THREAD_TRACE_BUFFER_SIZE=\n"); return false; } thread_trace_se.data_ptr = data_ptr; thread_trace_se.info = *info; thread_trace_se.shader_engine = se; int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]); /* For GFX10+ compute_unit really means WGP */ thread_trace_se.compute_unit = sctx->screen->info.chip_class >= GFX10 ? (first_active_cu / 2) : first_active_cu; thread_trace->traces[se] = thread_trace_se; } thread_trace->data = sctx->thread_trace; return true; } bool si_init_thread_trace(struct si_context *sctx) { static bool warn_once = true; if (warn_once) { fprintf(stderr, "*************************************************\n"); fprintf(stderr, "* WARNING: Thread trace support is experimental *\n"); fprintf(stderr, "*************************************************\n"); warn_once = false; } sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data); if (sctx->chip_class < GFX8) { fprintf(stderr, "GPU hardware not supported: refer to " "the RGP documentation for the list of " "supported GPUs!\n"); return false; } if (sctx->chip_class > GFX10_3) { fprintf(stderr, "radeonsi: Thread trace is not supported " "for that GPU!\n"); return false; } /* Default buffer size set to 1MB per SE. */ sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 1024) * 1024; sctx->thread_trace->start_frame = 10; const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER"); if (trigger) { sctx->thread_trace->start_frame = atoi(trigger); if (sctx->thread_trace->start_frame <= 0) { /* This isn't a frame number, must be a file */ sctx->thread_trace->trigger_file = strdup(trigger); sctx->thread_trace->start_frame = -1; } } if (!si_thread_trace_init_bo(sctx)) return false; list_inithead(&sctx->thread_trace->rgp_pso_correlation.record); simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain); list_inithead(&sctx->thread_trace->rgp_loader_events.record); simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain); list_inithead(&sctx->thread_trace->rgp_code_object.record); simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain); si_thread_trace_init_cs(sctx); sctx->sqtt_next_event = EventInvalid; return true; } void si_destroy_thread_trace(struct si_context *sctx) { struct si_screen *sscreen = sctx->screen; struct pb_buffer *bo = sctx->thread_trace->bo; radeon_bo_reference(sctx->screen->ws, &bo, NULL); if (sctx->thread_trace->trigger_file) free(sctx->thread_trace->trigger_file); sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[RING_GFX]); sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[RING_GFX]); struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation; struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events; struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object; list_for_each_entry_safe(struct rgp_pso_correlation_record, record, &pso_correlation->record, list) { list_del(&record->list); free(record); } simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock); list_for_each_entry_safe(struct rgp_loader_events_record, record, &loader_events->record, list) { list_del(&record->list); free(record); } simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock); list_for_each_entry_safe(struct rgp_code_object_record, record, &code_object->record, list) { uint32_t mask = record->shader_stages_mask; int i; /* Free the disassembly. */ while (mask) { i = u_bit_scan(&mask); free(record->shader_data[i].code); } list_del(&record->list); free(record); } simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock); free(sctx->thread_trace); sctx->thread_trace = NULL; } static uint64_t num_frames = 0; void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) { /* Should we enable SQTT yet? */ if (!sctx->thread_trace_enabled) { bool frame_trigger = num_frames == sctx->thread_trace->start_frame; bool file_trigger = false; if (sctx->thread_trace->trigger_file && access(sctx->thread_trace->trigger_file, W_OK) == 0) { if (unlink(sctx->thread_trace->trigger_file) == 0) { file_trigger = true; } else { /* Do not enable tracing if we cannot remove the file, * because by then we'll trace every frame. */ fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n"); } } if (frame_trigger || file_trigger) { /* Wait for last submission */ sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE); /* Start SQTT */ si_begin_thread_trace(sctx, rcs); sctx->thread_trace_enabled = true; sctx->thread_trace->start_frame = -1; /* Force shader update to make sure si_sqtt_describe_pipeline_bind is called * for the current "pipeline". */ sctx->do_update_shaders = true; } } else { struct ac_thread_trace thread_trace = {0}; /* Stop SQTT */ si_end_thread_trace(sctx, rcs); sctx->thread_trace_enabled = false; sctx->thread_trace->start_frame = -1; assert (sctx->last_sqtt_fence); /* Wait for SQTT to finish and read back the bo */ if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) && si_get_thread_trace(sctx, &thread_trace)) { ac_dump_rgp_capture(&sctx->screen->info, &thread_trace); } else { fprintf(stderr, "Failed to read the trace\n"); } } num_frames++; } static void si_emit_thread_trace_userdata(struct si_context* sctx, struct radeon_cmdbuf *cs, const void *data, uint32_t num_dwords) { const uint32_t *dwords = (uint32_t *)data; radeon_begin(cs); while (num_dwords > 0) { uint32_t count = MIN2(num_dwords, 2); /* Without the perfctr bit the CP might not always pass the * write on correctly. */ radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10); radeon_emit_array(dwords, count); dwords += count; num_dwords -= count; } radeon_end(); } static void si_emit_spi_config_cntl(struct si_context* sctx, struct radeon_cmdbuf *cs, bool enable) { radeon_begin(cs); if (sctx->chip_class >= GFX9) { uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) | S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable); if (sctx->chip_class >= GFX10) spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3); radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl); } else { /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */ radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL, S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable)); } radeon_end(); } static uint32_t num_events = 0; void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs, enum rgp_sqtt_marker_event_type api_type, uint32_t vertex_offset_user_data, uint32_t instance_offset_user_data, uint32_t draw_index_user_data) { struct rgp_sqtt_marker_event marker = {0}; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT; marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type; marker.cmd_id = num_events++; marker.cb_id = 0; if (vertex_offset_user_data == UINT_MAX || instance_offset_user_data == UINT_MAX) { vertex_offset_user_data = 0; instance_offset_user_data = 0; } if (draw_index_user_data == UINT_MAX) draw_index_user_data = vertex_offset_user_data; marker.vertex_offset_reg_idx = vertex_offset_user_data; marker.instance_offset_reg_idx = instance_offset_user_data; marker.draw_index_reg_idx = draw_index_user_data; si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); sctx->sqtt_next_event = EventInvalid; } void si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs, enum rgp_sqtt_marker_event_type api_type, uint32_t x, uint32_t y, uint32_t z) { struct rgp_sqtt_marker_event_with_dims marker = {0}; marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT; marker.event.api_type = api_type; marker.event.cmd_id = num_events++; marker.event.cb_id = 0; marker.event.has_thread_dims = 1; marker.thread_x = x; marker.thread_y = y; marker.thread_z = z; si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); sctx->sqtt_next_event = EventInvalid; } void si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs) { struct rgp_sqtt_marker_barrier_start marker = {0}; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START; marker.cb_id = 0; marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */ si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); } void si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs, unsigned flags) { struct rgp_sqtt_marker_barrier_end marker = {0}; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END; marker.cb_id = 0; if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) marker.vs_partial_flush = true; if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) marker.ps_partial_flush = true; if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH) marker.cs_partial_flush = true; if (flags & SI_CONTEXT_PFP_SYNC_ME) marker.pfp_sync_me = true; if (flags & SI_CONTEXT_INV_VCACHE) marker.inval_tcp = true; if (flags & SI_CONTEXT_INV_ICACHE) marker.inval_sqI = true; if (flags & SI_CONTEXT_INV_SCACHE) marker.inval_sqK = true; if (flags & SI_CONTEXT_INV_L2) marker.inval_tcc = true; if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { marker.inval_cb = true; marker.flush_cb = true; } if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { marker.inval_db = true; marker.flush_db = true; } si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); } void si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs, enum rgp_sqtt_marker_user_event_type type, const char *str, int len) { if (type == UserEventPop) { assert (str == NULL); struct rgp_sqtt_marker_user_event marker = { 0 }; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT; marker.data_type = type; si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); } else { assert (str != NULL); struct rgp_sqtt_marker_user_event_with_length marker = { 0 }; marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT; marker.user_event.data_type = type; len = MIN2(1024, len); marker.length = align(len, 4); uint8_t *buffer = alloca(sizeof(marker) + marker.length); memcpy(buffer, &marker, sizeof(marker)); memcpy(buffer + sizeof(marker), str, len); buffer[sizeof(marker) + len - 1] = '\0'; si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4); } } bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data, uint64_t pipeline_hash) { simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock); list_for_each_entry_safe(struct rgp_pso_correlation_record, record, &thread_trace_data->rgp_pso_correlation.record, list) { if (record->pipeline_hash[0] == pipeline_hash) { simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock); return true; } } simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock); return false; } static enum rgp_hardware_stages si_sqtt_pipe_to_rgp_shader_stage(struct si_shader_key* key, enum pipe_shader_type stage) { switch (stage) { case PIPE_SHADER_VERTEX: if (key->as_ls) return RGP_HW_STAGE_LS; else if (key->as_es) return RGP_HW_STAGE_ES; else if (key->as_ngg) return RGP_HW_STAGE_GS; else return RGP_HW_STAGE_VS; case PIPE_SHADER_TESS_CTRL: return RGP_HW_STAGE_HS; case PIPE_SHADER_TESS_EVAL: if (key->as_es) return RGP_HW_STAGE_ES; else if (key->as_ngg) return RGP_HW_STAGE_GS; else return RGP_HW_STAGE_VS; case PIPE_SHADER_GEOMETRY: return RGP_HW_STAGE_GS; case PIPE_SHADER_FRAGMENT: return RGP_HW_STAGE_PS; case PIPE_SHADER_COMPUTE: return RGP_HW_STAGE_CS; default: unreachable("invalid mesa shader stage"); } } static bool si_sqtt_add_code_object(struct si_context* sctx, uint64_t pipeline_hash, bool is_compute) { struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object; struct rgp_code_object_record *record; record = malloc(sizeof(struct rgp_code_object_record)); if (!record) return false; record->shader_stages_mask = 0; record->num_shaders_combined = 0; record->pipeline_hash[0] = pipeline_hash; record->pipeline_hash[1] = pipeline_hash; for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) { struct si_shader *shader; enum rgp_hardware_stages hw_stage; if (is_compute) { if (i != PIPE_SHADER_COMPUTE) continue; shader = &sctx->cs_shader_state.program->shader; hw_stage = RGP_HW_STAGE_CS; } else if (i != PIPE_SHADER_COMPUTE) { if (!sctx->shaders[i].cso || !sctx->shaders[i].current) continue; shader = sctx->shaders[i].current; hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i); } else { continue; } uint8_t *code = malloc(shader->binary.uploaded_code_size); if (!code) { free(record); return false; } memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size); uint64_t va = shader->bo->gpu_address; unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i); record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size); record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0]; record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size; record->shader_data[gl_shader_stage].code = code; record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs; record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs; record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff; record->shader_data[gl_shader_stage].elf_symbol_offset = 0; record->shader_data[gl_shader_stage].hw_stage = hw_stage; record->shader_data[gl_shader_stage].is_combined = false; record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave; record->shader_data[gl_shader_stage].wavefront_size = si_get_shader_wave_size(shader); record->shader_stages_mask |= 1 << gl_shader_stage; record->num_shaders_combined++; } simple_mtx_lock(&code_object->lock); list_addtail(&record->list, &code_object->record); code_object->record_count++; simple_mtx_unlock(&code_object->lock); return true; } bool si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute) { struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash)); bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash); if (!result) return false; result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address); if (!result) return false; return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute); } void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point) { struct rgp_sqtt_marker_pipeline_bind marker = {0}; struct radeon_cmdbuf *cs = &sctx->gfx_cs; if (likely(!sctx->thread_trace_enabled)) { return; } marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE; marker.cb_id = 0; marker.bind_point = bind_point; marker.api_pso_hash[0] = pipeline_hash; marker.api_pso_hash[1] = pipeline_hash >> 32; si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4); }