/* * Copyright (c) 2017 Lima Project * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * */ #include #include "gpir.h" /* * GP schedule algorithm (by Connor Abbott ) * * Pre schedule phase: * 1. order all nodes in a sequence * 2. convert the real reg read/write to GP load/store node, now all * variable is SSA * 3. do reg alloc for all SSA with 11 reg (value reg) and spill with * load/store to real reg if needed * 4. add fake dependency like this: * after step 3, node sequence is * 01: r1=r2+r3 * 02: r4=r1+r2 * 03: r1=r5+r6 * we should add a fake dependency of node 3 to node 2 like a * write-after-read dep. But this is not really write-after-read * dep because there's no r1 really, because it's a value register. * We need this fake dep in the schedule phase to make sure in any * schedule point, there're only <=11 input needed by the past * scheduled nodes. * 5. build DAG according to all the real and fake dep * * Schedule phase: * 1. Compute the nodes ready to schedule, if no nodes, exit * 2. Create a new GP instruction, and call it as current instr * 3. For any nodes with a use 2 cycles ago with a definition ready to * schedule, schedule that definition immediately if possible, or else * schedule a move. * 4. For any nodes with a use 2 cycles ago but the definition not * scheduled and not ready to schedule, schedule a move immediately * to prevent the value from falling off the queue. * 5. Calculate the number of remaining nodes with a use 1 cycle ago but * the definition not yet scheduled, and if there are more than 5, * schedule moves or definitions for the rest now. * 6. Schedule the rest of the available nodes using your favorite heuristic * to current instr. * 7. go to step 1 * * Step 5 for the current instruction guarantees that steps 3 and 4 for * the next instruction will always succeed, so it's only step 5 that can * possibly fail. Now, note that the nodes whose definitions have not yet * been scheduled but one or more use has been scheduled, are exactly the * nodes that are live in the final schedule. Therefore there will never * be more than 11 of them (guarenteed by the 11 value reg alloc and the * fake dep added before schedule). The worst case for step 5 is that all of * these nodes had a use 1 cycle ago, which means that none of them hit * case 3 or 4 already, so there are 6 slots still available so step 5 * will always succeed. In general, even if there are exactly 11 values * live, if n are scheduled in steps 3 and 4, there are 11-n left in step * 4 so at most 11-n-5 = 6-n are scheduled in step 5 and therefore 6 are * scheduled total, below the limit. So the algorithm will always succeed. */ static int gpir_min_dist_alu(gpir_dep *dep) { switch (dep->pred->op) { case gpir_op_load_uniform: case gpir_op_load_temp: case gpir_op_load_reg: case gpir_op_load_attribute: return 0; case gpir_op_complex1: return 2; default: return 1; } } static int gpir_get_min_dist(gpir_dep *dep) { switch (dep->type) { case GPIR_DEP_INPUT: switch (dep->succ->op) { case gpir_op_store_temp: case gpir_op_store_reg: case gpir_op_store_varying: /* store must use alu node as input */ if (dep->pred->type == gpir_node_type_load) return INT_MAX >> 2; else return 0; default: return gpir_min_dist_alu(dep); } case GPIR_DEP_OFFSET: assert(dep->succ->op == gpir_op_store_temp); return gpir_min_dist_alu(dep); case GPIR_DEP_READ_AFTER_WRITE: switch (dep->succ->op) { case gpir_op_load_temp: assert(dep->pred->op == gpir_op_store_temp); return 4; case gpir_op_load_reg: assert(dep->pred->op == gpir_op_store_reg); return 3; case gpir_op_load_uniform: assert(dep->pred->op == gpir_op_store_temp_load_off0 || dep->pred->op == gpir_op_store_temp_load_off1 || dep->pred->op == gpir_op_store_temp_load_off2); return 4; default: assert(0); } case GPIR_DEP_WRITE_AFTER_READ: switch (dep->pred->op) { case gpir_op_load_temp: assert(dep->succ->op == gpir_op_store_temp); return -3; case gpir_op_load_reg: assert(dep->succ->op == gpir_op_store_reg); return -2; case gpir_op_load_uniform: assert(dep->succ->op == gpir_op_store_temp_load_off0 || dep->succ->op == gpir_op_store_temp_load_off1 || dep->succ->op == gpir_op_store_temp_load_off2); return -3; default: assert(0); } case GPIR_DEP_VREG_WRITE_AFTER_READ: return 0; case GPIR_DEP_VREG_READ_AFTER_WRITE: assert(0); /* not possible, this is GPIR_DEP_INPUT */ } return 0; } static int gpir_max_dist_alu(gpir_dep *dep) { switch (dep->pred->op) { case gpir_op_load_uniform: case gpir_op_load_temp: return 0; case gpir_op_load_attribute: return 1; case gpir_op_load_reg: if (dep->pred->sched.pos < GPIR_INSTR_SLOT_REG0_LOAD0 || dep->pred->sched.pos > GPIR_INSTR_SLOT_REG0_LOAD3) return 0; else return 1; case gpir_op_exp2_impl: case gpir_op_log2_impl: case gpir_op_rcp_impl: case gpir_op_rsqrt_impl: case gpir_op_store_temp_load_off0: case gpir_op_store_temp_load_off1: case gpir_op_store_temp_load_off2: return 1; case gpir_op_mov: if (dep->pred->sched.pos == GPIR_INSTR_SLOT_COMPLEX) return 1; else return 2; default: return 2; } } static int gpir_get_max_dist(gpir_dep *dep) { switch (dep->type) { case GPIR_DEP_INPUT: switch (dep->succ->op) { case gpir_op_store_temp: case gpir_op_store_reg: case gpir_op_store_varying: return 0; default: return gpir_max_dist_alu(dep); } case GPIR_DEP_OFFSET: assert(dep->succ->op == gpir_op_store_temp); return gpir_max_dist_alu(dep); default: return INT_MAX >> 2; /* Don't want to overflow... */ } } static void schedule_update_distance(gpir_node *node) { if (gpir_node_is_leaf(node)) { node->sched.dist = 0; return; } gpir_node_foreach_pred(node, dep) { gpir_node *pred = dep->pred; if (pred->sched.dist < 0) schedule_update_distance(pred); int dist = pred->sched.dist + 1; if (node->sched.dist < dist) node->sched.dist = dist; } } static void schedule_insert_ready_list(struct list_head *ready_list, gpir_node *insert_node) { /* if this node is fully ready or partially ready * fully ready: all successors have been scheduled * partially ready: part of input successors have been scheduled * * either fully ready or partially ready node need be inserted to * the ready list, but we only schedule a move node for partially * ready node. */ bool ready = true, insert = false; gpir_node_foreach_succ(insert_node, dep) { gpir_node *succ = dep->succ; if (succ->sched.instr >= 0) { if (dep->type == GPIR_DEP_INPUT) insert = true; } else ready = false; } insert_node->sched.ready = ready; /* for root node */ insert |= ready; if (!insert || insert_node->sched.inserted) return; struct list_head *insert_pos = ready_list; list_for_each_entry(gpir_node, node, ready_list, list) { if (insert_node->sched.dist > node->sched.dist) { insert_pos = &node->list; break; } } list_addtail(&insert_node->list, insert_pos); insert_node->sched.inserted = true; } static int gpir_get_max_start(gpir_node *node) { int max_start = 0; /* find the max start instr constrainted by all successors */ gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; if (succ->sched.instr < 0) continue; int start = succ->sched.instr + gpir_get_min_dist(dep); if (start > max_start) max_start = start; } return max_start; } static int gpir_get_min_end(gpir_node *node) { int min_end = INT_MAX; /* find the min end instr constrainted by all successors */ gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; if (succ->sched.instr < 0) continue; int end = succ->sched.instr + gpir_get_max_dist(dep); if (end < min_end) min_end = end; } return min_end; } static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node) { gpir_load_node *load = gpir_node_to_load(node); for (int i = GPIR_INSTR_SLOT_REG0_LOAD0; i <= GPIR_INSTR_SLOT_MEM_LOAD3; i++) { if (!instr->slots[i]) continue; gpir_load_node *iload = gpir_node_to_load(instr->slots[i]); if (load->node.op == iload->node.op && load->index == iload->index && load->component == iload->component) return &iload->node; } return NULL; } static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node) { if (node->type == gpir_node_type_load) { gpir_node *load = gpir_sched_instr_has_load(instr, node); if (load) { gpir_debug("same load %d in instr %d for node %d\n", load->index, instr->index, node->index); /* not really merge two node, just fake scheduled same place */ node->sched.instr = load->sched.instr; node->sched.pos = load->sched.pos; return true; } } node->sched.instr = instr->index; int *slots = gpir_op_infos[node->op].slots; for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) { node->sched.pos = slots[i]; if (node->sched.instr >= gpir_get_max_start(node) && node->sched.instr <= gpir_get_min_end(node) && gpir_instr_try_insert_node(instr, node)) return true; } node->sched.instr = -1; node->sched.pos = -1; return false; } static gpir_node *schedule_create_move_node(gpir_node *node) { gpir_alu_node *move = gpir_node_create(node->block, gpir_op_mov); if (unlikely(!move)) return NULL; move->children[0] = node; move->num_child = 1; move->node.sched.instr = -1; move->node.sched.pos = -1; move->node.sched.dist = node->sched.dist; gpir_debug("create move %d for %d\n", move->node.index, node->index); return &move->node; } static gpir_node *gpir_sched_node(gpir_instr *instr, gpir_node *node) { if (node->op == gpir_op_mov) { gpir_node *child = gpir_node_to_alu(node)->children[0]; gpir_node_foreach_succ_safe(node, dep) { gpir_node *succ = dep->succ; if (succ->sched.instr < 0 || instr->index < succ->sched.instr + gpir_get_min_dist(dep)) { gpir_node_replace_pred(dep, child); if (dep->type == GPIR_DEP_INPUT) gpir_node_replace_child(succ, node, child); } } MAYBE_UNUSED bool result = schedule_try_place_node(instr, node); assert(result); return node; } else { gpir_node *move = schedule_create_move_node(node); list_del(&node->list); node->sched.ready = false; node->sched.inserted = false; gpir_node_replace_succ(move, node); gpir_node_add_dep(move, node, GPIR_DEP_INPUT); return move; } } static bool gpir_is_input_node(gpir_node *node) { gpir_node_foreach_succ(node, dep) { if (dep->type == GPIR_DEP_INPUT) return true; } return false; } static int gpir_get_min_scheduled_succ(gpir_node *node) { int min = INT_MAX; gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; if (succ->sched.instr >= 0 && dep->type == GPIR_DEP_INPUT) { if (min > succ->sched.instr) min = succ->sched.instr; } } return min; } static gpir_node *gpir_sched_instr_pass(gpir_instr *instr, struct list_head *ready_list) { /* fully ready node reach its max dist with any of its successor */ list_for_each_entry_safe(gpir_node, node, ready_list, list) { if (node->sched.ready) { int end = gpir_get_min_end(node); assert(end >= instr->index); if (instr->index < end) continue; gpir_debug("fully ready max node %d\n", node->index); if (schedule_try_place_node(instr, node)) return node; return gpir_sched_node(instr, node); } } /* partially ready node reach its max dist with any of its successor */ list_for_each_entry_safe(gpir_node, node, ready_list, list) { if (!node->sched.ready) { int end = gpir_get_min_end(node); assert(end >= instr->index); if (instr->index < end) continue; gpir_debug("partially ready max node %d\n", node->index); return gpir_sched_node(instr, node); } } /* schedule node used by previous instr when count > 5 */ int count = 0; gpir_node *two_slot_node = NULL; list_for_each_entry(gpir_node, node, ready_list, list) { if (gpir_is_input_node(node)) { int min = gpir_get_min_scheduled_succ(node); assert(min >= instr->index - 1); if (min == instr->index - 1) { if (gpir_op_infos[node->op].may_consume_two_slots) { two_slot_node = node; count += 2; } else count++; } } } if (count > 5) { /* When no slot avaible, must schedule a move for two slot node * to reduce the count. This results from the dummy_m/f method. */ if (gpir_instr_alu_slot_is_full(instr)) { assert(two_slot_node); gpir_debug("instr is full, schedule move node for two slot node %d\n", two_slot_node->index); return gpir_sched_node(instr, two_slot_node); } /* schedule fully ready node first */ list_for_each_entry(gpir_node, node, ready_list, list) { if (gpir_is_input_node(node)) { int min = gpir_get_min_scheduled_succ(node); if (min == instr->index - 1 && node->sched.ready) { gpir_debug(">5 ready node %d\n", node->index); if (schedule_try_place_node(instr, node)) return node; } } } /* no fully ready node be scheduled, schedule partially ready node */ list_for_each_entry_safe(gpir_node, node, ready_list, list) { if (gpir_is_input_node(node)) { int min = gpir_get_min_scheduled_succ(node); if (min == instr->index - 1 && !node->sched.ready) { gpir_debug(">5 partially ready node %d\n", node->index); return gpir_sched_node(instr, node); } } } /* finally schedule move for fully ready node */ list_for_each_entry_safe(gpir_node, node, ready_list, list) { if (gpir_is_input_node(node)) { int min = gpir_get_min_scheduled_succ(node); if (min == instr->index - 1 && node->sched.ready) { gpir_debug(">5 fully ready move node %d\n", node->index); return gpir_sched_node(instr, node); } } } } /* schedule remain fully ready nodes */ list_for_each_entry(gpir_node, node, ready_list, list) { if (node->sched.ready) { gpir_debug("remain fully ready node %d\n", node->index); if (schedule_try_place_node(instr, node)) return node; } } return NULL; } static void schedule_print_pre_one_instr(gpir_instr *instr, struct list_head *ready_list) { if (!(lima_debug & LIMA_DEBUG_GP)) return; printf("instr %d for ready list:", instr->index); list_for_each_entry(gpir_node, node, ready_list, list) { printf(" %d/%c", node->index, node->sched.ready ? 'r' : 'p'); } printf("\n"); } static void schedule_print_post_one_instr(gpir_instr *instr) { if (!(lima_debug & LIMA_DEBUG_GP)) return; printf("post schedule instr"); for (int i = 0; i < GPIR_INSTR_SLOT_NUM; i++) { if (instr->slots[i]) printf(" %d/%d", i, instr->slots[i]->index); } printf("\n"); } static bool schedule_one_instr(gpir_block *block, struct list_head *ready_list) { gpir_instr *instr = gpir_instr_create(block); if (unlikely(!instr)) return false; schedule_print_pre_one_instr(instr, ready_list); while (true) { gpir_node *node = gpir_sched_instr_pass(instr, ready_list); if (!node) break; if (node->sched.instr < 0) schedule_insert_ready_list(ready_list, node); else { list_del(&node->list); list_add(&node->list, &block->node_list); gpir_node_foreach_pred(node, dep) { gpir_node *pred = dep->pred; schedule_insert_ready_list(ready_list, pred); } } } schedule_print_post_one_instr(instr); return true; } static bool schedule_block(gpir_block *block) { /* calculate distance */ list_for_each_entry(gpir_node, node, &block->node_list, list) { if (gpir_node_is_root(node)) schedule_update_distance(node); } struct list_head ready_list; list_inithead(&ready_list); /* construct the ready list from root nodes */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { if (gpir_node_is_root(node)) schedule_insert_ready_list(&ready_list, node); } list_inithead(&block->node_list); while (!list_empty(&ready_list)) { if (!schedule_one_instr(block, &ready_list)) return false; } return true; } static void schedule_build_vreg_dependency(gpir_block *block) { gpir_node *regs[GPIR_VALUE_REG_NUM] = {0}; list_for_each_entry(gpir_node, node, &block->node_list, list) { /* store node has no value reg assigned */ if (node->value_reg < 0) continue; gpir_node *reg = regs[node->value_reg]; if (reg) { gpir_node_foreach_succ(reg, dep) { /* write after read dep should only apply to real 'read' */ if (dep->type != GPIR_DEP_INPUT) continue; gpir_node *succ = dep->succ; gpir_node_add_dep(node, succ, GPIR_DEP_VREG_WRITE_AFTER_READ); } } regs[node->value_reg] = node; } /* merge dummy_f/m to the node created from */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { if (node->op == gpir_op_dummy_m) { gpir_alu_node *alu = gpir_node_to_alu(node); gpir_node *origin = alu->children[0]; gpir_node *dummy_f = alu->children[1]; gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; /* origin and node may have same succ (by VREG/INPUT or * VREG/VREG dep), so use gpir_node_add_dep() instead of * gpir_node_replace_pred() */ gpir_node_add_dep(succ, origin, dep->type); gpir_node_replace_child(succ, node, origin); } gpir_node_delete(dummy_f); gpir_node_delete(node); } } } static void schedule_build_preg_dependency(gpir_compiler *comp) { /* merge reg with the same index */ gpir_reg *regs[GPIR_VALUE_REG_NUM] = {0}; list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) { if (!regs[reg->index]) regs[reg->index] = reg; else { list_splicetail(®->defs_list, ®s[reg->index]->defs_list); list_splicetail(®->uses_list, ®s[reg->index]->uses_list); } } /* calculate physical reg read/write dependency for load/store nodes */ for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) { gpir_reg *reg = regs[i]; if (!reg) continue; /* sort reg write */ struct list_head tmp_list; list_replace(®->defs_list, &tmp_list); list_inithead(®->defs_list); list_for_each_entry_safe(gpir_store_node, store, &tmp_list, reg_link) { struct list_head *insert_pos = ®->defs_list; list_for_each_entry(gpir_store_node, st, ®->defs_list, reg_link) { if (st->node.sched.index > store->node.sched.index) { insert_pos = &st->reg_link; break; } } list_del(&store->reg_link); list_addtail(&store->reg_link, insert_pos); } /* sort reg read */ list_replace(®->uses_list, &tmp_list); list_inithead(®->uses_list); list_for_each_entry_safe(gpir_load_node, load, &tmp_list, reg_link) { struct list_head *insert_pos = ®->uses_list; list_for_each_entry(gpir_load_node, ld, ®->uses_list, reg_link) { if (ld->node.sched.index > load->node.sched.index) { insert_pos = &ld->reg_link; break; } } list_del(&load->reg_link); list_addtail(&load->reg_link, insert_pos); } /* insert dependency */ gpir_store_node *store = list_first_entry(®->defs_list, gpir_store_node, reg_link); gpir_store_node *next = store->reg_link.next != ®->defs_list ? list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL; list_for_each_entry(gpir_load_node, load, ®->uses_list, reg_link) { /* loop until load is between store and next */ while (next && next->node.sched.index < load->node.sched.index) { store = next; next = store->reg_link.next != ®->defs_list ? list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL; } gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE); if (next) gpir_node_add_dep(&next->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); } } } static void print_statistic(gpir_compiler *comp, int save_index) { int num_nodes[gpir_op_num] = {0}; int num_created_nodes[gpir_op_num] = {0}; list_for_each_entry(gpir_block, block, &comp->block_list, list) { list_for_each_entry(gpir_node, node, &block->node_list, list) { num_nodes[node->op]++; if (node->index >= save_index) num_created_nodes[node->op]++; } } printf("====== gpir scheduler statistic ======\n"); printf("---- how many nodes are scheduled ----\n"); int n = 0, l = 0; for (int i = 0; i < gpir_op_num; i++) { if (num_nodes[i]) { printf("%10s:%-6d", gpir_op_infos[i].name, num_nodes[i]); n += num_nodes[i]; if (!(++l % 4)) printf("\n"); } } if (l % 4) printf("\n"); printf("\ntotal: %d\n", n); printf("---- how many nodes are created ----\n"); n = l = 0; for (int i = 0; i < gpir_op_num; i++) { if (num_created_nodes[i]) { printf("%10s:%-6d", gpir_op_infos[i].name, num_created_nodes[i]); n += num_created_nodes[i]; if (!(++l % 4)) printf("\n"); } } if (l % 4) printf("\n"); printf("\ntotal: %d\n", n); printf("------------------------------------\n"); } bool gpir_schedule_prog(gpir_compiler *comp) { int save_index = comp->cur_index; /* init schedule info */ int index = 0; list_for_each_entry(gpir_block, block, &comp->block_list, list) { block->sched.instr_index = 0; list_for_each_entry(gpir_node, node, &block->node_list, list) { node->sched.instr = -1; node->sched.pos = -1; node->sched.index = index++; node->sched.dist = -1; node->sched.ready = false; node->sched.inserted = false; } } /* build fake/virtual dependency */ list_for_each_entry(gpir_block, block, &comp->block_list, list) { schedule_build_vreg_dependency(block); } schedule_build_preg_dependency(comp); //gpir_debug("after scheduler build reg dependency\n"); //gpir_node_print_prog_dep(comp); list_for_each_entry(gpir_block, block, &comp->block_list, list) { if (!schedule_block(block)) { gpir_error("fail schedule block\n"); return false; } } if (lima_debug & LIMA_DEBUG_GP) { print_statistic(comp, save_index); gpir_instr_print_prog(comp); } return true; }