filename | src/sh4/sh4x86.in |
changeset | 1112:4cac5e474d4c |
prev | 1092:7c4ffe27e7b5 |
next | 1120:7c40a0f687b3 |
author | nkeynes |
date | Tue Jul 13 18:23:16 2010 +1000 (13 years ago) |
permissions | -rw-r--r-- |
last change | Rearrange the main translation loop to allow translated blocks to jump directly to their successors without needing to return to the main loop in between. Shaves about 6% off the core runtime. |
file | annotate | diff | log | raw |
1.1 --- a/src/sh4/sh4x86.in Sun Dec 20 21:01:03 2009 +10001.2 +++ b/src/sh4/sh4x86.in Tue Jul 13 18:23:16 2010 +10001.3 @@ -71,6 +71,8 @@1.4 #define DELAY_PC 11.5 #define DELAY_PC_PR 21.7 +#define SH4_MODE_UNKNOWN -11.8 +1.9 struct backpatch_record {1.10 uint32_t fixup_offset;1.11 uint32_t fixup_icount;1.12 @@ -84,6 +86,7 @@1.13 */1.14 struct sh4_x86_state {1.15 int in_delay_slot;1.16 + uint8_t *code;1.17 gboolean fpuen_checked; /* true if we've already checked fpu enabled. */1.18 gboolean branch_taken; /* true if we branched unconditionally */1.19 gboolean double_prec; /* true if FPU is in double-precision mode */1.20 @@ -91,6 +94,7 @@1.21 gboolean sse3_enabled; /* true if host supports SSE3 instructions */1.22 uint32_t block_start_pc;1.23 uint32_t stack_posn; /* Trace stack height for alignment purposes */1.24 + uint32_t sh4_mode; /* Mirror of sh4r.xlat_sh4_mode */1.25 int tstate;1.27 /* mode flags */1.28 @@ -171,7 +175,7 @@1.29 fprintf( out, "%c%016lx: %-30s %-40s", (target_pc == (uintptr_t)native_pc ? '*' : ' '),1.30 target_pc, op, buf );1.31 #else1.32 - fprintf( out, "%c%08x: %-30s %-40s", (target_pc == (uintptr_t)native_pc ? '*' : ' '),1.33 + fprintf( out, "%c%08lx: %-30s %-40s", (target_pc == (uintptr_t)native_pc ? '*' : ' '),1.34 target_pc, op, buf );1.35 #endif1.36 if( source_recov_table < source_recov_end &&1.37 @@ -249,6 +253,7 @@1.38 #define SETNE_t() SETCCB_cc_rbpdisp(X86_COND_NE,R_T)1.39 #define SETC_r8(r1) SETCCB_cc_r8(X86_COND_C, r1)1.40 #define JAE_label(label) JCC_cc_rel8(X86_COND_AE,-1); MARK_JMP8(label)1.41 +#define JBE_label(label) JCC_cc_rel8(X86_COND_BE,-1); MARK_JMP8(label)1.42 #define JE_label(label) JCC_cc_rel8(X86_COND_E,-1); MARK_JMP8(label)1.43 #define JGE_label(label) JCC_cc_rel8(X86_COND_GE,-1); MARK_JMP8(label)1.44 #define JNA_label(label) JCC_cc_rel8(X86_COND_NA,-1); MARK_JMP8(label)1.45 @@ -317,7 +322,7 @@1.46 /* Exception checks - Note that all exception checks will clobber EAX */1.48 #define check_priv( ) \1.49 - if( (sh4r.xlat_sh4_mode & SR_MD) == 0 ) { \1.50 + if( (sh4_x86.sh4_mode & SR_MD) == 0 ) { \1.51 if( sh4_x86.in_delay_slot ) { \1.52 exit_block_exc(EXC_SLOT_ILLEGAL, (pc-2) ); \1.53 } else { \1.54 @@ -365,7 +370,7 @@1.55 TESTL_imms_r32( 0x00000007, x86reg ); \1.56 JNE_exc(EXC_DATA_ADDR_WRITE);1.58 -#define address_space() ((sh4r.xlat_sh4_mode&SR_MD) ? (uintptr_t)sh4_address_space : (uintptr_t)sh4_user_address_space)1.59 +#define address_space() ((sh4_x86.sh4_mode&SR_MD) ? (uintptr_t)sh4_address_space : (uintptr_t)sh4_user_address_space)1.61 #define UNDEF(ir)1.62 /* Note: For SR.MD == 1 && MMUCR.AT == 0, there are no memory exceptions, so1.63 @@ -375,7 +380,7 @@1.64 static void call_read_func(int addr_reg, int value_reg, int offset, int pc)1.65 {1.66 decode_address(address_space(), addr_reg);1.67 - if( !sh4_x86.tlb_on && (sh4r.xlat_sh4_mode & SR_MD) ) {1.68 + if( !sh4_x86.tlb_on && (sh4_x86.sh4_mode & SR_MD) ) {1.69 CALL1_r32disp_r32(REG_ECX, offset, addr_reg);1.70 } else {1.71 if( addr_reg != REG_ARG1 ) {1.72 @@ -393,7 +398,7 @@1.73 static void call_write_func(int addr_reg, int value_reg, int offset, int pc)1.74 {1.75 decode_address(address_space(), addr_reg);1.76 - if( !sh4_x86.tlb_on && (sh4r.xlat_sh4_mode & SR_MD) ) {1.77 + if( !sh4_x86.tlb_on && (sh4_x86.sh4_mode & SR_MD) ) {1.78 CALL2_r32disp_r32_r32(REG_ECX, offset, addr_reg, value_reg);1.79 } else {1.80 if( value_reg != REG_ARG2 ) {1.81 @@ -444,8 +449,7 @@1.83 void sh4_translate_begin_block( sh4addr_t pc )1.84 {1.85 - enter_block();1.86 - MOVP_immptr_rptr( ((uint8_t *)&sh4r) + 128, REG_EBP );1.87 + sh4_x86.code = xlat_output;1.88 sh4_x86.in_delay_slot = FALSE;1.89 sh4_x86.fpuen_checked = FALSE;1.90 sh4_x86.branch_taken = FALSE;1.91 @@ -455,6 +459,8 @@1.92 sh4_x86.tstate = TSTATE_NONE;1.93 sh4_x86.double_prec = sh4r.fpscr & FPSCR_PR;1.94 sh4_x86.double_size = sh4r.fpscr & FPSCR_SZ;1.95 + sh4_x86.sh4_mode = sh4r.xlat_sh4_mode;1.96 + enter_block();1.97 }1.100 @@ -481,19 +487,49 @@1.102 #define UNTRANSLATABLE(pc) !IS_IN_ICACHE(pc)1.104 +/** Offset of xlat_sh4_mode field relative to the code pointer */1.105 +#define XLAT_SH4_MODE_CODE_OFFSET (uint32_t)(offsetof(struct xlat_cache_block, xlat_sh4_mode) - offsetof(struct xlat_cache_block,code) )1.106 +1.107 +/**1.108 + * Test if the loaded target code pointer in %eax is valid, and if so jump1.109 + * directly into it, bypassing the normal exit.1.110 + */1.111 +static void jump_next_block()1.112 +{1.113 + TESTP_rptr_rptr(REG_EAX, REG_EAX);1.114 + JE_label(nocode);1.115 + if( sh4_x86.sh4_mode == SH4_MODE_UNKNOWN ) {1.116 + /* sr/fpscr was changed, possibly updated xlat_sh4_mode, so reload it */1.117 + MOVL_rbpdisp_r32( REG_OFFSET(xlat_sh4_mode), REG_ECX );1.118 + CMPL_r32_r32disp( REG_ECX, REG_EAX, XLAT_SH4_MODE_CODE_OFFSET );1.119 + } else {1.120 + CMPL_imms_r32disp( sh4_x86.sh4_mode, REG_EAX, XLAT_SH4_MODE_CODE_OFFSET );1.121 + }1.122 + JNE_label(wrongmode);1.123 + LEAP_rptrdisp_rptr(REG_EAX, PROLOGUE_SIZE,REG_EAX);1.124 + JMP_rptr(REG_EAX);1.125 + JMP_TARGET(nocode); JMP_TARGET(wrongmode);1.126 +}1.127 +1.128 /**1.129 * Exit the block with sh4r.pc already written1.130 */1.131 void exit_block_pcset( sh4addr_t pc )1.132 {1.133 MOVL_imm32_r32( ((pc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );1.134 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.135 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );1.136 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.137 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );1.138 + JBE_label(exitloop);1.139 MOVL_rbpdisp_r32( R_PC, REG_ARG1 );1.140 if( sh4_x86.tlb_on ) {1.141 CALL1_ptr_r32(xlat_get_code_by_vma,REG_ARG1);1.142 } else {1.143 CALL1_ptr_r32(xlat_get_code,REG_ARG1);1.144 }1.145 +1.146 + jump_next_block();1.147 + JMP_TARGET(exitloop);1.148 exit_block();1.149 }1.151 @@ -503,14 +539,20 @@1.152 void exit_block_newpcset( sh4addr_t pc )1.153 {1.154 MOVL_imm32_r32( ((pc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );1.155 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.156 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );1.157 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.158 MOVL_rbpdisp_r32( R_NEW_PC, REG_ARG1 );1.159 MOVL_r32_rbpdisp( REG_ARG1, R_PC );1.160 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );1.161 + JBE_label(exitloop);1.162 if( sh4_x86.tlb_on ) {1.163 CALL1_ptr_r32(xlat_get_code_by_vma,REG_ARG1);1.164 } else {1.165 CALL1_ptr_r32(xlat_get_code,REG_ARG1);1.166 }1.167 +1.168 + jump_next_block();1.169 + JMP_TARGET(exitloop);1.170 exit_block();1.171 }1.173 @@ -520,18 +562,25 @@1.174 */1.175 void exit_block_abs( sh4addr_t pc, sh4addr_t endpc )1.176 {1.177 - MOVL_imm32_r32( pc, REG_ECX );1.178 - MOVL_r32_rbpdisp( REG_ECX, R_PC );1.179 + MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );1.180 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );1.181 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.182 +1.183 + MOVL_imm32_r32( pc, REG_ARG1 );1.184 + MOVL_r32_rbpdisp( REG_ARG1, R_PC );1.185 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );1.186 + JBE_label(exitloop);1.187 +1.188 if( IS_IN_ICACHE(pc) ) {1.189 MOVP_moffptr_rax( xlat_get_lut_entry(GET_ICACHE_PHYS(pc)) );1.190 ANDP_imms_rptr( -4, REG_EAX );1.191 } else if( sh4_x86.tlb_on ) {1.192 - CALL1_ptr_r32(xlat_get_code_by_vma, REG_ECX);1.193 + CALL1_ptr_r32(xlat_get_code_by_vma, REG_ARG1);1.194 } else {1.195 - CALL1_ptr_r32(xlat_get_code, REG_ECX);1.196 + CALL1_ptr_r32(xlat_get_code, REG_ARG1);1.197 }1.198 - MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );1.199 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.200 + jump_next_block();1.201 + JMP_TARGET(exitloop);1.202 exit_block();1.203 }1.205 @@ -540,19 +589,36 @@1.206 */1.207 void exit_block_rel( sh4addr_t pc, sh4addr_t endpc )1.208 {1.209 - MOVL_imm32_r32( pc - sh4_x86.block_start_pc, REG_ECX );1.210 - ADDL_rbpdisp_r32( R_PC, REG_ECX );1.211 - MOVL_r32_rbpdisp( REG_ECX, R_PC );1.212 - if( IS_IN_ICACHE(pc) ) {1.213 - MOVP_moffptr_rax( xlat_get_lut_entry(GET_ICACHE_PHYS(pc)) );1.214 - ANDP_imms_rptr( -4, REG_EAX );1.215 - } else if( sh4_x86.tlb_on ) {1.216 - CALL1_ptr_r32(xlat_get_code_by_vma, REG_ECX);1.217 - } else {1.218 - CALL1_ptr_r32(xlat_get_code, REG_ECX);1.219 + MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );1.220 + ADDL_rbpdisp_r32( REG_OFFSET(slice_cycle), REG_ECX );1.221 + MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.222 +1.223 + if( pc == sh4_x86.block_start_pc && sh4_x86.sh4_mode == sh4r.xlat_sh4_mode ) {1.224 + /* Special case for tight loops - the PC doesn't change, and1.225 + * we already know the target address. Just check events pending before1.226 + * looping.1.227 + */1.228 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );1.229 + uint32_t backdisp = ((uintptr_t)(sh4_x86.code - xlat_output)) + PROLOGUE_SIZE;1.230 + JCC_cc_prerel(X86_COND_A, backdisp);1.231 + } else {1.232 + MOVL_imm32_r32( pc - sh4_x86.block_start_pc, REG_ARG1 );1.233 + ADDL_rbpdisp_r32( R_PC, REG_ARG1 );1.234 + MOVL_r32_rbpdisp( REG_ARG1, R_PC );1.235 + CMPL_r32_rbpdisp( REG_ECX, REG_OFFSET(event_pending) );1.236 + JBE_label(exitloop2);1.237 +1.238 + if( IS_IN_ICACHE(pc) ) {1.239 + MOVP_moffptr_rax( xlat_get_lut_entry(GET_ICACHE_PHYS(pc)) );1.240 + ANDP_imms_rptr( -4, REG_EAX );1.241 + } else if( sh4_x86.tlb_on ) {1.242 + CALL1_ptr_r32(xlat_get_code_by_vma, REG_ARG1);1.243 + } else {1.244 + CALL1_ptr_r32(xlat_get_code, REG_ARG1);1.245 + }1.246 + jump_next_block();1.247 + JMP_TARGET(exitloop2);1.248 }1.249 - MOVL_imm32_r32( ((endpc - sh4_x86.block_start_pc)>>1)*sh4_cpu_period, REG_ECX );1.250 - ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.251 exit_block();1.252 }1.254 @@ -567,13 +633,6 @@1.255 ADDL_r32_rbpdisp( REG_ECX, REG_OFFSET(slice_cycle) );1.256 MOVL_imm32_r32( code, REG_ARG1 );1.257 CALL1_ptr_r32( sh4_raise_exception, REG_ARG1 );1.258 - MOVL_rbpdisp_r32( R_PC, REG_ARG1 );1.259 - if( sh4_x86.tlb_on ) {1.260 - CALL1_ptr_r32(xlat_get_code_by_vma,REG_ARG1);1.261 - } else {1.262 - CALL1_ptr_r32(xlat_get_code,REG_ARG1);1.263 - }1.264 -1.265 exit_block();1.266 }1.268 @@ -599,13 +658,7 @@1.269 MOVL_imm32_r32( sh4_x86.in_delay_slot ? 1 : 0, REG_ECX );1.270 MOVL_r32_rbpdisp( REG_ECX, REG_OFFSET(in_delay_slot) );1.272 - CALL_ptr( sh4_execute_instruction );1.273 - MOVL_rbpdisp_r32( R_PC, REG_EAX );1.274 - if( sh4_x86.tlb_on ) {1.275 - CALL1_ptr_r32(xlat_get_code_by_vma,REG_EAX);1.276 - } else {1.277 - CALL1_ptr_r32(xlat_get_code,REG_EAX);1.278 - }1.279 + CALL_ptr( sh4_execute_instruction );1.280 exit_block();1.281 }1.283 @@ -627,12 +680,6 @@1.284 MOVL_moffptr_eax( &sh4_cpu_period );1.285 MULL_r32( REG_EDX );1.286 ADDL_r32_rbpdisp( REG_EAX, REG_OFFSET(slice_cycle) );1.287 - MOVL_rbpdisp_r32( R_PC, REG_ARG1 );1.288 - if( sh4_x86.tlb_on ) {1.289 - CALL1_ptr_r32(xlat_get_code_by_vma, REG_ARG1);1.290 - } else {1.291 - CALL1_ptr_r32(xlat_get_code, REG_ARG1);1.292 - }1.293 exit_block();1.295 for( i=0; i< sh4_x86.backpatch_posn; i++ ) {1.296 @@ -1975,6 +2022,7 @@1.297 sh4_x86.fpuen_checked = FALSE;1.298 sh4_x86.tstate = TSTATE_NONE;1.299 sh4_x86.branch_taken = TRUE;1.300 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;1.301 if( UNTRANSLATABLE(pc+2) ) {1.302 exit_block_emu(pc+2);1.303 return 2;1.304 @@ -2539,6 +2587,7 @@1.305 XORL_imms_rbpdisp( FPSCR_SZ, REG_OFFSET(xlat_sh4_mode) );1.306 sh4_x86.tstate = TSTATE_NONE;1.307 sh4_x86.double_size = !sh4_x86.double_size;1.308 + sh4_x86.sh4_mode = sh4_x86.sh4_mode ^ FPSCR_SZ;1.309 :}1.311 /* Processor control instructions */1.312 @@ -2552,6 +2601,7 @@1.313 CALL1_ptr_r32( sh4_write_sr, REG_EAX );1.314 sh4_x86.fpuen_checked = FALSE;1.315 sh4_x86.tstate = TSTATE_NONE;1.316 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;1.317 return 2;1.318 }1.319 :}1.320 @@ -2624,6 +2674,7 @@1.321 CALL1_ptr_r32( sh4_write_sr, REG_EAX );1.322 sh4_x86.fpuen_checked = FALSE;1.323 sh4_x86.tstate = TSTATE_NONE;1.324 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;1.325 return 2;1.326 }1.327 :}1.328 @@ -2693,6 +2744,7 @@1.329 load_reg( REG_EAX, Rm );1.330 CALL1_ptr_r32( sh4_write_fpscr, REG_EAX );1.331 sh4_x86.tstate = TSTATE_NONE;1.332 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;1.333 return 2;1.334 :}1.335 LDS.L @Rm+, FPSCR {:1.336 @@ -2704,6 +2756,7 @@1.337 ADDL_imms_rbpdisp( 4, REG_OFFSET(r[Rm]) );1.338 CALL1_ptr_r32( sh4_write_fpscr, REG_EAX );1.339 sh4_x86.tstate = TSTATE_NONE;1.340 + sh4_x86.sh4_mode = SH4_MODE_UNKNOWN;1.341 return 2;1.342 :}1.343 LDS Rm, FPUL {:
.