From 7ae099f2be7e8a54e50b7e34ee5f3a5ad4343ea9 Mon Sep 17 00:00:00 2001 From: Skruppy Date: Fri, 9 Jul 2021 04:55:34 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Fix=20AVR=20DELAY=5FUS=20int=20o?= =?UTF-8?q?verflow=20(#22268)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Marlin/src/HAL/shared/Delay.h | 80 ++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/Marlin/src/HAL/shared/Delay.h b/Marlin/src/HAL/shared/Delay.h index f7e01ad25c..3174968c1b 100644 --- a/Marlin/src/HAL/shared/Delay.h +++ b/Marlin/src/HAL/shared/Delay.h @@ -97,43 +97,65 @@ void calibrate_delay_loop(); #define DELAY_US(x) DelayCycleFnc((x) * ((F_CPU) / 1000000UL)) #elif defined(__AVR__) - - #define nop() __asm__ __volatile__("nop;\n\t":::) - - FORCE_INLINE static void __delay_4cycles(uint8_t cy) { - __asm__ __volatile__( - L("1") - A("dec %[cnt]") - A("nop") - A("brne 1b") - : [cnt] "+r"(cy) // output: +r means input+output - : // input: - : "cc" // clobbers: - ); + FORCE_INLINE static void __delay_up_to_3c(uint8_t cycles) { + switch (cycles) { + case 3: + __asm__ __volatile__(A("RJMP .+0") A("NOP")); + break; + case 2: + __asm__ __volatile__(A("RJMP .+0")); + break; + case 1: + __asm__ __volatile__(A("NOP")); + break; + } } // Delay in cycles - FORCE_INLINE static void DELAY_CYCLES(uint16_t x) { - - if (__builtin_constant_p(x)) { - #define MAXNOPS 4 - - if (x <= (MAXNOPS)) { - switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); } + FORCE_INLINE static void DELAY_CYCLES(uint16_t cycles) { + if (__builtin_constant_p(cycles)) { + if (cycles <= 3) { + __delay_up_to_3c(cycles); + } + else if (cycles == 4) { + __delay_up_to_3c(2); + __delay_up_to_3c(2); } else { - const uint32_t rem = (x) % (MAXNOPS); - switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); } - if ((x = (x) / (MAXNOPS))) - __delay_4cycles(x); // if need more then 4 nop loop is more optimal + cycles -= 1 + 4; // Compensate for the first LDI (1) and the first round (4) + __delay_up_to_3c(cycles % 4); + + cycles /= 4; + // The following code burns [1 + 4 * (rounds+1)] cycles + uint16_t dummy; + __asm__ __volatile__( + // "manually" load counter from constants, otherwise the compiler may optimize this part away + A("LDI %A[rounds], %[l]") // 1c + A("LDI %B[rounds], %[h]") // 1c (compensating the non branching BRCC) + L("1") + A("SBIW %[rounds], 1") // 2c + A("BRCC 1b") // 2c when branching, else 1c (end of loop) + : // Outputs ... + [rounds] "=w" (dummy) // Restrict to a wo (=) 16 bit register pair (w) + : // Inputs ... + [l] "M" (cycles%256), // Restrict to 0..255 constant (M) + [h] "M" (cycles/256) // Restrict to 0..255 constant (M) + :// Clobbers ... + "cc" // Indicate we are modifying flags like Carry (cc) + ); } - - #undef MAXNOPS } - else if ((x >>= 2)) - __delay_4cycles(x); + else { + __asm__ __volatile__( + L("1") + A("SBIW %[cycles], 4") // 2c + A("BRCC 1b") // 2c when branching, else 1c (end of loop) + : [cycles] "+w" (cycles) // output: Restrict to a rw (+) 16 bit register pair (w) + : // input: - + : "cc" // clobbers: We are modifying flags like Carry (cc) + ); + } } - #undef nop // Delay in microseconds #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))