Marlin_FB4S/Marlin/src/HAL/shared/Delay.h


								/**

								 * Marlin 3D Printer Firmware

								 * Copyright (c) 2020 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]

								 *

								 * Based on Sprinter and grbl.

								 * Copyright (c) 2011 Camiel Gubbels / Erik van der Zalm

								 *

								 * This program is free software: you can redistribute it and/or modify

								 * it under the terms of the GNU General Public License as published by

								 * the Free Software Foundation, either version 3 of the License, or

								 * (at your option) any later version.

								 *

								 * This program is distributed in the hope that it will be useful,

								 * but WITHOUT ANY WARRANTY; without even the implied warranty of

								 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								 * GNU General Public License for more details.

								 *

								 * You should have received a copy of the GNU General Public License

								 * along with this program.  If not, see <https://www.gnu.org/licenses/>.

								 *

								 */

								#pragma once


								#include "../../inc/MarlinConfigPre.h"


								/**

								 * Busy wait delay cycles routines:

								 *

								 *  DELAY_CYCLES(count): Delay execution in cycles

								 *  DELAY_NS(count): Delay execution in nanoseconds

								 *  DELAY_US(count): Delay execution in microseconds

								 */


								#include "../../core/macros.h"


								void calibrate_delay_loop();


								#if defined(__arm__) || defined(__thumb__)


								  // We want to have delay_cycle function with the lowest possible overhead, so we adjust at the function at runtime based on the current CPU best feature

								  typedef void (*DelayImpl)(uint32_t);

								  extern DelayImpl DelayCycleFnc;


								  // I've measured 36 cycles on my system to call the cycle waiting method, but it shouldn't change much to have a bit more margin, it only consume a bit more flash

								  #define TRIP_POINT_FOR_CALLING_FUNCTION   40


								  // A simple recursive template class that output exactly one 'nop' of code per recursion

								  template <int N> struct NopWriter {

								    FORCE_INLINE static void build() {

								      __asm__ __volatile__("nop");

								      NopWriter<N-1>::build();

								    }

								  };

								  // End the loop

								  template <> struct NopWriter<0> { FORCE_INLINE static void build() {} };


								  namespace Private {

								    // Split recursing template in 2 different class so we don't reach the maximum template instantiation depth limit

								    template <bool belowTP, int N> struct Helper {

								      FORCE_INLINE static void build() {

								        DelayCycleFnc(N - 2); //  Approximative cost of calling the function (might be off by one or 2 cycles)

								      }

								    };


								    template <int N> struct Helper<true, N> {

								      FORCE_INLINE static void build() {

								        NopWriter<N - 1>::build();

								      }

								    };


								    template <> struct Helper<true, 0> {

								      FORCE_INLINE static void build() {}

								    };


								  }

								  // Select a behavior based on the constexpr'ness of the parameter

								  // If called with a compile-time parameter, then write as many NOP as required to reach the asked cycle count

								  // (there is some tripping point here to start looping when it's more profitable than gruntly executing NOPs)

								  // If not called from a compile-time parameter, fallback to a runtime loop counting version instead

								  template <bool compileTime, int Cycles>

								  struct SmartDelay {

								    FORCE_INLINE SmartDelay(int) {

								      if (Cycles == 0) return;

								      Private::Helper<Cycles < TRIP_POINT_FOR_CALLING_FUNCTION, Cycles>::build();

								    }

								  };

								  // Runtime version below. There is no way this would run under less than ~TRIP_POINT_FOR_CALLING_FUNCTION cycles

								  template <int T>

								  struct SmartDelay<false, T> {

								    FORCE_INLINE SmartDelay(int v) { DelayCycleFnc(v); }

								  };


								  #define DELAY_CYCLES(X) do { SmartDelay<IS_CONSTEXPR(X), IS_CONSTEXPR(X) ? X : 0> _smrtdly_X(X); } while(0)


								  #if GCC_VERSION <= 70000

								    #define DELAY_CYCLES_VAR(X) DelayCycleFnc(X)

								  #else

								    #define DELAY_CYCLES_VAR DELAY_CYCLES

								  #endif


								  // For delay in microseconds, no smart delay selection is required, directly call the delay function

								  // Teensy compiler is too old and does not accept smart delay compile-time / run-time selection correctly

								  #define DELAY_US(x) DelayCycleFnc((x) * ((F_CPU) / 1000000UL))


								#elif defined(__AVR__)

								  FORCE_INLINE static void __delay_up_to_3c(uint8_t cycles) {

								    switch (cycles) {

								      case 3:

								        __asm__ __volatile__(A("RJMP .+0") A("NOP"));

								        break;

								      case 2:

								        __asm__ __volatile__(A("RJMP .+0"));

								        break;

								      case 1:

								        __asm__ __volatile__(A("NOP"));

								        break;

								    }

								  }


								  // Delay in cycles

								  FORCE_INLINE static void DELAY_CYCLES(uint16_t cycles) {

								    if (__builtin_constant_p(cycles)) {

								      if (cycles <= 3) {

								        __delay_up_to_3c(cycles);

								      }

								      else if (cycles == 4) {

								        __delay_up_to_3c(2);

								        __delay_up_to_3c(2);

								      }

								      else {

								        cycles -= 1 + 4; // Compensate for the first LDI (1) and the first round (4)

								        __delay_up_to_3c(cycles % 4);


								        cycles /= 4;

								        // The following code burns [1 + 4 * (rounds+1)] cycles

								        uint16_t dummy;

								        __asm__ __volatile__(

								          // "manually" load counter from constants, otherwise the compiler may optimize this part away

								          A("LDI %A[rounds], %[l]") // 1c

								          A("LDI %B[rounds], %[h]") // 1c (compensating the non branching BRCC)

								          L("1")

								          A("SBIW %[rounds], 1")    // 2c

								          A("BRCC 1b")              // 2c when branching, else 1c (end of loop)

								          : // Outputs ...

								          [rounds] "=w" (dummy) // Restrict to a wo (=) 16 bit register pair (w)

								          : // Inputs ...

								          [l] "M" (cycles%256), // Restrict to 0..255 constant (M)

								          [h] "M" (cycles/256)  // Restrict to 0..255 constant (M)

								          :// Clobbers ...

								          "cc"                  // Indicate we are modifying flags like Carry (cc)

								        );

								      }

								    }

								    else {

								      __asm__ __volatile__(

								        L("1")

								        A("SBIW %[cycles], 4")   // 2c

								        A("BRCC 1b")             // 2c when branching, else 1c (end of loop)

								        : [cycles] "+w" (cycles) // output: Restrict to a rw (+) 16 bit register pair (w)

								        :                        // input: -

								        : "cc"                   // clobbers: We are modifying flags like Carry (cc)

								      );

								    }

								  }


								  // Delay in microseconds

								  #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))


								#elif defined(ESP32) || defined(__PLAT_LINUX__) || defined(__PLAT_NATIVE_SIM__)


								  // DELAY_CYCLES specified inside platform


								  // Delay in microseconds

								  #define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))

								#else


								  #error "Unsupported MCU architecture"


								#endif


								/**************************************************************

								 *  Delay in nanoseconds. Requires the F_CPU macro.

								 *  These macros follow avr-libc delay conventions.

								 *

								 * For AVR there are three possible operation modes, due to its

								 * slower clock speeds and thus coarser delay resolution. For

								 * example, when F_CPU = 16000000 the resolution is 62.5ns.

								 *

								 *  Round up (default)

								 *    Round up the delay according to the CPU clock resolution.

								 *    e.g., 100 will give a delay of 2 cycles (125ns).

								 *

								 *  Round down (DELAY_NS_ROUND_DOWN)

								 *    Round down the delay according to the CPU clock resolution.

								 *    e.g., 100 will be rounded down to 1 cycle (62.5ns).

								 *

								 *  Nearest (DELAY_NS_ROUND_CLOSEST)

								 *    Round the delay to the nearest number of clock cycles.

								 *    e.g., 165 will be rounded up to 3 cycles (187.5ns) because

								 *          it's closer to the requested delay than 2 cycle (125ns).

								 */


								#ifndef __AVR__

								  #undef DELAY_NS_ROUND_DOWN

								  #undef DELAY_NS_ROUND_CLOSEST

								#endif


								#if ENABLED(DELAY_NS_ROUND_DOWN)

								  #define _NS_TO_CYCLES(x) ( (x) * ((F_CPU) / 1000000UL)        / 1000UL) // floor

								#elif ENABLED(DELAY_NS_ROUND_CLOSEST)

								  #define _NS_TO_CYCLES(x) (((x) * ((F_CPU) / 1000000UL) + 500) / 1000UL) // round

								#else

								  #define _NS_TO_CYCLES(x) (((x) * ((F_CPU) / 1000000UL) + 999) / 1000UL) // "ceil"

								#endif


								#define DELAY_NS(x)         DELAY_CYCLES(_NS_TO_CYCLES(x))

								#define DELAY_NS_VAR(x) DELAY_CYCLES_VAR(_NS_TO_CYCLES(x))