Browse Source

Fix ARM delay function (#20901)

vanilla_fb_2.0.x
X-Ryl669 3 years ago
committed by Scott Lahteine
parent
commit
77966135e8
  1. 8
      Marlin/src/HAL/DUE/HAL_SPI.cpp
  2. 1
      Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp
  3. 1
      Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp
  4. 11
      Marlin/src/HAL/STM32/HAL.cpp
  5. 1
      Marlin/src/HAL/STM32/HAL.h
  6. 30
      Marlin/src/HAL/STM32/HAL_SPI.cpp
  7. 176
      Marlin/src/HAL/shared/Delay.cpp
  8. 126
      Marlin/src/HAL/shared/Delay.h
  9. 3
      Marlin/src/MarlinCore.cpp
  10. 2
      Marlin/src/core/macros.h
  11. 8
      Marlin/src/gcode/gcode_d.cpp

8
Marlin/src/HAL/DUE/HAL_SPI.cpp

@ -240,7 +240,7 @@
}
// all the others
static uint32_t spiDelayCyclesX4 = (F_CPU) / 1000000; // 4µs => 125khz
static uint32_t spiDelayCyclesX4 = 4 * (F_CPU) / 1000000; // 4µs => 125khz
static uint8_t spiTransferX(uint8_t b) { // using Mode 0
int bits = 8;
@ -249,12 +249,12 @@
b <<= 1; // little setup time
WRITE(SD_SCK_PIN, HIGH);
__delay_4cycles(spiDelayCyclesX4);
DELAY_CYCLES(spiDelayCyclesX4);
b |= (READ(SD_MISO_PIN) != 0);
WRITE(SD_SCK_PIN, LOW);
__delay_4cycles(spiDelayCyclesX4);
DELAY_CYCLES(spiDelayCyclesX4);
} while (--bits);
return b;
}
@ -510,7 +510,7 @@
spiRxBlock = (pfnSpiRxBlock)spiRxBlockX;
break;
default:
spiDelayCyclesX4 = ((F_CPU) / 1000000) >> (6 - spiRate);
spiDelayCyclesX4 = ((F_CPU) / 1000000) >> (6 - spiRate) << 2; // spiRate of 2 gives the maximum error with current CPU
spiTransferTx = (pfnSpiTransfer)spiTransferX;
spiTransferRx = (pfnSpiTransfer)spiTransferX;
spiTxBlock = (pfnSpiTxBlock)spiTxBlockX;

1
Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_st7920_sw_spi.cpp

@ -59,6 +59,7 @@
#if ENABLED(U8GLIB_ST7920)
#include "../../../inc/MarlinConfig.h"
#include "../../shared/Delay.h"
#include <U8glib.h>

1
Marlin/src/HAL/DUE/dogm/u8g_com_HAL_DUE_sw_spi_shared.cpp

@ -59,6 +59,7 @@
#if HAS_MARLINUI_U8GLIB
#include "../../../inc/MarlinConfig.h"
#include "../../shared/Delay.h"
#include <U8glib.h>

11
Marlin/src/HAL/STM32/HAL.cpp

@ -57,17 +57,6 @@ uint16_t HAL_adc_result;
// Public functions
// ------------------------
// Needed for DELAY_NS() / DELAY_US() on CORTEX-M7
#if (defined(__arm__) || defined(__thumb__)) && __CORTEX_M == 7
// HAL pre-initialization task
// Force the preinit function to run between the premain() and main() function
// of the STM32 arduino core
__attribute__((constructor (102)))
void HAL_preinit() {
enableCycleCounter();
}
#endif
// HAL initialization task
void HAL_init() {
FastIO_init();

1
Marlin/src/HAL/STM32/HAL.h

@ -194,6 +194,7 @@ void flashFirmware(const int16_t);
typedef void (*systickCallback_t)(void);
void systick_attach_callback(systickCallback_t cb);
void HAL_SYSTICK_Callback();
extern volatile uint32_t systick_uptime_millis;
#define HAL_CAN_SET_PWM_FREQ // This HAL supports PWM Frequency adjustment

30
Marlin/src/HAL/STM32/HAL_SPI.cpp

@ -51,18 +51,28 @@ static SPISettings spiConfig;
OUT_WRITE(SD_MOSI_PIN, HIGH);
}
static uint16_t delay_STM32_soft_spi;
// Use function with compile-time value so we can actually reach the desired frequency
// Need to adjust this a little bit: on a 72MHz clock, we have 14ns/clock
// and we'll use ~3 cycles to jump to the method and going back, so it'll take ~40ns from the given clock here
#define CALLING_COST_NS (3U * 1000000000U) / (F_CPU)
void (*delaySPIFunc)();
void delaySPI_125() { DELAY_NS(125 - CALLING_COST_NS); }
void delaySPI_250() { DELAY_NS(250 - CALLING_COST_NS); }
void delaySPI_500() { DELAY_NS(500 - CALLING_COST_NS); }
void delaySPI_1000() { DELAY_NS(1000 - CALLING_COST_NS); }
void delaySPI_2000() { DELAY_NS(2000 - CALLING_COST_NS); }
void delaySPI_4000() { DELAY_NS(4000 - CALLING_COST_NS); }
void spiInit(uint8_t spiRate) {
// Use datarates Marlin uses
switch (spiRate) {
case SPI_FULL_SPEED: delay_STM32_soft_spi = 125; break; // desired: 8,000,000 actual: ~1.1M
case SPI_HALF_SPEED: delay_STM32_soft_spi = 125; break; // desired: 4,000,000 actual: ~1.1M
case SPI_QUARTER_SPEED:delay_STM32_soft_spi = 250; break; // desired: 2,000,000 actual: ~890K
case SPI_EIGHTH_SPEED: delay_STM32_soft_spi = 500; break; // desired: 1,000,000 actual: ~590K
case SPI_SPEED_5: delay_STM32_soft_spi = 1000; break; // desired: 500,000 actual: ~360K
case SPI_SPEED_6: delay_STM32_soft_spi = 2000; break; // desired: 250,000 actual: ~210K
default: delay_STM32_soft_spi = 4000; break; // desired: 125,000 actual: ~123K
case SPI_FULL_SPEED: delaySPIFunc = &delaySPI_125; break; // desired: 8,000,000 actual: ~1.1M
case SPI_HALF_SPEED: delaySPIFunc = &delaySPI_125; break; // desired: 4,000,000 actual: ~1.1M
case SPI_QUARTER_SPEED:delaySPIFunc = &delaySPI_250; break; // desired: 2,000,000 actual: ~890K
case SPI_EIGHTH_SPEED: delaySPIFunc = &delaySPI_500; break; // desired: 1,000,000 actual: ~590K
case SPI_SPEED_5: delaySPIFunc = &delaySPI_1000; break; // desired: 500,000 actual: ~360K
case SPI_SPEED_6: delaySPIFunc = &delaySPI_2000; break; // desired: 250,000 actual: ~210K
default: delaySPIFunc = &delaySPI_4000; break; // desired: 125,000 actual: ~123K
}
SPI.begin();
}
@ -75,9 +85,9 @@ static SPISettings spiConfig;
WRITE(SD_SCK_PIN, LOW);
WRITE(SD_MOSI_PIN, b & 0x80);
DELAY_NS(delay_STM32_soft_spi);
delaySPIFunc();
WRITE(SD_SCK_PIN, HIGH);
DELAY_NS(delay_STM32_soft_spi);
delaySPIFunc();
b <<= 1; // little setup time
b |= (READ(SD_MISO_PIN) != 0);

176
Marlin/src/HAL/shared/Delay.cpp

@ -0,0 +1,176 @@
/**
* Marlin 3D Printer Firmware
* Copyright (c) 2020 MarlinFirmware [https://github.com/MarlinFirmware/Marlin]
*
* Based on Sprinter and grbl.
* Copyright (c) 2011 Camiel Gubbels / Erik van der Zalm
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
#include "Delay.h"
#include "../../inc/MarlinConfig.h"
#if defined(__arm__) || defined(__thumb__)
static uint32_t ASM_CYCLES_PER_ITERATION = 4; // Initial bet which will be adjusted in calibrate_delay_loop
// Simple assembler loop counting down
void delay_asm(uint32_t cy) {
cy = _MAX(cy / ASM_CYCLES_PER_ITERATION, 1U); // Zero is forbidden here
__asm__ __volatile__(
A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax
L("1")
A("subs %[cnt],#1")
A("bne 1b")
: [cnt]"+r"(cy) // output: +r means input+output
: // input:
: "cc" // clobbers:
);
}
// We can't use CMSIS since it's not available on all platform, so fallback to hardcoded register values
#define HW_REG(X) *(volatile uint32_t *)(X)
#define _DWT_CTRL 0xE0001000
#define _DWT_CYCCNT 0xE0001004 // CYCCNT is 32bits, takes 37s or so to wrap.
#define _DEM_CR 0xE000EDFC
#define _LAR 0xE0001FB0
// Use hardware cycle counter instead, it's much safer
void delay_dwt(uint32_t count) {
// Reuse the ASM_CYCLES_PER_ITERATION variable to avoid wasting another useless variable
register uint32_t start = HW_REG(_DWT_CYCCNT) - ASM_CYCLES_PER_ITERATION, elapsed;
do {
elapsed = HW_REG(_DWT_CYCCNT) - start;
} while (elapsed < count);
}
// Pointer to asm function, calling the functions has a ~20 cycles overhead
DelayImpl DelayCycleFnc = delay_asm;
void calibrate_delay_loop() {
// Check if we have a working DWT implementation in the CPU (see https://developer.arm.com/documentation/ddi0439/b/Data-Watchpoint-and-Trace-Unit/DWT-Programmers-Model)
if (!HW_REG(_DWT_CTRL)) {
// No DWT present, so fallback to plain old ASM nop counting
// Unfortunately, we don't exactly know how many iteration it'll take to decrement a counter in a loop
// It depends on the CPU architecture, the code current position (flash vs SRAM)
// So, instead of wild guessing and making mistake, instead
// compute it once for all
ASM_CYCLES_PER_ITERATION = 1;
// We need to fetch some reference clock before waiting
cli();
uint32_t start = micros();
delay_asm(1000); // On a typical CPU running in MHz, waiting 1000 "unknown cycles" means it'll take between 1ms to 6ms, that's perfectly acceptable
uint32_t end = micros();
sei();
uint32_t expectedCycles = (end - start) * ((F_CPU) / 1000000UL); // Convert microseconds to cycles
// Finally compute the right scale
ASM_CYCLES_PER_ITERATION = (uint32_t)(expectedCycles / 1000);
// No DWT present, likely a Cortex M0 so NOP counting is our best bet here
DelayCycleFnc = delay_asm;
}
else {
// Enable DWT counter
// From https://stackoverflow.com/a/41188674/1469714
HW_REG(_DEM_CR) = HW_REG(_DEM_CR) | 0x01000000; // Enable trace
#if __CORTEX_M == 7
HW_REG(_LAR) = 0xC5ACCE55; // Unlock access to DWT registers, see https://developer.arm.com/documentation/ihi0029/e/ section B2.3.10
#endif
HW_REG(_DWT_CYCCNT) = 0; // Clear DWT cycle counter
HW_REG(_DWT_CTRL) = HW_REG(_DWT_CTRL) | 1; // Enable DWT cycle counter
// Then calibrate the constant offset from the counter
ASM_CYCLES_PER_ITERATION = 0;
uint32_t s = HW_REG(_DWT_CYCCNT);
uint32_t e = HW_REG(_DWT_CYCCNT); // (e - s) contains the number of cycle required to read the cycle counter
delay_dwt(0);
uint32_t f = HW_REG(_DWT_CYCCNT); // (f - e) contains the delay to call the delay function + the time to read the cycle counter
ASM_CYCLES_PER_ITERATION = (f - e) - (e - s);
// Use safer DWT function
DelayCycleFnc = delay_dwt;
}
}
#if ENABLED(MARLIN_DEV_MODE)
void dump_delay_accuracy_check()
{
auto report_call_time = [](PGM_P const name, const uint32_t cycles, const uint32_t total, const bool do_flush=true) {
SERIAL_ECHOPGM("Calling ");
serialprintPGM(name);
SERIAL_ECHOLNPAIR(" for ", cycles, "cycles took: ", total, "cycles");
if (do_flush) SERIAL_FLUSH();
};
uint32_t s, e;
SERIAL_ECHOLNPAIR("Computed delay calibration value: ", ASM_CYCLES_PER_ITERATION);
SERIAL_FLUSH();
// Display the results of the calibration above
constexpr uint32_t testValues[] = { 1, 5, 10, 20, 50, 100, 150, 200, 350, 500, 750, 1000 };
for (auto i : testValues) {
s = micros(); DELAY_US(i); e = micros();
report_call_time(PSTR("delay"), i, e - s);
}
if (HW_REG(_DWT_CTRL)) {
for (auto i : testValues) {
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(i); e = HW_REG(_DWT_CYCCNT);
report_call_time(PSTR("delay"), i, e - s);
}
// Measure the delay to call a real function compared to a function pointer
s = HW_REG(_DWT_CYCCNT); delay_dwt(1); e = HW_REG(_DWT_CYCCNT);
report_call_time(PSTR("delay_dwt"), 1, e - s);
static PGMSTR(dcd, "DELAY_CYCLES directly ");
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES( 1); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 1, e - s, false);
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES( 5); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 5, e - s, false);
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(10); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 10, e - s, false);
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(20); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 20, e - s, false);
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(50); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 50, e - s, false);
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(100); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 100, e - s, false);
s = HW_REG(_DWT_CYCCNT); DELAY_CYCLES(200); e = HW_REG(_DWT_CYCCNT);
report_call_time(dcd, 200, e - s, false);
}
}
#endif // MARLIN_DEV_MODE
#else
void calibrate_delay_loop() {}
#if ENABLED(MARLIN_DEV_MODE)
void dump_delay_accuracy_check() {
static PGMSTR(none, "N/A on this platform");
serialprintPGM(none);
}
#endif
#endif

126
Marlin/src/HAL/shared/Delay.h

@ -21,6 +21,8 @@
*/
#pragma once
#include "../../inc/MarlinConfigPre.h"
/**
* Busy wait delay cycles routines:
*
@ -31,79 +33,68 @@
#include "../../core/macros.h"
#if defined(__arm__) || defined(__thumb__)
#if __CORTEX_M == 7
void calibrate_delay_loop();
// Cortex-M3 through M7 can use the cycle counter of the DWT unit
// https://www.anthonyvh.com/2017/05/18/cortex_m-cycle_counter/
#if defined(__arm__) || defined(__thumb__)
FORCE_INLINE static void enableCycleCounter() {
CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
// We want to have delay_cycle function with the lowest possible overhead, so we adjust at the function at runtime based on the current CPU best feature
typedef void (*DelayImpl)(uint32_t);
extern DelayImpl DelayCycleFnc;
#if __CORTEX_M == 7
DWT->LAR = 0xC5ACCE55; // Unlock DWT on the M7
#endif
// I've measured 36 cycles on my system to call the cycle waiting method, but it shouldn't change much to have a bit more margin, it only consume a bit more flash
#define TRIP_POINT_FOR_CALLING_FUNCTION 40
DWT->CYCCNT = 0;
DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
// A simple recursive template class that output exactly one 'nop' of code per recursion
template <int N> struct NopWriter {
FORCE_INLINE static void build() {
__asm__ __volatile__("nop");
NopWriter<N-1>::build();
}
};
// End the loop
template <> struct NopWriter<0> { FORCE_INLINE static void build() {} };
namespace Private {
// Split recursing template in 2 different class so we don't reach the maximum template instantiation depth limit
template <bool belowTP, int N> struct Helper {
FORCE_INLINE static void build() {
DelayCycleFnc(N - 2); // Approximative cost of calling the function (might be off by one or 2 cycles)
}
};
FORCE_INLINE volatile uint32_t getCycleCount() { return DWT->CYCCNT; }
template <int N> struct Helper<true, N> {
FORCE_INLINE static void build() {
NopWriter<N - 1>::build();
}
};
FORCE_INLINE static void DELAY_CYCLES(const uint32_t x) {
const uint32_t endCycles = getCycleCount() + x;
while (PENDING(getCycleCount(), endCycles)) {}
}
template <> struct Helper<true, 0> {
FORCE_INLINE static void build() {}
};
#else
// https://blueprints.launchpad.net/gcc-arm-embedded/+spec/delay-cycles
#define nop() __asm__ __volatile__("nop;\n\t":::)
FORCE_INLINE static void __delay_4cycles(uint32_t cy) { // +1 cycle
#if ARCH_PIPELINE_RELOAD_CYCLES < 2
#define EXTRA_NOP_CYCLES A("nop")
#else
#define EXTRA_NOP_CYCLES ""
#endif
__asm__ __volatile__(
A(".syntax unified") // is to prevent CM0,CM1 non-unified syntax
L("1")
A("subs %[cnt],#1")
EXTRA_NOP_CYCLES
A("bne 1b")
: [cnt]"+r"(cy) // output: +r means input+output
: // input:
: "cc" // clobbers:
);
}
// Select a behavior based on the constexpr'ness of the parameter
// If called with a compile-time parameter, then write as many NOP as required to reach the asked cycle count
// (there is some tripping point here to start looping when it's more profitable than gruntly executing NOPs)
// If not called from a compile-time parameter, fallback to a runtime loop counting version instead
template <bool compileTime, int Cycles>
struct SmartDelay {
FORCE_INLINE SmartDelay(int) {
if (Cycles == 0) return;
Private::Helper<Cycles < TRIP_POINT_FOR_CALLING_FUNCTION, Cycles>::build();
}
};
// Runtime version below. There is no way this would run under less than ~TRIP_POINT_FOR_CALLING_FUNCTION cycles
template <int T>
struct SmartDelay<false, T> {
FORCE_INLINE SmartDelay(int v) { DelayCycleFnc(v); }
};
// Delay in cycles
FORCE_INLINE static void DELAY_CYCLES(uint32_t x) {
if (__builtin_constant_p(x)) {
#define MAXNOPS 4
if (x <= (MAXNOPS)) {
switch (x) { case 4: nop(); case 3: nop(); case 2: nop(); case 1: nop(); }
}
else { // because of +1 cycle inside delay_4cycles
const uint32_t rem = (x - 1) % (MAXNOPS);
switch (rem) { case 3: nop(); case 2: nop(); case 1: nop(); }
if ((x = (x - 1) / (MAXNOPS)))
__delay_4cycles(x); // if need more then 4 nop loop is more optimal
}
#undef MAXNOPS
}
else if ((x >>= 2))
__delay_4cycles(x);
}
#undef nop
#define DELAY_CYCLES(X) do { SmartDelay<IS_CONSTEXPR(X), IS_CONSTEXPR(X) ? X : 0> _smrtdly_X(X); } while(0)
#endif
// For delay in microseconds, no smart delay selection is required, directly call the delay function
// Teensy compiler is too old and does not accept smart delay compile-time / run-time selection correctly
#define DELAY_US(x) DelayCycleFnc((x) * ((F_CPU) / 1000000UL))
#elif defined(__AVR__)
@ -144,10 +135,15 @@
}
#undef nop
// Delay in microseconds
#define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
#elif defined(__PLAT_LINUX__) || defined(ESP32)
// specified inside platform
// DELAY_CYCLES specified inside platform
// Delay in microseconds
#define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))
#else
#error "Unsupported MCU architecture"
@ -157,5 +153,5 @@
// Delay in nanoseconds
#define DELAY_NS(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL) / 1000UL)
// Delay in microseconds
#define DELAY_US(x) DELAY_CYCLES((x) * ((F_CPU) / 1000000UL))

3
Marlin/src/MarlinCore.cpp

@ -1005,6 +1005,9 @@ void setup() {
SERIAL_ECHO_MSG("Compiled: " __DATE__);
SERIAL_ECHO_MSG(STR_FREE_MEMORY, freeMemory(), STR_PLANNER_BUFFER_BYTES, (int)sizeof(block_t) * (BLOCK_BUFFER_SIZE));
// Some HAL need precise delay adjustment
calibrate_delay_loop();
// Init buzzer pin(s)
#if USE_BEEPER
SETUP_RUN(buzzer.init());

2
Marlin/src/core/macros.h

@ -61,6 +61,8 @@
#define _O2 __attribute__((optimize("O2")))
#define _O3 __attribute__((optimize("O3")))
#define IS_CONSTEXPR(...) __builtin_constant_p(__VA_ARGS__) // Only valid solution with C++14. Should use std::is_constant_evaluated() in C++20 instead
#ifndef UNUSED
#define UNUSED(x) ((void)(x))
#endif

8
Marlin/src/gcode/gcode_d.cpp

@ -30,6 +30,8 @@
#include "../HAL/shared/eeprom_if.h"
#include "../HAL/shared/Delay.h"
extern void dump_delay_accuracy_check();
/**
* Dn: G-code for development and testing
*
@ -141,7 +143,7 @@
}
} break;
case 5: { // D4 Read / Write onboard Flash
case 5: { // D5 Read / Write onboard Flash
#define FLASH_SIZE 1024
uint8_t *pointer = parser.hex_adr_val('A');
uint16_t len = parser.ushortval('C', 1);
@ -162,6 +164,10 @@
}
} break;
case 6: // D6 Check delay loop accuracy
dump_delay_accuracy_check();
break;
case 100: { // D100 Disable heaters and attempt a hard hang (Watchdog Test)
SERIAL_ECHOLNPGM("Disabling heaters and attempting to trigger Watchdog");
SERIAL_ECHOLNPGM("(USE_WATCHDOG " TERN(USE_WATCHDOG, "ENABLED", "DISABLED") ")");

Loading…
Cancel
Save