From 1f83f472c7fc02bb77f8230c60f9fd455d004939 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Mon, 13 Jan 2025 19:22:06 +0300 Subject: [PATCH 1/7] core: swap transfer and MPT GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It doesn't change anything logically, but transfer GC needs to have current block timestamp for its logic and if we're delaying it with MPT GC it can more often fail to obtain it: 2025-01-13T16:15:18.311+0300 ERROR failed to get block timestamp transfer GC {"time": "1.022µs", "index": 20000} It's not critical, this garbage can still be collected on the next run, but we better avoid this anyway. Refs. #3783. Signed-off-by: Roman Khimov --- pkg/core/blockchain.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/core/blockchain.go b/pkg/core/blockchain.go index a223f01bd6..adfbc3ef26 100644 --- a/pkg/core/blockchain.go +++ b/pkg/core/blockchain.go @@ -1159,8 +1159,8 @@ func (bc *Blockchain) tryRunGC(oldHeight uint32) time.Duration { oldHeight /= bc.config.Ledger.GarbageCollectionPeriod newHeight /= bc.config.Ledger.GarbageCollectionPeriod if tgtBlock > int64(bc.config.Ledger.GarbageCollectionPeriod) && newHeight != oldHeight { - dur = bc.stateRoot.GC(uint32(tgtBlock), bc.store) - dur += bc.removeOldTransfers(uint32(tgtBlock)) + dur = bc.removeOldTransfers(uint32(tgtBlock)) + dur += bc.stateRoot.GC(uint32(tgtBlock), bc.store) } return dur } From ba0ca6a4abb2023e0ac9d3d94430d87ed0dfd230 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Mon, 13 Jan 2025 23:37:42 +0300 Subject: [PATCH 2/7] core: perform synchronized persist if GC took some time The intention here is to reduce the amount of in-flight changes and prevent OOM. It doesn't matter what we're doing, persisting or collecting garbage, what matters is that we're behind the schedule of regular persist cycle. Refs. #3783. Signed-off-by: Roman Khimov --- pkg/core/blockchain.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/core/blockchain.go b/pkg/core/blockchain.go index adfbc3ef26..438bc30705 100644 --- a/pkg/core/blockchain.go +++ b/pkg/core/blockchain.go @@ -1119,7 +1119,6 @@ func (bc *Blockchain) Run() { return case <-persistTimer.C: var oldPersisted uint32 - var gcDur time.Duration if bc.config.Ledger.RemoveUntraceableBlocks { oldPersisted = atomic.LoadUint32(&bc.persistedHeight) @@ -1129,10 +1128,10 @@ func (bc *Blockchain) Run() { bc.log.Warn("failed to persist blockchain", zap.Error(err)) } if bc.config.Ledger.RemoveUntraceableBlocks { - gcDur = bc.tryRunGC(oldPersisted) + dur += bc.tryRunGC(oldPersisted) } nextSync = dur > persistInterval*2 - interval := persistInterval - dur - gcDur + interval := persistInterval - dur interval = max(interval, time.Microsecond) // Reset doesn't work with zero or negative value. persistTimer.Reset(interval) } From 70aaeb06add755dab1b5738c4ea8c883dec64e06 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Tue, 14 Jan 2025 12:40:53 +0300 Subject: [PATCH 3/7] core: fix old transfer data deletion in RUB-only configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RemoveUntraceableBlocks without RemoveUntraceableHeaders is still a valid configuration and removeOldTransfers() only checks for gcBlockTimes now for timestamps, so they should always be added. This fixes 2025-01-13T23:28:57.340+0300 ERROR failed to get block timestamp transfer GC {"time": "1.162µs", "index": 20000} for RemoveUntraceableBlocks-only configurations. Signed-off-by: Roman Khimov --- pkg/core/blockchain.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/core/blockchain.go b/pkg/core/blockchain.go index 438bc30705..abde79626e 100644 --- a/pkg/core/blockchain.go +++ b/pkg/core/blockchain.go @@ -1643,7 +1643,7 @@ func (bc *Blockchain) storeBlock(block *block.Block, txpool *mempool.Pool) error } for index := start; index < stop; index++ { ts, err := kvcache.DeleteBlock(bc.GetHeaderHash(index), bc.config.Ledger.RemoveUntraceableHeaders) - if bc.config.Ledger.RemoveUntraceableHeaders && index%bc.config.Ledger.GarbageCollectionPeriod == 0 { + if index%bc.config.Ledger.GarbageCollectionPeriod == 0 { _ = bc.gcBlockTimes.Add(index, ts) } if err != nil { From c14492e8c84c9ab29ae3559671e4107dab290426 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Tue, 14 Jan 2025 12:45:29 +0300 Subject: [PATCH 4/7] core: raise severity of persistence failure message It is very critical in fact. Signed-off-by: Roman Khimov --- pkg/core/blockchain.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/core/blockchain.go b/pkg/core/blockchain.go index abde79626e..d14fa9852f 100644 --- a/pkg/core/blockchain.go +++ b/pkg/core/blockchain.go @@ -1125,7 +1125,7 @@ func (bc *Blockchain) Run() { } dur, err := bc.persist(nextSync) if err != nil { - bc.log.Warn("failed to persist blockchain", zap.Error(err)) + bc.log.Error("failed to persist blockchain", zap.Error(err)) } if bc.config.Ledger.RemoveUntraceableBlocks { dur += bc.tryRunGC(oldPersisted) From c07c74df41a1cf0da25a0d0c662216c69a2ba91d Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Tue, 14 Jan 2025 17:17:34 +0300 Subject: [PATCH 5/7] docs: add more recommendataions to GarbageCollectionPeriod We're trying to delete less than 1% of data in the default configuration for mainnet, so it looks like this: 2025-01-14T15:51:39.449+0300 INFO finished MPT garbage collection {"removed": 221115, "kept": 71236766, "time": "5m40.323822085s"} Spending this much time for this low gain every 10K blocks is far from being optimal. Signed-off-by: Roman Khimov --- docs/node-configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-configuration.md b/docs/node-configuration.md index 26565f9e7f..7aedf409d7 100644 --- a/docs/node-configuration.md +++ b/docs/node-configuration.md @@ -18,7 +18,7 @@ node-related settings described in the table below. | --- | --- | --- | --- | | DBConfiguration | [DB Configuration](#DB-Configuration) | | Describes configuration for database. See the [DB Configuration](#DB-Configuration) section for details. | | LogLevel | `string` | "info" | Minimal logged messages level (can be "debug", "info", "warn", "error", "dpanic", "panic" or "fatal"). | -| GarbageCollectionPeriod | `uint32` | 10000 | Controls MPT garbage collection interval (in blocks) for configurations with `RemoveUntraceableBlocks` enabled and `KeepOnlyLatestState` disabled. In this mode the node stores a number of MPT trees (corresponding to `MaxTraceableBlocks` and `StateSyncInterval`), but the DB needs to be clean from old entries from time to time. Doing it too often will cause too much processing overhead, doing it too rarely will leave more useless data in the DB. | +| GarbageCollectionPeriod | `uint32` | 10000 | Controls MPT garbage collection interval (in blocks) for configurations with `RemoveUntraceableBlocks` enabled and `KeepOnlyLatestState` disabled. In this mode the node stores a number of MPT trees (corresponding to `MaxTraceableBlocks` and `StateSyncInterval`), but the DB needs to be clean from old entries from time to time. Doing it too often will cause too much processing overhead (it requires going through the whole DB which can take minutes), doing it too rarely will leave more useless data in the DB. Always compare this to `MaxTraceableBlocks`, values lower than 10% of it are likely too low, values higher than 50% are likely to leave more garbage than is possible to collect. The default value is more aligned with NeoFS networks that have low MTB values, but for N3 mainnet it's too low. | | KeepOnlyLatestState | `bool` | `false` | Specifies if MPT should only store the latest state (or a set of latest states, see `P2PStateExchangeExtensions` section in the ProtocolConfiguration for details). If true, DB size will be smaller, but older roots won't be accessible. This value should remain the same for the same database. | | | LogPath | `string` | "", so only console logging | File path where to store node logs. | | NeoFSBlockFetcher | [NeoFS BlockFetcher Configuration](#NeoFS-BlockFetcher-Configuration) | | NeoFS BlockFetcher module configuration. See the [NeoFS BlockFetcher Configuration](#NeoFS-BlockFetcher-Configuration) section for details. | From 9513780c450632a0703e0216e1948dc88cc9cfb3 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Wed, 15 Jan 2025 11:31:08 +0300 Subject: [PATCH 6/7] core: adjust in-memory processed set dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of tick-tocking with sync/async and having an unpredictable data set we can just try to check for the real amount of keys that be processed by the underlying DB. Can't be perfect, but still this adds some hard limit to the amount of in-memory data. It's also adaptive, slower machines will keep less and faster machines will keep more. This gives almost perfect 4s cycles for mainnet BoltDB with no tail cutting, it makes zero sense to process more blocks since we're clearly DB-bound: 2025-01-15T11:35:00.567+0300 INFO persisted to disk {"blocks": 1469, "keys": 40579, "headerHeight": 5438141, "blockHeight": 5438140, "velocity": 9912, "took": "4.378939648s"} 2025-01-15T11:35:04.699+0300 INFO persisted to disk {"blocks": 1060, "keys": 39976, "headerHeight": 5439201, "blockHeight": 5439200, "velocity": 9888, "took": "4.131985438s"} 2025-01-15T11:35:08.752+0300 INFO persisted to disk {"blocks": 1508, "keys": 39658, "headerHeight": 5440709, "blockHeight": 5440708, "velocity": 9877, "took": "4.052347569s"} 2025-01-15T11:35:12.807+0300 INFO persisted to disk {"blocks": 1645, "keys": 39565, "headerHeight": 5442354, "blockHeight": 5442353, "velocity": 9864, "took": "4.05547743s"} 2025-01-15T11:35:17.011+0300 INFO persisted to disk {"blocks": 1472, "keys": 39519, "headerHeight": 5443826, "blockHeight": 5443825, "velocity": 9817, "took": "4.203258142s"} 2025-01-15T11:35:21.089+0300 INFO persisted to disk {"blocks": 1345, "keys": 39529, "headerHeight": 5445171, "blockHeight": 5445170, "velocity": 9804, "took": "4.078297579s"} 2025-01-15T11:35:25.090+0300 INFO persisted to disk {"blocks": 1054, "keys": 39326, "headerHeight": 5446225, "blockHeight": 5446224, "velocity": 9806, "took": "4.000524899s"} 2025-01-15T11:35:30.372+0300 INFO persisted to disk {"blocks": 1239, "keys": 39349, "headerHeight": 5447464, "blockHeight": 5447463, "velocity": 9744, "took": "4.281444939s"} 2× can be considered, but this calculation isn't perfect for low number of keys, so somewhat bigger tolerance is preferable for now. Overall it's not degrading performance, my mainnet/bolt run was even 8% better with this. Fixes #3249, we don't need any option this way. Fixes #3783 as well, it no longer OOMs in that scenario. It however can OOM in case of big GarbageCollectionPeriod (like 400K), but this can't be solved easily. Signed-off-by: Roman Khimov --- pkg/core/blockchain.go | 53 +++++++++++++++++++----- pkg/core/blockchain_core_test.go | 2 +- pkg/core/prometheus.go | 13 ++++++ pkg/core/storage/memcached_store_test.go | 2 + pkg/core/storage/memory_store.go | 7 ++++ 5 files changed, 66 insertions(+), 11 deletions(-) diff --git a/pkg/core/blockchain.go b/pkg/core/blockchain.go index d14fa9852f..85291b9ee9 100644 --- a/pkg/core/blockchain.go +++ b/pkg/core/blockchain.go @@ -70,6 +70,13 @@ const ( // either, the next cycle will still do the job (only transfers need this, // MPT won't notice at all). defaultBlockTimesCache = 8 + + // persistSamples is the number of persist velocity samples to use for + // storeBlock limit. + persistSamples = 10 + // persistMinForSampling is the minimal number of keys to take persist + // time into account wrt persist velocity. + persistMinForSampling = 100 ) // stateChangeStage denotes the stage of state modification process. @@ -165,6 +172,14 @@ type Blockchain struct { // Current persisted block count. persistedHeight uint32 + // keysPerPersist is the average number of persisted keys per persist + // time limit. + keysPerPersist uint32 + + // persistCond is signaled each time persist cycle ends, this wakes + // storeBlock if needed (when it has too many queued blocks) + persistCond *sync.Cond + // Index->Timestamp cache for garbage collector. Headers can be gone // by the time it runs, so we use a tiny little cache to sync block // removal (performed in storeBlock()) with transfer/MPT GC (tryRunGC()) @@ -338,6 +353,7 @@ func NewBlockchain(s storage.Store, cfg config.Blockchain, log *zap.Logger) (*Bl contracts: *native.NewContracts(cfg.ProtocolConfiguration), } + bc.persistCond = sync.NewCond(&bc.lock) bc.gcBlockTimes, _ = lru.New[uint32, uint64](defaultBlockTimesCache) // Never errors for positive size bc.stateRoot = stateroot.NewModule(cfg, bc.VerifyWitness, bc.log, bc.dao.Store) bc.contracts.Designate.StateRootService = bc.stateRoot @@ -1102,7 +1118,7 @@ func (bc *Blockchain) Run() { persistTimer := time.NewTimer(persistInterval) defer func() { persistTimer.Stop() - if _, err := bc.persist(true); err != nil { + if _, err := bc.persist(); err != nil { bc.log.Warn("failed to persist", zap.Error(err)) } if err := bc.dao.Store.Close(); err != nil { @@ -1112,7 +1128,6 @@ func (bc *Blockchain) Run() { close(bc.runToExitCh) }() go bc.notificationDispatcher() - var nextSync bool for { select { case <-bc.stopCh: @@ -1123,14 +1138,13 @@ func (bc *Blockchain) Run() { if bc.config.Ledger.RemoveUntraceableBlocks { oldPersisted = atomic.LoadUint32(&bc.persistedHeight) } - dur, err := bc.persist(nextSync) + dur, err := bc.persist() if err != nil { bc.log.Error("failed to persist blockchain", zap.Error(err)) } if bc.config.Ledger.RemoveUntraceableBlocks { dur += bc.tryRunGC(oldPersisted) } - nextSync = dur > persistInterval*2 interval := persistInterval - dur interval = max(interval, time.Microsecond) // Reset doesn't work with zero or negative value. persistTimer.Reset(interval) @@ -1797,6 +1811,14 @@ func (bc *Blockchain) storeBlock(block *block.Block, txpool *mempool.Pool) error } bc.lock.Lock() + // Wait for a while if we're lagging behind the persistence routine, + // it's too easy to OOM otherwise. Keep in mind that this check can't + // be perfect, so some tolerance (accepting more) is OK to have. + var persistVelocity = atomic.LoadUint32(&bc.keysPerPersist) + for persistVelocity != 0 && uint32(bc.dao.Store.Len()) > persistVelocity*4 { + bc.persistCond.Wait() + } + _, err = aerCache.Persist() if err != nil { bc.lock.Unlock() @@ -2148,7 +2170,7 @@ func (bc *Blockchain) LastBatch() *storage.MemBatch { } // persist flushes current in-memory Store contents to the persistent storage. -func (bc *Blockchain) persist(isSync bool) (time.Duration, error) { +func (bc *Blockchain) persist() (time.Duration, error) { var ( start = time.Now() duration time.Duration @@ -2156,11 +2178,7 @@ func (bc *Blockchain) persist(isSync bool) (time.Duration, error) { err error ) - if isSync { - persisted, err = bc.dao.PersistSync() - } else { - persisted, err = bc.dao.Persist() - } + persisted, err = bc.dao.Persist() if err != nil { return 0, err } @@ -2177,6 +2195,20 @@ func (bc *Blockchain) persist(isSync bool) (time.Duration, error) { return 0, err } duration = time.Since(start) + // Low number of keys is not representative and duration _can_ + // be zero in tests on strange platforms like Windows. + if duration > 0 && persisted > persistMinForSampling { + var ( + currentVelocity = uint32(int64(persisted) * int64(persistInterval) / int64(duration)) + persistVelocity = atomic.LoadUint32(&bc.keysPerPersist) + ) + if persistVelocity != 0 { + currentVelocity = min(currentVelocity, 2*persistVelocity) // Normalize sudden spikes. + currentVelocity = (persistVelocity*(persistSamples-1) + currentVelocity) / persistSamples + } // Otherwise it's the first sample and we take it as is. + atomic.StoreUint32(&bc.keysPerPersist, currentVelocity) + updateEstimatedPersistVelocityMetric(currentVelocity) + } bc.log.Info("persisted to disk", zap.Uint32("blocks", diff), zap.Int("keys", persisted), @@ -2187,6 +2219,7 @@ func (bc *Blockchain) persist(isSync bool) (time.Duration, error) { // update monitoring metrics. updatePersistedHeightMetric(bHeight) } + bc.persistCond.Signal() return duration, nil } diff --git a/pkg/core/blockchain_core_test.go b/pkg/core/blockchain_core_test.go index 7f71a57905..f99fa5ad29 100644 --- a/pkg/core/blockchain_core_test.go +++ b/pkg/core/blockchain_core_test.go @@ -63,7 +63,7 @@ func TestAddBlock(t *testing.T) { assert.Equal(t, lastBlock.Hash(), bc.CurrentHeaderHash()) // This one tests persisting blocks, so it does need to persist() - _, err = bc.persist(false) + _, err = bc.persist() require.NoError(t, err) key := make([]byte, 1+util.Uint256Size) diff --git a/pkg/core/prometheus.go b/pkg/core/prometheus.go index 8e429d0446..b649cd26ae 100644 --- a/pkg/core/prometheus.go +++ b/pkg/core/prometheus.go @@ -22,6 +22,14 @@ var ( Namespace: "neogo", }, ) + // estimatedPersistVelocity prometheus metric. + estimatedPersistVelocity = prometheus.NewGauge( + prometheus.GaugeOpts{ + Help: "Estimation of persist velocity per cycle (1s by default)", + Name: "estimated_persist_velocity", + Namespace: "neogo", + }, + ) // headerHeight prometheus metric. headerHeight = prometheus.NewGauge( prometheus.GaugeOpts{ @@ -44,6 +52,7 @@ func init() { prometheus.MustRegister( blockHeight, persistedHeight, + estimatedPersistVelocity, headerHeight, mempoolUnsortedTx, ) @@ -53,6 +62,10 @@ func updatePersistedHeightMetric(pHeight uint32) { persistedHeight.Set(float64(pHeight)) } +func updateEstimatedPersistVelocityMetric(v uint32) { + estimatedPersistVelocity.Set(float64(v)) +} + func updateHeaderHeightMetric(hHeight uint32) { headerHeight.Set(float64(hHeight)) } diff --git a/pkg/core/storage/memcached_store_test.go b/pkg/core/storage/memcached_store_test.go index 586e5e06ed..cc041b4214 100644 --- a/pkg/core/storage/memcached_store_test.go +++ b/pkg/core/storage/memcached_store_test.go @@ -19,12 +19,14 @@ func TestMemCachedPutGetDelete(t *testing.T) { s.Put(key, value) + require.Equal(t, 1, s.Len()) result, err := s.Get(key) assert.Nil(t, err) require.Equal(t, value, result) s.Delete(key) + require.Equal(t, 1, s.Len()) // deletion marker _, err = s.Get(key) assert.NotNil(t, err) assert.Equal(t, err, ErrKeyNotFound) diff --git a/pkg/core/storage/memory_store.go b/pkg/core/storage/memory_store.go index e4e8c20d6e..dd962cb87d 100644 --- a/pkg/core/storage/memory_store.go +++ b/pkg/core/storage/memory_store.go @@ -66,6 +66,13 @@ func (s *MemoryStore) putChangeSet(puts map[string][]byte, stores map[string][]b } } +// Len returns the number of keys stored. +func (s *MemoryStore) Len() int { + s.mut.RLock() + defer s.mut.RUnlock() + return len(s.mem) + len(s.stor) +} + // Seek implements the Store interface. func (s *MemoryStore) Seek(rng SeekRange, f func(k, v []byte) bool) { s.seek(rng, f, s.mut.RLock, s.mut.RUnlock) From c047bad446100380352a3f02193d51ca4e55c18d Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Wed, 15 Jan 2025 21:48:03 +0300 Subject: [PATCH 7/7] docs: a bit more DB recommendations Signed-off-by: Roman Khimov --- docs/node-configuration.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/node-configuration.md b/docs/node-configuration.md index 7aedf409d7..7ba47982c9 100644 --- a/docs/node-configuration.md +++ b/docs/node-configuration.md @@ -100,7 +100,12 @@ DBConfiguration: ``` where: - `Type` is the database type (string value). Supported types: `leveldb`, `boltdb` and - `inmemory` (not recommended for production usage). + `inmemory` (not recommended for production usage). LevelDB is better for archive nodes + that store all data, it better deals with writes in general, but any tail-cutting node + options seriously degrade its performance. BoltDB works much better in various + performance tests, however it can seriously degrade GC in case the DB size is bigger + than the amount of available memory. BoltDB is also more memory-demanding for some + operations, so GC can be problematic from that angle as well. - `LevelDBOptions` are settings for LevelDB. Includes the DB files path and ReadOnly mode toggle. If ReadOnly mode is on, then an error will be returned on attempt to connect to unexisting or empty database. Database doesn't allow changes in this mode, a warning will be logged on DB persist attempts.