-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetric.go
277 lines (259 loc) · 10.9 KB
/
metric.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
package cords
/*
BSD 3-Clause License
Copyright (c) 2020–21, Norbert Pillmayer
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Metric is a metric to calculate on a cord. Sometimes it's helpful to find
// information about a (large) text by collecting metrics from fragments and
// assembling them. Cords naturally break up texts into smaller fragments,
// letting us calculate metrics by applying them to (a subset of) fragments and
// propagate them upwards the nodes of the cord tree.
//
// An example of a (very simplistic) metric would be to count the number of
// bytes in a text. The total count is calculated by counting the bytes in every
// fragment and adding up intermediate sums while travelling upwards through the
// cord's tree.
//
// Metric is a simple interface whith a metric function that will be applied
// to portions of text. Clients will have no control over size or boundaries of
// the fragment. Applying metrics to different fragments may be done concurrently
// by the calculation driver, therefore it is illegal to hold unguarded global state
// in a metric.
//
// Combine requires the Metric to calculate a metric value “sum” (monoid) from
// two metric values. This way the metric will bubble up metric values to the root
// of the cord tree and therewith result in a single overall metric value for
// a text.
// Combine must be a monoid over cords.MetricValue, with a neutral element n
// of Apply = f("") → n, i.e. the metric value of the empty string.
//
// However, for materialized metrics it is a bit
// different from plain metrics: they resemble a free monoid. This is reflected
// by the result of materialized metrics, which is a list of spans (organized through
// a cord-tree).
// As a corollary, Combine has an additional task for materialized metrics
// than it has for plain metrics. Combine has to focus on the bytes *between* the
// already recognized spans of both the left and right sibling, and be able to
// convert them to cord leafs.
//
type Metric interface {
Apply(frag []byte) MetricValue
Combine(leftSibling, rightSibling MetricValue, metric Metric) MetricValue
}
// MaterializedMetric is a type for metrics (please refer to interface Metric)
// that build a concrete cord tree for a text-cord.
//
// A materialized metric does metric calculations exactly as a simple metric.
// However, it additionally supports building up a tree from atomic leafs
// containing metric values.
//
// There are (at least) two ways to go about building a metric tree: one to
// preserve a homomorph of the text fragments, essentially materializing a
// catamorphism, or we can re-align the leaf boundaries with the
// continuity-boundaries of the metric. We'll go with the latter and build up
// a tree which will the be somewhat decoupled from the text cord.
//
type MaterializedMetric interface {
Metric
Leafs(MetricValue, bool) []Leaf
}
// MetricValue is a type returned by applying a metric to text fragments (see
// interface Metric). It holds information about the added length of the text
// fragments which this value has been calulated for, and slices of bytes
// at either end of the accumulated fragments which have to be reprocessed.
//
// Fragments of text are presented to the metric function as slices of bytes,
// without regard to rune-, grapheme- or line-boundaries. If we want to
// calculate information about, say, the maximum line length in a text, we'd
// have to count the graphemes of fragments. Graphemes will consist, however,
// of an unknown number of bytes and code points, which may only be identified
// by reading them at the grapheme start character. If a fragment is cut in
// the middle of a grapheme, the metric at the first bytes of a fragment cannot
// reliably calculated. Therefore metrics will be calculated on substrings of
// fragments where conditions allow a metric application, and any unprocessed
// bytes at the left and right boundary will be marked for reprocessing.
//
// When propagating metric values up the tree nodes, metric value of the left
// and right child node of a cord tree node will have to be combined.
// The combination must be able to reprocess any unprocessed bytes.
//
// --- denotes unprocessed bytes
// === denotes bytes already processed by the metric
//
// (a) |-----========--| & |----==============| combine two sibling fragments
//
// (b) |-----======== ------ ==============| reprocess 6 bytes in between
//
// (c) |-----============================| combined intermediate fragment or
//
// For an approachable discussion please refer to Raph Levien's “Rope Science” series
// (https://xi-editor.io/docs/rope_science_01.html), or—less approachable—read up
// on catamorphism.
//
// Calculating metric values is a topic where implementation characteristics of cords
// get somewhat visible for clients of the API. This is unfortunate, but may be
// mitigated by helper types provided by this package. Clients usually will either create
// metrics on top of pre-defined basic metrics or they may embed the MetricValueBase
// helper type in their MetricValues.
//
type MetricValue interface {
Len() int // summed up length of text fragments
Unprocessed() ([]byte, []byte) // unprocessed bytes at either end
}
// --- Apply a metric to a cord ----------------------------------------------
// ApplyMetric applies a metric calculation on a (section of a) text.
//
// i and j are text positions with Go slice semantics.
// If [i, j) does not specify a valid slice of the text, ErrIndexOutOfBounds will be
// returned.
//
func ApplyMetric(cord Cord, i, j uint64, metric Metric) (MetricValue, error) {
if cord.IsVoid() {
return nil, nil
}
if i > cord.Len() || j > cord.Len() || j < i {
return nil, ErrIndexOutOfBounds
}
return applyMetric(&cord.root.cordNode, i, j, metric), nil
}
func applyMetric(node *cordNode, i, j uint64, metric Metric) MetricValue {
tracer().Debugf("called applyMetric([%d], %d, %d)", node.Weight(), i, j)
if node.IsLeaf() {
leaf := node.AsLeaf()
tracer().Debugf("METRIC(%s|%d, %d, %d)", leaf, leaf.Len(), i, j)
s := leaf.leaf.Substring(umax(0, i), umin(j, leaf.Len()))
v := metric.Apply(s)
tracer().Debugf("leaf metric value = %v", v)
return v
}
var v, vl, vr MetricValue
if i < node.Weight() && node.Left() != nil {
vl = applyMetric(node.Left(), i, j, metric)
tracer().Debugf("left metric value = %v", vl)
}
if node.Right() != nil && j > node.Weight() {
w := node.Weight()
vr = applyMetric(node.Right(), i-umin(w, i), j-w, metric)
tracer().Debugf("right metric value = %v", vr)
}
if !isnull(vl) && !isnull(vr) {
tracer().Debugf("COMBINE %v + %v", vl, vr)
v = metric.Combine(vl, vr, metric)
} else if !isnull(vl) {
v = vl
} else if !isnull(vr) {
v = vr
}
tracer().Debugf("combined metric value = %v", v)
tracer().Debugf("node=%v", node)
tracer().Debugf("dropping out of applyMetric([%d], %d, %d)", node.Weight(), i, j)
return v
}
// ApplyMaterializedMetric applies a materialized metric to a (section of a) text.
// Returns a metric value and a cord, which manages the spans of the metric
// on the text.
//
// i and j are text positions with Go slice semantics.
// If [i, j) does not specify a valid slice of the text, ErrIndexOutOfBounds will be
// returned.
//
func ApplyMaterializedMetric(cord Cord, i, j uint64, metric MaterializedMetric) (MetricValue, Cord, error) {
if cord.IsVoid() {
return nil, Cord{}, nil
}
if i > cord.Len() || j > cord.Len() || j < i {
return nil, Cord{}, ErrIndexOutOfBounds
}
v, c := applyMaterializedMetric(&cord.root.cordNode, i, j, metric)
spans := metric.Leafs(v, true)
cl := buildFragmentCord(spans[:1])
cr := buildFragmentCord(spans[1:])
c = Concat(cl, c, cr)
//sl, sr := v.Unprocessed() // there may be unprocessed bytes at either end
// TODO apply metric and build l.leaf and r.leaf
return v, c, nil
}
func applyMaterializedMetric(node *cordNode, i, j uint64, metric MaterializedMetric) (MetricValue, Cord) {
tracer().Debugf("called applyMaterializedMetric([%d], %d, %d)", node.Weight(), i, j)
if node.IsLeaf() {
leaf := node.AsLeaf()
tracer().Debugf("M-METRIC(%s|%d, %d, %d)", leaf, leaf.Len(), i, j)
s := leaf.leaf.Substring(umax(0, i), umin(j, leaf.Len()))
v := metric.Apply(s)
c := buildFragmentCord(metric.Leafs(v, false))
if !c.IsVoid() {
dump(&c.root.cordNode)
}
return v, c
}
var v, vl, vr MetricValue
var c, cl, cr Cord
if i < node.Weight() && node.Left() != nil {
vl, cl = applyMaterializedMetric(node.Left(), i, j, metric)
tracer().Debugf("left metric value = %v", vl)
}
if node.Right() != nil && j > node.Weight() {
w := node.Weight()
vr, cr = applyMaterializedMetric(node.Right(), i-umin(w, i), j-w, metric)
tracer().Debugf("right metric value = %v", vr)
}
if !isnull(vl) && !isnull(vr) {
tracer().Debugf("COMBINE %v + %v", vl, vr)
v = metric.Combine(vl, vr, metric)
mid := buildFragmentCord(metric.Leafs(v, false))
c = Concat(cl, mid, cr)
} else if !isnull(vl) {
v = vl
c = cl
} else if !isnull(vr) {
v = vr
c = cr
}
tracer().Debugf("combined metric value = %v", v)
tracer().Debugf("node=%v", node)
if !c.IsVoid() {
dump(&c.root.cordNode)
}
tracer().Debugf("dropping out of applyMetric([%d], %d, %d)", node.Weight(), i, j)
return v, c
}
func buildFragmentCord(leafs []Leaf) Cord {
if len(leafs) == 0 || leafs[0] == nil {
return Cord{}
}
var cord Cord
for _, leaf := range leafs {
lnode := makeLeafNode()
lnode.leaf = leaf
c := makeCord(&lnode.cordNode)
cord = cord.concat2(c)
}
if !cord.IsVoid() {
dump(&cord.root.cordNode)
}
return cord
}
func isnull(v MetricValue) bool {
return v == nil || v.Len() == 0
}