Merge branch 'main' into enhancement/pit_index_filter

javanna · Nov 20, 2023 · f5803a0 · f5803a0
2 parents f25527a + 36a2f9b
commit f5803a0
Show file tree

Hide file tree

Showing 134 changed files with 4,381 additions and 1,604 deletions.
diff --git a/...c/main/java/org/elasticsearch/benchmark/compute/operator/ValuesSourceReaderBenchmark.java b/...c/main/java/org/elasticsearch/benchmark/compute/operator/ValuesSourceReaderBenchmark.java
@@ -8,8 +8,11 @@
 
 package org.elasticsearch.benchmark.compute.operator;
 
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StoredField;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -19,6 +22,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.NumericUtils;
+import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.compute.data.BlockFactory;
 import org.elasticsearch.compute.data.BytesRefBlock;
 import org.elasticsearch.compute.data.BytesRefVector;
@@ -30,14 +34,16 @@
 import org.elasticsearch.compute.data.LongBlock;
 import org.elasticsearch.compute.data.LongVector;
 import org.elasticsearch.compute.data.Page;
-import org.elasticsearch.compute.lucene.BlockReaderFactories;
 import org.elasticsearch.compute.lucene.LuceneSourceOperator;
 import org.elasticsearch.compute.lucene.ValuesSourceReaderOperator;
 import org.elasticsearch.compute.operator.topn.TopNOperator;
 import org.elasticsearch.core.IOUtils;
+import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.mapper.BlockLoader;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
+import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.NumberFieldMapper;
+import org.elasticsearch.search.lookup.SearchLookup;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@@ -56,7 +62,9 @@
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.PrimitiveIterator;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.IntStream;
 
@@ -93,18 +101,113 @@ public class ValuesSourceReaderBenchmark {
         }
     }
 
-    private static BlockLoader blockLoader(String name) {
+    private static List<ValuesSourceReaderOperator.FieldInfo> fields(String name) {
         return switch (name) {
-            case "long" -> numericBlockLoader(name, NumberFieldMapper.NumberType.LONG);
-            case "int" -> numericBlockLoader(name, NumberFieldMapper.NumberType.INTEGER);
-            case "double" -> numericBlockLoader(name, NumberFieldMapper.NumberType.DOUBLE);
-            case "keyword" -> new KeywordFieldMapper.KeywordFieldType(name).blockLoader(null);
-            default -> throw new IllegalArgumentException("can't read [" + name + "]");
+            case "3_stored_keywords" -> List.of(
+                new ValuesSourceReaderOperator.FieldInfo("keyword_1", List.of(blockLoader("stored_keyword_1"))),
+                new ValuesSourceReaderOperator.FieldInfo("keyword_2", List.of(blockLoader("stored_keyword_2"))),
+                new ValuesSourceReaderOperator.FieldInfo("keyword_3", List.of(blockLoader("stored_keyword_3")))
+            );
+            default -> List.of(new ValuesSourceReaderOperator.FieldInfo(name, List.of(blockLoader(name))));
         };
     }
 
-    private static BlockLoader numericBlockLoader(String name, NumberFieldMapper.NumberType numberType) {
-        return new NumberFieldMapper.NumberFieldType(name, numberType).blockLoader(null);
+    enum Where {
+        DOC_VALUES,
+        SOURCE,
+        STORED;
+    }
+
+    private static BlockLoader blockLoader(String name) {
+        Where where = Where.DOC_VALUES;
+        if (name.startsWith("stored_")) {
+            name = name.substring("stored_".length());
+            where = Where.STORED;
+        } else if (name.startsWith("source_")) {
+            name = name.substring("source_".length());
+            where = Where.SOURCE;
+        }
+        switch (name) {
+            case "long":
+                return numericBlockLoader(name, where, NumberFieldMapper.NumberType.LONG);
+            case "int":
+                return numericBlockLoader(name, where, NumberFieldMapper.NumberType.INTEGER);
+            case "double":
+                return numericBlockLoader(name, where, NumberFieldMapper.NumberType.DOUBLE);
+            case "keyword":
+                name = "keyword_1";
+        }
+        if (name.startsWith("keyword")) {
+            boolean syntheticSource = false;
+            FieldType ft = new FieldType(KeywordFieldMapper.Defaults.FIELD_TYPE);
+            switch (where) {
+                case DOC_VALUES:
+                    break;
+                case SOURCE:
+                    ft.setDocValuesType(DocValuesType.NONE);
+                    break;
+                case STORED:
+                    ft.setStored(true);
+                    ft.setDocValuesType(DocValuesType.NONE);
+                    syntheticSource = true;
+                    break;
+            }
+            ft.freeze();
+            return new KeywordFieldMapper.KeywordFieldType(
+                name,
+                ft,
+                Lucene.KEYWORD_ANALYZER,
+                Lucene.KEYWORD_ANALYZER,
+                Lucene.KEYWORD_ANALYZER,
+                new KeywordFieldMapper.Builder(name, IndexVersion.current()).docValues(ft.docValuesType() != DocValuesType.NONE),
+                syntheticSource
+            ).blockLoader(new MappedFieldType.BlockLoaderContext() {
+                @Override
+                public String indexName() {
+                    return "benchmark";
+                }
+
+                @Override
+                public SearchLookup lookup() {
+                    throw new UnsupportedOperationException();
+                }
+
+                @Override
+                public Set<String> sourcePaths(String name) {
+                    return Set.of(name);
+                }
+            });
+        }
+        throw new IllegalArgumentException("can't read [" + name + "]");
+    }
+
+    private static BlockLoader numericBlockLoader(String name, Where where, NumberFieldMapper.NumberType numberType) {
+        boolean stored = false;
+        boolean docValues = true;
+        switch (where) {
+            case DOC_VALUES:
+                break;
+            case SOURCE:
+                stored = true;
+                docValues = false;
+                break;
+            case STORED:
+                throw new UnsupportedOperationException();
+        }
+        return new NumberFieldMapper.NumberFieldType(
+            name,
+            numberType,
+            true,
+            stored,
+            docValues,
+            true,
+            null,
+            Map.of(),
+            null,
+            false,
+            null,
+            null
+        ).blockLoader(null);
     }
 
     /**
@@ -122,7 +225,7 @@ private static BlockLoader numericBlockLoader(String name, NumberFieldMapper.Num
     @Param({ "in_order", "shuffled", "shuffled_singles" })
     public String layout;
 
-    @Param({ "long", "int", "double", "keyword" })
+    @Param({ "long", "int", "double", "keyword", "stored_keyword", "3_stored_keywords" })
     public String name;
 
     private Directory directory;
@@ -134,9 +237,9 @@ private static BlockLoader numericBlockLoader(String name, NumberFieldMapper.Num
     public void benchmark() {
         ValuesSourceReaderOperator op = new ValuesSourceReaderOperator(
             BlockFactory.getNonBreakingInstance(),
-            List.of(BlockReaderFactories.loaderToFactory(reader, blockLoader(name))),
-            0,
-            name
+            fields(name),
+            List.of(reader),
+            0
         );
         long sum = 0;
         for (Page page : pages) {
@@ -160,7 +263,7 @@ public void benchmark() {
                         sum += (long) values.getDouble(p);
                     }
                 }
-                case "keyword" -> {
+                case "keyword", "stored_keyword" -> {
                     BytesRef scratch = new BytesRef();
                     BytesRefVector values = op.getOutput().<BytesRefBlock>getBlock(1).asVector();
                     for (int p = 0; p < values.getPositionCount(); p++) {
@@ -170,21 +273,59 @@ public void benchmark() {
                         sum += Integer.parseInt(r.utf8ToString());
                     }
                 }
+                case "3_stored_keywords" -> {
+                    BytesRef scratch = new BytesRef();
+                    Page out = op.getOutput();
+                    for (BytesRefVector values : new BytesRefVector[] {
+                        out.<BytesRefBlock>getBlock(1).asVector(),
+                        out.<BytesRefBlock>getBlock(2).asVector(),
+                        out.<BytesRefBlock>getBlock(3).asVector() }) {
+
+                        for (int p = 0; p < values.getPositionCount(); p++) {
+                            BytesRef r = values.getBytesRef(p, scratch);
+                            r.offset++;
+                            r.length--;
+                            sum += Integer.parseInt(r.utf8ToString());
+                        }
+                    }
+                }
             }
         }
-        long expected;
-        if (name.equals("keyword")) {
-            expected = 0;
-            for (int i = 0; i < INDEX_SIZE; i++) {
-                expected += i % 1000;
-            }
-        } else {
-            expected = INDEX_SIZE;
-            expected = expected * (expected - 1) / 2;
+        long expected = 0;
+        switch (name) {
+            case "keyword", "stored_keyword":
+                for (int i = 0; i < INDEX_SIZE; i++) {
+                    expected += i % 1000;
+                }
+                break;
+            case "3_stored_keywords":
+                for (int i = 0; i < INDEX_SIZE; i++) {
+                    expected += 3 * (i % 1000);
+                }
+                break;
+            default:
+                expected = INDEX_SIZE;
+                expected = expected * (expected - 1) / 2;
         }
         if (expected != sum) {
             throw new AssertionError("[" + layout + "][" + name + "] expected [" + expected + "] but was [" + sum + "]");
         }
+        boolean foundStoredFieldLoader = false;
+        ValuesSourceReaderOperator.Status status = (ValuesSourceReaderOperator.Status) op.status();
+        for (Map.Entry<String, Integer> e : status.readersBuilt().entrySet()) {
+            if (e.getKey().indexOf("stored_fields") >= 0) {
+                foundStoredFieldLoader = true;
+            }
+        }
+        if (name.indexOf("stored") >= 0) {
+            if (foundStoredFieldLoader == false) {
+                throw new AssertionError("expected to use a stored field loader but only had: " + status.readersBuilt());
+            }
+        } else {
+            if (foundStoredFieldLoader) {
+                throw new AssertionError("expected not to use a stored field loader but only had: " + status.readersBuilt());
+            }
+        }
     }
 
     @Setup
@@ -195,15 +336,23 @@ public void setup() throws IOException {
 
     private void setupIndex() throws IOException {
         directory = new ByteBuffersDirectory();
+        FieldType keywordFieldType = new FieldType(KeywordFieldMapper.Defaults.FIELD_TYPE);
+        keywordFieldType.setStored(true);
+        keywordFieldType.freeze();
         try (IndexWriter iw = new IndexWriter(directory, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
             for (int i = 0; i < INDEX_SIZE; i++) {
                 String c = Character.toString('a' - ((i % 1000) % 26) + 26);
                 iw.addDocument(
                     List.of(
                         new NumericDocValuesField("long", i),
+                        new StoredField("long", i),
                         new NumericDocValuesField("int", i),
+                        new StoredField("int", i),
                         new NumericDocValuesField("double", NumericUtils.doubleToSortableLong(i)),
-                        new KeywordFieldMapper.KeywordField("keyword", new BytesRef(c + i % 1000), KeywordFieldMapper.Defaults.FIELD_TYPE)
+                        new StoredField("double", (double) i),
+                        new KeywordFieldMapper.KeywordField("keyword_1", new BytesRef(c + i % 1000), keywordFieldType),
+                        new KeywordFieldMapper.KeywordField("keyword_2", new BytesRef(c + i % 1000), keywordFieldType),
+                        new KeywordFieldMapper.KeywordField("keyword_3", new BytesRef(c + i % 1000), keywordFieldType)
                     )
                 );
                 if (i % COMMIT_INTERVAL == 0) {

diff --git a/docs/changelog/100408.yaml b/docs/changelog/100408.yaml
@@ -0,0 +1,5 @@
+pr: 100408
+summary: "ESQL: Make blocks ref counted"
+area: ES|QL
+type: enhancement
+issues: []
diff --git a/docs/changelog/102138.yaml b/docs/changelog/102138.yaml
@@ -0,0 +1,5 @@
+pr: 102138
+summary: Skip shards that don't match the source query during checkpointing
+area: Transform
+type: enhancement
+issues: []
diff --git a/docs/changelog/102192.yaml b/docs/changelog/102192.yaml
@@ -0,0 +1,5 @@
+pr: 102192
+summary: "ESQL: Load more than one field at once"
+area: ES|QL
+type: enhancement
+issues: []
diff --git a/docs/changelog/102282.yaml b/docs/changelog/102282.yaml
@@ -0,0 +1,6 @@
+pr: 102282
+summary: "ES|QL: Fix drop of renamed grouping"
+area: ES|QL
+type: bug
+issues:
+ - 102121
diff --git a/docs/changelog/102350.yaml b/docs/changelog/102350.yaml
@@ -0,0 +1,6 @@
+pr: 102350
+summary: "ESQL: Fix rare bug with empty string"
+area: ES|QL
+type: bug
+issues:
+ - 101969
diff --git a/docs/reference/query-dsl/text-expansion-query.asciidoc b/docs/reference/query-dsl/text-expansion-query.asciidoc
@@ -78,29 +78,7 @@ GET my-index/_search
 ----
 // TEST[skip: TBD]
 
-[discrete]
-[[optimizing-text-expansion]]
-=== Optimizing the search performance of the text_expansion query
-
-https://www.elastic.co/blog/faster-retrieval-of-top-hits-in-elasticsearch-with-block-max-wand[Max WAND]
-is an optimization technique used by {es} to skip documents that cannot score
-competitively against the current best matching documents. However, the tokens
-generated by the ELSER model don't work well with the Max WAND optimization.
-Consequently, enabling Max WAND can actually increase query latency for
-`text_expansion`. For datasets of a significant size, disabling Max
-WAND leads to lower query latencies.
-
-Max WAND is controlled by the
-<<track-total-hits, track_total_hits>> query parameter. Setting track_total_hits
-to true forces {es} to consider all documents, resulting in lower query
-latencies for the `text_expansion` query. However, other {es} queries run slower
-when Max WAND is disabled.
-
-If you are combining the `text_expansion` query with standard text queries in a
-compound search, it is recommended to measure the query performance before
-deciding which setting to use.
-
-NOTE: The `track_total_hits` option applies to all queries in the search request
-and may be optimal for some queries but not for others. Take into account the
-characteristics of all your queries to determine the most suitable
-configuration.
+[NOTE]
+====
+Depending on your data, the text expansion query may be faster with `track_total_hits: false`.
+====
diff --git a/docs/reference/search/search-your-data/search-api.asciidoc b/docs/reference/search/search-your-data/search-api.asciidoc
@@ -440,6 +440,17 @@ GET my-index-000001/_search
 Finally you can force an accurate count by setting `"track_total_hits"`
 to `true` in the request.
 
+[TIP]
+=========================================
+The `track_total_hits` parameter allows you to trade hit count accuracy for performance.
+In general the lower the value of `track_total_hits` the faster the query will be,
+with `false` returning the fastest results.
+Setting `track_total_hits` to true will cause {es} to return exact hit counts, which could
+hurt query performance because it disables the
+https://www.elastic.co/blog/faster-retrieval-of-top-hits-in-elasticsearch-with-block-max-wand[Max WAND]
+optimization.
+=========================================
+
 [discrete]
 [[quickly-check-for-matching-docs]]
 === Quickly check for matching docs