From ccd4a801b55f52ec4e480d50f820425bdd425c6d Mon Sep 17 00:00:00 2001 From: h3xds1nz Date: Sat, 4 Jan 2025 16:43:41 +0100 Subject: [PATCH] Optimize unescaping/parsing performance of BamlResourceContent/BamlStringToken (#9508) * remove array allocation with CollectionsMarshal * use foreach to prevent multiple pointer shifts * naive span-based unescape over MatchEvaluator, increase perf by 6 times, decrease allocs by 8 times * improve func name and comments, use BamlConst.EscapeChar * add initial searchvalues for tokens to match performance for string with no escapes * Fix is-pattern in null checks, comment spaces --- .../Globalization/BamlResourceContent.cs | 153 ++++++++++++------ .../Internal/Globalization/BamlTreeUpdater.cs | 14 +- 2 files changed, 113 insertions(+), 54 deletions(-) diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlResourceContent.cs b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlResourceContent.cs index 01205cd4248..cafe0c9e849 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlResourceContent.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlResourceContent.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -8,6 +8,8 @@ using System.Text; using System.Text.RegularExpressions; +using System.Runtime.InteropServices; +using System.Buffers; namespace MS.Internal.Globalization { @@ -21,9 +23,10 @@ internal static class BamlResourceContentUtil /// internal static string EscapeString(string content) { - if (content == null) return null; + if (content is null) + return null; - StringBuilder builder = new StringBuilder(); + StringBuilder builder = new(content.Length * 2); for (int i = 0; i < content.Length; i++) { switch (content[i]) @@ -72,52 +75,105 @@ internal static string EscapeString(string content) return builder.ToString(); } + /// + /// Holds all escape tokens used for initial string-search loop to find out whether we need to unescape the string. + /// + private static readonly SearchValues s_escapeTokens = SearchValues.Create(["\\", """, "'", "&", "<", ">"], StringComparison.Ordinal); + /// /// Unescape a string. Note: /// Backslash following any character will become that character. /// Backslash by itself will be skipped. /// - internal static string UnescapeString(string content) - { - return UnescapePattern.Replace( - content, - UnescapeMatchEvaluator - ); - } - - // Regular expression - // need to use 4 backslash here because it is escaped by compiler and regular expressions - private static Regex UnescapePattern = new Regex("(\\\\.?|<|>|"|'|&)", RegexOptions.CultureInvariant | RegexOptions.Compiled); - - // delegates to escape and unesacpe a matched pattern - private static MatchEvaluator UnescapeMatchEvaluator = new MatchEvaluator(UnescapeMatch); + /// Prefer overload when possible. + internal static string UnescapeString(string content) => UnescapeString(content.AsSpan(), false) ?? content; /// - /// the delegate to Unescape the matched pattern + /// Unescape a string. Note: + /// Backslash following any character will become that character. + /// Backslash by itself will be skipped. /// - private static string UnescapeMatch(Match match) + internal static string UnescapeString(ReadOnlySpan contentSpan, bool returnNewInstance = true) { - switch (match.Value) + // Check whether there's anything to unescape + int firstEscapeToken = contentSpan.IndexOfAny(s_escapeTokens); + if (firstEscapeToken == -1) + return returnNewInstance ? new string(contentSpan) : null; + + // Allocate buffer and append the chunk without tokens (unescaped) + StringBuilder stringBuilder = new(contentSpan.Length); + stringBuilder.Append(contentSpan.Slice(0, firstEscapeToken)); + + for (int i = firstEscapeToken; i < contentSpan.Length; i++) { - case "<": return "<"; - case ">": return ">"; - case "&": return "&"; - case "'": return "'"; - case """: return "\""; - default: + if (contentSpan[i] == BamlConst.EscapeChar) // An escape token ('\') + { + if (contentSpan.Length > i + 1) // Check whether we're at the end { - // this is a '\' followed by 0 or 1 character - Debug.Assert(match.Value.Length > 0 && match.Value[0] == BamlConst.EscapeChar); - if (match.Value.Length == 2) - { - return match.Value[1].ToString(); - } - else - { - return string.Empty; - } + i++; + stringBuilder.Append(contentSpan[i]); } + else // We are, break out of the loop + break; + } + else if (contentSpan[i] == '&') // A known escape sequence shall follow + { + EvaulateEscapeSequence(stringBuilder, contentSpan, ref i); + } + else // Nothing interesting, append character + stringBuilder.Append(contentSpan[i]); } + + // Evaluates whether any of the known escape sequences follows '&' (" - ' - & - < - >) + static void EvaulateEscapeSequence(StringBuilder stringBuilder, ReadOnlySpan contentSpan, ref int i) + { + contentSpan = contentSpan.Slice(i); + + if (contentSpan.Length > 5 && contentSpan[5] == ';') + { + if (contentSpan.Slice(0, 6).SequenceEqual(""")) + { + stringBuilder.Append('"'); + i += 5; + return; + } + else if (contentSpan.Slice(0, 6).SequenceEqual("'")) + { + stringBuilder.Append('\''); + i += 5; + return; + } + } + else if (contentSpan.Length > 4 && contentSpan[4] == ';') + { + if (contentSpan.Slice(0, 5).SequenceEqual("&")) + { + stringBuilder.Append('&'); + i += 4; + return; + } + } + else if (contentSpan.Length > 3 && contentSpan[3] == ';') + { + if (contentSpan.Slice(0, 4).SequenceEqual("<")) + { + stringBuilder.Append('<'); + i += 3; + return; + } + else if (contentSpan.Slice(0, 4).SequenceEqual(">")) + { + stringBuilder.Append('>'); + i += 3; + return; + } + } + + // Default case, no escaped sequence found + stringBuilder.Append('&'); + } + + return stringBuilder.ToString(); } /// @@ -125,12 +181,15 @@ private static string UnescapeMatch(Match match) /// Element placeholders start with '#' and end with ';'. /// In case of error, a null array is returned. /// - internal static BamlStringToken[] ParseChildPlaceholder(string input) + internal static ReadOnlySpan ParseChildPlaceholder(string input) { - if (input == null) return null; + if (input is null) + return ReadOnlySpan.Empty; + + List tokens = new(8); + int tokenStart = 0; + bool inPlaceHolder = false; - List tokens = new List(8); - int tokenStart = 0; bool inPlaceHolder = false; for (int i = 0; i < input.Length; i++) { if (input[i] == BamlConst.ChildStart) @@ -140,7 +199,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input) if (inPlaceHolder) { // All # needs to be escaped in a child place holder - return null; // error + return ReadOnlySpan.Empty; // error } inPlaceHolder = true; @@ -149,7 +208,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input) tokens.Add( new BamlStringToken( BamlStringToken.TokenType.Text, - UnescapeString(input.Substring(tokenStart, i - tokenStart)) + UnescapeString(input.AsSpan(tokenStart, i - tokenStart)) ) ); tokenStart = i; @@ -166,7 +225,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input) tokens.Add( new BamlStringToken( BamlStringToken.TokenType.ChildPlaceHolder, - UnescapeString(input.Substring(tokenStart + 1, i - tokenStart - 1)) + UnescapeString(input.AsSpan(tokenStart + 1, i - tokenStart - 1)) ) ); @@ -180,7 +239,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input) if (inPlaceHolder) { // at the end of the string, all child placeholder must be closed - return null; // error + return ReadOnlySpan.Empty; // error } if (tokenStart < input.Length) @@ -188,17 +247,17 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input) tokens.Add( new BamlStringToken( BamlStringToken.TokenType.Text, - UnescapeString(input.Substring(tokenStart)) + UnescapeString(input.AsSpan(tokenStart)) ) ); } - return tokens.ToArray(); + return CollectionsMarshal.AsSpan(tokens); } } - internal struct BamlStringToken + internal readonly struct BamlStringToken { internal readonly TokenType Type; internal readonly string Value; diff --git a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlTreeUpdater.cs b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlTreeUpdater.cs index 682ed386150..36c0dd2d4f3 100644 --- a/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlTreeUpdater.cs +++ b/src/Microsoft.DotNet.Wpf/src/PresentationFramework/MS/Internal/Globalization/BamlTreeUpdater.cs @@ -613,9 +613,9 @@ private static bool GetBamlTreeNodeFromText( IList newChildrenList // list of new children ) { - BamlStringToken[] tokens = BamlResourceContentUtil.ParseChildPlaceholder(content); + ReadOnlySpan tokens = BamlResourceContentUtil.ParseChildPlaceholder(content); - if (tokens == null) + if (tokens.IsEmpty) { bamlTreeMap.Resolver.RaiseErrorNotifyEvent( new BamlLocalizerErrorNotifyEventArgs( @@ -627,19 +627,19 @@ private static bool GetBamlTreeNodeFromText( } bool succeed = true; - for (int i = 0; i < tokens.Length; i++) + foreach (BamlStringToken token in tokens) { - switch (tokens[i].Type) + switch (token.Type) { case BamlStringToken.TokenType.Text: { - BamlTreeNode node = new BamlTextNode(tokens[i].Value); + BamlTreeNode node = new BamlTextNode(token.Value); newChildrenList.Add(node); break; } case BamlStringToken.TokenType.ChildPlaceHolder: { - BamlTreeNode node = bamlTreeMap.MapUidToBamlTreeElementNode(tokens[i].Value); + BamlTreeNode node = bamlTreeMap.MapUidToBamlTreeElementNode(token.Value); // The value will be null if there is no uid-matching node in the tree. if (node != null) @@ -651,7 +651,7 @@ private static bool GetBamlTreeNodeFromText( bamlTreeMap.Resolver.RaiseErrorNotifyEvent( new BamlLocalizerErrorNotifyEventArgs( new BamlLocalizableResourceKey( - tokens[i].Value, + token.Value, string.Empty, string.Empty ),