Skip to content

Commit

Permalink
Optimize unescaping/parsing performance of BamlResourceContent/BamlSt…
Browse files Browse the repository at this point in the history
…ringToken (#9508)

* remove array allocation with CollectionsMarshal

* use foreach to prevent multiple pointer shifts

* naive span-based unescape over MatchEvaluator, increase perf by 6 times, decrease allocs by 8 times

* improve func name and comments, use BamlConst.EscapeChar

* add initial searchvalues<string> for tokens to match performance for string with no escapes

* Fix is-pattern in null checks, comment spaces
  • Loading branch information
h3xds1nz authored Jan 4, 2025
1 parent d911a56 commit ccd4a80
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 54 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

Expand All @@ -8,6 +8,8 @@

using System.Text;
using System.Text.RegularExpressions;
using System.Runtime.InteropServices;
using System.Buffers;

namespace MS.Internal.Globalization
{
Expand All @@ -21,9 +23,10 @@ internal static class BamlResourceContentUtil
/// </summary>
internal static string EscapeString(string content)
{
if (content == null) return null;
if (content is null)
return null;

StringBuilder builder = new StringBuilder();
StringBuilder builder = new(content.Length * 2);
for (int i = 0; i < content.Length; i++)
{
switch (content[i])
Expand Down Expand Up @@ -72,65 +75,121 @@ internal static string EscapeString(string content)
return builder.ToString();
}

/// <summary>
/// Holds all escape tokens used for initial string-search loop to find out whether we need to unescape the string.
/// </summary>
private static readonly SearchValues<string> s_escapeTokens = SearchValues.Create(["\\", "&quot;", "&apos;", "&amp;", "&lt;", "&gt;"], StringComparison.Ordinal);

/// <summary>
/// Unescape a string. Note:
/// Backslash following any character will become that character.
/// Backslash by itself will be skipped.
/// </summary>
internal static string UnescapeString(string content)
{
return UnescapePattern.Replace(
content,
UnescapeMatchEvaluator
);
}

// Regular expression
// need to use 4 backslash here because it is escaped by compiler and regular expressions
private static Regex UnescapePattern = new Regex("(\\\\.?|&lt;|&gt;|&quot;|&apos;|&amp;)", RegexOptions.CultureInvariant | RegexOptions.Compiled);

// delegates to escape and unesacpe a matched pattern
private static MatchEvaluator UnescapeMatchEvaluator = new MatchEvaluator(UnescapeMatch);
/// <remarks>Prefer <see cref="UnescapeString(ReadOnlySpan{char})"/> overload when possible.</remarks>
internal static string UnescapeString(string content) => UnescapeString(content.AsSpan(), false) ?? content;

/// <summary>
/// the delegate to Unescape the matched pattern
/// Unescape a string. Note:
/// Backslash following any character will become that character.
/// Backslash by itself will be skipped.
/// </summary>
private static string UnescapeMatch(Match match)
internal static string UnescapeString(ReadOnlySpan<char> contentSpan, bool returnNewInstance = true)
{
switch (match.Value)
// Check whether there's anything to unescape
int firstEscapeToken = contentSpan.IndexOfAny(s_escapeTokens);
if (firstEscapeToken == -1)
return returnNewInstance ? new string(contentSpan) : null;

// Allocate buffer and append the chunk without tokens (unescaped)
StringBuilder stringBuilder = new(contentSpan.Length);
stringBuilder.Append(contentSpan.Slice(0, firstEscapeToken));

for (int i = firstEscapeToken; i < contentSpan.Length; i++)
{
case "&lt;": return "<";
case "&gt;": return ">";
case "&amp;": return "&";
case "&apos;": return "'";
case "&quot;": return "\"";
default:
if (contentSpan[i] == BamlConst.EscapeChar) // An escape token ('\')
{
if (contentSpan.Length > i + 1) // Check whether we're at the end
{
// this is a '\' followed by 0 or 1 character
Debug.Assert(match.Value.Length > 0 && match.Value[0] == BamlConst.EscapeChar);
if (match.Value.Length == 2)
{
return match.Value[1].ToString();
}
else
{
return string.Empty;
}
i++;
stringBuilder.Append(contentSpan[i]);
}
else // We are, break out of the loop
break;
}
else if (contentSpan[i] == '&') // A known escape sequence shall follow
{
EvaulateEscapeSequence(stringBuilder, contentSpan, ref i);
}
else // Nothing interesting, append character
stringBuilder.Append(contentSpan[i]);
}

// Evaluates whether any of the known escape sequences follows '&' (&quot; - &apos; - &amp; - &lt; - &gt;)
static void EvaulateEscapeSequence(StringBuilder stringBuilder, ReadOnlySpan<char> contentSpan, ref int i)
{
contentSpan = contentSpan.Slice(i);

if (contentSpan.Length > 5 && contentSpan[5] == ';')
{
if (contentSpan.Slice(0, 6).SequenceEqual("&quot;"))
{
stringBuilder.Append('"');
i += 5;
return;
}
else if (contentSpan.Slice(0, 6).SequenceEqual("&apos;"))
{
stringBuilder.Append('\'');
i += 5;
return;
}
}
else if (contentSpan.Length > 4 && contentSpan[4] == ';')
{
if (contentSpan.Slice(0, 5).SequenceEqual("&amp;"))
{
stringBuilder.Append('&');
i += 4;
return;
}
}
else if (contentSpan.Length > 3 && contentSpan[3] == ';')
{
if (contentSpan.Slice(0, 4).SequenceEqual("&lt;"))
{
stringBuilder.Append('<');
i += 3;
return;
}
else if (contentSpan.Slice(0, 4).SequenceEqual("&gt;"))
{
stringBuilder.Append('>');
i += 3;
return;
}
}

// Default case, no escaped sequence found
stringBuilder.Append('&');
}

return stringBuilder.ToString();
}

/// <summary>
/// Parse the input string into an array of text/child-placeholder tokens.
/// Element placeholders start with '#' and end with ';'.
/// In case of error, a null array is returned.
/// </summary>
internal static BamlStringToken[] ParseChildPlaceholder(string input)
internal static ReadOnlySpan<BamlStringToken> ParseChildPlaceholder(string input)
{
if (input == null) return null;
if (input is null)
return ReadOnlySpan<BamlStringToken>.Empty;

List<BamlStringToken> tokens = new(8);
int tokenStart = 0;
bool inPlaceHolder = false;

List<BamlStringToken> tokens = new List<BamlStringToken>(8);
int tokenStart = 0; bool inPlaceHolder = false;
for (int i = 0; i < input.Length; i++)
{
if (input[i] == BamlConst.ChildStart)
Expand All @@ -140,7 +199,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input)
if (inPlaceHolder)
{
// All # needs to be escaped in a child place holder
return null; // error
return ReadOnlySpan<BamlStringToken>.Empty; // error
}

inPlaceHolder = true;
Expand All @@ -149,7 +208,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input)
tokens.Add(
new BamlStringToken(
BamlStringToken.TokenType.Text,
UnescapeString(input.Substring(tokenStart, i - tokenStart))
UnescapeString(input.AsSpan(tokenStart, i - tokenStart))
)
);
tokenStart = i;
Expand All @@ -166,7 +225,7 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input)
tokens.Add(
new BamlStringToken(
BamlStringToken.TokenType.ChildPlaceHolder,
UnescapeString(input.Substring(tokenStart + 1, i - tokenStart - 1))
UnescapeString(input.AsSpan(tokenStart + 1, i - tokenStart - 1))
)
);

Expand All @@ -180,25 +239,25 @@ internal static BamlStringToken[] ParseChildPlaceholder(string input)
if (inPlaceHolder)
{
// at the end of the string, all child placeholder must be closed
return null; // error
return ReadOnlySpan<BamlStringToken>.Empty; // error
}

if (tokenStart < input.Length)
{
tokens.Add(
new BamlStringToken(
BamlStringToken.TokenType.Text,
UnescapeString(input.Substring(tokenStart))
UnescapeString(input.AsSpan(tokenStart))
)
);
}

return tokens.ToArray();
return CollectionsMarshal.AsSpan(tokens);
}
}


internal struct BamlStringToken
internal readonly struct BamlStringToken
{
internal readonly TokenType Type;
internal readonly string Value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,9 +613,9 @@ private static bool GetBamlTreeNodeFromText(
IList<BamlTreeNode> newChildrenList // list of new children
)
{
BamlStringToken[] tokens = BamlResourceContentUtil.ParseChildPlaceholder(content);
ReadOnlySpan<BamlStringToken> tokens = BamlResourceContentUtil.ParseChildPlaceholder(content);

if (tokens == null)
if (tokens.IsEmpty)
{
bamlTreeMap.Resolver.RaiseErrorNotifyEvent(
new BamlLocalizerErrorNotifyEventArgs(
Expand All @@ -627,19 +627,19 @@ private static bool GetBamlTreeNodeFromText(
}

bool succeed = true;
for (int i = 0; i < tokens.Length; i++)
foreach (BamlStringToken token in tokens)
{
switch (tokens[i].Type)
switch (token.Type)
{
case BamlStringToken.TokenType.Text:
{
BamlTreeNode node = new BamlTextNode(tokens[i].Value);
BamlTreeNode node = new BamlTextNode(token.Value);
newChildrenList.Add(node);
break;
}
case BamlStringToken.TokenType.ChildPlaceHolder:
{
BamlTreeNode node = bamlTreeMap.MapUidToBamlTreeElementNode(tokens[i].Value);
BamlTreeNode node = bamlTreeMap.MapUidToBamlTreeElementNode(token.Value);

// The value will be null if there is no uid-matching node in the tree.
if (node != null)
Expand All @@ -651,7 +651,7 @@ private static bool GetBamlTreeNodeFromText(
bamlTreeMap.Resolver.RaiseErrorNotifyEvent(
new BamlLocalizerErrorNotifyEventArgs(
new BamlLocalizableResourceKey(
tokens[i].Value,
token.Value,
string.Empty,
string.Empty
),
Expand Down

0 comments on commit ccd4a80

Please sign in to comment.