Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add full Unicode support for Javascript identifiers #3647

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5ad0744
Added support for UnicodeCombiningMark, fixes #3639.
ctjlewis Jul 19, 2020
93ed59e
Removed unnecessary test script, moved scripts to utils/.
ctjlewis Jul 19, 2020
7d2ea29
Added newlines for formatting
ctjlewis Jul 19, 2020
3c5ee21
Patched isCombiningMark to check Unicode space manually rather than r…
ctjlewis Jul 20, 2020
b3a720c
Added bazel output symlinks to .gitignore and removed from tree.
ctjlewis Jul 20, 2020
590f2dc
Removed unneeded java.lang.Character import.
ctjlewis Jul 20, 2020
1ecfd75
Removed unneeded test.js.
ctjlewis Jul 20, 2020
e1d5129
Merge remote-tracking branch 'upstream/master'
ctjlewis Jul 20, 2020
696e201
Adding support for non-ASCII Unicode.
ctjlewis Jul 21, 2020
0d25c89
Added .vscode to .gitignore.
ctjlewis Jul 21, 2020
f2f02e3
Added note about regexpu.
ctjlewis Jul 21, 2020
a8e9e95
Fixed test re. U+FF3F, which is now supported.
ctjlewis Jul 21, 2020
813797b
Added more notes on regex patterns used.
ctjlewis Jul 21, 2020
8fefd7d
Inlined calls to UnicodeMatch utils.
ctjlewis Aug 12, 2020
78700f2
UnicodeMatch to public class, prevent build failure
ctjlewis Aug 12, 2020
da68979
Merge branch 'master' into unicode
ctjlewis Aug 12, 2020
ad71a90
Changed isIdentifierPart and isIdentifierStart arg types from char to…
ctjlewis Aug 12, 2020
4372212
Added comment about branch prediction misses
ctjlewis Aug 12, 2020
066eaba
Clarified comments and added `fastPathAscii`
ctjlewis Aug 12, 2020
8c7c040
Re-added fastPath Greek/Latin checks until further advisement
ctjlewis Aug 12, 2020
72d6ce8
Reverting fastPathAscii check until further advisement
ctjlewis Aug 12, 2020
7e83d4f
Refactored fast paths
ctjlewis Aug 12, 2020
2f69388
Removed debugging utils/ directory
ctjlewis Aug 12, 2020
dc275d2
Comment formatting
ctjlewis Aug 12, 2020
bd26958
Fixed string/char error
ctjlewis Aug 12, 2020
5fb993f
Comment formatting
ctjlewis Aug 12, 2020
8ab20b7
Committing changes before push to remote.
ctjlewis Aug 12, 2020
cb50fe1
Explicit typecast to char for `Character.toString()`
ctjlewis Aug 12, 2020
469265f
Updated fast paths for ASCII optimization
ctjlewis Aug 12, 2020
6d9f04e
Comment formatting
ctjlewis Aug 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ target/
/yarn.lock
/node_modules
/.metadata/

# Bazel stuff
/bazel-bin
/bazel-closure-compiler
/bazel-out
/bazel-testlogs

# IDE
.vscode
58 changes: 6 additions & 52 deletions src/com/google/javascript/jscomp/parsing/parser/Scanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import com.google.javascript.jscomp.parsing.parser.util.ErrorReporter;
import com.google.javascript.jscomp.parsing.parser.util.SourcePosition;
import com.google.javascript.jscomp.parsing.parser.util.SourceRange;
import com.google.javascript.jscomp.parsing.parser.util.UnicodeMatch;

import java.util.ArrayList;
import javax.annotation.Nullable;

Expand Down Expand Up @@ -140,7 +142,7 @@ public LiteralToken nextRegularExpressionLiteralToken() {
nextChar();

// flags
while (isIdentifierPart(peekChar())) {
while (UnicodeMatch.isJavascriptIdentifierPart(peekChar())) {
nextChar();
}

Expand Down Expand Up @@ -764,7 +766,7 @@ private Token scanIdentifierOrKeyword(int beginToken, char ch) {
int unicodeEscapeLen = containsUnicodeEscape ? 1 : 0;

ch = peekChar();
while (isIdentifierPart(ch)
while (UnicodeMatch.isJavascriptIdentifierPart(ch)
|| ch == '\\'
|| (ch == '{' && unicodeEscapeLen == 2)
|| (ch == '}' && bracedUnicodeEscape)) {
Expand Down Expand Up @@ -804,7 +806,7 @@ private Token scanIdentifierOrKeyword(int beginToken, char ch) {
// Check to make sure the first character (or the unicode escape at the
// beginning of the identifier) is a valid identifier start character.
char start = value.charAt(0);
if (!isIdentifierStart(start)) {
if (!UnicodeMatch.isJavascriptIdentifierStart(start)) {
reportError(
getPosition(beginToken),
"Character '%c' (U+%04X) is not a valid identifier start char",
Expand Down Expand Up @@ -858,7 +860,7 @@ private static String processUnicodeEscapes(String value) {
}
// TODO(mattloring): Allow code points >= 0xFFFF (greater than the size of a char).
char ch = (char) Integer.parseInt(hexDigits, 0x10);
if (!isIdentifierPart(ch)) {
if (!UnicodeMatch.isJavascriptIdentifierPart(ch)) {
return null;
}
value = value.substring(0, escapeStart) + ch + value.substring(escapeEnd);
Expand All @@ -869,54 +871,6 @@ private static String processUnicodeEscapes(String value) {
return value;
}

@SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code
private static boolean isIdentifierStart(char ch) {
// Most code is written in pure ASCII, so create a fast path here.
if (ch <= 127) {
// Intentionally avoiding short circuiting behavior of "||" and "&&".
// This minimizes branches in this code which minimizes branch prediction misses.
return ((ch >= 'A' & ch <= 'Z') | (ch >= 'a' & ch <= 'z') | (ch == '_' | ch == '$'));
}

// Handle non-ASCII characters.
// TODO(tjgq): This should include all characters with the ID_Start property.
if (Character.isLetter(ch)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you feel this is insufficient for the native Java builds? Where do you believe you need the JavaScript version?

Copy link
Contributor Author

@ctjlewis ctjlewis Aug 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@concavelenz I believe the reason I removed Character.isLetter was because the GWT build would not work for non-ASCII unicode, due to limitations of J2CL / Guava implementations of those methods. See: google/j2cl#103

Apologies for the lack of specificity, this issue is about a month old and is not super fresh in my mind. Thank you for your reviews, John.

return true;
}

// Workaround for b/36459436.
// When running under GWT/J2CL, Character.isLetter only handles ASCII.
// Angular relies heavily on Latin Small Letter Barred O and Greek Capital Letter Delta.
// Greek letters are occasionally found in math code.
// Latin letters are found in our own tests.
return (ch >= 0x00C0 & ch <= 0x00D6) // Latin letters
// 0x00D7 = multiplication sign, not a letter
| (ch >= 0x00D8 & ch <= 0x00F6) // Latin letters
// 0x00F7 = division sign, not a letter
| (ch >= 0x00F8 & ch <= 0x00FF) // Latin letters
| ch == 0x0275 // Latin Barred O
| (ch >= 0x0391 & ch <= 0x03A1) // Greek uppercase letters
// 0x03A2 = unassigned
| (ch >= 0x03A3 & ch <= 0x03A9) // Remaining Greek uppercase letters
| (ch >= 0x03B1 & ch <= 0x03C9); // Greek lowercase letters
}

@SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code
private static boolean isIdentifierPart(char ch) {
// Most code is written in pure ASCII, so create a fast path here.
if (ch <= 127) {
return ((ch >= 'A' & ch <= 'Z')
| (ch >= 'a' & ch <= 'z')
| (ch >= '0' & ch <= '9')
| (ch == '_' | ch == '$'));
}

// Handle non-ASCII characters.
// TODO(tjgq): This should include all characters with the ID_Continue property, plus
// Zero Width Non-Joiner and Zero Width Joiner.
return isIdentifierStart(ch) || Character.isDigit(ch);
}

private Token scanStringLiteral(int beginIndex, char terminator) {
// String literals might span multiple lines.
SourcePosition startingPosition = getPosition(beginIndex);
Expand Down
174 changes: 174 additions & 0 deletions src/com/google/javascript/jscomp/parsing/parser/util/UnicodeMatch.java

Large diffs are not rendered by default.

25 changes: 3 additions & 22 deletions test/com/google/javascript/jscomp/deps/ClosureBundlerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,33 +210,14 @@ public void testCommentsAndFormattingRemovedWithTranspilation() throws Exception
}

@Test
public void testFullWidthLowLineWithDefaultTranspilerIsOkay() throws Exception {
// The last character is something the compiler doesn't handle correctly
public void testFullWidthLowLine() throws Exception {
String input = "var aesthetic_";
ClosureBundler bundler = new ClosureBundler();
StringBuilder sb = new StringBuilder();
bundler.appendTo(
sb,
SimpleDependencyInfo.builder("", "").build(),
input);
assertThat(sb.toString()).isEqualTo(input);
}

// TODO(johnplaisted): If / when the compiler can parse full width low line in identifiers
// this should be okay to be transpiled.
@Test
public void testFullWidthLowLineInTranspiledCodeIsError() throws Exception {
// The last character is something the compiler doesn't handle correctly
String input = "let aesthetic_";
ClosureBundler bundler = new ClosureBundler(BaseTranspiler.ES5_TRANSPILER);
StringBuilder sb = new StringBuilder();
try {
bundler.appendTo(sb, SimpleDependencyInfo.builder("", "").build(), input);
assertWithMessage("Expected an exception").fail();
} catch (TranspilationException e) {
assertThat(e)
.hasMessageThat()
.contains("Parse error. Character '_' (U+FF3F) is not a valid identifier start char");
}
assertThat(sb.toString()).isEqualTo("var \uff41\uff45\uff53\uff54\uff48\uff45\uff54\uff49\uff43\uff3f");
}
}
}
4 changes: 4 additions & 0 deletions travis_util/test_npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ if [ -z $TRAVIS_BUILD_DIR ]; then
TRAVIS_BUILD_DIR=`pwd`
fi

if [ -z $TRAVIS_BUILD_DIR ]; then
TRAVIS_BUILD_DIR=`pwd`
fi

# Run yarn install so that dev dependencies are available
yarn install && cd ${TRAVIS_BUILD_DIR}/node_modules/closure-compiler-npm && yarn install

Expand Down