From 67c1fc9db6452f3d95a8f7c8c34d375a46a36acb Mon Sep 17 00:00:00 2001
From: Dave Holoway <adelphes@gmail.com>
Date: Sun, 7 Jun 2020 23:50:51 +0100
Subject: [PATCH] allow unicode characters, $ and _ in identifiers

---
 langserver/java/body-parser3.js | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/langserver/java/body-parser3.js b/langserver/java/body-parser3.js
index 15b416f..cea4566 100644
--- a/langserver/java/body-parser3.js
+++ b/langserver/java/body-parser3.js
@@ -2646,7 +2646,7 @@ class ResolvedIdent {
  *   ".*       unterminated string literal
  *   '\\?.?'?  character literal - possibly unterminated and/or with invalid escape
  *   \.?\d     number literal (start) - further processing extracts the value
- *   \w+       word - keyword or identifier
+ *   [\p{L}\p{N}_$]*       word - keyword or identifier
  *   [;,?:(){}\[\]]   single-character symbols and operators
  *   \.(\.\.)?    . ...
  * 
@@ -2670,7 +2670,7 @@ class ResolvedIdent {
  */
 function tokenize(source, offset = 0, length = source.length) {
     const text = source.slice(offset, offset + length);
-    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\?.?'?)|(\.?\d)|(\w+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/g;
+    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
     const raw_token_types = [
         'wsc',
         'string-literal',
@@ -2696,7 +2696,7 @@ function tokenize(source, offset = 0, length = source.length) {
      * \w+    word
      * ```
      */
-    const word_re = /(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized)|(class|enum|interface)|(package|import)|(\w+))\b/g;
+    const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized)|(class|enum|interface)|(package|import)|(.+))$/;
     const word_token_types = [
         'boolean-literal',
         'object-literal',
@@ -2753,9 +2753,8 @@ function tokenize(source, offset = 0, length = source.length) {
                 break;
             case 'word':
                 // we need to work out what kind of keyword, literal or ident this is
-                word_re.lastIndex = m.index;
-                m = word_re.exec(text);
-                idx = m.findIndex((match,i) => i && match) - 1;
+                let word_m = m[0].match(word_re);
+                idx = word_m.findIndex((match,i) => i && match) - 1;
                 tokentype = word_token_types[idx];        
                 break;
             case 'operator':