make type parser and body parser use same tokenizer

2025-12-22 17:39:19 +00:00 · 2020-06-08 11:45:01 +01:00
parent 930237359e
commit bbc6007338
4 changed files with 347 additions and 412 deletions
--- a/langserver/java/body-parser3.js
+++ b/langserver/java/body-parser3.js
@@ -8,7 +8,7 @@ const { JavaType, CEIType, PrimitiveType, ArrayType, UnresolvedType, WildcardTyp
 const { SourceMethod, SourceConstructor } = require('./source-type');
 const ResolvedImport = require('./parsetypes/resolved-import');
 const ParseProblem = require('./parsetypes/parse-problem');
-const { TextBlock, BlockRange } = require('./parsetypes/textblock');
+const { getOperatorType, tokenize, Token } = require('./tokenizer');

 /**
 * @typedef {SourceMethod|SourceConstructor} SourceMC
@@ -2654,312 +2654,4 @@ class ResolvedIdent {

 }

-/**
- *   \s+       whitespace
- *   \/\/.*    single-line comment (slc)
- *   \/\*[\d\D]*?\*\/   multi-line comment (mlc)
- *   "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"   string literal - correctly terminated but may contain invalid escapes
- *   ".*       unterminated string literal
- *   '\\?.?'?  character literal - possibly unterminated and/or with invalid escape
- *   \.?\d     number literal (start) - further processing extracts the value
- *   [\p{L}\p{N}_$]*       word - keyword or identifier
- *   [;,?:(){}\[\]]   single-character symbols and operators
- *   \.(\.\.)?    . ...
- * 
- *   the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
- *   [!=/%*^]=?   ! = / % * ^ != == /= %= *= ^= 
- *   <<?=?        < << <= <<=
- *   >>?[>=]?     > >> >= >>> >>=
- *   &[&=]?       & && &=
- *   \|[|=]?      | || |=
- *   (\+\+|--)     ++ --   postfix inc - only matches if immediately preceded by a word or a ]
- *   [+-]=?       + - += -=
- * 
- * 
- * 
- */
-/**
- * 
- * @param {string} source 
- * @param {number} [offset] 
- * @param {number} [length] 
- */
-function tokenize(source, offset = 0, length = source.length) {
-    const text = source.slice(offset, offset + length);
-    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
-    const raw_token_types = [
-        'wsc',
-        'string-literal',
-        'unterminated-string-literal',
-        'char-literal',
-        'number-literal',
-        'word',
-        'open-bracket',
-        'symbol',
-        'operator',
-    ];
-    /**
-     * ```
-     * true|false    boolean
-     * this|null     object
-     * int|long|short|byte|float|double|char|boolean|void   primitive type
-     * new
-     * instanceof
-     * public|private|protected|static|final|abstract|native|volatile|transient   modifier
-     * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized    statement keyword
-     * class|enum|interface    type keyword
-     * package|import    package keyword
-     * \w+    word
-     * ```
-     */
-    const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(package|import)|(.+))$/;
-    const word_token_types = [
-        'boolean-literal',
-        'object-literal',
-        'primitive-type',
-        'new-operator',
-        'instanceof-operator',
-        'modifier',
-        'statement-kw',
-        'type-kw',
-        'package-kw',
-        'ident'
-    ]
-    /**
-     * ```
-     * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?    decimal exponent: 1e0, 1.5e+10, 0.123E-20d
-     * (?:\d+\.\d*|\.\d+)[fFdD]?    decimal number: 0.1, 12.34f, 7.D, .3
-     * 0x[\da-fA-F]*[lL]?    hex integer: 0x1, 0xaBc, 0x, 0x7L
-     * \d+[fFdDlL]?   integer: 0, 123, 234f, 345L
-     * ```
-     * todo - underscore seperators
-     */
-    const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
-    const number_token_types = [
-        'dec-exp-number-literal',
-        'dec-number-literal',
-        'hex-number-literal',
-        'int-number-literal',
-    ]
-    const tokens = [];
-    let lastindex = 0, m;
-    while (m = raw_token_re.exec(text)) {
-        // any text appearing between two matches is invalid
-        if (m.index > lastindex) {
-            tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
-        }
-        lastindex = m.index + m[0].length;
-        if (m.index >= text.length) {
-            // end of input
-            break;
-        }
-
-        let idx = m.findIndex((match,i) => i && match) - 1;
-        let tokentype = raw_token_types[idx];
-
-        switch(tokentype) {
-            case 'number-literal':
-                // we need to extract the exact number part
-                number_re.lastIndex = m.index;
-                m = number_re.exec(text);
-                idx = m.findIndex((match,i) => i && match) - 1;
-                tokentype = number_token_types[idx];        
-                // update the raw_token_re position based on the length of the extracted number
-                raw_token_re.lastIndex = lastindex = number_re.lastIndex;
-                break;
-            case 'word':
-                // we need to work out what kind of keyword, literal or ident this is
-                let word_m = m[0].match(word_re);
-                idx = word_m.findIndex((match,i) => i && match) - 1;
-                tokentype = word_token_types[idx];        
-                break;
-            case 'operator':
-                // find the operator-type
-                tokentype = getOperatorType(m[0]);
-                break;
-        }
-        tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
-    }
-    
-    return tokens;
-}
-
-
-/**
- * ```
- * =|[/%*&|^+-]=|>>>?=|<<=    assignment
- * \+\+|--   inc
- * [!=]=     equality
- * [<>]=?    comparison
- * [&|^]    bitwise
- * <<|>>>?    shift
- * &&|[|][|]   logical
- * [*%/]   muldiv
- * [+-]   plumin
- * [~!]   unary
- * ```
- */
-const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
-/**
- * @typedef {
-    'assignment-operator'|
-    'inc-operator'|
-    'equality-operator'|
-    'comparison-operator'|
-    'bitwise-operator'|
-    'shift-operator'|
-    'logical-operator'|
-    'muldiv-operator'|
-    'plumin-operator'|
-    'unary-operator'} OperatorKind
- */
-/** @type {OperatorKind[]} */
-const operator_token_types = [
-    'assignment-operator',
-    'inc-operator',
-    'equality-operator',
-    'comparison-operator',
-    'bitwise-operator',
-    'shift-operator',
-    'logical-operator',
-    'muldiv-operator',
-    'plumin-operator',
-    'unary-operator',
-]
-/**
- * @param {string} value 
- */
-function getOperatorType(value) {
-    const op_match = value.match(operator_re);
-    const idx = op_match.findIndex((match,i) => i && match) - 1;
-    // @ts-ignore
-    return operator_token_types[idx];
-}
-
-class Token extends TextBlock {
-
-    /**
-     * 
-     * @param {string} text 
-     * @param {number} start 
-     * @param {number} length 
-     * @param {string} kind 
-     */
-    constructor(text, start, length, kind) {
-        super(new BlockRange(text, start, length), null);
-        this.kind = kind;
-    }
-
-    get value() {
-        return this.source;
-    }
-}
-
-function testTokenize() {
-    const tests = [
-        // the basics
-        { src: 'i', r: [{value: 'i', kind:'ident'}] },
-        { src: '0', r: [{value: '0', kind:'int-number-literal'}] },
-        { src: `""`, r: [{value: `""`, kind:'string-literal'}] },
-        { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
-        { src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
-        ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
-        ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
-        ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
-        ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
-        ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
-
-        // numbers - decimal with exponent
-        ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
-        // numbers - decimal with partial exponent
-        ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
-        // numbers - not decimal exponent
-        { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
-
-        // numbers - decimal (no exponent)
-        ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
-        // numbers - not decimal
-        { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
-        { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
-
-        // numbers - hex
-        ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
-        // numbers - partial hex
-        ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
-
-        // numbers - decimal
-        ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
-
-        // strings
-        ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
-        // unterminated strings
-        ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
-        // strings cannot cross newlines
-        { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
-
-        // characters
-        ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
-        // unterminated/invalid characters
-        ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
-        // characters cannot cross newlines
-        { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
-
-        // arity symbol
-        { src: `int...x`, r: [
-            {value: `int`, kind:'primitive-type'},
-            {value: `...`, kind:'symbol'},
-            {value: `x`, kind:'ident'},
-        ],},
-
-        // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to 
-        // prioritise ++ in every case, assuming that the developer will insert spaces as required.
-        // e.g this first one fails to compile with javac
-        { src: '++abc+++def', r: [
-            {value: '++', kind:'inc-operator'},
-            {value: 'abc', kind:'ident'},
-            {value: '++', kind:'inc-operator'},
-            {value: '+', kind:'plumin-operator'},
-            {value: 'def', kind:'ident'},
-        ] },
-        // this should be ok
-        { src: '++abc+ ++def', r: [
-            {value: '++', kind:'inc-operator'},
-            {value: 'abc', kind:'ident'},
-            {value: '+', kind:'plumin-operator'},
-            {value: ' ', kind:'wsc'},
-            {value: '++', kind:'inc-operator'},
-            {value: 'def', kind:'ident'},
-        ] },
-    ]
-    const report = (test, msg) => {
-        console.log(JSON.stringify({test, msg}));
-    }
-    tests.forEach(t => {
-        const tokens = tokenize(t.src);
-        if (tokens.length !== t.r.length) {
-            report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
-            return;
-        }
-        for (let i=0; i < tokens.length; i++) {
-            if (tokens[i].value !== t.r[i].value)
-                report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
-            if (tokens[i].kind !== t.r[i].kind)
-                report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
-        }
-    })
-}
-
-
-testTokenize();
-
-// const s = require('fs').readFileSync('/home/dave/dev/vscode/android-dev-ext/langserver/tests/java-files/View-25.java', 'utf8');
-// console.time();
-// const tokens = tokenize(s);
-// console.timeEnd();
-// if (tokens.map(t => t.value).join('') !== s) {
-//     console.log('mismatch');
-// }
-
-// testTokenize();
-
 exports.parseBody = parseBody;
--- a/langserver/java/parser9.js
+++ b/langserver/java/parser9.js
@@ -1,4 +1,5 @@
 const { TextBlock, TextBlockArray } = require('./parsetypes/textblock');
+const { tokenize, Token } = require('./tokenizer');

 /**
 *    Normalises comments, whitespace, string and character literals.
@@ -205,109 +206,6 @@ function parse2(source) {

 }

-/**
- * @param {string} source 
- */
-function tokenize(source) {
-    const blocks = [];
-    const re = /(\/\*[\d\D]*?\*\/)|(\/\/.*)|(\s+)|([a-zA-Z_]\w*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|('\\?.')|(\d\w*)|(::|\.{3}|[(){}\[\];,.@])|([=!~*/%^]=?|[?:]|>>?>?=?|<<?<?=?|[&][&=]?|[|][|=]?|\+[+=]?|-[-=>]?)|(.)|$/g;
-    let lastIndex = 0;
-    for (let m; m = re.exec(source);) {
-        if (m.index > lastIndex) {
-            blocks.push(TextBlock.from(source, lastIndex, m.index-lastIndex));
-            throw "er"
-        }
-        lastIndex = m.index + m[0].length;
-        const len = m[0].length;
-        if (m[1]) {
-            // mlc
-            // - MLCs are replaced with tab instead of space. This makes them easy to differentiate (for JavaDocs)
-            // whilst still being treated as general whitespace.
-            const mlc = TextBlock.from(source, m.index, len, m[0].replace(/./g, '\t'));
-            blocks.push(mlc);
-            continue;
-        }
-        if (m[2]) {
-            // slc
-            const slc = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' '));
-            blocks.push(slc);
-            continue;
-        }
-        if (m[3]) {
-            // whitespace (other than space and newline)
-            const ws = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' '));
-            blocks.push(ws);
-            continue;
-        }
-        if (m[4]) {
-            // ident or keyword
-            const KEYWORDS = /^(assert|break|case|catch|class|const|continue|do|else|enum|extends|finally|for|goto|if|implements|import|interface|new|package|return|super|switch|throw|throws|try|while)$/;
-            const MODIFIER_KEYWORDS = /^(abstract|final|native|private|protected|public|static|strictfp|synchronized|transient|volatile|default)$/;
-            const PRIMITIVE_TYPE_KEYWORDS = /^(int|boolean|byte|char|double|float|long|short|void)$/
-            const LITERAL_VALUE_KEYWORDS = /^(this|true|false|null)$/;
-            const OPERATOR_KEYWORDS = /^(instanceof)$/;
-            let simplified;
-            let space = ' '.repeat(len-1);
-            if (KEYWORDS.test(m[0])) {
-                
-            } else if (MODIFIER_KEYWORDS.test(m[0])) {
-                simplified = 'M' + space;
-            } else if (PRIMITIVE_TYPE_KEYWORDS.test(m[0])) {
-                simplified = 'P' + space;
-            } else if (LITERAL_VALUE_KEYWORDS.test(m[0])) {
-                simplified = 'W' + space;
-            } else if (OPERATOR_KEYWORDS.test(m[0])) {
-                simplified = m[0];
-            } else {
-                simplified = 'W' + space;
-            }
-            const word = TextBlock.from(source, m.index, len, simplified);
-            blocks.push(word);
-            continue;
-        }
-        if (m[5]) {
-            // string literal
-            const str = TextBlock.from(source, m.index, len, `"${'#'.repeat(m[0].length - 2)}"`);
-            blocks.push(str);
-            continue;
-        }
-        if (m[6]) {
-            // char literal
-            const char = TextBlock.from(source, m.index, len, `'#'`);
-            blocks.push(char);
-            continue;
-        }
-        if (m[7]) {
-            // number literal
-            const number = TextBlock.from(source, m.index, len, `0${' '.repeat(m[0].length-1)}`);
-            blocks.push(number);
-            continue;
-        }
-        if (m[8]) {
-            // separator
-            const separator = TextBlock.from(source, m.index, m[0].length);
-            blocks.push(separator);
-            continue;
-        }
-        if (m[9]) {
-            // operator
-            const operator = TextBlock.from(source, m.index, m[0].length);
-            blocks.push(operator);
-            continue;
-        }
-        if (m[10]) {
-            // invalid source char
-            const invalid = TextBlock.from(source, m.index, m[0].length);
-            blocks.push(invalid);
-            continue;
-        }
-        // end of file
-        break;
-    }
-
-    return blocks;
-}
-
 const markers = {
    arrayQualifier: 'A',
    blocks: 'B',
@@ -331,6 +229,7 @@ const markers = {
    typeArgs: 'T',
    enumvalues: 'U',
    varDecl: 'V',
+    ident: 'W',
    typeDecl: 'Z',
    error: ' ',
 }
--- a/langserver/java/tokenizer.js
+++ b/langserver/java/tokenizer.js
@@ -0,0 +1,246 @@
+const { TextBlock, BlockRange } = require('./parsetypes/textblock');
+
+/**
+ * Convert a token to its simplified form for easier declaration parsing.
+ * 
+ * - Whitespace, comments, strings and character literals are normalised.
+ * - Modifier keywords and identifers are abbreviated.
+ * - Any invalid text is replaced with spaces.
+ * 
+ * Abbreviated and normalised values are padded to occupy the same space 
+ * as the original text - this ensures any parse errors are reported in the
+ * correct location.
+ * @param {string} text 
+ * @param {number} start 
+ * @param {number} length 
+ * @param {string} kind 
+ */
+function tokenKindToSimplified(text, start, length, kind) {
+    const chunk = text.slice(start, start + length);
+    switch (kind) {
+        case 'wsc':
+            return chunk.replace(/[^\r\n]/g, ' ');
+        case 'string-literal':
+            if (chunk.length <= 2) return chunk;
+            return `"${'#'.repeat(chunk.length - 2)}"`;
+        case 'char-literal':
+            if (chunk.length <= 2) return chunk;
+            return `'${'#'.repeat(chunk.length - 2)}'`;
+        case 'primitive-type':
+            return `P${' '.repeat(chunk.length - 1)}`;
+        case 'modifier':
+            return `M${' '.repeat(chunk.length - 1)}`;
+        case 'ident':
+            return `W${' '.repeat(chunk.length - 1)}`;
+        case 'invalid':
+            return ' '.repeat(chunk.length);
+    }
+    return chunk;
+}
+
+class Token extends TextBlock {
+
+    /**
+     * @param {string} text 
+     * @param {number} start 
+     * @param {number} length 
+     * @param {string} kind 
+     */
+    constructor(text, start, length, kind) {
+        super(new BlockRange(text, start, length), tokenKindToSimplified(text, start, length, kind));
+        this.kind = kind;
+    }
+
+    get value() {
+        return this.source;
+    }
+}
+
+
+/**
+ *   \s+       whitespace
+ *   \/\/.*    single-line comment (slc)
+ *   \/\*[\d\D]*?\*\/   multi-line comment (mlc)
+ *   "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"   string literal - correctly terminated but may contain invalid escapes
+ *   ".*       unterminated string literal
+ *   '\\?.?'?  character literal - possibly unterminated and/or with invalid escape
+ *   \.?\d     number literal (start) - further processing extracts the value
+ *   [\p{L}\p{N}_$]*       word - keyword or identifier
+ *   [;,?:(){}\[\]]   single-character symbols and operators
+ *   \.(\.\.)?    . ...
+ * 
+ *   the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
+ *   [!=/%*^]=?   ! = / % * ^ != == /= %= *= ^= 
+ *   <<?=?        < << <= <<=
+ *   >>?[>=]?     > >> >= >>> >>=
+ *   &[&=]?       & && &=
+ *   \|[|=]?      | || |=
+ *   (\+\+|--)     ++ --   postfix inc - only matches if immediately preceded by a word or a ]
+ *   [+-]=?       + - += -=
+ * 
+ * 
+ * 
+ */
+
+/**
+ * 
+ * @param {string} source 
+ * @param {number} [offset] 
+ * @param {number} [length] 
+ */
+function tokenize(source, offset = 0, length = source.length) {
+    const text = source.slice(offset, offset + length);
+    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/|\/\*[\d\D]*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"|".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
+    const raw_token_types = [
+        'wsc',
+        'string-literal',
+        'char-literal',
+        'number-literal',
+        'word',
+        'open-bracket',
+        'symbol',
+        'operator',
+    ];
+    /**
+     * ```
+     * true|false    boolean
+     * this|null     object
+     * int|long|short|byte|float|double|char|boolean|void   primitive type
+     * new
+     * instanceof
+     * public|private|protected|static|final|abstract|native|volatile|transient   modifier
+     * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized    statement keyword
+     * class|enum|interface    type keyword
+     * package|import    package keyword
+     * \w+    word
+     * ```
+     */
+    const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(extends|implements)|(package|import)|(.+))$/;
+    const word_token_types = [
+        'boolean-literal',
+        'object-literal',
+        'primitive-type',
+        'new-operator',
+        'instanceof-operator',
+        'modifier',
+        'statement-kw',
+        'type-kw',
+        'package-kw',
+        'extimp-kw',
+        'ident'
+    ]
+    /**
+     * ```
+     * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?    decimal exponent: 1e0, 1.5e+10, 0.123E-20d
+     * (?:\d+\.\d*|\.\d+)[fFdD]?    decimal number: 0.1, 12.34f, 7.D, .3
+     * 0x[\da-fA-F]*[lL]?    hex integer: 0x1, 0xaBc, 0x, 0x7L
+     * \d+[fFdDlL]?   integer: 0, 123, 234f, 345L
+     * ```
+     * todo - underscore seperators
+     */
+    const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
+    const number_token_types = [
+        'dec-exp-number-literal',
+        'dec-number-literal',
+        'hex-number-literal',
+        'int-number-literal',
+    ]
+    const tokens = [];
+    let lastindex = 0, m;
+    while (m = raw_token_re.exec(text)) {
+        // any text appearing between two matches is invalid
+        if (m.index > lastindex) {
+            tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
+        }
+        lastindex = m.index + m[0].length;
+        if (m.index >= text.length) {
+            // end of input
+            break;
+        }
+
+        let idx = m.findIndex((match,i) => i && match) - 1;
+        let tokentype = raw_token_types[idx];
+
+        switch(tokentype) {
+            case 'number-literal':
+                // we need to extract the exact number part
+                number_re.lastIndex = m.index;
+                m = number_re.exec(text);
+                idx = m.findIndex((match,i) => i && match) - 1;
+                tokentype = number_token_types[idx];        
+                // update the raw_token_re position based on the length of the extracted number
+                raw_token_re.lastIndex = lastindex = number_re.lastIndex;
+                break;
+            case 'word':
+                // we need to work out what kind of keyword, literal or ident this is
+                let word_m = m[0].match(word_re);
+                idx = word_m.findIndex((match,i) => i && match) - 1;
+                tokentype = word_token_types[idx];        
+                break;
+            case 'operator':
+                // find the operator-type
+                tokentype = getOperatorType(m[0]);
+                break;
+        }
+        tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
+    }
+    
+    return tokens;
+}
+
+
+/**
+ * ```
+ * =|[/%*&|^+-]=|>>>?=|<<=    assignment
+ * \+\+|--   inc
+ * [!=]=     equality
+ * [<>]=?    comparison
+ * [&|^]    bitwise
+ * <<|>>>?    shift
+ * &&|[|][|]   logical
+ * [*%/]   muldiv
+ * [+-]   plumin
+ * [~!]   unary
+ * ```
+ */
+const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
+/**
+ * @typedef {
+    'assignment-operator'|
+    'inc-operator'|
+    'equality-operator'|
+    'comparison-operator'|
+    'bitwise-operator'|
+    'shift-operator'|
+    'logical-operator'|
+    'muldiv-operator'|
+    'plumin-operator'|
+    'unary-operator'} OperatorKind
+ */
+/** @type {OperatorKind[]} */
+const operator_token_types = [
+    'assignment-operator',
+    'inc-operator',
+    'equality-operator',
+    'comparison-operator',
+    'bitwise-operator',
+    'shift-operator',
+    'logical-operator',
+    'muldiv-operator',
+    'plumin-operator',
+    'unary-operator',
+]
+/**
+ * @param {string} value 
+ */
+function getOperatorType(value) {
+    const op_match = value.match(operator_re);
+    const idx = op_match.findIndex((match,i) => i && match) - 1;
+    // @ts-ignore
+    return operator_token_types[idx];
+}
+
+
+exports.getOperatorType = getOperatorType;
+exports.tokenize = tokenize;
+exports.Token = Token;
--- a/langserver/tests/test-tokenizer.js
+++ b/langserver/tests/test-tokenizer.js
@@ -0,0 +1,98 @@
+const { tokenize } = require('../java/tokenizer');
+
+function testTokenize() {
+    const tests = [
+        // the basics
+        { src: 'i', r: [{value: 'i', kind:'ident'}] },
+        { src: '0', r: [{value: '0', kind:'int-number-literal'}] },
+        { src: `""`, r: [{value: `""`, kind:'string-literal'}] },
+        { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
+        { src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
+        ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
+        ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
+        ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
+        ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
+        ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
+
+        // numbers - decimal with exponent
+        ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
+        // numbers - decimal with partial exponent
+        ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
+        // numbers - not decimal exponent
+        { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
+
+        // numbers - decimal (no exponent)
+        ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
+        // numbers - not decimal
+        { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
+        { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
+
+        // numbers - hex
+        ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
+        // numbers - partial hex
+        ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
+
+        // numbers - decimal
+        ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
+
+        // strings
+        ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
+        // unterminated strings
+        ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
+        // strings cannot cross newlines
+        { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
+
+        // characters
+        ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
+        // unterminated/invalid characters
+        ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
+        // characters cannot cross newlines
+        { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
+
+        // arity symbol
+        { src: `int...x`, r: [
+            {value: `int`, kind:'primitive-type'},
+            {value: `...`, kind:'symbol'},
+            {value: `x`, kind:'ident'},
+        ],},
+
+        // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to 
+        // prioritise ++ in every case, assuming that the developer will insert spaces as required.
+        // e.g this first one fails to compile with javac
+        { src: '++abc+++def', r: [
+            {value: '++', kind:'inc-operator'},
+            {value: 'abc', kind:'ident'},
+            {value: '++', kind:'inc-operator'},
+            {value: '+', kind:'plumin-operator'},
+            {value: 'def', kind:'ident'},
+        ] },
+        // this should be ok
+        { src: '++abc+ ++def', r: [
+            {value: '++', kind:'inc-operator'},
+            {value: 'abc', kind:'ident'},
+            {value: '+', kind:'plumin-operator'},
+            {value: ' ', kind:'wsc'},
+            {value: '++', kind:'inc-operator'},
+            {value: 'def', kind:'ident'},
+        ] },
+    ]
+    const report = (test, msg) => {
+        console.log(JSON.stringify({test, msg}));
+    }
+    tests.forEach(t => {
+        const tokens = tokenize(t.src);
+        if (tokens.length !== t.r.length) {
+            report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
+            return;
+        }
+        for (let i=0; i < tokens.length; i++) {
+            if (tokens[i].value !== t.r[i].value)
+                report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
+            if (tokens[i].kind !== t.r[i].kind)
+                report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
+        }
+    })
+}
+
+
+testTokenize();