make type parser and body parser use same tokenizer

2026-02-09 01:14:26 +00:00 · 2020-06-08 11:45:01 +01:00
parent 930237359e
commit bbc6007338
4 changed files with 347 additions and 412 deletions
--- a/langserver/java/body-parser3.js
+++ b/langserver/java/body-parser3.js
@@ -8,7 +8,7 @@ const { JavaType, CEIType, PrimitiveType, ArrayType, UnresolvedType, WildcardTyp
 const { SourceMethod, SourceConstructor } = require('./source-type');
 const ResolvedImport = require('./parsetypes/resolved-import');
 const ParseProblem = require('./parsetypes/parse-problem');
-const { TextBlock, BlockRange } = require('./parsetypes/textblock');
+const { getOperatorType, tokenize, Token } = require('./tokenizer');
 /**
 * @typedef {SourceMethod|SourceConstructor} SourceMC
@@ -2654,312 +2654,4 @@ class ResolvedIdent {
 }
 /**
 *   \s+       whitespace
 *   \/\/.*    single-line comment (slc)
 *   \/\*[\d\D]*?\*\/   multi-line comment (mlc)
 *   "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"   string literal - correctly terminated but may contain invalid escapes
 *   ".*       unterminated string literal
 *   '\\?.?'?  character literal - possibly unterminated and/or with invalid escape
 *   \.?\d     number literal (start) - further processing extracts the value
 *   [\p{L}\p{N}_$]*       word - keyword or identifier
 *   [;,?:(){}\[\]]   single-character symbols and operators
 *   \.(\.\.)?    . ...
 * 
 *   the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
 *   [!=/%*^]=?   ! = / % * ^ != == /= %= *= ^= 
 *   <<?=?        < << <= <<=
 *   >>?[>=]?     > >> >= >>> >>=
 *   &[&=]?       & && &=
 *   \|[|=]?      | || |=
 *   (\+\+|--)     ++ --   postfix inc - only matches if immediately preceded by a word or a ]
 *   [+-]=?       + - += -=
 * 
 * 
 * 
 */
 /**
 * 
 * @param {string} source 
 * @param {number} [offset] 
 * @param {number} [length] 
 */
 function tokenize(source, offset = 0, length = source.length) {
    const text = source.slice(offset, offset + length);
    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
    const raw_token_types = [
        'wsc',
        'string-literal',
        'unterminated-string-literal',
        'char-literal',
        'number-literal',
        'word',
        'open-bracket',
        'symbol',
        'operator',
    ];
    /**
     * ```
     * true|false    boolean
     * this|null     object
     * int|long|short|byte|float|double|char|boolean|void   primitive type
     * new
     * instanceof
     * public|private|protected|static|final|abstract|native|volatile|transient   modifier
     * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized    statement keyword
     * class|enum|interface    type keyword
     * package|import    package keyword
     * \w+    word
     * ```
     */
    const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(package|import)|(.+))$/;
    const word_token_types = [
        'boolean-literal',
        'object-literal',
        'primitive-type',
        'new-operator',
        'instanceof-operator',
        'modifier',
        'statement-kw',
        'type-kw',
        'package-kw',
        'ident'
    ]
    /**
     * ```
     * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?    decimal exponent: 1e0, 1.5e+10, 0.123E-20d
     * (?:\d+\.\d*|\.\d+)[fFdD]?    decimal number: 0.1, 12.34f, 7.D, .3
     * 0x[\da-fA-F]*[lL]?    hex integer: 0x1, 0xaBc, 0x, 0x7L
     * \d+[fFdDlL]?   integer: 0, 123, 234f, 345L
     * ```
     * todo - underscore seperators
     */
    const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
    const number_token_types = [
        'dec-exp-number-literal',
        'dec-number-literal',
        'hex-number-literal',
        'int-number-literal',
    ]
    const tokens = [];
    let lastindex = 0, m;
    while (m = raw_token_re.exec(text)) {
        // any text appearing between two matches is invalid
        if (m.index > lastindex) {
            tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
        }
        lastindex = m.index + m[0].length;
        if (m.index >= text.length) {
            // end of input
            break;
        }
        let idx = m.findIndex((match,i) => i && match) - 1;
        let tokentype = raw_token_types[idx];
        switch(tokentype) {
            case 'number-literal':
                // we need to extract the exact number part
                number_re.lastIndex = m.index;
                m = number_re.exec(text);
                idx = m.findIndex((match,i) => i && match) - 1;
                tokentype = number_token_types[idx];        
                // update the raw_token_re position based on the length of the extracted number
                raw_token_re.lastIndex = lastindex = number_re.lastIndex;
                break;
            case 'word':
                // we need to work out what kind of keyword, literal or ident this is
                let word_m = m[0].match(word_re);
                idx = word_m.findIndex((match,i) => i && match) - 1;
                tokentype = word_token_types[idx];        
                break;
            case 'operator':
                // find the operator-type
                tokentype = getOperatorType(m[0]);
                break;
        }
        tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
    }
    return tokens;
 }
 /**
 * ```
 * =|[/%*&|^+-]=|>>>?=|<<=    assignment
 * \+\+|--   inc
 * [!=]=     equality
 * [<>]=?    comparison
 * [&|^]    bitwise
 * <<|>>>?    shift
 * &&|[|][|]   logical
 * [*%/]   muldiv
 * [+-]   plumin
 * [~!]   unary
 * ```
 */
 const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
 /**
 * @typedef {
    'assignment-operator'|
    'inc-operator'|
    'equality-operator'|
    'comparison-operator'|
    'bitwise-operator'|
    'shift-operator'|
    'logical-operator'|
    'muldiv-operator'|
    'plumin-operator'|
    'unary-operator'} OperatorKind
 */
 /** @type {OperatorKind[]} */
 const operator_token_types = [
    'assignment-operator',
    'inc-operator',
    'equality-operator',
    'comparison-operator',
    'bitwise-operator',
    'shift-operator',
    'logical-operator',
    'muldiv-operator',
    'plumin-operator',
    'unary-operator',
 ]
 /**
 * @param {string} value 
 */
 function getOperatorType(value) {
    const op_match = value.match(operator_re);
    const idx = op_match.findIndex((match,i) => i && match) - 1;
    // @ts-ignore
    return operator_token_types[idx];
 }
 class Token extends TextBlock {
    /**
     * 
     * @param {string} text 
     * @param {number} start 
     * @param {number} length 
     * @param {string} kind 
     */
    constructor(text, start, length, kind) {
        super(new BlockRange(text, start, length), null);
        this.kind = kind;
    }
    get value() {
        return this.source;
    }
 }
 function testTokenize() {
    const tests = [
        // the basics
        { src: 'i', r: [{value: 'i', kind:'ident'}] },
        { src: '0', r: [{value: '0', kind:'int-number-literal'}] },
        { src: `""`, r: [{value: `""`, kind:'string-literal'}] },
        { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
        { src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
        ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
        ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
        ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
        ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
        ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
        // numbers - decimal with exponent
        ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
        // numbers - decimal with partial exponent
        ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
        // numbers - not decimal exponent
        { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
        // numbers - decimal (no exponent)
        ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
        // numbers - not decimal
        { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
        { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
        // numbers - hex
        ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
        // numbers - partial hex
        ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
        // numbers - decimal
        ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
        // strings
        ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
        // unterminated strings
        ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
        // strings cannot cross newlines
        { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
        // characters
        ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
        // unterminated/invalid characters
        ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
        // characters cannot cross newlines
        { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
        // arity symbol
        { src: `int...x`, r: [
            {value: `int`, kind:'primitive-type'},
            {value: `...`, kind:'symbol'},
            {value: `x`, kind:'ident'},
        ],},
        // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to 
        // prioritise ++ in every case, assuming that the developer will insert spaces as required.
        // e.g this first one fails to compile with javac
        { src: '++abc+++def', r: [
            {value: '++', kind:'inc-operator'},
            {value: 'abc', kind:'ident'},
            {value: '++', kind:'inc-operator'},
            {value: '+', kind:'plumin-operator'},
            {value: 'def', kind:'ident'},
        ] },
        // this should be ok
        { src: '++abc+ ++def', r: [
            {value: '++', kind:'inc-operator'},
            {value: 'abc', kind:'ident'},
            {value: '+', kind:'plumin-operator'},
            {value: ' ', kind:'wsc'},
            {value: '++', kind:'inc-operator'},
            {value: 'def', kind:'ident'},
        ] },
    ]
    const report = (test, msg) => {
        console.log(JSON.stringify({test, msg}));
    }
    tests.forEach(t => {
        const tokens = tokenize(t.src);
        if (tokens.length !== t.r.length) {
            report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
            return;
        }
        for (let i=0; i < tokens.length; i++) {
            if (tokens[i].value !== t.r[i].value)
                report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
            if (tokens[i].kind !== t.r[i].kind)
                report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
        }
    })
 }
 testTokenize();
 // const s = require('fs').readFileSync('/home/dave/dev/vscode/android-dev-ext/langserver/tests/java-files/View-25.java', 'utf8');
 // console.time();
 // const tokens = tokenize(s);
 // console.timeEnd();
 // if (tokens.map(t => t.value).join('') !== s) {
 //     console.log('mismatch');
 // }
 // testTokenize();
 exports.parseBody = parseBody;
--- a/langserver/java/parser9.js
+++ b/langserver/java/parser9.js
@@ -1,4 +1,5 @@
 const { TextBlock, TextBlockArray } = require('./parsetypes/textblock');
 const { tokenize, Token } = require('./tokenizer');
 /**
 *    Normalises comments, whitespace, string and character literals.
@@ -205,109 +206,6 @@ function parse2(source) {
 }
 /**
 * @param {string} source 
 */
 function tokenize(source) {
    const blocks = [];
    const re = /(\/\*[\d\D]*?\*\/)|(\/\/.*)|(\s+)|([a-zA-Z_]\w*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|('\\?.')|(\d\w*)|(::|\.{3}|[(){}\[\];,.@])|([=!~*/%^]=?|[?:]|>>?>?=?|<<?<?=?|[&][&=]?|[|][|=]?|\+[+=]?|-[-=>]?)|(.)|$/g;
    let lastIndex = 0;
    for (let m; m = re.exec(source);) {
        if (m.index > lastIndex) {
            blocks.push(TextBlock.from(source, lastIndex, m.index-lastIndex));
            throw "er"
        }
        lastIndex = m.index + m[0].length;
        const len = m[0].length;
        if (m[1]) {
            // mlc
            // - MLCs are replaced with tab instead of space. This makes them easy to differentiate (for JavaDocs)
            // whilst still being treated as general whitespace.
            const mlc = TextBlock.from(source, m.index, len, m[0].replace(/./g, '\t'));
            blocks.push(mlc);
            continue;
        }
        if (m[2]) {
            // slc
            const slc = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' '));
            blocks.push(slc);
            continue;
        }
        if (m[3]) {
            // whitespace (other than space and newline)
            const ws = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' '));
            blocks.push(ws);
            continue;
        }
        if (m[4]) {
            // ident or keyword
            const KEYWORDS = /^(assert|break|case|catch|class|const|continue|do|else|enum|extends|finally|for|goto|if|implements|import|interface|new|package|return|super|switch|throw|throws|try|while)$/;
            const MODIFIER_KEYWORDS = /^(abstract|final|native|private|protected|public|static|strictfp|synchronized|transient|volatile|default)$/;
            const PRIMITIVE_TYPE_KEYWORDS = /^(int|boolean|byte|char|double|float|long|short|void)$/
            const LITERAL_VALUE_KEYWORDS = /^(this|true|false|null)$/;
            const OPERATOR_KEYWORDS = /^(instanceof)$/;
            let simplified;
            let space = ' '.repeat(len-1);
            if (KEYWORDS.test(m[0])) {
            } else if (MODIFIER_KEYWORDS.test(m[0])) {
                simplified = 'M' + space;
            } else if (PRIMITIVE_TYPE_KEYWORDS.test(m[0])) {
                simplified = 'P' + space;
            } else if (LITERAL_VALUE_KEYWORDS.test(m[0])) {
                simplified = 'W' + space;
            } else if (OPERATOR_KEYWORDS.test(m[0])) {
                simplified = m[0];
            } else {
                simplified = 'W' + space;
            }
            const word = TextBlock.from(source, m.index, len, simplified);
            blocks.push(word);
            continue;
        }
        if (m[5]) {
            // string literal
            const str = TextBlock.from(source, m.index, len, `"${'#'.repeat(m[0].length - 2)}"`);
            blocks.push(str);
            continue;
        }
        if (m[6]) {
            // char literal
            const char = TextBlock.from(source, m.index, len, `'#'`);
            blocks.push(char);
            continue;
        }
        if (m[7]) {
            // number literal
            const number = TextBlock.from(source, m.index, len, `0${' '.repeat(m[0].length-1)}`);
            blocks.push(number);
            continue;
        }
        if (m[8]) {
            // separator
            const separator = TextBlock.from(source, m.index, m[0].length);
            blocks.push(separator);
            continue;
        }
        if (m[9]) {
            // operator
            const operator = TextBlock.from(source, m.index, m[0].length);
            blocks.push(operator);
            continue;
        }
        if (m[10]) {
            // invalid source char
            const invalid = TextBlock.from(source, m.index, m[0].length);
            blocks.push(invalid);
            continue;
        }
        // end of file
        break;
    }
    return blocks;
 }
 const markers = {
    arrayQualifier: 'A',
    blocks: 'B',
@@ -331,6 +229,7 @@ const markers = {
    typeArgs: 'T',
    enumvalues: 'U',
    varDecl: 'V',
    ident: 'W',
    typeDecl: 'Z',
    error: ' ',
 }
--- a/langserver/java/tokenizer.js
+++ b/langserver/java/tokenizer.js
@@ -0,0 +1,246 @@
 const { TextBlock, BlockRange } = require('./parsetypes/textblock');
 /**
 * Convert a token to its simplified form for easier declaration parsing.
 * 
 * - Whitespace, comments, strings and character literals are normalised.
 * - Modifier keywords and identifers are abbreviated.
 * - Any invalid text is replaced with spaces.
 * 
 * Abbreviated and normalised values are padded to occupy the same space 
 * as the original text - this ensures any parse errors are reported in the
 * correct location.
 * @param {string} text 
 * @param {number} start 
 * @param {number} length 
 * @param {string} kind 
 */
 function tokenKindToSimplified(text, start, length, kind) {
    const chunk = text.slice(start, start + length);
    switch (kind) {
        case 'wsc':
            return chunk.replace(/[^\r\n]/g, ' ');
        case 'string-literal':
            if (chunk.length <= 2) return chunk;
            return `"${'#'.repeat(chunk.length - 2)}"`;
        case 'char-literal':
            if (chunk.length <= 2) return chunk;
            return `'${'#'.repeat(chunk.length - 2)}'`;
        case 'primitive-type':
            return `P${' '.repeat(chunk.length - 1)}`;
        case 'modifier':
            return `M${' '.repeat(chunk.length - 1)}`;
        case 'ident':
            return `W${' '.repeat(chunk.length - 1)}`;
        case 'invalid':
            return ' '.repeat(chunk.length);
    }
    return chunk;
 }
 class Token extends TextBlock {
    /**
     * @param {string} text 
     * @param {number} start 
     * @param {number} length 
     * @param {string} kind 
     */
    constructor(text, start, length, kind) {
        super(new BlockRange(text, start, length), tokenKindToSimplified(text, start, length, kind));
        this.kind = kind;
    }
    get value() {
        return this.source;
    }
 }
 /**
 *   \s+       whitespace
 *   \/\/.*    single-line comment (slc)
 *   \/\*[\d\D]*?\*\/   multi-line comment (mlc)
 *   "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"   string literal - correctly terminated but may contain invalid escapes
 *   ".*       unterminated string literal
 *   '\\?.?'?  character literal - possibly unterminated and/or with invalid escape
 *   \.?\d     number literal (start) - further processing extracts the value
 *   [\p{L}\p{N}_$]*       word - keyword or identifier
 *   [;,?:(){}\[\]]   single-character symbols and operators
 *   \.(\.\.)?    . ...
 * 
 *   the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
 *   [!=/%*^]=?   ! = / % * ^ != == /= %= *= ^= 
 *   <<?=?        < << <= <<=
 *   >>?[>=]?     > >> >= >>> >>=
 *   &[&=]?       & && &=
 *   \|[|=]?      | || |=
 *   (\+\+|--)     ++ --   postfix inc - only matches if immediately preceded by a word or a ]
 *   [+-]=?       + - += -=
 * 
 * 
 * 
 */
 /**
 * 
 * @param {string} source 
 * @param {number} [offset] 
 * @param {number} [length] 
 */
 function tokenize(source, offset = 0, length = source.length) {
    const text = source.slice(offset, offset + length);
    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/|\/\*[\d\D]*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"|".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
    const raw_token_types = [
        'wsc',
        'string-literal',
        'char-literal',
        'number-literal',
        'word',
        'open-bracket',
        'symbol',
        'operator',
    ];
    /**
     * ```
     * true|false    boolean
     * this|null     object
     * int|long|short|byte|float|double|char|boolean|void   primitive type
     * new
     * instanceof
     * public|private|protected|static|final|abstract|native|volatile|transient   modifier
     * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized    statement keyword
     * class|enum|interface    type keyword
     * package|import    package keyword
     * \w+    word
     * ```
     */
    const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(extends|implements)|(package|import)|(.+))$/;
    const word_token_types = [
        'boolean-literal',
        'object-literal',
        'primitive-type',
        'new-operator',
        'instanceof-operator',
        'modifier',
        'statement-kw',
        'type-kw',
        'package-kw',
        'extimp-kw',
        'ident'
    ]
    /**
     * ```
     * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?    decimal exponent: 1e0, 1.5e+10, 0.123E-20d
     * (?:\d+\.\d*|\.\d+)[fFdD]?    decimal number: 0.1, 12.34f, 7.D, .3
     * 0x[\da-fA-F]*[lL]?    hex integer: 0x1, 0xaBc, 0x, 0x7L
     * \d+[fFdDlL]?   integer: 0, 123, 234f, 345L
     * ```
     * todo - underscore seperators
     */
    const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
    const number_token_types = [
        'dec-exp-number-literal',
        'dec-number-literal',
        'hex-number-literal',
        'int-number-literal',
    ]
    const tokens = [];
    let lastindex = 0, m;
    while (m = raw_token_re.exec(text)) {
        // any text appearing between two matches is invalid
        if (m.index > lastindex) {
            tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
        }
        lastindex = m.index + m[0].length;
        if (m.index >= text.length) {
            // end of input
            break;
        }
        let idx = m.findIndex((match,i) => i && match) - 1;
        let tokentype = raw_token_types[idx];
        switch(tokentype) {
            case 'number-literal':
                // we need to extract the exact number part
                number_re.lastIndex = m.index;
                m = number_re.exec(text);
                idx = m.findIndex((match,i) => i && match) - 1;
                tokentype = number_token_types[idx];        
                // update the raw_token_re position based on the length of the extracted number
                raw_token_re.lastIndex = lastindex = number_re.lastIndex;
                break;
            case 'word':
                // we need to work out what kind of keyword, literal or ident this is
                let word_m = m[0].match(word_re);
                idx = word_m.findIndex((match,i) => i && match) - 1;
                tokentype = word_token_types[idx];        
                break;
            case 'operator':
                // find the operator-type
                tokentype = getOperatorType(m[0]);
                break;
        }
        tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
    }
    return tokens;
 }
 /**
 * ```
 * =|[/%*&|^+-]=|>>>?=|<<=    assignment
 * \+\+|--   inc
 * [!=]=     equality
 * [<>]=?    comparison
 * [&|^]    bitwise
 * <<|>>>?    shift
 * &&|[|][|]   logical
 * [*%/]   muldiv
 * [+-]   plumin
 * [~!]   unary
 * ```
 */
 const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
 /**
 * @typedef {
    'assignment-operator'|
    'inc-operator'|
    'equality-operator'|
    'comparison-operator'|
    'bitwise-operator'|
    'shift-operator'|
    'logical-operator'|
    'muldiv-operator'|
    'plumin-operator'|
    'unary-operator'} OperatorKind
 */
 /** @type {OperatorKind[]} */
 const operator_token_types = [
    'assignment-operator',
    'inc-operator',
    'equality-operator',
    'comparison-operator',
    'bitwise-operator',
    'shift-operator',
    'logical-operator',
    'muldiv-operator',
    'plumin-operator',
    'unary-operator',
 ]
 /**
 * @param {string} value 
 */
 function getOperatorType(value) {
    const op_match = value.match(operator_re);
    const idx = op_match.findIndex((match,i) => i && match) - 1;
    // @ts-ignore
    return operator_token_types[idx];
 }
 exports.getOperatorType = getOperatorType;
 exports.tokenize = tokenize;
 exports.Token = Token;
--- a/langserver/tests/test-tokenizer.js
+++ b/langserver/tests/test-tokenizer.js
@@ -0,0 +1,98 @@
 const { tokenize } = require('../java/tokenizer');
 function testTokenize() {
    const tests = [
        // the basics
        { src: 'i', r: [{value: 'i', kind:'ident'}] },
        { src: '0', r: [{value: '0', kind:'int-number-literal'}] },
        { src: `""`, r: [{value: `""`, kind:'string-literal'}] },
        { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
        { src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
        ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
        ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
        ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
        ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
        ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
        // numbers - decimal with exponent
        ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
        // numbers - decimal with partial exponent
        ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
        // numbers - not decimal exponent
        { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
        // numbers - decimal (no exponent)
        ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
        // numbers - not decimal
        { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
        { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
        // numbers - hex
        ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
        // numbers - partial hex
        ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
        // numbers - decimal
        ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
        // strings
        ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
        // unterminated strings
        ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
        // strings cannot cross newlines
        { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
        // characters
        ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
        // unterminated/invalid characters
        ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
        // characters cannot cross newlines
        { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
        // arity symbol
        { src: `int...x`, r: [
            {value: `int`, kind:'primitive-type'},
            {value: `...`, kind:'symbol'},
            {value: `x`, kind:'ident'},
        ],},
        // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to 
        // prioritise ++ in every case, assuming that the developer will insert spaces as required.
        // e.g this first one fails to compile with javac
        { src: '++abc+++def', r: [
            {value: '++', kind:'inc-operator'},
            {value: 'abc', kind:'ident'},
            {value: '++', kind:'inc-operator'},
            {value: '+', kind:'plumin-operator'},
            {value: 'def', kind:'ident'},
        ] },
        // this should be ok
        { src: '++abc+ ++def', r: [
            {value: '++', kind:'inc-operator'},
            {value: 'abc', kind:'ident'},
            {value: '+', kind:'plumin-operator'},
            {value: ' ', kind:'wsc'},
            {value: '++', kind:'inc-operator'},
            {value: 'def', kind:'ident'},
        ] },
    ]
    const report = (test, msg) => {
        console.log(JSON.stringify({test, msg}));
    }
    tests.forEach(t => {
        const tokens = tokenize(t.src);
        if (tokens.length !== t.r.length) {
            report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
            return;
        }
        for (let i=0; i < tokens.length; i++) {
            if (tokens[i].value !== t.r[i].value)
                report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
            if (tokens[i].kind !== t.r[i].kind)
                report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
        }
    })
 }
 testTokenize();