make type parser and body parser use same tokenizer

2025-12-23 01:48:18 +00:00 · 2020-06-08 11:45:01 +01:00
parent 930237359e
commit bbc6007338
4 changed files with 347 additions and 412 deletions
--- a/langserver/java/body-parser3.js
+++ b/langserver/java/body-parser3.js
@@ -8,7 +8,7 @@ const { JavaType, CEIType, PrimitiveType, ArrayType, UnresolvedType, WildcardTyp
 const { SourceMethod, SourceConstructor } = require('./source-type');
 const ResolvedImport = require('./parsetypes/resolved-import');
 const ParseProblem = require('./parsetypes/parse-problem');
-const { TextBlock, BlockRange } = require('./parsetypes/textblock');
+const { getOperatorType, tokenize, Token } = require('./tokenizer');

 /**
 * @typedef {SourceMethod|SourceConstructor} SourceMC
@@ -2654,312 +2654,4 @@ class ResolvedIdent {

 }

-/**
- *   \s+       whitespace
- *   \/\/.*    single-line comment (slc)
- *   \/\*[\d\D]*?\*\/   multi-line comment (mlc)
- *   "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"   string literal - correctly terminated but may contain invalid escapes
- *   ".*       unterminated string literal
- *   '\\?.?'?  character literal - possibly unterminated and/or with invalid escape
- *   \.?\d     number literal (start) - further processing extracts the value
- *   [\p{L}\p{N}_$]*       word - keyword or identifier
- *   [;,?:(){}\[\]]   single-character symbols and operators
- *   \.(\.\.)?    . ...
- * 
- *   the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
- *   [!=/%*^]=?   ! = / % * ^ != == /= %= *= ^= 
- *   <<?=?        < << <= <<=
- *   >>?[>=]?     > >> >= >>> >>=
- *   &[&=]?       & && &=
- *   \|[|=]?      | || |=
- *   (\+\+|--)     ++ --   postfix inc - only matches if immediately preceded by a word or a ]
- *   [+-]=?       + - += -=
- * 
- * 
- * 
- */
-/**
- * 
- * @param {string} source 
- * @param {number} [offset] 
- * @param {number} [length] 
- */
-function tokenize(source, offset = 0, length = source.length) {
-    const text = source.slice(offset, offset + length);
-    const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
-    const raw_token_types = [
-        'wsc',
-        'string-literal',
-        'unterminated-string-literal',
-        'char-literal',
-        'number-literal',
-        'word',
-        'open-bracket',
-        'symbol',
-        'operator',
-    ];
-    /**
-     * ```
-     * true|false    boolean
-     * this|null     object
-     * int|long|short|byte|float|double|char|boolean|void   primitive type
-     * new
-     * instanceof
-     * public|private|protected|static|final|abstract|native|volatile|transient   modifier
-     * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized    statement keyword
-     * class|enum|interface    type keyword
-     * package|import    package keyword
-     * \w+    word
-     * ```
-     */
-    const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(package|import)|(.+))$/;
-    const word_token_types = [
-        'boolean-literal',
-        'object-literal',
-        'primitive-type',
-        'new-operator',
-        'instanceof-operator',
-        'modifier',
-        'statement-kw',
-        'type-kw',
-        'package-kw',
-        'ident'
-    ]
-    /**
-     * ```
-     * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?    decimal exponent: 1e0, 1.5e+10, 0.123E-20d
-     * (?:\d+\.\d*|\.\d+)[fFdD]?    decimal number: 0.1, 12.34f, 7.D, .3
-     * 0x[\da-fA-F]*[lL]?    hex integer: 0x1, 0xaBc, 0x, 0x7L
-     * \d+[fFdDlL]?   integer: 0, 123, 234f, 345L
-     * ```
-     * todo - underscore seperators
-     */
-    const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
-    const number_token_types = [
-        'dec-exp-number-literal',
-        'dec-number-literal',
-        'hex-number-literal',
-        'int-number-literal',
-    ]
-    const tokens = [];
-    let lastindex = 0, m;
-    while (m = raw_token_re.exec(text)) {
-        // any text appearing between two matches is invalid
-        if (m.index > lastindex) {
-            tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
-        }
-        lastindex = m.index + m[0].length;
-        if (m.index >= text.length) {
-            // end of input
-            break;
-        }
-
-        let idx = m.findIndex((match,i) => i && match) - 1;
-        let tokentype = raw_token_types[idx];
-
-        switch(tokentype) {
-            case 'number-literal':
-                // we need to extract the exact number part
-                number_re.lastIndex = m.index;
-                m = number_re.exec(text);
-                idx = m.findIndex((match,i) => i && match) - 1;
-                tokentype = number_token_types[idx];        
-                // update the raw_token_re position based on the length of the extracted number
-                raw_token_re.lastIndex = lastindex = number_re.lastIndex;
-                break;
-            case 'word':
-                // we need to work out what kind of keyword, literal or ident this is
-                let word_m = m[0].match(word_re);
-                idx = word_m.findIndex((match,i) => i && match) - 1;
-                tokentype = word_token_types[idx];        
-                break;
-            case 'operator':
-                // find the operator-type
-                tokentype = getOperatorType(m[0]);
-                break;
-        }
-        tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
-    }
-    
-    return tokens;
-}
-
-
-/**
- * ```
- * =|[/%*&|^+-]=|>>>?=|<<=    assignment
- * \+\+|--   inc
- * [!=]=     equality
- * [<>]=?    comparison
- * [&|^]    bitwise
- * <<|>>>?    shift
- * &&|[|][|]   logical
- * [*%/]   muldiv
- * [+-]   plumin
- * [~!]   unary
- * ```
- */
-const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
-/**
- * @typedef {
-    'assignment-operator'|
-    'inc-operator'|
-    'equality-operator'|
-    'comparison-operator'|
-    'bitwise-operator'|
-    'shift-operator'|
-    'logical-operator'|
-    'muldiv-operator'|
-    'plumin-operator'|
-    'unary-operator'} OperatorKind
- */
-/** @type {OperatorKind[]} */
-const operator_token_types = [
-    'assignment-operator',
-    'inc-operator',
-    'equality-operator',
-    'comparison-operator',
-    'bitwise-operator',
-    'shift-operator',
-    'logical-operator',
-    'muldiv-operator',
-    'plumin-operator',
-    'unary-operator',
-]
-/**
- * @param {string} value 
- */
-function getOperatorType(value) {
-    const op_match = value.match(operator_re);
-    const idx = op_match.findIndex((match,i) => i && match) - 1;
-    // @ts-ignore
-    return operator_token_types[idx];
-}
-
-class Token extends TextBlock {
-
-    /**
-     * 
-     * @param {string} text 
-     * @param {number} start 
-     * @param {number} length 
-     * @param {string} kind 
-     */
-    constructor(text, start, length, kind) {
-        super(new BlockRange(text, start, length), null);
-        this.kind = kind;
-    }
-
-    get value() {
-        return this.source;
-    }
-}
-
-function testTokenize() {
-    const tests = [
-        // the basics
-        { src: 'i', r: [{value: 'i', kind:'ident'}] },
-        { src: '0', r: [{value: '0', kind:'int-number-literal'}] },
-        { src: `""`, r: [{value: `""`, kind:'string-literal'}] },
-        { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
-        { src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
-        ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
-        ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
-        ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
-        ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
-        ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
-
-        // numbers - decimal with exponent
-        ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
-        // numbers - decimal with partial exponent
-        ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
-        // numbers - not decimal exponent
-        { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
-
-        // numbers - decimal (no exponent)
-        ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
-        // numbers - not decimal
-        { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
-        { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
-
-        // numbers - hex
-        ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
-        // numbers - partial hex
-        ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
-
-        // numbers - decimal
-        ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
-
-        // strings
-        ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
-        // unterminated strings
-        ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
-        // strings cannot cross newlines
-        { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
-
-        // characters
-        ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
-        // unterminated/invalid characters
-        ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
-        // characters cannot cross newlines
-        { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
-
-        // arity symbol
-        { src: `int...x`, r: [
-            {value: `int`, kind:'primitive-type'},
-            {value: `...`, kind:'symbol'},
-            {value: `x`, kind:'ident'},
-        ],},
-
-        // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to 
-        // prioritise ++ in every case, assuming that the developer will insert spaces as required.
-        // e.g this first one fails to compile with javac
-        { src: '++abc+++def', r: [
-            {value: '++', kind:'inc-operator'},
-            {value: 'abc', kind:'ident'},
-            {value: '++', kind:'inc-operator'},
-            {value: '+', kind:'plumin-operator'},
-            {value: 'def', kind:'ident'},
-        ] },
-        // this should be ok
-        { src: '++abc+ ++def', r: [
-            {value: '++', kind:'inc-operator'},
-            {value: 'abc', kind:'ident'},
-            {value: '+', kind:'plumin-operator'},
-            {value: ' ', kind:'wsc'},
-            {value: '++', kind:'inc-operator'},
-            {value: 'def', kind:'ident'},
-        ] },
-    ]
-    const report = (test, msg) => {
-        console.log(JSON.stringify({test, msg}));
-    }
-    tests.forEach(t => {
-        const tokens = tokenize(t.src);
-        if (tokens.length !== t.r.length) {
-            report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
-            return;
-        }
-        for (let i=0; i < tokens.length; i++) {
-            if (tokens[i].value !== t.r[i].value)
-                report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
-            if (tokens[i].kind !== t.r[i].kind)
-                report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
-        }
-    })
-}
-
-
-testTokenize();
-
-// const s = require('fs').readFileSync('/home/dave/dev/vscode/android-dev-ext/langserver/tests/java-files/View-25.java', 'utf8');
-// console.time();
-// const tokens = tokenize(s);
-// console.timeEnd();
-// if (tokens.map(t => t.value).join('') !== s) {
-//     console.log('mismatch');
-// }
-
-// testTokenize();
-
 exports.parseBody = parseBody;