make type parser and body parser use same tokenizer

This commit is contained in:
Dave Holoway
2020-06-08 11:45:01 +01:00
parent 930237359e
commit bbc6007338
4 changed files with 347 additions and 412 deletions

View File

@@ -8,7 +8,7 @@ const { JavaType, CEIType, PrimitiveType, ArrayType, UnresolvedType, WildcardTyp
const { SourceMethod, SourceConstructor } = require('./source-type');
const ResolvedImport = require('./parsetypes/resolved-import');
const ParseProblem = require('./parsetypes/parse-problem');
const { TextBlock, BlockRange } = require('./parsetypes/textblock');
const { getOperatorType, tokenize, Token } = require('./tokenizer');
/**
* @typedef {SourceMethod|SourceConstructor} SourceMC
@@ -2654,312 +2654,4 @@ class ResolvedIdent {
}
/**
* \s+ whitespace
* \/\/.* single-line comment (slc)
* \/\*[\d\D]*?\*\/ multi-line comment (mlc)
* "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*" string literal - correctly terminated but may contain invalid escapes
* ".* unterminated string literal
* '\\?.?'? character literal - possibly unterminated and/or with invalid escape
* \.?\d number literal (start) - further processing extracts the value
* [\p{L}\p{N}_$]* word - keyword or identifier
* [;,?:(){}\[\]] single-character symbols and operators
* \.(\.\.)? . ...
*
* the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
* [!=/%*^]=? ! = / % * ^ != == /= %= *= ^=
* <<?=? < << <= <<=
* >>?[>=]? > >> >= >>> >>=
* &[&=]? & && &=
* \|[|=]? | || |=
* (\+\+|--) ++ -- postfix inc - only matches if immediately preceded by a word or a ]
* [+-]=? + - += -=
*
*
*
*/
/**
*
* @param {string} source
* @param {number} [offset]
* @param {number} [length]
*/
function tokenize(source, offset = 0, length = source.length) {
const text = source.slice(offset, offset + length);
const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
const raw_token_types = [
'wsc',
'string-literal',
'unterminated-string-literal',
'char-literal',
'number-literal',
'word',
'open-bracket',
'symbol',
'operator',
];
/**
* ```
* true|false boolean
* this|null object
* int|long|short|byte|float|double|char|boolean|void primitive type
* new
* instanceof
* public|private|protected|static|final|abstract|native|volatile|transient modifier
* if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized statement keyword
* class|enum|interface type keyword
* package|import package keyword
* \w+ word
* ```
*/
const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(package|import)|(.+))$/;
const word_token_types = [
'boolean-literal',
'object-literal',
'primitive-type',
'new-operator',
'instanceof-operator',
'modifier',
'statement-kw',
'type-kw',
'package-kw',
'ident'
]
/**
* ```
* \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]? decimal exponent: 1e0, 1.5e+10, 0.123E-20d
* (?:\d+\.\d*|\.\d+)[fFdD]? decimal number: 0.1, 12.34f, 7.D, .3
* 0x[\da-fA-F]*[lL]? hex integer: 0x1, 0xaBc, 0x, 0x7L
* \d+[fFdDlL]? integer: 0, 123, 234f, 345L
* ```
* todo - underscore seperators
*/
const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
const number_token_types = [
'dec-exp-number-literal',
'dec-number-literal',
'hex-number-literal',
'int-number-literal',
]
const tokens = [];
let lastindex = 0, m;
while (m = raw_token_re.exec(text)) {
// any text appearing between two matches is invalid
if (m.index > lastindex) {
tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
}
lastindex = m.index + m[0].length;
if (m.index >= text.length) {
// end of input
break;
}
let idx = m.findIndex((match,i) => i && match) - 1;
let tokentype = raw_token_types[idx];
switch(tokentype) {
case 'number-literal':
// we need to extract the exact number part
number_re.lastIndex = m.index;
m = number_re.exec(text);
idx = m.findIndex((match,i) => i && match) - 1;
tokentype = number_token_types[idx];
// update the raw_token_re position based on the length of the extracted number
raw_token_re.lastIndex = lastindex = number_re.lastIndex;
break;
case 'word':
// we need to work out what kind of keyword, literal or ident this is
let word_m = m[0].match(word_re);
idx = word_m.findIndex((match,i) => i && match) - 1;
tokentype = word_token_types[idx];
break;
case 'operator':
// find the operator-type
tokentype = getOperatorType(m[0]);
break;
}
tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
}
return tokens;
}
/**
* ```
* =|[/%*&|^+-]=|>>>?=|<<= assignment
* \+\+|-- inc
* [!=]= equality
* [<>]=? comparison
* [&|^] bitwise
* <<|>>>? shift
* &&|[|][|] logical
* [*%/] muldiv
* [+-] plumin
* [~!] unary
* ```
*/
const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
/**
* @typedef {
'assignment-operator'|
'inc-operator'|
'equality-operator'|
'comparison-operator'|
'bitwise-operator'|
'shift-operator'|
'logical-operator'|
'muldiv-operator'|
'plumin-operator'|
'unary-operator'} OperatorKind
*/
/** @type {OperatorKind[]} */
const operator_token_types = [
'assignment-operator',
'inc-operator',
'equality-operator',
'comparison-operator',
'bitwise-operator',
'shift-operator',
'logical-operator',
'muldiv-operator',
'plumin-operator',
'unary-operator',
]
/**
* @param {string} value
*/
function getOperatorType(value) {
const op_match = value.match(operator_re);
const idx = op_match.findIndex((match,i) => i && match) - 1;
// @ts-ignore
return operator_token_types[idx];
}
class Token extends TextBlock {
/**
*
* @param {string} text
* @param {number} start
* @param {number} length
* @param {string} kind
*/
constructor(text, start, length, kind) {
super(new BlockRange(text, start, length), null);
this.kind = kind;
}
get value() {
return this.source;
}
}
function testTokenize() {
const tests = [
// the basics
{ src: 'i', r: [{value: 'i', kind:'ident'}] },
{ src: '0', r: [{value: '0', kind:'int-number-literal'}] },
{ src: `""`, r: [{value: `""`, kind:'string-literal'}] },
{ src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
{ src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
// numbers - decimal with exponent
...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
// numbers - decimal with partial exponent
...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
// numbers - not decimal exponent
{ src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
// numbers - decimal (no exponent)
...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
// numbers - not decimal
{ src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
{ src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
// numbers - hex
...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
// numbers - partial hex
...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
// numbers - decimal
...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
// strings
...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
// unterminated strings
...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
// strings cannot cross newlines
{ src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
// characters
...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
// unterminated/invalid characters
...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
// characters cannot cross newlines
{ src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
// arity symbol
{ src: `int...x`, r: [
{value: `int`, kind:'primitive-type'},
{value: `...`, kind:'symbol'},
{value: `x`, kind:'ident'},
],},
// complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to
// prioritise ++ in every case, assuming that the developer will insert spaces as required.
// e.g this first one fails to compile with javac
{ src: '++abc+++def', r: [
{value: '++', kind:'inc-operator'},
{value: 'abc', kind:'ident'},
{value: '++', kind:'inc-operator'},
{value: '+', kind:'plumin-operator'},
{value: 'def', kind:'ident'},
] },
// this should be ok
{ src: '++abc+ ++def', r: [
{value: '++', kind:'inc-operator'},
{value: 'abc', kind:'ident'},
{value: '+', kind:'plumin-operator'},
{value: ' ', kind:'wsc'},
{value: '++', kind:'inc-operator'},
{value: 'def', kind:'ident'},
] },
]
const report = (test, msg) => {
console.log(JSON.stringify({test, msg}));
}
tests.forEach(t => {
const tokens = tokenize(t.src);
if (tokens.length !== t.r.length) {
report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
return;
}
for (let i=0; i < tokens.length; i++) {
if (tokens[i].value !== t.r[i].value)
report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
if (tokens[i].kind !== t.r[i].kind)
report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
}
})
}
testTokenize();
// const s = require('fs').readFileSync('/home/dave/dev/vscode/android-dev-ext/langserver/tests/java-files/View-25.java', 'utf8');
// console.time();
// const tokens = tokenize(s);
// console.timeEnd();
// if (tokens.map(t => t.value).join('') !== s) {
// console.log('mismatch');
// }
// testTokenize();
exports.parseBody = parseBody;