mirror of
https://github.com/adelphes/android-dev-ext.git
synced 2025-12-22 17:39:19 +00:00
make type parser and body parser use same tokenizer
This commit is contained in:
@@ -8,7 +8,7 @@ const { JavaType, CEIType, PrimitiveType, ArrayType, UnresolvedType, WildcardTyp
|
|||||||
const { SourceMethod, SourceConstructor } = require('./source-type');
|
const { SourceMethod, SourceConstructor } = require('./source-type');
|
||||||
const ResolvedImport = require('./parsetypes/resolved-import');
|
const ResolvedImport = require('./parsetypes/resolved-import');
|
||||||
const ParseProblem = require('./parsetypes/parse-problem');
|
const ParseProblem = require('./parsetypes/parse-problem');
|
||||||
const { TextBlock, BlockRange } = require('./parsetypes/textblock');
|
const { getOperatorType, tokenize, Token } = require('./tokenizer');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @typedef {SourceMethod|SourceConstructor} SourceMC
|
* @typedef {SourceMethod|SourceConstructor} SourceMC
|
||||||
@@ -2654,312 +2654,4 @@ class ResolvedIdent {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* \s+ whitespace
|
|
||||||
* \/\/.* single-line comment (slc)
|
|
||||||
* \/\*[\d\D]*?\*\/ multi-line comment (mlc)
|
|
||||||
* "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*" string literal - correctly terminated but may contain invalid escapes
|
|
||||||
* ".* unterminated string literal
|
|
||||||
* '\\?.?'? character literal - possibly unterminated and/or with invalid escape
|
|
||||||
* \.?\d number literal (start) - further processing extracts the value
|
|
||||||
* [\p{L}\p{N}_$]* word - keyword or identifier
|
|
||||||
* [;,?:(){}\[\]] single-character symbols and operators
|
|
||||||
* \.(\.\.)? . ...
|
|
||||||
*
|
|
||||||
* the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
|
|
||||||
* [!=/%*^]=? ! = / % * ^ != == /= %= *= ^=
|
|
||||||
* <<?=? < << <= <<=
|
|
||||||
* >>?[>=]? > >> >= >>> >>=
|
|
||||||
* &[&=]? & && &=
|
|
||||||
* \|[|=]? | || |=
|
|
||||||
* (\+\+|--) ++ -- postfix inc - only matches if immediately preceded by a word or a ]
|
|
||||||
* [+-]=? + - += -=
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param {string} source
|
|
||||||
* @param {number} [offset]
|
|
||||||
* @param {number} [length]
|
|
||||||
*/
|
|
||||||
function tokenize(source, offset = 0, length = source.length) {
|
|
||||||
const text = source.slice(offset, offset + length);
|
|
||||||
const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
|
|
||||||
const raw_token_types = [
|
|
||||||
'wsc',
|
|
||||||
'string-literal',
|
|
||||||
'unterminated-string-literal',
|
|
||||||
'char-literal',
|
|
||||||
'number-literal',
|
|
||||||
'word',
|
|
||||||
'open-bracket',
|
|
||||||
'symbol',
|
|
||||||
'operator',
|
|
||||||
];
|
|
||||||
/**
|
|
||||||
* ```
|
|
||||||
* true|false boolean
|
|
||||||
* this|null object
|
|
||||||
* int|long|short|byte|float|double|char|boolean|void primitive type
|
|
||||||
* new
|
|
||||||
* instanceof
|
|
||||||
* public|private|protected|static|final|abstract|native|volatile|transient modifier
|
|
||||||
* if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized statement keyword
|
|
||||||
* class|enum|interface type keyword
|
|
||||||
* package|import package keyword
|
|
||||||
* \w+ word
|
|
||||||
* ```
|
|
||||||
*/
|
|
||||||
const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(package|import)|(.+))$/;
|
|
||||||
const word_token_types = [
|
|
||||||
'boolean-literal',
|
|
||||||
'object-literal',
|
|
||||||
'primitive-type',
|
|
||||||
'new-operator',
|
|
||||||
'instanceof-operator',
|
|
||||||
'modifier',
|
|
||||||
'statement-kw',
|
|
||||||
'type-kw',
|
|
||||||
'package-kw',
|
|
||||||
'ident'
|
|
||||||
]
|
|
||||||
/**
|
|
||||||
* ```
|
|
||||||
* \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]? decimal exponent: 1e0, 1.5e+10, 0.123E-20d
|
|
||||||
* (?:\d+\.\d*|\.\d+)[fFdD]? decimal number: 0.1, 12.34f, 7.D, .3
|
|
||||||
* 0x[\da-fA-F]*[lL]? hex integer: 0x1, 0xaBc, 0x, 0x7L
|
|
||||||
* \d+[fFdDlL]? integer: 0, 123, 234f, 345L
|
|
||||||
* ```
|
|
||||||
* todo - underscore seperators
|
|
||||||
*/
|
|
||||||
const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
|
|
||||||
const number_token_types = [
|
|
||||||
'dec-exp-number-literal',
|
|
||||||
'dec-number-literal',
|
|
||||||
'hex-number-literal',
|
|
||||||
'int-number-literal',
|
|
||||||
]
|
|
||||||
const tokens = [];
|
|
||||||
let lastindex = 0, m;
|
|
||||||
while (m = raw_token_re.exec(text)) {
|
|
||||||
// any text appearing between two matches is invalid
|
|
||||||
if (m.index > lastindex) {
|
|
||||||
tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
|
|
||||||
}
|
|
||||||
lastindex = m.index + m[0].length;
|
|
||||||
if (m.index >= text.length) {
|
|
||||||
// end of input
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let idx = m.findIndex((match,i) => i && match) - 1;
|
|
||||||
let tokentype = raw_token_types[idx];
|
|
||||||
|
|
||||||
switch(tokentype) {
|
|
||||||
case 'number-literal':
|
|
||||||
// we need to extract the exact number part
|
|
||||||
number_re.lastIndex = m.index;
|
|
||||||
m = number_re.exec(text);
|
|
||||||
idx = m.findIndex((match,i) => i && match) - 1;
|
|
||||||
tokentype = number_token_types[idx];
|
|
||||||
// update the raw_token_re position based on the length of the extracted number
|
|
||||||
raw_token_re.lastIndex = lastindex = number_re.lastIndex;
|
|
||||||
break;
|
|
||||||
case 'word':
|
|
||||||
// we need to work out what kind of keyword, literal or ident this is
|
|
||||||
let word_m = m[0].match(word_re);
|
|
||||||
idx = word_m.findIndex((match,i) => i && match) - 1;
|
|
||||||
tokentype = word_token_types[idx];
|
|
||||||
break;
|
|
||||||
case 'operator':
|
|
||||||
// find the operator-type
|
|
||||||
tokentype = getOperatorType(m[0]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
|
|
||||||
}
|
|
||||||
|
|
||||||
return tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ```
|
|
||||||
* =|[/%*&|^+-]=|>>>?=|<<= assignment
|
|
||||||
* \+\+|-- inc
|
|
||||||
* [!=]= equality
|
|
||||||
* [<>]=? comparison
|
|
||||||
* [&|^] bitwise
|
|
||||||
* <<|>>>? shift
|
|
||||||
* &&|[|][|] logical
|
|
||||||
* [*%/] muldiv
|
|
||||||
* [+-] plumin
|
|
||||||
* [~!] unary
|
|
||||||
* ```
|
|
||||||
*/
|
|
||||||
const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
|
|
||||||
/**
|
|
||||||
* @typedef {
|
|
||||||
'assignment-operator'|
|
|
||||||
'inc-operator'|
|
|
||||||
'equality-operator'|
|
|
||||||
'comparison-operator'|
|
|
||||||
'bitwise-operator'|
|
|
||||||
'shift-operator'|
|
|
||||||
'logical-operator'|
|
|
||||||
'muldiv-operator'|
|
|
||||||
'plumin-operator'|
|
|
||||||
'unary-operator'} OperatorKind
|
|
||||||
*/
|
|
||||||
/** @type {OperatorKind[]} */
|
|
||||||
const operator_token_types = [
|
|
||||||
'assignment-operator',
|
|
||||||
'inc-operator',
|
|
||||||
'equality-operator',
|
|
||||||
'comparison-operator',
|
|
||||||
'bitwise-operator',
|
|
||||||
'shift-operator',
|
|
||||||
'logical-operator',
|
|
||||||
'muldiv-operator',
|
|
||||||
'plumin-operator',
|
|
||||||
'unary-operator',
|
|
||||||
]
|
|
||||||
/**
|
|
||||||
* @param {string} value
|
|
||||||
*/
|
|
||||||
function getOperatorType(value) {
|
|
||||||
const op_match = value.match(operator_re);
|
|
||||||
const idx = op_match.findIndex((match,i) => i && match) - 1;
|
|
||||||
// @ts-ignore
|
|
||||||
return operator_token_types[idx];
|
|
||||||
}
|
|
||||||
|
|
||||||
class Token extends TextBlock {
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param {string} text
|
|
||||||
* @param {number} start
|
|
||||||
* @param {number} length
|
|
||||||
* @param {string} kind
|
|
||||||
*/
|
|
||||||
constructor(text, start, length, kind) {
|
|
||||||
super(new BlockRange(text, start, length), null);
|
|
||||||
this.kind = kind;
|
|
||||||
}
|
|
||||||
|
|
||||||
get value() {
|
|
||||||
return this.source;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function testTokenize() {
|
|
||||||
const tests = [
|
|
||||||
// the basics
|
|
||||||
{ src: 'i', r: [{value: 'i', kind:'ident'}] },
|
|
||||||
{ src: '0', r: [{value: '0', kind:'int-number-literal'}] },
|
|
||||||
{ src: `""`, r: [{value: `""`, kind:'string-literal'}] },
|
|
||||||
{ src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
|
|
||||||
{ src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
|
|
||||||
...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
|
|
||||||
...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
|
|
||||||
...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
|
|
||||||
...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
|
|
||||||
...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
|
|
||||||
|
|
||||||
// numbers - decimal with exponent
|
|
||||||
...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
|
|
||||||
// numbers - decimal with partial exponent
|
|
||||||
...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
|
|
||||||
// numbers - not decimal exponent
|
|
||||||
{ src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
|
|
||||||
|
|
||||||
// numbers - decimal (no exponent)
|
|
||||||
...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
|
|
||||||
// numbers - not decimal
|
|
||||||
{ src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
|
|
||||||
{ src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
|
|
||||||
|
|
||||||
// numbers - hex
|
|
||||||
...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
|
|
||||||
// numbers - partial hex
|
|
||||||
...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
|
|
||||||
|
|
||||||
// numbers - decimal
|
|
||||||
...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
|
|
||||||
|
|
||||||
// strings
|
|
||||||
...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
|
|
||||||
// unterminated strings
|
|
||||||
...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
|
|
||||||
// strings cannot cross newlines
|
|
||||||
{ src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
|
|
||||||
|
|
||||||
// characters
|
|
||||||
...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
|
|
||||||
// unterminated/invalid characters
|
|
||||||
...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
|
|
||||||
// characters cannot cross newlines
|
|
||||||
{ src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
|
|
||||||
|
|
||||||
// arity symbol
|
|
||||||
{ src: `int...x`, r: [
|
|
||||||
{value: `int`, kind:'primitive-type'},
|
|
||||||
{value: `...`, kind:'symbol'},
|
|
||||||
{value: `x`, kind:'ident'},
|
|
||||||
],},
|
|
||||||
|
|
||||||
// complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to
|
|
||||||
// prioritise ++ in every case, assuming that the developer will insert spaces as required.
|
|
||||||
// e.g this first one fails to compile with javac
|
|
||||||
{ src: '++abc+++def', r: [
|
|
||||||
{value: '++', kind:'inc-operator'},
|
|
||||||
{value: 'abc', kind:'ident'},
|
|
||||||
{value: '++', kind:'inc-operator'},
|
|
||||||
{value: '+', kind:'plumin-operator'},
|
|
||||||
{value: 'def', kind:'ident'},
|
|
||||||
] },
|
|
||||||
// this should be ok
|
|
||||||
{ src: '++abc+ ++def', r: [
|
|
||||||
{value: '++', kind:'inc-operator'},
|
|
||||||
{value: 'abc', kind:'ident'},
|
|
||||||
{value: '+', kind:'plumin-operator'},
|
|
||||||
{value: ' ', kind:'wsc'},
|
|
||||||
{value: '++', kind:'inc-operator'},
|
|
||||||
{value: 'def', kind:'ident'},
|
|
||||||
] },
|
|
||||||
]
|
|
||||||
const report = (test, msg) => {
|
|
||||||
console.log(JSON.stringify({test, msg}));
|
|
||||||
}
|
|
||||||
tests.forEach(t => {
|
|
||||||
const tokens = tokenize(t.src);
|
|
||||||
if (tokens.length !== t.r.length) {
|
|
||||||
report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (let i=0; i < tokens.length; i++) {
|
|
||||||
if (tokens[i].value !== t.r[i].value)
|
|
||||||
report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
|
|
||||||
if (tokens[i].kind !== t.r[i].kind)
|
|
||||||
report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
testTokenize();
|
|
||||||
|
|
||||||
// const s = require('fs').readFileSync('/home/dave/dev/vscode/android-dev-ext/langserver/tests/java-files/View-25.java', 'utf8');
|
|
||||||
// console.time();
|
|
||||||
// const tokens = tokenize(s);
|
|
||||||
// console.timeEnd();
|
|
||||||
// if (tokens.map(t => t.value).join('') !== s) {
|
|
||||||
// console.log('mismatch');
|
|
||||||
// }
|
|
||||||
|
|
||||||
// testTokenize();
|
|
||||||
|
|
||||||
exports.parseBody = parseBody;
|
exports.parseBody = parseBody;
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
const { TextBlock, TextBlockArray } = require('./parsetypes/textblock');
|
const { TextBlock, TextBlockArray } = require('./parsetypes/textblock');
|
||||||
|
const { tokenize, Token } = require('./tokenizer');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalises comments, whitespace, string and character literals.
|
* Normalises comments, whitespace, string and character literals.
|
||||||
@@ -205,109 +206,6 @@ function parse2(source) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @param {string} source
|
|
||||||
*/
|
|
||||||
function tokenize(source) {
|
|
||||||
const blocks = [];
|
|
||||||
const re = /(\/\*[\d\D]*?\*\/)|(\/\/.*)|(\s+)|([a-zA-Z_]\w*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|('\\?.')|(\d\w*)|(::|\.{3}|[(){}\[\];,.@])|([=!~*/%^]=?|[?:]|>>?>?=?|<<?<?=?|[&][&=]?|[|][|=]?|\+[+=]?|-[-=>]?)|(.)|$/g;
|
|
||||||
let lastIndex = 0;
|
|
||||||
for (let m; m = re.exec(source);) {
|
|
||||||
if (m.index > lastIndex) {
|
|
||||||
blocks.push(TextBlock.from(source, lastIndex, m.index-lastIndex));
|
|
||||||
throw "er"
|
|
||||||
}
|
|
||||||
lastIndex = m.index + m[0].length;
|
|
||||||
const len = m[0].length;
|
|
||||||
if (m[1]) {
|
|
||||||
// mlc
|
|
||||||
// - MLCs are replaced with tab instead of space. This makes them easy to differentiate (for JavaDocs)
|
|
||||||
// whilst still being treated as general whitespace.
|
|
||||||
const mlc = TextBlock.from(source, m.index, len, m[0].replace(/./g, '\t'));
|
|
||||||
blocks.push(mlc);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[2]) {
|
|
||||||
// slc
|
|
||||||
const slc = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' '));
|
|
||||||
blocks.push(slc);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[3]) {
|
|
||||||
// whitespace (other than space and newline)
|
|
||||||
const ws = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' '));
|
|
||||||
blocks.push(ws);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[4]) {
|
|
||||||
// ident or keyword
|
|
||||||
const KEYWORDS = /^(assert|break|case|catch|class|const|continue|do|else|enum|extends|finally|for|goto|if|implements|import|interface|new|package|return|super|switch|throw|throws|try|while)$/;
|
|
||||||
const MODIFIER_KEYWORDS = /^(abstract|final|native|private|protected|public|static|strictfp|synchronized|transient|volatile|default)$/;
|
|
||||||
const PRIMITIVE_TYPE_KEYWORDS = /^(int|boolean|byte|char|double|float|long|short|void)$/
|
|
||||||
const LITERAL_VALUE_KEYWORDS = /^(this|true|false|null)$/;
|
|
||||||
const OPERATOR_KEYWORDS = /^(instanceof)$/;
|
|
||||||
let simplified;
|
|
||||||
let space = ' '.repeat(len-1);
|
|
||||||
if (KEYWORDS.test(m[0])) {
|
|
||||||
|
|
||||||
} else if (MODIFIER_KEYWORDS.test(m[0])) {
|
|
||||||
simplified = 'M' + space;
|
|
||||||
} else if (PRIMITIVE_TYPE_KEYWORDS.test(m[0])) {
|
|
||||||
simplified = 'P' + space;
|
|
||||||
} else if (LITERAL_VALUE_KEYWORDS.test(m[0])) {
|
|
||||||
simplified = 'W' + space;
|
|
||||||
} else if (OPERATOR_KEYWORDS.test(m[0])) {
|
|
||||||
simplified = m[0];
|
|
||||||
} else {
|
|
||||||
simplified = 'W' + space;
|
|
||||||
}
|
|
||||||
const word = TextBlock.from(source, m.index, len, simplified);
|
|
||||||
blocks.push(word);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[5]) {
|
|
||||||
// string literal
|
|
||||||
const str = TextBlock.from(source, m.index, len, `"${'#'.repeat(m[0].length - 2)}"`);
|
|
||||||
blocks.push(str);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[6]) {
|
|
||||||
// char literal
|
|
||||||
const char = TextBlock.from(source, m.index, len, `'#'`);
|
|
||||||
blocks.push(char);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[7]) {
|
|
||||||
// number literal
|
|
||||||
const number = TextBlock.from(source, m.index, len, `0${' '.repeat(m[0].length-1)}`);
|
|
||||||
blocks.push(number);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[8]) {
|
|
||||||
// separator
|
|
||||||
const separator = TextBlock.from(source, m.index, m[0].length);
|
|
||||||
blocks.push(separator);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[9]) {
|
|
||||||
// operator
|
|
||||||
const operator = TextBlock.from(source, m.index, m[0].length);
|
|
||||||
blocks.push(operator);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (m[10]) {
|
|
||||||
// invalid source char
|
|
||||||
const invalid = TextBlock.from(source, m.index, m[0].length);
|
|
||||||
blocks.push(invalid);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// end of file
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return blocks;
|
|
||||||
}
|
|
||||||
|
|
||||||
const markers = {
|
const markers = {
|
||||||
arrayQualifier: 'A',
|
arrayQualifier: 'A',
|
||||||
blocks: 'B',
|
blocks: 'B',
|
||||||
@@ -331,6 +229,7 @@ const markers = {
|
|||||||
typeArgs: 'T',
|
typeArgs: 'T',
|
||||||
enumvalues: 'U',
|
enumvalues: 'U',
|
||||||
varDecl: 'V',
|
varDecl: 'V',
|
||||||
|
ident: 'W',
|
||||||
typeDecl: 'Z',
|
typeDecl: 'Z',
|
||||||
error: ' ',
|
error: ' ',
|
||||||
}
|
}
|
||||||
|
|||||||
246
langserver/java/tokenizer.js
Normal file
246
langserver/java/tokenizer.js
Normal file
@@ -0,0 +1,246 @@
|
|||||||
|
const { TextBlock, BlockRange } = require('./parsetypes/textblock');
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a token to its simplified form for easier declaration parsing.
|
||||||
|
*
|
||||||
|
* - Whitespace, comments, strings and character literals are normalised.
|
||||||
|
* - Modifier keywords and identifers are abbreviated.
|
||||||
|
* - Any invalid text is replaced with spaces.
|
||||||
|
*
|
||||||
|
* Abbreviated and normalised values are padded to occupy the same space
|
||||||
|
* as the original text - this ensures any parse errors are reported in the
|
||||||
|
* correct location.
|
||||||
|
* @param {string} text
|
||||||
|
* @param {number} start
|
||||||
|
* @param {number} length
|
||||||
|
* @param {string} kind
|
||||||
|
*/
|
||||||
|
function tokenKindToSimplified(text, start, length, kind) {
|
||||||
|
const chunk = text.slice(start, start + length);
|
||||||
|
switch (kind) {
|
||||||
|
case 'wsc':
|
||||||
|
return chunk.replace(/[^\r\n]/g, ' ');
|
||||||
|
case 'string-literal':
|
||||||
|
if (chunk.length <= 2) return chunk;
|
||||||
|
return `"${'#'.repeat(chunk.length - 2)}"`;
|
||||||
|
case 'char-literal':
|
||||||
|
if (chunk.length <= 2) return chunk;
|
||||||
|
return `'${'#'.repeat(chunk.length - 2)}'`;
|
||||||
|
case 'primitive-type':
|
||||||
|
return `P${' '.repeat(chunk.length - 1)}`;
|
||||||
|
case 'modifier':
|
||||||
|
return `M${' '.repeat(chunk.length - 1)}`;
|
||||||
|
case 'ident':
|
||||||
|
return `W${' '.repeat(chunk.length - 1)}`;
|
||||||
|
case 'invalid':
|
||||||
|
return ' '.repeat(chunk.length);
|
||||||
|
}
|
||||||
|
return chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
class Token extends TextBlock {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} text
|
||||||
|
* @param {number} start
|
||||||
|
* @param {number} length
|
||||||
|
* @param {string} kind
|
||||||
|
*/
|
||||||
|
constructor(text, start, length, kind) {
|
||||||
|
super(new BlockRange(text, start, length), tokenKindToSimplified(text, start, length, kind));
|
||||||
|
this.kind = kind;
|
||||||
|
}
|
||||||
|
|
||||||
|
get value() {
|
||||||
|
return this.source;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \s+ whitespace
|
||||||
|
* \/\/.* single-line comment (slc)
|
||||||
|
* \/\*[\d\D]*?\*\/ multi-line comment (mlc)
|
||||||
|
* "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*" string literal - correctly terminated but may contain invalid escapes
|
||||||
|
* ".* unterminated string literal
|
||||||
|
* '\\?.?'? character literal - possibly unterminated and/or with invalid escape
|
||||||
|
* \.?\d number literal (start) - further processing extracts the value
|
||||||
|
* [\p{L}\p{N}_$]* word - keyword or identifier
|
||||||
|
* [;,?:(){}\[\]] single-character symbols and operators
|
||||||
|
* \.(\.\.)? . ...
|
||||||
|
*
|
||||||
|
* the operators: [!=/%*^]=?|<<?=?|>>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=?
|
||||||
|
* [!=/%*^]=? ! = / % * ^ != == /= %= *= ^=
|
||||||
|
* <<?=? < << <= <<=
|
||||||
|
* >>?[>=]? > >> >= >>> >>=
|
||||||
|
* &[&=]? & && &=
|
||||||
|
* \|[|=]? | || |=
|
||||||
|
* (\+\+|--) ++ -- postfix inc - only matches if immediately preceded by a word or a ]
|
||||||
|
* [+-]=? + - += -=
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param {string} source
|
||||||
|
* @param {number} [offset]
|
||||||
|
* @param {number} [length]
|
||||||
|
*/
|
||||||
|
function tokenize(source, offset = 0, length = source.length) {
|
||||||
|
const text = source.slice(offset, offset + length);
|
||||||
|
const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/|\/\*[\d\D]*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"|".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<<?=?|>>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu;
|
||||||
|
const raw_token_types = [
|
||||||
|
'wsc',
|
||||||
|
'string-literal',
|
||||||
|
'char-literal',
|
||||||
|
'number-literal',
|
||||||
|
'word',
|
||||||
|
'open-bracket',
|
||||||
|
'symbol',
|
||||||
|
'operator',
|
||||||
|
];
|
||||||
|
/**
|
||||||
|
* ```
|
||||||
|
* true|false boolean
|
||||||
|
* this|null object
|
||||||
|
* int|long|short|byte|float|double|char|boolean|void primitive type
|
||||||
|
* new
|
||||||
|
* instanceof
|
||||||
|
* public|private|protected|static|final|abstract|native|volatile|transient modifier
|
||||||
|
* if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized statement keyword
|
||||||
|
* class|enum|interface type keyword
|
||||||
|
* package|import package keyword
|
||||||
|
* \w+ word
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(extends|implements)|(package|import)|(.+))$/;
|
||||||
|
const word_token_types = [
|
||||||
|
'boolean-literal',
|
||||||
|
'object-literal',
|
||||||
|
'primitive-type',
|
||||||
|
'new-operator',
|
||||||
|
'instanceof-operator',
|
||||||
|
'modifier',
|
||||||
|
'statement-kw',
|
||||||
|
'type-kw',
|
||||||
|
'package-kw',
|
||||||
|
'extimp-kw',
|
||||||
|
'ident'
|
||||||
|
]
|
||||||
|
/**
|
||||||
|
* ```
|
||||||
|
* \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]? decimal exponent: 1e0, 1.5e+10, 0.123E-20d
|
||||||
|
* (?:\d+\.\d*|\.\d+)[fFdD]? decimal number: 0.1, 12.34f, 7.D, .3
|
||||||
|
* 0x[\da-fA-F]*[lL]? hex integer: 0x1, 0xaBc, 0x, 0x7L
|
||||||
|
* \d+[fFdDlL]? integer: 0, 123, 234f, 345L
|
||||||
|
* ```
|
||||||
|
* todo - underscore seperators
|
||||||
|
*/
|
||||||
|
const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g;
|
||||||
|
const number_token_types = [
|
||||||
|
'dec-exp-number-literal',
|
||||||
|
'dec-number-literal',
|
||||||
|
'hex-number-literal',
|
||||||
|
'int-number-literal',
|
||||||
|
]
|
||||||
|
const tokens = [];
|
||||||
|
let lastindex = 0, m;
|
||||||
|
while (m = raw_token_re.exec(text)) {
|
||||||
|
// any text appearing between two matches is invalid
|
||||||
|
if (m.index > lastindex) {
|
||||||
|
tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid'));
|
||||||
|
}
|
||||||
|
lastindex = m.index + m[0].length;
|
||||||
|
if (m.index >= text.length) {
|
||||||
|
// end of input
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let idx = m.findIndex((match,i) => i && match) - 1;
|
||||||
|
let tokentype = raw_token_types[idx];
|
||||||
|
|
||||||
|
switch(tokentype) {
|
||||||
|
case 'number-literal':
|
||||||
|
// we need to extract the exact number part
|
||||||
|
number_re.lastIndex = m.index;
|
||||||
|
m = number_re.exec(text);
|
||||||
|
idx = m.findIndex((match,i) => i && match) - 1;
|
||||||
|
tokentype = number_token_types[idx];
|
||||||
|
// update the raw_token_re position based on the length of the extracted number
|
||||||
|
raw_token_re.lastIndex = lastindex = number_re.lastIndex;
|
||||||
|
break;
|
||||||
|
case 'word':
|
||||||
|
// we need to work out what kind of keyword, literal or ident this is
|
||||||
|
let word_m = m[0].match(word_re);
|
||||||
|
idx = word_m.findIndex((match,i) => i && match) - 1;
|
||||||
|
tokentype = word_token_types[idx];
|
||||||
|
break;
|
||||||
|
case 'operator':
|
||||||
|
// find the operator-type
|
||||||
|
tokentype = getOperatorType(m[0]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
tokens.push(new Token(source, offset + m.index, m[0].length, tokentype));
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ```
|
||||||
|
* =|[/%*&|^+-]=|>>>?=|<<= assignment
|
||||||
|
* \+\+|-- inc
|
||||||
|
* [!=]= equality
|
||||||
|
* [<>]=? comparison
|
||||||
|
* [&|^] bitwise
|
||||||
|
* <<|>>>? shift
|
||||||
|
* &&|[|][|] logical
|
||||||
|
* [*%/] muldiv
|
||||||
|
* [+-] plumin
|
||||||
|
* [~!] unary
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/;
|
||||||
|
/**
|
||||||
|
* @typedef {
|
||||||
|
'assignment-operator'|
|
||||||
|
'inc-operator'|
|
||||||
|
'equality-operator'|
|
||||||
|
'comparison-operator'|
|
||||||
|
'bitwise-operator'|
|
||||||
|
'shift-operator'|
|
||||||
|
'logical-operator'|
|
||||||
|
'muldiv-operator'|
|
||||||
|
'plumin-operator'|
|
||||||
|
'unary-operator'} OperatorKind
|
||||||
|
*/
|
||||||
|
/** @type {OperatorKind[]} */
|
||||||
|
const operator_token_types = [
|
||||||
|
'assignment-operator',
|
||||||
|
'inc-operator',
|
||||||
|
'equality-operator',
|
||||||
|
'comparison-operator',
|
||||||
|
'bitwise-operator',
|
||||||
|
'shift-operator',
|
||||||
|
'logical-operator',
|
||||||
|
'muldiv-operator',
|
||||||
|
'plumin-operator',
|
||||||
|
'unary-operator',
|
||||||
|
]
|
||||||
|
/**
|
||||||
|
* @param {string} value
|
||||||
|
*/
|
||||||
|
function getOperatorType(value) {
|
||||||
|
const op_match = value.match(operator_re);
|
||||||
|
const idx = op_match.findIndex((match,i) => i && match) - 1;
|
||||||
|
// @ts-ignore
|
||||||
|
return operator_token_types[idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
exports.getOperatorType = getOperatorType;
|
||||||
|
exports.tokenize = tokenize;
|
||||||
|
exports.Token = Token;
|
||||||
98
langserver/tests/test-tokenizer.js
Normal file
98
langserver/tests/test-tokenizer.js
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
const { tokenize } = require('../java/tokenizer');
|
||||||
|
|
||||||
|
function testTokenize() {
|
||||||
|
const tests = [
|
||||||
|
// the basics
|
||||||
|
{ src: 'i', r: [{value: 'i', kind:'ident'}] },
|
||||||
|
{ src: '0', r: [{value: '0', kind:'int-number-literal'}] },
|
||||||
|
{ src: `""`, r: [{value: `""`, kind:'string-literal'}] },
|
||||||
|
{ src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] },
|
||||||
|
{ src: `(`, r: [{value: `(`, kind:'open-bracket'}] },
|
||||||
|
...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })),
|
||||||
|
...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })),
|
||||||
|
...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })),
|
||||||
|
...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })),
|
||||||
|
...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })),
|
||||||
|
|
||||||
|
// numbers - decimal with exponent
|
||||||
|
...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
|
||||||
|
// numbers - decimal with partial exponent
|
||||||
|
...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })),
|
||||||
|
// numbers - not decimal exponent
|
||||||
|
{ src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] },
|
||||||
|
|
||||||
|
// numbers - decimal (no exponent)
|
||||||
|
...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })),
|
||||||
|
// numbers - not decimal
|
||||||
|
{ src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
|
||||||
|
{ src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] },
|
||||||
|
|
||||||
|
// numbers - hex
|
||||||
|
...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
|
||||||
|
// numbers - partial hex
|
||||||
|
...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })),
|
||||||
|
|
||||||
|
// numbers - decimal
|
||||||
|
...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })),
|
||||||
|
|
||||||
|
// strings
|
||||||
|
...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })),
|
||||||
|
// unterminated strings
|
||||||
|
...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })),
|
||||||
|
// strings cannot cross newlines
|
||||||
|
{ src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] },
|
||||||
|
|
||||||
|
// characters
|
||||||
|
...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
|
||||||
|
// unterminated/invalid characters
|
||||||
|
...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })),
|
||||||
|
// characters cannot cross newlines
|
||||||
|
{ src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] },
|
||||||
|
|
||||||
|
// arity symbol
|
||||||
|
{ src: `int...x`, r: [
|
||||||
|
{value: `int`, kind:'primitive-type'},
|
||||||
|
{value: `...`, kind:'symbol'},
|
||||||
|
{value: `x`, kind:'ident'},
|
||||||
|
],},
|
||||||
|
|
||||||
|
// complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to
|
||||||
|
// prioritise ++ in every case, assuming that the developer will insert spaces as required.
|
||||||
|
// e.g this first one fails to compile with javac
|
||||||
|
{ src: '++abc+++def', r: [
|
||||||
|
{value: '++', kind:'inc-operator'},
|
||||||
|
{value: 'abc', kind:'ident'},
|
||||||
|
{value: '++', kind:'inc-operator'},
|
||||||
|
{value: '+', kind:'plumin-operator'},
|
||||||
|
{value: 'def', kind:'ident'},
|
||||||
|
] },
|
||||||
|
// this should be ok
|
||||||
|
{ src: '++abc+ ++def', r: [
|
||||||
|
{value: '++', kind:'inc-operator'},
|
||||||
|
{value: 'abc', kind:'ident'},
|
||||||
|
{value: '+', kind:'plumin-operator'},
|
||||||
|
{value: ' ', kind:'wsc'},
|
||||||
|
{value: '++', kind:'inc-operator'},
|
||||||
|
{value: 'def', kind:'ident'},
|
||||||
|
] },
|
||||||
|
]
|
||||||
|
const report = (test, msg) => {
|
||||||
|
console.log(JSON.stringify({test, msg}));
|
||||||
|
}
|
||||||
|
tests.forEach(t => {
|
||||||
|
const tokens = tokenize(t.src);
|
||||||
|
if (tokens.length !== t.r.length) {
|
||||||
|
report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (let i=0; i < tokens.length; i++) {
|
||||||
|
if (tokens[i].value !== t.r[i].value)
|
||||||
|
report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`);
|
||||||
|
if (tokens[i].kind !== t.r[i].kind)
|
||||||
|
report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
testTokenize();
|
||||||
Reference in New Issue
Block a user