diff --git a/langserver/java/body-parser3.js b/langserver/java/body-parser3.js index ab6fd55..b7cee26 100644 --- a/langserver/java/body-parser3.js +++ b/langserver/java/body-parser3.js @@ -8,7 +8,7 @@ const { JavaType, CEIType, PrimitiveType, ArrayType, UnresolvedType, WildcardTyp const { SourceMethod, SourceConstructor } = require('./source-type'); const ResolvedImport = require('./parsetypes/resolved-import'); const ParseProblem = require('./parsetypes/parse-problem'); -const { TextBlock, BlockRange } = require('./parsetypes/textblock'); +const { getOperatorType, tokenize, Token } = require('./tokenizer'); /** * @typedef {SourceMethod|SourceConstructor} SourceMC @@ -2654,312 +2654,4 @@ class ResolvedIdent { } -/** - * \s+ whitespace - * \/\/.* single-line comment (slc) - * \/\*[\d\D]*?\*\/ multi-line comment (mlc) - * "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*" string literal - correctly terminated but may contain invalid escapes - * ".* unterminated string literal - * '\\?.?'? character literal - possibly unterminated and/or with invalid escape - * \.?\d number literal (start) - further processing extracts the value - * [\p{L}\p{N}_$]* word - keyword or identifier - * [;,?:(){}\[\]] single-character symbols and operators - * \.(\.\.)? . ... - * - * the operators: [!=/%*^]=?|<>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=? - * [!=/%*^]=? ! = / % * ^ != == /= %= *= ^= - * <>?[>=]? > >> >= >>> >>= - * &[&=]? & && &= - * \|[|=]? | || |= - * (\+\+|--) ++ -- postfix inc - only matches if immediately preceded by a word or a ] - * [+-]=? + - += -= - * - * - * - */ -/** - * - * @param {string} source - * @param {number} [offset] - * @param {number} [length] - */ -function tokenize(source, offset = 0, length = source.length) { - const text = source.slice(offset, offset + length); - const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|(".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu; - const raw_token_types = [ - 'wsc', - 'string-literal', - 'unterminated-string-literal', - 'char-literal', - 'number-literal', - 'word', - 'open-bracket', - 'symbol', - 'operator', - ]; - /** - * ``` - * true|false boolean - * this|null object - * int|long|short|byte|float|double|char|boolean|void primitive type - * new - * instanceof - * public|private|protected|static|final|abstract|native|volatile|transient modifier - * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized statement keyword - * class|enum|interface type keyword - * package|import package keyword - * \w+ word - * ``` - */ - const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(package|import)|(.+))$/; - const word_token_types = [ - 'boolean-literal', - 'object-literal', - 'primitive-type', - 'new-operator', - 'instanceof-operator', - 'modifier', - 'statement-kw', - 'type-kw', - 'package-kw', - 'ident' - ] - /** - * ``` - * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]? decimal exponent: 1e0, 1.5e+10, 0.123E-20d - * (?:\d+\.\d*|\.\d+)[fFdD]? decimal number: 0.1, 12.34f, 7.D, .3 - * 0x[\da-fA-F]*[lL]? hex integer: 0x1, 0xaBc, 0x, 0x7L - * \d+[fFdDlL]? integer: 0, 123, 234f, 345L - * ``` - * todo - underscore seperators - */ - const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g; - const number_token_types = [ - 'dec-exp-number-literal', - 'dec-number-literal', - 'hex-number-literal', - 'int-number-literal', - ] - const tokens = []; - let lastindex = 0, m; - while (m = raw_token_re.exec(text)) { - // any text appearing between two matches is invalid - if (m.index > lastindex) { - tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid')); - } - lastindex = m.index + m[0].length; - if (m.index >= text.length) { - // end of input - break; - } - - let idx = m.findIndex((match,i) => i && match) - 1; - let tokentype = raw_token_types[idx]; - - switch(tokentype) { - case 'number-literal': - // we need to extract the exact number part - number_re.lastIndex = m.index; - m = number_re.exec(text); - idx = m.findIndex((match,i) => i && match) - 1; - tokentype = number_token_types[idx]; - // update the raw_token_re position based on the length of the extracted number - raw_token_re.lastIndex = lastindex = number_re.lastIndex; - break; - case 'word': - // we need to work out what kind of keyword, literal or ident this is - let word_m = m[0].match(word_re); - idx = word_m.findIndex((match,i) => i && match) - 1; - tokentype = word_token_types[idx]; - break; - case 'operator': - // find the operator-type - tokentype = getOperatorType(m[0]); - break; - } - tokens.push(new Token(source, offset + m.index, m[0].length, tokentype)); - } - - return tokens; -} - - -/** - * ``` - * =|[/%*&|^+-]=|>>>?=|<<= assignment - * \+\+|-- inc - * [!=]= equality - * [<>]=? comparison - * [&|^] bitwise - * <<|>>>? shift - * &&|[|][|] logical - * [*%/] muldiv - * [+-] plumin - * [~!] unary - * ``` - */ -const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/; -/** - * @typedef { - 'assignment-operator'| - 'inc-operator'| - 'equality-operator'| - 'comparison-operator'| - 'bitwise-operator'| - 'shift-operator'| - 'logical-operator'| - 'muldiv-operator'| - 'plumin-operator'| - 'unary-operator'} OperatorKind - */ -/** @type {OperatorKind[]} */ -const operator_token_types = [ - 'assignment-operator', - 'inc-operator', - 'equality-operator', - 'comparison-operator', - 'bitwise-operator', - 'shift-operator', - 'logical-operator', - 'muldiv-operator', - 'plumin-operator', - 'unary-operator', -] -/** - * @param {string} value - */ -function getOperatorType(value) { - const op_match = value.match(operator_re); - const idx = op_match.findIndex((match,i) => i && match) - 1; - // @ts-ignore - return operator_token_types[idx]; -} - -class Token extends TextBlock { - - /** - * - * @param {string} text - * @param {number} start - * @param {number} length - * @param {string} kind - */ - constructor(text, start, length, kind) { - super(new BlockRange(text, start, length), null); - this.kind = kind; - } - - get value() { - return this.source; - } -} - -function testTokenize() { - const tests = [ - // the basics - { src: 'i', r: [{value: 'i', kind:'ident'}] }, - { src: '0', r: [{value: '0', kind:'int-number-literal'}] }, - { src: `""`, r: [{value: `""`, kind:'string-literal'}] }, - { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] }, - { src: `(`, r: [{value: `(`, kind:'open-bracket'}] }, - ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })), - ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })), - ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })), - ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })), - ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })), - - // numbers - decimal with exponent - ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })), - // numbers - decimal with partial exponent - ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })), - // numbers - not decimal exponent - { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] }, - - // numbers - decimal (no exponent) - ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })), - // numbers - not decimal - { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] }, - { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] }, - - // numbers - hex - ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })), - // numbers - partial hex - ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })), - - // numbers - decimal - ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })), - - // strings - ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })), - // unterminated strings - ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })), - // strings cannot cross newlines - { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] }, - - // characters - ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })), - // unterminated/invalid characters - ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })), - // characters cannot cross newlines - { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] }, - - // arity symbol - { src: `int...x`, r: [ - {value: `int`, kind:'primitive-type'}, - {value: `...`, kind:'symbol'}, - {value: `x`, kind:'ident'}, - ],}, - - // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to - // prioritise ++ in every case, assuming that the developer will insert spaces as required. - // e.g this first one fails to compile with javac - { src: '++abc+++def', r: [ - {value: '++', kind:'inc-operator'}, - {value: 'abc', kind:'ident'}, - {value: '++', kind:'inc-operator'}, - {value: '+', kind:'plumin-operator'}, - {value: 'def', kind:'ident'}, - ] }, - // this should be ok - { src: '++abc+ ++def', r: [ - {value: '++', kind:'inc-operator'}, - {value: 'abc', kind:'ident'}, - {value: '+', kind:'plumin-operator'}, - {value: ' ', kind:'wsc'}, - {value: '++', kind:'inc-operator'}, - {value: 'def', kind:'ident'}, - ] }, - ] - const report = (test, msg) => { - console.log(JSON.stringify({test, msg})); - } - tests.forEach(t => { - const tokens = tokenize(t.src); - if (tokens.length !== t.r.length) { - report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`); - return; - } - for (let i=0; i < tokens.length; i++) { - if (tokens[i].value !== t.r[i].value) - report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`); - if (tokens[i].kind !== t.r[i].kind) - report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`); - } - }) -} - - -testTokenize(); - -// const s = require('fs').readFileSync('/home/dave/dev/vscode/android-dev-ext/langserver/tests/java-files/View-25.java', 'utf8'); -// console.time(); -// const tokens = tokenize(s); -// console.timeEnd(); -// if (tokens.map(t => t.value).join('') !== s) { -// console.log('mismatch'); -// } - -// testTokenize(); - exports.parseBody = parseBody; diff --git a/langserver/java/parser9.js b/langserver/java/parser9.js index 9cf9f9f..2c1a031 100644 --- a/langserver/java/parser9.js +++ b/langserver/java/parser9.js @@ -1,4 +1,5 @@ const { TextBlock, TextBlockArray } = require('./parsetypes/textblock'); +const { tokenize, Token } = require('./tokenizer'); /** * Normalises comments, whitespace, string and character literals. @@ -205,109 +206,6 @@ function parse2(source) { } -/** - * @param {string} source - */ -function tokenize(source) { - const blocks = []; - const re = /(\/\*[\d\D]*?\*\/)|(\/\/.*)|(\s+)|([a-zA-Z_]\w*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*")|('\\?.')|(\d\w*)|(::|\.{3}|[(){}\[\];,.@])|([=!~*/%^]=?|[?:]|>>?>?=?|<]?)|(.)|$/g; - let lastIndex = 0; - for (let m; m = re.exec(source);) { - if (m.index > lastIndex) { - blocks.push(TextBlock.from(source, lastIndex, m.index-lastIndex)); - throw "er" - } - lastIndex = m.index + m[0].length; - const len = m[0].length; - if (m[1]) { - // mlc - // - MLCs are replaced with tab instead of space. This makes them easy to differentiate (for JavaDocs) - // whilst still being treated as general whitespace. - const mlc = TextBlock.from(source, m.index, len, m[0].replace(/./g, '\t')); - blocks.push(mlc); - continue; - } - if (m[2]) { - // slc - const slc = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' ')); - blocks.push(slc); - continue; - } - if (m[3]) { - // whitespace (other than space and newline) - const ws = TextBlock.from(source, m.index, len, m[0].replace(/./g, ' ')); - blocks.push(ws); - continue; - } - if (m[4]) { - // ident or keyword - const KEYWORDS = /^(assert|break|case|catch|class|const|continue|do|else|enum|extends|finally|for|goto|if|implements|import|interface|new|package|return|super|switch|throw|throws|try|while)$/; - const MODIFIER_KEYWORDS = /^(abstract|final|native|private|protected|public|static|strictfp|synchronized|transient|volatile|default)$/; - const PRIMITIVE_TYPE_KEYWORDS = /^(int|boolean|byte|char|double|float|long|short|void)$/ - const LITERAL_VALUE_KEYWORDS = /^(this|true|false|null)$/; - const OPERATOR_KEYWORDS = /^(instanceof)$/; - let simplified; - let space = ' '.repeat(len-1); - if (KEYWORDS.test(m[0])) { - - } else if (MODIFIER_KEYWORDS.test(m[0])) { - simplified = 'M' + space; - } else if (PRIMITIVE_TYPE_KEYWORDS.test(m[0])) { - simplified = 'P' + space; - } else if (LITERAL_VALUE_KEYWORDS.test(m[0])) { - simplified = 'W' + space; - } else if (OPERATOR_KEYWORDS.test(m[0])) { - simplified = m[0]; - } else { - simplified = 'W' + space; - } - const word = TextBlock.from(source, m.index, len, simplified); - blocks.push(word); - continue; - } - if (m[5]) { - // string literal - const str = TextBlock.from(source, m.index, len, `"${'#'.repeat(m[0].length - 2)}"`); - blocks.push(str); - continue; - } - if (m[6]) { - // char literal - const char = TextBlock.from(source, m.index, len, `'#'`); - blocks.push(char); - continue; - } - if (m[7]) { - // number literal - const number = TextBlock.from(source, m.index, len, `0${' '.repeat(m[0].length-1)}`); - blocks.push(number); - continue; - } - if (m[8]) { - // separator - const separator = TextBlock.from(source, m.index, m[0].length); - blocks.push(separator); - continue; - } - if (m[9]) { - // operator - const operator = TextBlock.from(source, m.index, m[0].length); - blocks.push(operator); - continue; - } - if (m[10]) { - // invalid source char - const invalid = TextBlock.from(source, m.index, m[0].length); - blocks.push(invalid); - continue; - } - // end of file - break; - } - - return blocks; -} - const markers = { arrayQualifier: 'A', blocks: 'B', @@ -331,6 +229,7 @@ const markers = { typeArgs: 'T', enumvalues: 'U', varDecl: 'V', + ident: 'W', typeDecl: 'Z', error: ' ', } diff --git a/langserver/java/tokenizer.js b/langserver/java/tokenizer.js new file mode 100644 index 0000000..6227efd --- /dev/null +++ b/langserver/java/tokenizer.js @@ -0,0 +1,246 @@ +const { TextBlock, BlockRange } = require('./parsetypes/textblock'); + +/** + * Convert a token to its simplified form for easier declaration parsing. + * + * - Whitespace, comments, strings and character literals are normalised. + * - Modifier keywords and identifers are abbreviated. + * - Any invalid text is replaced with spaces. + * + * Abbreviated and normalised values are padded to occupy the same space + * as the original text - this ensures any parse errors are reported in the + * correct location. + * @param {string} text + * @param {number} start + * @param {number} length + * @param {string} kind + */ +function tokenKindToSimplified(text, start, length, kind) { + const chunk = text.slice(start, start + length); + switch (kind) { + case 'wsc': + return chunk.replace(/[^\r\n]/g, ' '); + case 'string-literal': + if (chunk.length <= 2) return chunk; + return `"${'#'.repeat(chunk.length - 2)}"`; + case 'char-literal': + if (chunk.length <= 2) return chunk; + return `'${'#'.repeat(chunk.length - 2)}'`; + case 'primitive-type': + return `P${' '.repeat(chunk.length - 1)}`; + case 'modifier': + return `M${' '.repeat(chunk.length - 1)}`; + case 'ident': + return `W${' '.repeat(chunk.length - 1)}`; + case 'invalid': + return ' '.repeat(chunk.length); + } + return chunk; +} + +class Token extends TextBlock { + + /** + * @param {string} text + * @param {number} start + * @param {number} length + * @param {string} kind + */ + constructor(text, start, length, kind) { + super(new BlockRange(text, start, length), tokenKindToSimplified(text, start, length, kind)); + this.kind = kind; + } + + get value() { + return this.source; + } +} + + +/** + * \s+ whitespace + * \/\/.* single-line comment (slc) + * \/\*[\d\D]*?\*\/ multi-line comment (mlc) + * "[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*" string literal - correctly terminated but may contain invalid escapes + * ".* unterminated string literal + * '\\?.?'? character literal - possibly unterminated and/or with invalid escape + * \.?\d number literal (start) - further processing extracts the value + * [\p{L}\p{N}_$]* word - keyword or identifier + * [;,?:(){}\[\]] single-character symbols and operators + * \.(\.\.)? . ... + * + * the operators: [!=/%*^]=?|<>?[>=]?|&[&=]?|\|[|=]?|\+(=|\++)?|\-+=? + * [!=/%*^]=? ! = / % * ^ != == /= %= *= ^= + * <>?[>=]? > >> >= >>> >>= + * &[&=]? & && &= + * \|[|=]? | || |= + * (\+\+|--) ++ -- postfix inc - only matches if immediately preceded by a word or a ] + * [+-]=? + - += -= + * + * + * + */ + +/** + * + * @param {string} source + * @param {number} [offset] + * @param {number} [length] + */ +function tokenize(source, offset = 0, length = source.length) { + const text = source.slice(offset, offset + length); + const raw_token_re = /(\s+|\/\/.*|\/\*[\d\D]*?\*\/|\/\*[\d\D]*)|("[^\r\n\\"]*(?:\\.[^\r\n\\"]*)*"|".*)|('\\u[\da-fA-F]{0,4}'?|'\\?.?'?)|(\.?\d)|([\p{L}\p{N}$_]+)|(\()|([;,?:(){}\[\]@]|\.(?:\.\.)?)|([!=/%*^]=?|<>?>?=?|&[&=]?|\|[|=]?|(\+\+|--)|[+-]=?|~)|$/gu; + const raw_token_types = [ + 'wsc', + 'string-literal', + 'char-literal', + 'number-literal', + 'word', + 'open-bracket', + 'symbol', + 'operator', + ]; + /** + * ``` + * true|false boolean + * this|null object + * int|long|short|byte|float|double|char|boolean|void primitive type + * new + * instanceof + * public|private|protected|static|final|abstract|native|volatile|transient modifier + * if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized statement keyword + * class|enum|interface type keyword + * package|import package keyword + * \w+ word + * ``` + */ + const word_re = /^(?:(true|false)|(this|super|null)|(int|long|short|byte|float|double|char|boolean|void)|(new)|(instanceof)|(public|private|protected|static|final|abstract|native|volatile|transient)|(if|else|while|for|do|try|catch|finally|switch|case|default|return|break|continue|throw|synchronized|assert)|(class|enum|interface)|(extends|implements)|(package|import)|(.+))$/; + const word_token_types = [ + 'boolean-literal', + 'object-literal', + 'primitive-type', + 'new-operator', + 'instanceof-operator', + 'modifier', + 'statement-kw', + 'type-kw', + 'package-kw', + 'extimp-kw', + 'ident' + ] + /** + * ``` + * \d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]? decimal exponent: 1e0, 1.5e+10, 0.123E-20d + * (?:\d+\.\d*|\.\d+)[fFdD]? decimal number: 0.1, 12.34f, 7.D, .3 + * 0x[\da-fA-F]*[lL]? hex integer: 0x1, 0xaBc, 0x, 0x7L + * \d+[fFdDlL]? integer: 0, 123, 234f, 345L + * ``` + * todo - underscore seperators + */ + const number_re = /((?:\d+(?:\.?\d*)?|\.\d+)[eE][+-]?\d*[fFdD]?)|((?:\d+\.\d*|\.\d+)[fFdD]?)|(0x[\da-fA-F]*[lL]?)|(\d+[fFdDlL]?)/g; + const number_token_types = [ + 'dec-exp-number-literal', + 'dec-number-literal', + 'hex-number-literal', + 'int-number-literal', + ] + const tokens = []; + let lastindex = 0, m; + while (m = raw_token_re.exec(text)) { + // any text appearing between two matches is invalid + if (m.index > lastindex) { + tokens.push(new Token(source, offset + lastindex, m.index - lastindex, 'invalid')); + } + lastindex = m.index + m[0].length; + if (m.index >= text.length) { + // end of input + break; + } + + let idx = m.findIndex((match,i) => i && match) - 1; + let tokentype = raw_token_types[idx]; + + switch(tokentype) { + case 'number-literal': + // we need to extract the exact number part + number_re.lastIndex = m.index; + m = number_re.exec(text); + idx = m.findIndex((match,i) => i && match) - 1; + tokentype = number_token_types[idx]; + // update the raw_token_re position based on the length of the extracted number + raw_token_re.lastIndex = lastindex = number_re.lastIndex; + break; + case 'word': + // we need to work out what kind of keyword, literal or ident this is + let word_m = m[0].match(word_re); + idx = word_m.findIndex((match,i) => i && match) - 1; + tokentype = word_token_types[idx]; + break; + case 'operator': + // find the operator-type + tokentype = getOperatorType(m[0]); + break; + } + tokens.push(new Token(source, offset + m.index, m[0].length, tokentype)); + } + + return tokens; +} + + +/** + * ``` + * =|[/%*&|^+-]=|>>>?=|<<= assignment + * \+\+|-- inc + * [!=]= equality + * [<>]=? comparison + * [&|^] bitwise + * <<|>>>? shift + * &&|[|][|] logical + * [*%/] muldiv + * [+-] plumin + * [~!] unary + * ``` + */ +const operator_re = /^(?:(=|[/%*&|^+-]=|>>>?=|<<=)|(\+\+|--)|([!=]=)|([<>]=?)|([&|^])|(<<|>>>?)|(&&|[|][|])|([*%/])|([+-])|([~!]))$/; +/** + * @typedef { + 'assignment-operator'| + 'inc-operator'| + 'equality-operator'| + 'comparison-operator'| + 'bitwise-operator'| + 'shift-operator'| + 'logical-operator'| + 'muldiv-operator'| + 'plumin-operator'| + 'unary-operator'} OperatorKind + */ +/** @type {OperatorKind[]} */ +const operator_token_types = [ + 'assignment-operator', + 'inc-operator', + 'equality-operator', + 'comparison-operator', + 'bitwise-operator', + 'shift-operator', + 'logical-operator', + 'muldiv-operator', + 'plumin-operator', + 'unary-operator', +] +/** + * @param {string} value + */ +function getOperatorType(value) { + const op_match = value.match(operator_re); + const idx = op_match.findIndex((match,i) => i && match) - 1; + // @ts-ignore + return operator_token_types[idx]; +} + + +exports.getOperatorType = getOperatorType; +exports.tokenize = tokenize; +exports.Token = Token; diff --git a/langserver/tests/test-tokenizer.js b/langserver/tests/test-tokenizer.js new file mode 100644 index 0000000..0a6828a --- /dev/null +++ b/langserver/tests/test-tokenizer.js @@ -0,0 +1,98 @@ +const { tokenize } = require('../java/tokenizer'); + +function testTokenize() { + const tests = [ + // the basics + { src: 'i', r: [{value: 'i', kind:'ident'}] }, + { src: '0', r: [{value: '0', kind:'int-number-literal'}] }, + { src: `""`, r: [{value: `""`, kind:'string-literal'}] }, + { src: `'x'`, r: [{value: `'x'`, kind:'char-literal'}] }, + { src: `(`, r: [{value: `(`, kind:'open-bracket'}] }, + ...'. , [ ] ? : @'.split(' ').map(symbol => ({ src: symbol, r: [{value: symbol, kind: 'symbol'}] })), + ...'= += -= *= /= %= >>= <<= &= |= ^='.split(' ').map(op => ({ src: op, r: [{value: op, kind:'assignment-operator'}] })), + ...'+ -'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'plumin-operator'}] })), + ...'* / %'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'muldiv-operator'}] })), + ...'# ¬'.split(' ').map(op => ({ src: op, r: [{value: op, kind:'invalid'}] })), + + // numbers - decimal with exponent + ...'0.0e+0 0.0E+0 0e+0 0e0 .0e0 0e0f 0e0d'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })), + // numbers - decimal with partial exponent + ...'0.0e+ 0.0E+ 0e+ 0e .0e 0ef 0ed'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-exp-number-literal'}] })), + // numbers - not decimal exponent + { src: '0.0ea', r: [{value: '0.0e', kind:'dec-exp-number-literal'}, {value: 'a', kind:'ident'}] }, + + // numbers - decimal (no exponent) + ...'0.123 0. 0.f 0.0D .0 .0f .123D'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'dec-number-literal'}] })), + // numbers - not decimal + { src: '0.a', r: [{value: '0.', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] }, + { src: '0.0a', r: [{value: '0.0', kind:'dec-number-literal'}, {value: 'a', kind:'ident'}] }, + + // numbers - hex + ...'0x0 0x123456789abcdef 0xABCDEF 0xabcdefl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })), + // numbers - partial hex + ...'0x 0xl'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'hex-number-literal'}] })), + + // numbers - decimal + ...'0 123456789 0l'.split(' ').map(num => ({ src: num, r: [{value: num, kind:'int-number-literal'}] })), + + // strings + ...[`"abc"`, `"\\n"`, `"\\""`].map(num => ({ src: num, r: [{value: num, kind:'string-literal'}] })), + // unterminated strings + ...[`"abc`, `"\\n`, `"\\"`, `"`].map(num => ({ src: num, r: [{value: num, kind:'unterminated-string-literal'}] })), + // strings cannot cross newlines + { src: `"abc\n`, r: [{value: `"abc`, kind:'unterminated-string-literal'}, {value: '\n', kind:'wsc'}] }, + + // characters + ...[`'a'`, `'\\n'`, `'\\''`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })), + // unterminated/invalid characters + ...[`'a`, `'\\n`, `'\\'`, `''`, `'`].map(num => ({ src: num, r: [{value: num, kind:'char-literal'}] })), + // characters cannot cross newlines + { src: `'\n`, r: [{value: `'`, kind:'char-literal'}, {value: '\n', kind:'wsc'}] }, + + // arity symbol + { src: `int...x`, r: [ + {value: `int`, kind:'primitive-type'}, + {value: `...`, kind:'symbol'}, + {value: `x`, kind:'ident'}, + ],}, + + // complex inc - the javac compiler doesn't bother to try and sensibly separate +++ - it just appears to + // prioritise ++ in every case, assuming that the developer will insert spaces as required. + // e.g this first one fails to compile with javac + { src: '++abc+++def', r: [ + {value: '++', kind:'inc-operator'}, + {value: 'abc', kind:'ident'}, + {value: '++', kind:'inc-operator'}, + {value: '+', kind:'plumin-operator'}, + {value: 'def', kind:'ident'}, + ] }, + // this should be ok + { src: '++abc+ ++def', r: [ + {value: '++', kind:'inc-operator'}, + {value: 'abc', kind:'ident'}, + {value: '+', kind:'plumin-operator'}, + {value: ' ', kind:'wsc'}, + {value: '++', kind:'inc-operator'}, + {value: 'def', kind:'ident'}, + ] }, + ] + const report = (test, msg) => { + console.log(JSON.stringify({test, msg})); + } + tests.forEach(t => { + const tokens = tokenize(t.src); + if (tokens.length !== t.r.length) { + report(t, `Wrong token count. Expected ${t.r.length}, got ${tokens.length}`); + return; + } + for (let i=0; i < tokens.length; i++) { + if (tokens[i].value !== t.r[i].value) + report(t, `Wrong token value. Expected ${t.r[i].value}, got ${tokens[i].value}`); + if (tokens[i].kind !== t.r[i].kind) + report(t, `Wrong token kind. Expected ${t.r[i].kind}, got ${tokens[i].kind}`); + } + }) +} + + +testTokenize();