add new method body parser to use direct linear parsing

This commit is contained in:
Dave Holoway
2020-06-05 14:36:42 +01:00
parent 4f70cb0128
commit c43ba6ccf1
5 changed files with 2622 additions and 197 deletions

View File

@@ -36,193 +36,6 @@ class LocalVariable {
}
}
/**
* @param {string} text
* @param {number} index
*/
function extractExpression(text, index = 0) {
const src = text.slice(index);
const e = new ExpressionText(src);
const parsed = parse_expression(e);
//console.log(parsed);
let consumed = index + src.lastIndexOf(e.expr);
return {
parsed,
index: consumed,
}
}
/**
* Parse a method body
*
* The parser is an inside-out parser.
* It works by tokenizing at the lowest level (comments, whitespace, identifiers, literals and symbols)
* and works its way outward, grouping tokens together in larger and larger chunks that it recognises.
*
* Each stage is forgiving on what it accepts and syntax errors (unexpected or missing tokens) are noted along the way.
* The final parse stage matches a set of statements - the highest-level concept of a method body.
*
* Once the parse is complete, all the complete expressions in the body can be type-resolved and checked.
*
* @param {string} text
* @param {number} text_index
*/
function parseBody(text, text_index = 0) {
const tokens = new TextBlockArray('body');
// preprocess - strip any comments and normalise strings
text = text.replace(/(\/\/.*|\/\*[\D\d]*?\*\/|\s+)|(".+?")/g, (_,comment,str) =>
str ?
`"${' '.repeat(str.length-2)}"`
: ' '
).replace(/;/g,';\n');
const re = /(\s+)|(["'\d]|\b(?:true|false|null)\b)|\b(if|switch|while|else|for|case|default|do|try|finally|catch|return|break|continue)\b|(\bnew\b)|(\w+|\d+(?:\.\d*)?[eE][+-]?\w*|[!~+-])|([;{}():])|(.)/g;
for (let m; m = re.exec(text);) {
if (m[1]) {
// ignore ws + comments
continue;
}
console.log(re.lastIndex)
if (m[2]) {
// string, character, number, boolean or null literal - parse as an expression
const { parsed, index } = extractExpression(text, m.index);
tokens.blocks.push(new ParsedExpressionBlock(text, m.index, index - m.index, parsed));
re.lastIndex = index;
continue;
}
if (m[3]) {
// statement keyword
tokens.blocks.push(TextBlock.from(text, m.index, m[0].length));
continue;
}
if (m[4]) {
// new keyword - need extra handling because of anonymous types
const { parsed, index } = extractExpression(text, m.index);
tokens.blocks.push(new ParsedExpressionBlock(text, m.index, index - m.index, parsed));
re.lastIndex = index;
}
if (m[5]) {
// word - first check if this looks like a variable declaration
const local_var_re = /(final +)?(\w+(?: *\. *\w+)*(?: *<.*?>)?(?: *\[ *\])*)( +)(\w+)( *\[ *\])*/g;
local_var_re.lastIndex = m.index;
const local_var_match = local_var_re.exec(text);
if (local_var_match && local_var_match.index === m.index) {
m = local_var_match;
// it looks like a local variable declaration
const typeident = new TypeIdent([new Token(text_index + m.index, m[2], '', null)]);
const local_var_decl = new LocalVariableDeclaration([], typeident);
let name_token = new Token(text_index + m.index + (m[1]||'').length + m[2].length + m[3].length, m[4], '', null);
let postarray_token = m[4] ? new Token(name_token.source_idx + m[4].length, m[5], '', null) : null;
const vars = [new LocalVariable(local_var_decl, name_token, postarray_token)];
const next = /( *= *)|( *, *)(\w+)( *\[ *\])*/g;
let lastIndex = local_var_re.lastIndex;
for (;;) {
next.lastIndex = lastIndex;
let m = next.exec(text);
if (!m || m.index !== lastIndex) {
break;
}
lastIndex = next.lastIndex;
if (m[1]) {
vars[0].equals = new Token(text_index + m.index + m[0].indexOf('='), '=', '', null);
// variable initialiser
const { parsed, index } = extractExpression(text, next.lastIndex);
lastIndex = index;
vars[0].expression = parsed;
} else {
// another variable
vars[0].comma = new Token(text_index + m.index + m[0].indexOf(','), ',', '', null);
name_token = new Token(text_index + m.index + m[2].length, m[3], '', null);
postarray_token = m[4] ? new Token(name_token.source_idx + m[3].length, m[4], '', null) : null;
vars.unshift(new LocalVariable(local_var_decl, name_token, postarray_token));
}
}
local_var_decl.vars = vars.reverse();
tokens.blocks.push(new LocalVariableDeclBlock(text, local_var_match.index, lastIndex, local_var_decl));
re.lastIndex = lastIndex;
continue;
}
const { parsed, index } = extractExpression(text, m.index);
tokens.blocks.push(new ParsedExpressionBlock(text, m.index, index - m.index, parsed));
re.lastIndex = index;
continue;
}
if (m[6]) {
// brackets, scopes or semcolon
tokens.blocks.push(TextBlock.from(text, m.index, m[0].length));
continue;
}
// anything else is invalid
tokens.blocks.push(new InvalidTextBlock(text, m.index, m[0].length));
}
tokens;
// convert the tokens to their simplified form for grouping
let sourcemap = tokens.sourcemap();
// convert simple statements and expressions
let chunks = [
/(for)(\(?)([XY]?)(;?)(X?)(;?)(X?)(\)?)/g, // for -> F
/(if)(\(?)(X?)(\)?)/g, // if -> I
/(while)(\(?)(X?)(\)?)/g, // while -> W
/(switch)(\(?)(X?)(\)?)/g, // switch -> P
/(catch)(\(?)(V?)(\)?)/g, // catch -> C
/(case)(X?)(:?)/g, // single case -> Q
/(default)(:?)/g, // default case -> Q
/(return|break|continue)(X?)(;?)/g, // return/break/continue -> S
/(finally)/g, // finally block -> N
/(else)/g, // else statement -> L
/Y(;?)/g, // variable declaration -> V
/X(;?)/g, // statement expression -> E
]
let replacements = 'FIWPCQQSNLVE';
let ids = 'for_hdr if_hdr while_hdr switch_hdr catch case default rbc finally else localvar expr'.split(' ');
chunks.forEach((re,idx) => {
re.lastIndex = 0;
for (let m; m = re.exec(sourcemap.simplified);) {
let start = sourcemap.map[m.index];
let end = sourcemap.map[m.index + m[0].length];
tokens.shrink(ids[idx], start, end - start, m, replacements[idx]);
sourcemap = tokens.sourcemap();
re.lastIndex = 0;
}
})
chunks = [
/\{([SBVE;]*)(\})/g, // statement block -> B
/I([SBVE;])(L[SBVE;])?/g, // if (Expression) Statement/Block Else -> S
/F[SBVE;]/g, // for loop -> S
/P(\{)(Q+[SBVE]*)*(\}?)/g, // switch(Expression){ Q(caseblock),... } -> S
/try(B)(C?B?)(N?B?)/g, // try, Block, catch/finally -> S
/do(B)(W?)(;?)/g, // do Block While -> S
/(?<!\})W[SVBE;]/g, // While -> S - this needs the no-pre-brace check to allow do-while to pair correctly
]
replacements = 'BSSSSSS';
ids = 'block if for switch try dowhile while'.split(' ');
for (;;) {
let old = sourcemap.simplified;
chunks.forEach((re,idx) => {
re.lastIndex = 0;
for (let m; m = re.exec(sourcemap.simplified);) {
let start = sourcemap.map[m.index];
let end = sourcemap.map[m.index + m[0].length];
tokens.shrink(ids[idx], start, end - start, m, replacements[idx]);
sourcemap = tokens.sourcemap();
re.lastIndex = 0;
}
})
if (old === sourcemap.simplified) break;
}
return tokens;
}
class ParsedExpressionBlock extends TextBlock {
/**
* @param {string} source
@@ -261,6 +74,221 @@ class InvalidTextBlock extends TextBlock {
}
/**
* @param {string} text
* @param {number} index
*/
function extractExpression(text, index = 0) {
const src = text.slice(index);
const e = new ExpressionText(src);
const parsed = parse_expression(e);
//console.log(parsed);
//let nex = index + src.lastIndexOf(e.expr);
return {
parsed,
nextIndex: text.length - e.expr.length,
}
}
/**
*
* @param {RegExpExecArray} local_var_match
* @param {number} lastIndex
* @param {string} text
* @param {number} text_index
*/
function extractLocalVariableDeclaration(local_var_match, lastIndex, text, text_index) {
let m = local_var_match;
// it looks like a local variable declaration
const typeident = new TypeIdent([new Token(text_index + m.index, m[2], '', null)]);
const local_var_decl = new LocalVariableDeclaration([], typeident);
let name_token = new Token(text_index + m.index + (m[1]||'').length + m[2].length + m[3].length, m[4], '', null);
let postarray_token = m[4] ? new Token(name_token.source_idx + m[4].length, m[5], '', null) : null;
const vars = [
new LocalVariable(local_var_decl, name_token, postarray_token)
];
const next_variable_re = /(\s*=\s*)|(\s*,\s*)(\w+)(\s*\[\s*\])*/g;
for (;;) {
next_variable_re.lastIndex = lastIndex;
let m = next_variable_re.exec(text);
if (!m || m.index !== lastIndex) {
break;
}
lastIndex = next_variable_re.lastIndex;
if (m[1]) {
vars[0].equals = new Token(text_index + m.index + m[0].indexOf('='), '=', '', null);
// variable initialiser
const { parsed, nextIndex } = extractExpression(text, next_variable_re.lastIndex);
lastIndex = nextIndex;
vars[0].expression = parsed;
} else {
// another variable
vars[0].comma = new Token(text_index + m.index + m[0].indexOf(','), ',', '', null);
name_token = new Token(text_index + m.index + m[2].length, m[3], '', null);
postarray_token = m[4] ? new Token(name_token.source_idx + m[3].length, m[4], '', null) : null;
vars.unshift(new LocalVariable(local_var_decl, name_token, postarray_token));
}
}
local_var_decl.vars = vars.reverse();
return {
local_var_decl,
nextIndex: lastIndex,
}
}
/**
* Parse a method body
*
* The parser is an inside-out parser.
* It works by tokenizing at the lowest level (comments, whitespace, identifiers, literals and symbols)
* and works its way outward, grouping tokens together in larger and larger chunks that it recognises.
*
* Each stage is forgiving on what it accepts and syntax errors (unexpected or missing tokens) are noted along the way.
* The final parse stage should match a set of statements - the highest-level concept of a method body.
*
* Once the parse is complete, all the complete expressions in the body can be type-resolved and validated.
*
* @param {string} text
* @param {number} text_index
*/
function parseBody(text, text_index = 0) {
const tokens = new TextBlockArray('body');
// preprocess - strip any comments and normalise strings
text = text.replace(/(\/\/.*|\/\*[\D\d]*?\*\/)|(".+?")/g, (_,comment,str) =>
str ?
`"${' '.repeat(str.length-2)}"`
: comment.replace(/./g, ' ')
);
const re = /(\s+)|(["'\d]|\.\d|\b(?:true|false|null|new)\b)|(\()|\b(if|switch|while|else|for|catch|case|default|do|try|finally|return|break|continue|throw)\b|(\w+|\d+(?:\.\d*)?[eE][+-]?\w*|[!~+-])|([;{}():])|(.)/g;
for (let m, i; m = re.exec(text);) {
if (m[i = 1]) {
// ignore ws + comments
continue;
}
//console.log(re.lastIndex, m[0])
if (m[++i]) {
// string, character, number, boolean, null or new - parse as an expression
const { parsed, nextIndex } = extractExpression(text, m.index);
tokens.blocks.push(new ParsedExpressionBlock(text, m.index, nextIndex - m.index, parsed));
re.lastIndex = nextIndex;
continue;
}
if (m[++i]) {
// bracket - if the previous element was a branch keyword, tokenize it
// otherwise parse it as an expression
const prev = tokens.blocks[tokens.blocks.length - 1];
if (prev && /if|for|while|switch|catch/.test(prev.source)) {
tokens.blocks.push(TextBlock.from(text, m.index, m[0].length));
continue;
}
const { parsed, nextIndex } = extractExpression(text, m.index);
tokens.blocks.push(new ParsedExpressionBlock(text, m.index, nextIndex - m.index, parsed));
re.lastIndex = nextIndex;
continue;
}
if (m[++i]) {
// statement keyword
tokens.blocks.push(TextBlock.from(text, m.index, m[0].length));
continue;
}
if (m[++i]) {
// word - first check if this looks like a variable declaration
// if (layerType < LAYER_TYPE_NONE || layerType > LAYER_TYPE_HARDWARE) {
const local_var_re1 = /(final +)?(\w+(?: *\. *\w+)*(?: *<(?:[a-zA-Z_]\w*|[<>\[\],.\s])*?>)?(?: *\[ *\])*)( +)(\w+)( *\[ *\])*/;
const local_var_re = new RegExp(`(?<=^[\\d\\D]{${m.index}})${local_var_re1.source}`, 'g');
//local_var_re.lastIndex = m.index;
const local_var_match = local_var_re.exec(text);
if (local_var_match && local_var_match.index === m.index && local_var_match[4] !== 'instanceof') {
const { local_var_decl, nextIndex } = extractLocalVariableDeclaration(local_var_match, local_var_re.lastIndex, text, text_index);
tokens.blocks.push(new LocalVariableDeclBlock(text, local_var_match.index, nextIndex, local_var_decl));
re.lastIndex = nextIndex;
continue;
}
const { parsed, nextIndex } = extractExpression(text, m.index);
tokens.blocks.push(new ParsedExpressionBlock(text, m.index, nextIndex - m.index, parsed));
re.lastIndex = nextIndex;
continue;
}
if (m[++i]) {
// brackets, scopes or semcolon
tokens.blocks.push(TextBlock.from(text, m.index, m[0].length));
continue;
}
// anything else is invalid
tokens.blocks.push(new InvalidTextBlock(text, m.index, m[0].length));
}
tokens;
// convert the tokens to their simplified form for grouping
let sourcemap = tokens.sourcemap();
// convert simple statements and expressions
let chunks = [
// for-iterables must match up to the ':' - otherwise, they're treated as normal for-loops
/(for)(\()(Y)(:)(X?)(\)?)/g, // for-iterable -> G
/(for)(\(?)([XY]?)(;?)(X?)(;?)(X?)(\)?)/g, // for -> F
/(if)(\(?)(X?)(\)?)/g, // if -> I
/(while)(\(?)(X?)(\)?)/g, // while -> W
/(switch)(\(?)(X?)(\)?)/g, // switch -> P
/(catch)(\(?)(Y?)(\)?)/g, // catch -> C
/(case)(X?)(:?)/g, // single case -> Q
/(default)(:?)/g, // default case -> Q
/(return|break|continue|throw)(X?)(;?)/g, // return/break/continue -> S
/(finally)/g, // finally block -> N
/(else)/g, // else statement -> L
/Y(;?)/g, // variable declaration -> V
/X(;?)/g, // statement expression -> E
]
let replacements = 'GFIWPCQQSNLVE';
let ids = 'fit_hdr for_hdr if_hdr while_hdr switch_hdr catch case default rbct finally else localvar expr'.split(' ');
chunks.forEach((re,idx) => {
re.lastIndex = 0;
for (let m; m = re.exec(sourcemap.simplified);) {
let start = sourcemap.map[m.index];
let end = sourcemap.map[m.index + m[0].length];
tokens.shrink(ids[idx], start, end - start, m, replacements[idx], null, false);
sourcemap = tokens.sourcemap();
re.lastIndex = 0;
}
})
chunks = [
/\{([SBVE;]*)(\})/g, // statement block -> B
/I([SBVE;])(?!L)/g, // if (Expression) Statement -> S
/I([SBVE;])(L[SBVE;])/g, // if (Expression) Statement/Block Else -> S
/G[SBVE;]/g, // for-iterable loop -> S
/F[SBVE;]/g, // for loop -> S
/P(\{)(Q+[SBVE]*)*(\})/g, // switch(Expression){ Q(caseblock),... } -> S
/try(B)(CB?)?(NB?)?/g, // try, Block, catch/finally -> S
/do(B)(W?)(;?)/g, // do Block While -> S
/(?<!\})W[SVBE;]/g, // While -> S - this needs the no-pre-brace check to allow do-while to pair correctly
]
replacements = 'BSSSSSSSS';
ids = 'block if ifelse fit for switch try dowhile while'.split(' ');
for (let i=0; i < chunks.length; ) {
let re = chunks[i];
re.lastIndex = 0;
let m = re.exec(sourcemap.simplified);
if (m) {
let start = sourcemap.map[m.index];
let end = sourcemap.map[m.index + m[0].length];
tokens.shrink(ids[i], start, end - start, m, replacements[i], null, false);
sourcemap = tokens.sourcemap();
i = 0;
continue;
}
i++;
}
return tokens;
}
module.exports = {
parseBody,
}

File diff suppressed because it is too large Load Diff

View File

@@ -114,11 +114,14 @@ class TextBlockArray {
* @param {RegExpMatchArray} match
* @param {string} marker
* @param {*} [parseClass]
* @param {boolean} [pad]
*/
shrink(id, start_block_idx, block_count, match, marker, parseClass) {
shrink(id, start_block_idx, block_count, match, marker, parseClass, pad=true) {
if (block_count <= 0) return;
const collapsed = new TextBlockArray(id, this.blocks.splice(start_block_idx, block_count, null));
const simplified = collapsed.source.replace(/./g, ' ').replace(/^./, marker);
const simplified = pad
? collapsed.source.replace(/./g, ' ').replace(/^./, marker)
: marker;
return this.blocks[start_block_idx] = parseClass
? new parseClass(collapsed, simplified, match)
: new TextBlock(collapsed, simplified);

View File

@@ -1,5 +1,5 @@
const { JavaType, CEIType, Method, Field, Parameter, TypeVariable, UnresolvedType } = require('java-mti');
const { ModuleBlock, TypeDeclBlock, FieldBlock, MethodBlock, ParameterBlock, TextBlock } = require('./parser9');
const { JavaType, CEIType, Constructor, Method, Field, Parameter, TypeVariable, UnresolvedType } = require('java-mti');
const { ModuleBlock, TypeDeclBlock, FieldBlock, ConstructorBlock, MethodBlock, ParameterBlock, TextBlock, TextBlockArray } = require('./parser9');
/**
*
@@ -44,17 +44,15 @@ class SourceType extends CEIType {
constructor(mod, type, qualified_type_name) {
super(type.shortSignature, type.kind(), mapmods(type), type.docs);
this._decl = type;
super.packageName = mod.packageName;
super.simpleTypeName = type.simpleName;
super.dottedTypeName = qualified_type_name.replace(/\$/g, '.');
super.fullyDottedRawName = type.fullyDottedName;
this._dottedTypeName = qualified_type_name.replace(/\$/g, '.');
this.extends_types = type.extends_decl ? extractTypeList(type.extends_decl) : []
this.extends_types = type.extends_decl ? extractTypeList(type.extends_decl) : [];
this.implements_types = type.implements_decl ? extractTypeList(type.implements_decl) : [];
this.implicit_extend = !this.extends_types.length && !this.implements_types.length ? [new ResolvableType({type: 'java.lang.Object', typeTokens:[]})] : [];
this.fields = type.fields.map(f => new SourceField(this, f));
this.methods = type.methods.map(m => new SourceMethod(this, m));
this.constructors = type.constructors.map(c => new SourceConstructor(this, c));
super.typevars = type.typevars.map(tv => {
const typevar = new TypeVariable(tv.name);
// automatically add the Object bound
@@ -63,6 +61,18 @@ class SourceType extends CEIType {
});
}
get dottedTypeName() {
return this._dottedTypeName;
}
get fullyDottedRawName() {
return this._decl.fullyDottedName;
}
get fullyDottedTypeName() {
return this._decl.fullyDottedName;
}
get supers() {
return [
...this.implicit_extend.map(t => t.resolved),
@@ -108,6 +118,38 @@ class SourceField extends Field {
}
}
class SourceConstructor extends Constructor {
/**
* @param {SourceType} owner
* @param {ConstructorBlock} decl
*/
constructor(owner, decl) {
super(mapmods(decl), decl.docs);
this._owner = owner;
this._decl = decl;
this._parameters = decl.parameters.map((p,i) => new SourceParameter(p));
}
get methodSignature() {
return `(${this._parameters.map(p => p.type.typeSignature).join('')})V`;
}
/**
* @returns {SourceParameter[]}
*/
get parameters() {
return this._parameters;
}
/**
* @returns {SourceType}
*/
get returnType() {
return this._owner;
}
}
class SourceMethod extends Method {
/**
* @param {SourceType} owner
@@ -180,3 +222,7 @@ class ResolvableType extends UnresolvedType {
}
exports.SourceType = SourceType;
exports.SourceField = SourceField;
exports.SourceMethod = SourceMethod;
exports.SourceParameter = SourceParameter;
exports.SourceConstructor = SourceConstructor;

View File

@@ -1,9 +1,10 @@
const { JavaType } = require('java-mti');
const { ModuleBlock, TypeDeclBlock } = require('./parser9');
const { resolveImports } = require('../java/import-resolver');
const ResolvedImport = require('../java/parsetypes/resolved-import');
const { resolveType } = require('../java/type-resolver');
const { SourceType } = require('./source-type');
const { JavaType } = require('java-mti');
const { parseBody } = require('./body-parser3');
/**
@@ -55,6 +56,22 @@ function validate(mod, androidLibrary) {
resolveResolvableTypes(t, imports.resolved, imports.typemap);
});
let probs = [];
source_types.forEach(t => {
t.constructors.forEach(c => {
console.log(c.label);
const parsed = parseBody(c._owner._decl.mod.source, c, imports.resolved, androidLibrary);
if (parsed)
probs = probs.concat(parsed.problems)
})
t.methods.forEach(m => {
console.log(m.label);
const parsed = parseBody(m._owner._decl.mod.source, m, imports.resolved, androidLibrary);
if (parsed)
probs = probs.concat(parsed.problems)
})
})
const module_validaters = [
require('./validation/multiple-package-decls'),
require('./validation/unit-decl-order'),
@@ -71,6 +88,7 @@ function validate(mod, androidLibrary) {
];
let problems = [
module_validaters.map(v => v(mod, imports, source_types)),
...probs,
];
console.timeEnd('validation');