Skip to content

Commit

Permalink
Replace pure function invocations in path expressions with their result
Browse files Browse the repository at this point in the history
In path expressions, we generally need to evaluate functions against every
node that we consider for the result set. For example, in the path
expression /files/etc/hosts/*[ipaddr =~ regexp('127\\.')], the regexp
function was evaluated against every entry in /etc/hosts. Evaluating that
function requires the construction and compilation of a new regexp. Because
of how memory is managed during evaluation of path expressions, the memory
used by all these copies of the same regexp is only freed after we are done
evaluating the path expression. This causes unacceptable memory usage in
large files (see #569)

To avoid these issues, we now distinguish between pure and impure functions
in the path expression interpreter. When we encounter a pure function, we
change the AST for the path expression so that the function invocation is
replaced with the result of invoking the function. With the example above,
that means we only construct and compile the regexp '127\\.' once,
regardless of how many nodes it gets checked against. That leads to a
dramatic reduction in the memory required to evaluate path expressions with
such constructs against large files.

Fixes #569
  • Loading branch information
lutter committed Aug 22, 2018
1 parent aab2069 commit bbf31f7
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 14 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
- General changes/additions
* augmatch: add a --quiet option; make the exit status useful to tell
whether there was a match or not
* Drastically reduce the amount of memory needed to evaluate complex
path expressions against large files (Issue #569)
- API changes
* aug_source did not in fact return the source; and always returned
NULL for that. That has been fixed.
Expand Down
55 changes: 41 additions & 14 deletions src/pathx.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ struct expr {
struct { /* E_APP */
const struct func *func;
struct expr **args;
/* If fold is true, replace this function invocation
* with its value after the first time we evaluate this
* expression */
bool fold;
};
};
};
Expand Down Expand Up @@ -282,6 +286,7 @@ struct func {
const char *name;
unsigned int arity;
enum type type;
bool pure; /* Result only depends on args */
const enum type *arg_types;
func_impl_t impl;
};
Expand All @@ -304,40 +309,40 @@ static const enum type arg_types_nodeset_string[] = { T_NODESET, T_STRING };

static const struct func builtin_funcs[] = {
{ .name = "last", .arity = 0, .type = T_NUMBER, .arg_types = NULL,
.impl = func_last },
.impl = func_last, .pure = false },
{ .name = "position", .arity = 0, .type = T_NUMBER, .arg_types = NULL,
.impl = func_position },
.impl = func_position, .pure = false },
{ .name = "label", .arity = 0, .type = T_STRING, .arg_types = NULL,
.impl = func_label },
.impl = func_label, .pure = false },
{ .name = "count", .arity = 1, .type = T_NUMBER,
.arg_types = arg_types_nodeset,
.impl = func_count },
.impl = func_count, .pure = false },
{ .name = "regexp", .arity = 1, .type = T_REGEXP,
.arg_types = arg_types_string,
.impl = func_regexp },
.impl = func_regexp, .pure = true },
{ .name = "regexp", .arity = 1, .type = T_REGEXP,
.arg_types = arg_types_nodeset,
.impl = func_regexp },
.impl = func_regexp, .pure = true },
{ .name = "regexp", .arity = 2, .type = T_REGEXP,
.arg_types = arg_types_string_string,
.impl = func_regexp_flag },
.impl = func_regexp_flag, .pure = true },
{ .name = "regexp", .arity = 2, .type = T_REGEXP,
.arg_types = arg_types_nodeset_string,
.impl = func_regexp_flag },
.impl = func_regexp_flag, .pure = true },
{ .name = "glob", .arity = 1, .type = T_REGEXP,
.arg_types = arg_types_string,
.impl = func_glob },
.impl = func_glob, .pure = true },
{ .name = "glob", .arity = 1, .type = T_REGEXP,
.arg_types = arg_types_nodeset,
.impl = func_glob },
.impl = func_glob, .pure = true },
{ .name = "int", .arity = 1, .type = T_NUMBER,
.arg_types = arg_types_string, .impl = func_int },
.arg_types = arg_types_string, .impl = func_int, .pure = false },
{ .name = "int", .arity = 1, .type = T_NUMBER,
.arg_types = arg_types_nodeset, .impl = func_int },
.arg_types = arg_types_nodeset, .impl = func_int, .pure = false },
{ .name = "int", .arity = 1, .type = T_NUMBER,
.arg_types = arg_types_bool, .impl = func_int },
.arg_types = arg_types_bool, .impl = func_int, .pure = false },
{ .name = "not", .arity = 1, .type = T_BOOLEAN,
.arg_types = arg_types_bool, .impl = func_not }
.arg_types = arg_types_bool, .impl = func_not, .pure = true }
};

#define RET_ON_ERROR \
Expand Down Expand Up @@ -1409,6 +1414,16 @@ static void eval_expr(struct expr *expr, struct state *state) {
break;
case E_APP:
eval_app(expr, state);
if (expr->fold) {
/* Do constant folding: replace the function application with
* a reference to the value that resulted from evaluating it */
for (int i=0; i < expr->func->arity; i++)
free_expr(expr->args[i]);
free(expr->args);
value_ind_t vind = state->values_used - 1;
expr->tag = E_VALUE;
expr->value_ind = state->values[vind];
}
break;
default:
assert(0);
Expand Down Expand Up @@ -1493,6 +1508,18 @@ static void check_app(struct expr *expr, struct state *state) {
if (f < ARRAY_CARDINALITY(builtin_funcs)) {
expr->func = builtin_funcs + f;
expr->type = expr->func->type;
expr->fold = expr->func->pure;
if (expr->fold) {
/* We only do constant folding for invocations of pure functions
* whose arguments are literal values. That misses opportunities
* for constant folding, e.g., "regexp('foo' + 'bar')" but is
* a bit simpler than doing full tracking of constants
*/
for (int i=0; i < expr->func->arity; i++) {
if (expr->args[i]->tag != E_VALUE)
expr->fold = false;
}
}
} else {
STATE_ERROR(state, PATHX_ETYPE);
}
Expand Down

0 comments on commit bbf31f7

Please sign in to comment.