Skip to content

Commit

Permalink
Merge pull request #2 from 8dcc/better-reader
Browse files Browse the repository at this point in the history
Better reader
  • Loading branch information
8dcc authored Oct 24, 2024
2 parents e07be00 + f79b78e commit 0ee2027
Show file tree
Hide file tree
Showing 2 changed files with 218 additions and 102 deletions.
318 changes: 217 additions & 101 deletions src/read.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,154 +14,270 @@
*
* You should have received a copy of the GNU General Public License along with
* SL. If not, see <https://www.gnu.org/licenses/>.
*
* ---------------------------------------------------------------------------
*
* TODO: Rename these functions to "scanner", or something other than
* reader? Since Lisp's `read' also parses the input into an `Expr'.
*/

#include <stdbool.h>
#include <stdio.h>
#include <ctype.h>

#include "include/util.h"
#include "include/read.h"
#include "include/lexer.h" /* is_token_separator() */

#define READ_BUFSZ 100

/*----------------------------------------------------------------------------*/

/*
* Is the specified character a comment start/end delimiter?
*/
#define IS_COMMENT_START(C) ((c) == ';')
#define IS_COMMENT_END(C) ((c) == '\n')

/*----------------------------------------------------------------------------*/
#define IS_COMMENT_START(C) ((C) == ';')
#define IS_COMMENT_END(C) ((C) == '\n')

/*
* Did the character at `str[pos]' just open/close a string with a double-quote?
* Checks if it was escaped:
*
* (...") -> true
* (...\\\") -> false (odd '\')
* (...\\\\") -> true (even '\')
* Get the incoming character from `fp'. Calls `fgetc' and, if it's not EOF,
* calls `ungetc'.
*/
static bool just_toggled_string_state(const char* str, int pos) {
if (str[pos] != '\"')
return false;
pos--;

/* Every consecutive backslash from the end, toggle the variable to store if
* the number is odd or even. */
bool odd_backslashes = true;
for (; pos >= 0 && str[pos] == '\\'; pos--)
odd_backslashes = !odd_backslashes;

return odd_backslashes;
static int get_incoming(FILE* fp) {
int incoming = fgetc(fp);
if (incoming != EOF)
ungetc(incoming, fp);
return incoming;
}

/* Return the first non-comment character from `fp' */
static int get_next_non_comment(FILE* fp) {
int c = fgetc(fp);
/*
* Read characters from `fp' until a non-comment is found using `get_incoming',
* without actually reading it.
*
* Returns false if EOF was encountered before the non-comment, or true
* otherwise.
*/
static bool read_until_incoming_non_comment(FILE* fp) {
int c;

while (IS_COMMENT_START(c)) {
/* Skip comment */
while (IS_COMMENT_START(get_incoming(fp))) {
/* Skip comment contents, along with comment end */
do {
c = fgetc(fp);

/* Make sure we never ignore EOF, even in comments */
if (c == EOF)
return EOF;
return false;
} while (!IS_COMMENT_END(c));

c = fgetc(fp);
}

return c;
return true;
}

/*
* Get the first non-comment character from `fp' using `fgetc'. If EOF is
* encountered inside a comment, it is returned literally.
*/
static int get_next_non_comment(FILE* fp) {
if (!read_until_incoming_non_comment(fp))
return EOF;

return fgetc(fp);
}

/*----------------------------------------------------------------------------*/

char* read_expr(FILE* fp) {
/* Will increase when encountering '(' and decrease with ')' */
int nesting_level = 0;
/*
* Read a double-quote-terminated string into the `dst' buffer, modifying its
* size and position. Assumes the opening double-quote has just been written to
* `dst'; and reads up to the final non-escaped double-quote, included.
*
* The string will be reallocated if necessary, ensuring there is enough space
* for the null terminator after the closing double-quote, but without actually
* writing it.
*/
static void read_user_string(FILE* fp, char** dst, size_t* dst_sz,
size_t* dst_pos) {
/*
* Important notes:
* - There are no comments (starting with ';') in strings.
* - Null bytes are handled in the lexer, not here.
* - A backslash is used to escape. In this "read" stage, we don't have to
* check what is being escaped. Just store it literally.
* - A string ends as long as a non-escaped double-quote is found.
* - EOF might appear inside a string, in which case we should abort and
* return false.
*/
int c = 0;
while (c != '\"') {
if (*dst_pos + 1 >= *dst_sz) {
*dst_sz += READ_BUFSZ;
sl_safe_realloc(*dst, *dst_sz);
}

/* If true, we found a symbol/constant with nesting_level at 0 */
bool isolated_symbol = false;
c = fgetc(fp);
if (c == EOF)
break;
(*dst)[(*dst_pos)++] = c;

/* If true, we are inside a user string in the form "..." */
bool inside_string = false;
if (c == '\\') {
const int escaped = fgetc(fp);
if (c == EOF)
break;
(*dst)[(*dst_pos)++] = escaped;
}
}
}

size_t result_sz = READ_BUFSZ;
char* result = sl_safe_malloc(result_sz);
size_t i = 0;
/*----------------------------------------------------------------------------*/

for (;;) {
if (i >= result_sz - 1) {
/*
* Read a user list with the form "(...)". Assumes the caller just received an
* opening parentheses, but didn't write it anywhere.
*/
static char* read_user_list(FILE* fp) {
size_t result_pos = 0;
size_t result_sz = READ_BUFSZ;
char* result = sl_safe_malloc(result_sz);

/* Will increase when encountering '(' and decrease with ')' */
int nesting_level = 1;

SL_ASSERT(get_incoming(fp) == '(');
result[result_pos++] = get_next_non_comment(fp);

while (nesting_level > 0) {
if (result_pos + 1 >= result_sz) {
result_sz += READ_BUFSZ;
sl_safe_realloc(result, result_sz);
}

const int c = get_next_non_comment(fp);
if (c == EOF) {
free(result);
return NULL;
}

result[i++] = c;
if (c == EOF)
break;
result[result_pos++] = c;

/*
* First, check if we are opening/closing a string. That static function
* will check for escaped double-quotes.
* If we encounter an opening or closing parentheses, simply increase or
* decrease the nesting level, respectively.
*
* If we encounter a double-quote, we should handle it similarly to
* `read_isolated_user_string', since parentheses should be ignored
* inside strings, escaped quotes should be handled, etc.
*/
if (just_toggled_string_state(result, i - 1))
inside_string = !inside_string;
switch (c) {
case '(':
nesting_level++;
break;

/*
* If we are inside a string, we don't want to check anything else until
* we close it.
*/
if (inside_string)
continue;

if (c == '(') {
/*
* FIXME: On input "123(+ 1 2)", we should only read "123" on the
* first call, and "(+ 1 2)" on the second. Refactor this whole
* function.
*/
nesting_level++;
} else if (c == ')') {
/*
* If we are still in level 0, we should have opened an
* expression. Just decrease `i' and ignore it.
*
* Otherwise, if we closed all the expressions that we opened, we
* are done.
*/
if (nesting_level <= 0) {
SL_ERR("Encountered unmatched ')'.");
i--;
} else {
case ')':
nesting_level--;
if (nesting_level <= 0)
break;
}
} else if (nesting_level == 0) {
/*
* We are reading outside of an expression.
*
* If we weren't reading an isolated atom and this isn't a token
* separator, start reading the atom.
*
* Otherwise, if we were reading an isolated atom and we reached a
* token separator, we are done. Decrease `i' because the token
* separator we just read isn't part of the final string.
*/
if (!isolated_symbol && !is_token_separator(c)) {
isolated_symbol = true;
} else if (isolated_symbol && is_token_separator(c)) {
i--;
break;
}

case '\"':
read_user_string(fp, &result, &result_sz, &result_pos);
break;

default:
break;
}
}

result[result_pos] = '\0';
return result;
}

/*
* Read an isolated user string. Assumes the caller just received a
* double-quote, but didn't write it anywhere. Writes the opening double-quote,
* reads a string using the `read_user_string' function (used in other places),
* and writes the final null terminator.
*/
static char* read_isolated_user_string(FILE* fp) {
size_t result_pos = 0;
size_t result_sz = READ_BUFSZ;
char* result = sl_safe_malloc(result_sz);

SL_ASSERT(get_incoming(fp) == '\"');
result[result_pos++] = get_next_non_comment(fp);

read_user_string(fp, &result, &result_sz, &result_pos);
result[result_pos] = '\0';
return result;
}

/*
* Reads characters until a token separator is found.
*/
static char* read_isolated_atom(FILE* fp) {
size_t result_pos = 0;
size_t result_sz = READ_BUFSZ;
char* result = sl_safe_malloc(result_sz);

/*
* Read until the incoming character is a token separator. This includes
* spaces, but also parentheses, for example. The `is_token_separator'
* function is declared in <lexer.h>.
*/
for (;;) {
const int incoming = get_incoming(fp);
if (is_token_separator(incoming) || incoming == EOF)
break;

if (result_pos + 1 >= result_sz) {
result_sz += READ_BUFSZ;
sl_safe_realloc(result, result_sz);
}

result[result_pos++] = get_next_non_comment(fp);
}

result[i] = '\0';
result[result_pos] = '\0';
return result;
}

/*----------------------------------------------------------------------------*/

char* read_expr(FILE* fp) {
int incoming = get_incoming(fp);

/* Skip leading spaces or comments, if any */
while (isspace(incoming) || IS_COMMENT_START(incoming)) {
fgetc(fp);
read_until_incoming_non_comment(fp);
incoming = get_incoming(fp);
}

/*
* The first character (which is guaranteed to not be EOF) will indicate
* where we should stop parsing the current expression:
*
* - If we are opening a parentheses, we stop at the closing parentheses at
* that same level (that is not inside a string).
* - If we are opening a double-quoted string, we stop at the first
* non-escaped double-quote.
* - Otherwise, we are reading an isolated atom.
*
* We also check for some invalid characters:
*
* - If we encountered a closing parentheses in level 0, it is unmatched.
* - If the next character is EOF, inform the caller that there is no more
* user input.
*/
switch (incoming) {
case '(':
return read_user_list(fp);

case '\"':
return read_isolated_user_string(fp);

default:
return read_isolated_atom(fp);

case ')':
SL_ERR("Encountered unmatched ')'.");
get_next_non_comment(fp);
return read_expr(fp);

case EOF:
return NULL;
}
}
2 changes: 1 addition & 1 deletion test/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ for file in $(ls "$SCRIPT_DIR"/*.lisp); do

input_str=""
if [ "$(basename "$file")" == "io.lisp" ]; then
input_str+="123 "
input_str+="123"
input_str+="(+ 1 2 3 (- 5 4))"
input_str+="User string...\n"
input_str+="Another delimited line. EXTRA"
Expand Down

0 comments on commit 0ee2027

Please sign in to comment.