slash

slash is a simple type-oriented programming language
Log | Files | Refs | README | LICENSE

commit 67caaf3025f80fef4af72cba3dac2dfb0dcf13af
parent 039616ce0bc89c6ee3bf968e32d2a707ed942c86
Author: Mario Rosell R. Martinez <mario@mariorosell.es>
Date:   Sat,  4 Apr 2026 11:27:16 +0200

lex: add simple lexer

Works using a simple DFA and tables. I still have to implement the pipeline,
though.

This lexer is still just a prototype ok? Don't take it as "final" or something.

Diffstat:
MMakefile | 2+-
Alex.c | 241+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alex.h | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 313 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile @@ -2,7 +2,7 @@ # Makefile -- build system # -SRCS := slash.c cli.c stat.c +SRCS := slash.c cli.c stat.c lex.c OBJS := ${SRCS:.c=.o} CC ?= cc diff --git a/lex.c b/lex.c @@ -0,0 +1,241 @@ +/* + * lex.c -- simple enough dfa-based parser + */ + +#include <stdio.h> +#include <string.h> +#include <ctype.h> + +enum toktype { + TOK_EOF, + + TOK_CLASS, + TOK_FC, + TOK_USE, + TOK_AS, + + TOK_F32, + TOK_F64, + TOK_I8, + TOK_I16, + TOK_I32, + TOK_I64, + TOK_U8, + TOK_U16, + TOK_U32, + TOK_U64, + + TOK_IDENT, + + TOK_DOT, + TOK_LBRACE, + TOK_RBRACE, + TOK_LPAREN, + TOK_RPAREN, + TOK_COMMA, + TOK_SEMI, + + TOK_UNKNOWN +}; + +typedef enum toktype toktype_t; + +typedef struct tok { + toktype_t type; + char lexeme[64]; +} tok_t; + +typedef struct { + const char *name; + toktype_t type; +} keyword_t; + +static const keyword_t kws[] = { + {"use", TOK_USE}, + {"class", TOK_CLASS}, + {"fc", TOK_FC}, + {"as", TOK_AS}, + + {"f32", TOK_F32}, + {"f64", TOK_F64}, + + {"i8", TOK_I8}, + {"i16", TOK_I16}, + {"i32", TOK_I32}, + {"i64", TOK_I64}, + + {"u8", TOK_U8}, + {"u16", TOK_U16}, + {"u32", TOK_U32}, + {"u64", TOK_U64}, +}; + +static const int nkeyws = sizeof(kws) / sizeof(kws[0]); + +toktype_t +lu_kw(const char *str) +{ + int i; + + i = 0; + while (i < nkeyws) { + if (strcmp(str, kws[i].name) == 0) + return kws[i].type; + i++; + } + return TOK_IDENT; +} + +/* state */ +const char *src; +size_t i = 0; + +char +peek(void) +{ + return src[i]; +} + +char +consume(void) +{ + return src[i++]; +} + +void +s_ws(void) +{ + while (isspace(peek())) + consume(); +} + +void +s_lc(void) +{ + while (peek() && peek() != '\n') + consume(); +} + +void +s_bc(void) +{ + consume(); + consume(); + + while (peek()) { + if (peek() == '*' && src[i + 1] == '/') { + consume(); + consume(); + break; + } + consume(); + } +} + +tok_t +id(void) +{ + tok_t t; + int len; + + len = 0; + + while (isalnum(peek()) || peek() == '_') { + if (len < 63) + t.lexeme[len++] = consume(); + else + consume(); + } + + t.lexeme[len] = '\0'; + t.type = lu_kw(t.lexeme); + return t; +} + +tok_t +nexttok(void) +{ + tok_t t; + char c; + + s_ws(); + + t.lexeme[0] = '\0'; + c = peek(); + + if (c == '\0') { + t.type = TOK_EOF; + return t; + } + + if (c == '/') { + consume(); + if (peek() == '/') { + consume(); + s_lc(); + return nexttok(); + } + if (peek() == '*') { + consume(); + s_bc(); + return nexttok(); + } + t.type = TOK_UNKNOWN; + return t; + } + + if (isalpha(c) || c == '_') + return id(); + + consume(); + + if (c == '{') + t.type = TOK_LBRACE; + else if (c == '}') + t.type = TOK_RBRACE; + else if (c == '(') + t.type = TOK_LPAREN; + else if (c == ')') + t.type = TOK_RPAREN; + else if (c == ',') + t.type = TOK_COMMA; + else if (c == ';') + t.type = TOK_SEMI; + else if (c == '.') + t.type = TOK_DOT; + else + t.type = TOK_UNKNOWN; + + return t; +} + +const char * +tokname(toktype_t t) +{ + switch (t) { + case TOK_CLASS: return "CLASS"; + case TOK_FC: return "FC"; + case TOK_AS: return "AS"; + + case TOK_F32: return "F32"; + case TOK_F64: return "F64"; + + case TOK_I16: return "I16"; + case TOK_I32: return "I32"; + + case TOK_U16: return "U16"; + case TOK_U64: return "U64"; + + case TOK_IDENT: return "IDENT"; + case TOK_DOT: return "DOT"; + case TOK_LBRACE: return "LBRACE"; + case TOK_RBRACE: return "RBRACE"; + case TOK_LPAREN: return "LPAREN"; + case TOK_RPAREN: return "RPAREN"; + case TOK_COMMA: return "COMMA"; + case TOK_SEMI: return "SEMI"; + case TOK_EOF: return "EOF"; + default: return "UNKNOWN"; + } +} + diff --git a/lex.h b/lex.h @@ -0,0 +1,71 @@ +/* + * lex.h -- lexer interface + */ + +#ifndef LEX_H +#define LEX_H + +#include <stddef.h> + +typedef enum toktype { + TOK_EOF, + + TOK_CLASS, + TOK_FC, + TOK_USE, + TOK_AS, + + TOK_F32, + TOK_F64, + TOK_I8, + TOK_I16, + TOK_I32, + TOK_I64, + TOK_U8, + TOK_U16, + TOK_U32, + TOK_U64, + + TOK_IDENT, + + TOK_DOT, + TOK_LBRACE, + TOK_RBRACE, + TOK_LPAREN, + TOK_RPAREN, + TOK_COMMA, + TOK_SEMI, + + TOK_UNKNOWN +} toktype_t; + +typedef struct tok { + toktype_t type; + char lexeme[64]; +} tok_t; + +typedef struct { + const char *name; + toktype_t type; +} keyword_t; + +/* state defined in lex.c */ +extern const char *src; +extern size_t i; + +toktype_t lu_kw(const char *str); + +char peek(void); +char consume(void); + +void s_ws(void); +void s_lc(void); +void s_bc(void); + +tok_t id(void); +tok_t nexttok(void); + +const char *tokname(toktype_t t); + +#endif /* LEX_H */ +