optimize lexer

This commit is contained in:
ValKmjolnir 2022-11-28 21:16:39 +08:00
parent be318abb2e
commit 9455a83df0
2 changed files with 85 additions and 61 deletions

View File

@ -25,7 +25,7 @@ enum class tok:u32 {
rif, // condition expression keyword if rif, // condition expression keyword if
elsif, // condition expression keyword elsif elsif, // condition expression keyword elsif
relse, // condition expression keyword else relse, // condition expression keyword else
nil, // nil literal tknil, // nil literal
lcurve, // ( lcurve, // (
rcurve, // ) rcurve, // )
lbracket, // [ lbracket, // [
@ -91,7 +91,7 @@ private:
{"if" ,tok::rif }, {"if" ,tok::rif },
{"elsif" ,tok::elsif }, {"elsif" ,tok::elsif },
{"else" ,tok::relse }, {"else" ,tok::relse },
{"nil" ,tok::nil }, {"nil" ,tok::tknil },
{"(" ,tok::lcurve }, {"(" ,tok::lcurve },
{")" ,tok::rcurve }, {")" ,tok::rcurve },
{"[" ,tok::lbracket}, {"[" ,tok::lbracket},
@ -135,11 +135,18 @@ private:
bool is_str(char); bool is_str(char);
bool is_single_opr(char); bool is_single_opr(char);
bool is_calc_opr(char); bool is_calc_opr(char);
void skip_note();
void err_char();
void open(const string&); void open(const string&);
string utf8_gen(); string utf8_gen();
string id_gen(); token id_gen();
string num_gen(); token num_gen();
string str_gen(); token str_gen();
token single_opr();
token dots();
token calc_opr();
public: public:
lexer(error& e): line(1),column(0),ptr(0),res(""),err(e) {} lexer(error& e): line(1),column(0),ptr(0),res(""),err(e) {}
const error& scan(const string&); const error& scan(const string&);
@ -188,6 +195,18 @@ bool lexer::is_calc_opr(char c) {
); );
} }
void lexer::skip_note() {
// avoid note, after this process ptr will point to a '\n', so next loop line counter+1
while(++ptr<res.size() && res[ptr]!='\n') {}
}
void lexer::err_char() {
++column;
char c=res[ptr++];
err.err("lexer",line,column,1,"invalid character 0x"+chrhex(c));
err.fatal("lexer","fatal error occurred, stop");
}
void lexer::open(const string& file) { void lexer::open(const string& file) {
struct stat buffer; struct stat buffer;
if (stat(file.c_str(),&buffer)==0 && !S_ISREG(buffer.st_mode)) { if (stat(file.c_str(),&buffer)==0 && !S_ISREG(buffer.st_mode)) {
@ -240,7 +259,7 @@ string lexer::utf8_gen() {
return str; return str;
} }
string lexer::id_gen() { token lexer::id_gen() {
string str=""; string str="";
while(ptr<res.size() && (is_id(res[ptr])||is_dec(res[ptr]))) { while(ptr<res.size() && (is_id(res[ptr])||is_dec(res[ptr]))) {
if (res[ptr]<0) { // utf-8 if (res[ptr]<0) { // utf-8
@ -250,10 +269,11 @@ string lexer::id_gen() {
++column; ++column;
} }
} }
return str; tok type=get_type(str);
return {line,column,(type!=tok::null)?type:tok::id,str};
} }
string lexer::num_gen() { token lexer::num_gen() {
// generate hex number // generate hex number
if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='x') { if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='x') {
string str="0x"; string str="0x";
@ -265,7 +285,7 @@ string lexer::num_gen() {
if (str.length()<3) { // "0x" if (str.length()<3) { // "0x"
err.err("lexer",line,column,str.length(),"invalid number `"+str+"`"); err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
} }
return str; return {line,column,tok::num,str};
} else if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='o') { // generate oct number } else if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='o') { // generate oct number
string str="0o"; string str="0o";
ptr+=2; ptr+=2;
@ -281,7 +301,7 @@ string lexer::num_gen() {
if (str.length()==2 || erfmt) { if (str.length()==2 || erfmt) {
err.err("lexer",line,column,str.length(),"invalid number `"+str+"`"); err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
} }
return str; return {line,column,tok::num,str};
} }
// generate dec number // generate dec number
// dec number -> [0~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*) // dec number -> [0~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*)
@ -298,7 +318,7 @@ string lexer::num_gen() {
if (str.back()=='.') { if (str.back()=='.') {
column+=str.length(); column+=str.length();
err.err("lexer",line,column,str.length(),"invalid number `"+str+"`"); err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
return "0"; return {line,column,tok::num,"0"};
} }
} }
if (ptr<res.size() && (res[ptr]=='e' || res[ptr]=='E')) { if (ptr<res.size() && (res[ptr]=='e' || res[ptr]=='E')) {
@ -313,14 +333,14 @@ string lexer::num_gen() {
if (str.back()=='e' || str.back()=='E' || str.back()=='-' || str.back()=='+') { if (str.back()=='e' || str.back()=='E' || str.back()=='-' || str.back()=='+') {
column+=str.length(); column+=str.length();
err.err("lexer",line,column,str.length(),"invalid number `"+str+"`"); err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
return "0"; return {line,column,tok::num,"0"};
} }
} }
column+=str.length(); column+=str.length();
return str; return {line,column,tok::num,str};
} }
string lexer::str_gen() { token lexer::str_gen() {
string str=""; string str="";
const char begin=res[ptr]; const char begin=res[ptr];
++column; ++column;
@ -356,13 +376,44 @@ string lexer::str_gen() {
// check if this string ends with a " or ' // check if this string ends with a " or '
if (ptr++>=res.size()) { if (ptr++>=res.size()) {
err.err("lexer",line,column,1,"get EOF when generating string"); err.err("lexer",line,column,1,"get EOF when generating string");
return str; return {line,column,tok::str,str};
} }
++column; ++column;
if (begin=='`' && str.length()!=1) { if (begin=='`' && str.length()!=1) {
err.err("lexer",line,column,1,"\'`\' is used for string that includes one character"); err.err("lexer",line,column,1,"\'`\' is used for string that includes one character");
} }
return str; return {line,column,tok::str,str};
}
token lexer::single_opr() {
string str(1,res[ptr]);
++column;
tok type=get_type(str);
if (type==tok::null) {
err.err("lexer",line,column,str.length(),"invalid operator `"+str+"`");
}
++ptr;
return {line,column,type,str};
}
token lexer::dots() {
string str=".";
if (ptr+2<res.size() && res[ptr+1]=='.' && res[ptr+2]=='.') {
str+="..";
}
ptr+=str.length();
column+=str.length();
return {line,column,get_type(str),str};
}
token lexer::calc_opr() {
// get calculation operator
string str(1,res[ptr++]);
if (ptr<res.size() && res[ptr]=='=') {
str+=res[ptr++];
}
column+=str.length();
return {line,column,get_type(str),str};
} }
const error& lexer::scan(const string& file) { const error& lexer::scan(const string& file) {
@ -371,7 +422,6 @@ const error& lexer::scan(const string& file) {
ptr=0; ptr=0;
open(file); open(file);
string str;
while(ptr<res.size()) { while(ptr<res.size()) {
while(ptr<res.size() && skip(res[ptr])) { while(ptr<res.size() && skip(res[ptr])) {
// these characters will be ignored, and '\n' will cause ++line // these characters will be ignored, and '\n' will cause ++line
@ -385,47 +435,21 @@ const error& lexer::scan(const string& file) {
break; break;
} }
if (is_id(res[ptr])) { if (is_id(res[ptr])) {
str=id_gen(); toks.push_back(id_gen());
tok type=get_type(str);
toks.push_back({line,column,(type!=tok::null)?type:tok::id,str});
} else if (is_dec(res[ptr])) { } else if (is_dec(res[ptr])) {
str=num_gen(); // make sure column is correct toks.push_back(num_gen());
toks.push_back({line,column,tok::num,str});
} else if (is_str(res[ptr])) { } else if (is_str(res[ptr])) {
str=str_gen(); // make sure column is correct toks.push_back(str_gen());
toks.push_back({line,column,tok::str,str});
} else if (is_single_opr(res[ptr])) { } else if (is_single_opr(res[ptr])) {
str=res[ptr]; toks.push_back(single_opr());
++column;
tok type=get_type(str);
if (type==tok::null) {
err.err("lexer",line,column,str.length(),"invalid operator `"+str+"`");
}
toks.push_back({line,column,type,str});
++ptr;
} else if (res[ptr]=='.') { } else if (res[ptr]=='.') {
str="."; toks.push_back(dots());
if (ptr+2<res.size() && res[ptr+1]=='.' && res[ptr+2]=='.') {
str+="..";
}
ptr+=str.length();
column+=str.length();
toks.push_back({line,column,get_type(str),str});
} else if (is_calc_opr(res[ptr])) { } else if (is_calc_opr(res[ptr])) {
// get calculation operator toks.push_back(calc_opr());
str=res[ptr++]; } else if (res[ptr]=='#') {
if (ptr<res.size() && res[ptr]=='=') { skip_note();
str+=res[ptr++];
}
column+=str.length();
toks.push_back({line,column,get_type(str),str});
} else if (res[ptr]=='#') { // avoid note, after this process ptr will point to a '\n', so next loop line counter+1
while(++ptr<res.size() && res[ptr]!='\n') {}
} else { } else {
++column; err_char();
char c=res[ptr++];
err.err("lexer",line,column,1,"invalid character 0x"+chrhex(c));
err.fatal("lexer","fatal error occurred, stop");
} }
} }
toks.push_back({line,column,tok::eof,"<eof>"}); toks.push_back({line,column,tok::eof,"<eof>"});

View File

@ -61,7 +61,7 @@ private:
{tok::rif ,"if" }, {tok::rif ,"if" },
{tok::elsif ,"elsif" }, {tok::elsif ,"elsif" },
{tok::relse ,"else" }, {tok::relse ,"else" },
{tok::nil ,"nil" }, {tok::tknil ,"nil" },
{tok::lcurve ,"(" }, {tok::lcurve ,"(" },
{tok::rcurve ,")" }, {tok::rcurve ,")" },
{tok::lbracket,"[" }, {tok::lbracket,"[" },
@ -338,7 +338,7 @@ ast parse::vec() {
// array end with tok::null=0 // array end with tok::null=0
const tok panic[]={ const tok panic[]={
tok::id,tok::str,tok::num, tok::id,tok::str,tok::num,
tok::opnot,tok::sub,tok::nil, tok::opnot,tok::sub,tok::tknil,
tok::func,tok::var,tok::lcurve, tok::func,tok::var,tok::lcurve,
tok::lbrace,tok::lbracket,tok::null tok::lbrace,tok::lbracket,tok::null
}; };
@ -451,7 +451,7 @@ ast parse::expr()
die(thisline,thiscol,thislen,"must use return in functions"); die(thisline,thiscol,thislen,"must use return in functions");
} }
switch(type) { switch(type) {
case tok::nil: case tok::tknil:
case tok::num: case tok::num:
case tok::str: case tok::str:
case tok::id: case tok::id:
@ -607,9 +607,9 @@ ast parse::unary() {
ast parse::scalar() { ast parse::scalar() {
ast node(toks[ptr].line,toks[ptr].col,ast_null); ast node(toks[ptr].line,toks[ptr].col,ast_null);
if (lookahead(tok::nil)) { if (lookahead(tok::tknil)) {
node=nil(); node=nil();
match(tok::nil); match(tok::tknil);
} else if (lookahead(tok::num)) { } else if (lookahead(tok::num)) {
node=num(); node=num();
} else if (lookahead(tok::str)) { } else if (lookahead(tok::str)) {
@ -673,7 +673,7 @@ ast parse::callv() {
// array end with tok::null=0 // array end with tok::null=0
const tok panic[]={ const tok panic[]={
tok::id,tok::str,tok::num, tok::id,tok::str,tok::num,
tok::opnot,tok::sub,tok::nil, tok::opnot,tok::sub,tok::tknil,
tok::func,tok::var,tok::lcurve, tok::func,tok::var,tok::lcurve,
tok::lbrace,tok::lbracket,tok::colon, tok::lbrace,tok::lbracket,tok::colon,
tok::null tok::null
@ -703,7 +703,7 @@ ast parse::callf() {
// array end with tok::null=0 // array end with tok::null=0
const tok panic[]={ const tok panic[]={
tok::id,tok::str,tok::num, tok::id,tok::str,tok::num,
tok::opnot,tok::sub,tok::nil, tok::opnot,tok::sub,tok::tknil,
tok::func,tok::var,tok::lcurve, tok::func,tok::var,tok::lcurve,
tok::lbrace,tok::lbracket,tok::null tok::lbrace,tok::lbracket,tok::null
}; };
@ -794,7 +794,7 @@ ast parse::multi_scalar() {
// if check_call_memory is true,we will check if value called here can reach a memory space // if check_call_memory is true,we will check if value called here can reach a memory space
const tok panic[]={ const tok panic[]={
tok::id,tok::str,tok::num, tok::id,tok::str,tok::num,
tok::opnot,tok::sub,tok::nil, tok::opnot,tok::sub,tok::tknil,
tok::func,tok::var,tok::lcurve, tok::func,tok::var,tok::lcurve,
tok::lbrace,tok::lbracket,tok::null tok::lbrace,tok::lbracket,tok::null
}; };
@ -979,7 +979,7 @@ ast parse::ret_expr() {
ast node(toks[ptr].line,toks[ptr].col,ast_ret); ast node(toks[ptr].line,toks[ptr].col,ast_ret);
match(tok::ret); match(tok::ret);
tok type=toks[ptr].type; tok type=toks[ptr].type;
if (type==tok::nil || type==tok::num || type==tok::str || type==tok::id || if (type==tok::tknil || type==tok::num || type==tok::str || type==tok::id ||
type==tok::func || type==tok::sub || type==tok::opnot || type==tok::lcurve || type==tok::func || type==tok::sub || type==tok::opnot || type==tok::lcurve ||
type==tok::lbracket || type==tok::lbrace type==tok::lbracket || type==tok::lbrace
) { ) {