⚡ optimize lexer

2022-11-28 21:16:39 +08:00 · 2022-11-28 21:16:39 +08:00 · 9455a83df0
parent be318abb2e
commit 9455a83df0
2 changed files with 85 additions and 61 deletions
--- a/nasal_lexer.h
+++ b/nasal_lexer.h
@ -25,7 +25,7 @@ enum class tok:u32 {
    rif,      // condition expression keyword if
    elsif,    // condition expression keyword elsif
    relse,    // condition expression keyword else
-    nil,      // nil literal
+    tknil,    // nil literal
    lcurve,   // (
    rcurve,   // )
    lbracket, // [
@ -91,7 +91,7 @@ private:
        {"if"      ,tok::rif     },
        {"elsif"   ,tok::elsif   },
        {"else"    ,tok::relse   },
-        {"nil"     ,tok::nil     },
+        {"nil"     ,tok::tknil   },
        {"("       ,tok::lcurve  },
        {")"       ,tok::rcurve  },
        {"["       ,tok::lbracket},
@ -135,11 +135,18 @@ private:
    bool is_str(char);
    bool is_single_opr(char);
    bool is_calc_opr(char);
+
+    void skip_note();
+    void err_char();
+
    void open(const string&);
    string utf8_gen();
-    string id_gen();
-    string num_gen();
-    string str_gen();
+    token id_gen();
+    token num_gen();
+    token str_gen();
+    token single_opr();
+    token dots();
+    token calc_opr();
 public:
    lexer(error& e): line(1),column(0),ptr(0),res(""),err(e) {}
    const error& scan(const string&);
@ -188,6 +195,18 @@ bool lexer::is_calc_opr(char c) {
    );
 }

+void lexer::skip_note() {
+    // avoid note, after this process ptr will point to a '\n', so next loop line counter+1
+    while(++ptr<res.size() && res[ptr]!='\n') {}
+}
+
+void lexer::err_char() {
+    ++column;
+    char c=res[ptr++];
+    err.err("lexer",line,column,1,"invalid character 0x"+chrhex(c));
+    err.fatal("lexer","fatal error occurred, stop");
+}
+
 void lexer::open(const string& file) {
    struct stat buffer;
    if (stat(file.c_str(),&buffer)==0 && !S_ISREG(buffer.st_mode)) {
@ -240,7 +259,7 @@ string lexer::utf8_gen() {
    return str;
 }

-string lexer::id_gen() {
+token lexer::id_gen() {
    string str="";
    while(ptr<res.size() && (is_id(res[ptr])||is_dec(res[ptr]))) {
        if (res[ptr]<0) { // utf-8
@ -250,10 +269,11 @@ string lexer::id_gen() {
            ++column;
        }
    }
-    return str;
+    tok type=get_type(str);
+    return {line,column,(type!=tok::null)?type:tok::id,str};
 }

-string lexer::num_gen() {
+token lexer::num_gen() {
    // generate hex number
    if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='x') {
        string str="0x";
@ -265,7 +285,7 @@ string lexer::num_gen() {
        if (str.length()<3) { // "0x"
            err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
        }
-        return str;
+        return {line,column,tok::num,str};
    } else if (ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='o') { // generate oct number
        string str="0o";
        ptr+=2;
@ -281,7 +301,7 @@ string lexer::num_gen() {
        if (str.length()==2 || erfmt) {
            err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
        }
-        return str;
+        return {line,column,tok::num,str};
    }
    // generate dec number
    // dec number -> [0~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*)
@ -298,7 +318,7 @@ string lexer::num_gen() {
        if (str.back()=='.') {
            column+=str.length();
            err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
-            return "0";
+            return {line,column,tok::num,"0"};
        }
    }
    if (ptr<res.size() && (res[ptr]=='e' || res[ptr]=='E')) {
@ -313,14 +333,14 @@ string lexer::num_gen() {
        if (str.back()=='e' || str.back()=='E' || str.back()=='-' || str.back()=='+') {
            column+=str.length();
            err.err("lexer",line,column,str.length(),"invalid number `"+str+"`");
-            return "0";
+            return {line,column,tok::num,"0"};
        }
    }
    column+=str.length();
-    return str;
+    return {line,column,tok::num,str};
 }

-string lexer::str_gen() {
+token lexer::str_gen() {
    string str="";
    const char begin=res[ptr];
    ++column;
@ -356,13 +376,44 @@ string lexer::str_gen() {
    // check if this string ends with a " or '
    if (ptr++>=res.size()) {
        err.err("lexer",line,column,1,"get EOF when generating string");
-        return str;
+        return {line,column,tok::str,str};
    }
    ++column;
    if (begin=='`' && str.length()!=1) {
        err.err("lexer",line,column,1,"\'`\' is used for string that includes one character");
    }
-    return str;
+    return {line,column,tok::str,str};
+}
+
+token lexer::single_opr() {
+    string str(1,res[ptr]);
+    ++column;
+    tok type=get_type(str);
+    if (type==tok::null) {
+        err.err("lexer",line,column,str.length(),"invalid operator `"+str+"`");
+    }
+    ++ptr;
+    return {line,column,type,str};
+}
+
+token lexer::dots() {
+    string str=".";
+    if (ptr+2<res.size() && res[ptr+1]=='.' && res[ptr+2]=='.') {
+        str+="..";
+    }
+    ptr+=str.length();
+    column+=str.length();
+    return {line,column,get_type(str),str};
+}
+
+token lexer::calc_opr() {
+    // get calculation operator
+    string str(1,res[ptr++]);
+    if (ptr<res.size() && res[ptr]=='=') {
+        str+=res[ptr++];
+    }
+    column+=str.length();
+    return {line,column,get_type(str),str};
 }

 const error& lexer::scan(const string& file) {
@ -371,7 +422,6 @@ const error& lexer::scan(const string& file) {
    ptr=0;
    open(file);

-    string str;
    while(ptr<res.size()) {
        while(ptr<res.size() && skip(res[ptr])) {
            // these characters will be ignored, and '\n' will cause ++line
@ -385,47 +435,21 @@ const error& lexer::scan(const string& file) {
            break;
        }
        if (is_id(res[ptr])) {
-            str=id_gen();
-            tok type=get_type(str);
-            toks.push_back({line,column,(type!=tok::null)?type:tok::id,str});
+            toks.push_back(id_gen());
        } else if (is_dec(res[ptr])) {
-            str=num_gen(); // make sure column is correct
-            toks.push_back({line,column,tok::num,str});
+            toks.push_back(num_gen());
        } else if (is_str(res[ptr])) {
-            str=str_gen(); // make sure column is correct
-            toks.push_back({line,column,tok::str,str});
+            toks.push_back(str_gen());
        } else if (is_single_opr(res[ptr])) {
-            str=res[ptr];
-            ++column;
-            tok type=get_type(str);
-            if (type==tok::null) {
-                err.err("lexer",line,column,str.length(),"invalid operator `"+str+"`");
-            }
-            toks.push_back({line,column,type,str});
-            ++ptr;
+            toks.push_back(single_opr());
        } else if (res[ptr]=='.') {
-            str=".";
-            if (ptr+2<res.size() && res[ptr+1]=='.' && res[ptr+2]=='.') {
-                str+="..";
-            }
-            ptr+=str.length();
-            column+=str.length();
-            toks.push_back({line,column,get_type(str),str});
+            toks.push_back(dots());
        } else if (is_calc_opr(res[ptr])) {
-            // get calculation operator
-            str=res[ptr++];
-            if (ptr<res.size() && res[ptr]=='=') {
-                str+=res[ptr++];
-            }
-            column+=str.length();
-            toks.push_back({line,column,get_type(str),str});
-        } else if (res[ptr]=='#') { // avoid note, after this process ptr will point to a '\n', so next loop line counter+1
-            while(++ptr<res.size() && res[ptr]!='\n') {}
+            toks.push_back(calc_opr());
+        } else if (res[ptr]=='#') {
+            skip_note();
        } else {
-            ++column;
-            char c=res[ptr++];
-            err.err("lexer",line,column,1,"invalid character 0x"+chrhex(c));
-            err.fatal("lexer","fatal error occurred, stop");
+            err_char();
        }
    }
    toks.push_back({line,column,tok::eof,"<eof>"});
--- a/nasal_parse.h
+++ b/nasal_parse.h
@ -61,7 +61,7 @@ private:
        {tok::rif     ,"if"      },
        {tok::elsif   ,"elsif"   },
        {tok::relse   ,"else"    },
-        {tok::nil     ,"nil"     },
+        {tok::tknil   ,"nil"     },
        {tok::lcurve  ,"("       },
        {tok::rcurve  ,")"       },
        {tok::lbracket,"["       },
@ -338,7 +338,7 @@ ast parse::vec() {
    // array end with tok::null=0
    const tok panic[]={
        tok::id,tok::str,tok::num,
-        tok::opnot,tok::sub,tok::nil,
+        tok::opnot,tok::sub,tok::tknil,
        tok::func,tok::var,tok::lcurve,
        tok::lbrace,tok::lbracket,tok::null
    };
@ -451,7 +451,7 @@ ast parse::expr()
        die(thisline,thiscol,thislen,"must use return in functions");
    }
    switch(type) {
-        case tok::nil:
+        case tok::tknil:
        case tok::num:
        case tok::str:
        case tok::id:
@ -607,9 +607,9 @@ ast parse::unary() {

 ast parse::scalar() {
    ast node(toks[ptr].line,toks[ptr].col,ast_null);
-    if (lookahead(tok::nil)) {
+    if (lookahead(tok::tknil)) {
        node=nil();
-        match(tok::nil);
+        match(tok::tknil);
    } else if (lookahead(tok::num)) {
        node=num();
    } else if (lookahead(tok::str)) {
@ -673,7 +673,7 @@ ast parse::callv() {
    // array end with tok::null=0
    const tok panic[]={
        tok::id,tok::str,tok::num,
-        tok::opnot,tok::sub,tok::nil,
+        tok::opnot,tok::sub,tok::tknil,
        tok::func,tok::var,tok::lcurve,
        tok::lbrace,tok::lbracket,tok::colon,
        tok::null
@ -703,7 +703,7 @@ ast parse::callf() {
    // array end with tok::null=0
    const tok panic[]={
        tok::id,tok::str,tok::num,
-        tok::opnot,tok::sub,tok::nil,
+        tok::opnot,tok::sub,tok::tknil,
        tok::func,tok::var,tok::lcurve,
        tok::lbrace,tok::lbracket,tok::null
    };
@ -794,7 +794,7 @@ ast parse::multi_scalar() {
    // if check_call_memory is true,we will check if value called here can reach a memory space
    const tok panic[]={
        tok::id,tok::str,tok::num,
-        tok::opnot,tok::sub,tok::nil,
+        tok::opnot,tok::sub,tok::tknil,
        tok::func,tok::var,tok::lcurve,
        tok::lbrace,tok::lbracket,tok::null
    };
@ -979,7 +979,7 @@ ast parse::ret_expr() {
    ast node(toks[ptr].line,toks[ptr].col,ast_ret);
    match(tok::ret);
    tok type=toks[ptr].type;
-    if (type==tok::nil || type==tok::num || type==tok::str || type==tok::id ||
+    if (type==tok::tknil || type==tok::num || type==tok::str || type==tok::id ||
       type==tok::func || type==tok::sub || type==tok::opnot || type==tok::lcurve ||
       type==tok::lbracket || type==tok::lbrace
    ) {