477 lines
13 KiB
C++
477 lines
13 KiB
C++
#pragma once
|
|
|
|
#include <sstream>
|
|
#include <sys/stat.h>
|
|
|
|
#ifdef _MSC_VER
|
|
#define S_ISREG(m) (((m)&0xF000)==0x8000)
|
|
#endif
|
|
|
|
enum tok:u32{
|
|
tok_null=0, // null token (default token type)
|
|
tok_num, // number literal
|
|
tok_str, // string literal
|
|
tok_id, // identifier
|
|
tok_for, // loop keyword for
|
|
tok_forindex,// loop keyword forindex
|
|
tok_foreach, // loop keyword foreach
|
|
tok_while, // loop keyword while
|
|
tok_var, // keyword for definition
|
|
tok_func, // keyword for definition of function
|
|
tok_break, // loop keyword break
|
|
tok_continue,// loop keyword continue
|
|
tok_ret, // function keyword return
|
|
tok_if, // condition expression keyword if
|
|
tok_elsif, // condition expression keyword elsif
|
|
tok_else, // condition expression keyword else
|
|
tok_nil, // nil literal
|
|
tok_lcurve, // (
|
|
tok_rcurve, // )
|
|
tok_lbracket,// [
|
|
tok_rbracket,// ]
|
|
tok_lbrace, // {
|
|
tok_rbrace, // }
|
|
tok_semi, // ;
|
|
tok_and, // operator and
|
|
tok_or, // operator or
|
|
tok_comma, // ,
|
|
tok_dot, // .
|
|
tok_ellipsis,// ...
|
|
tok_quesmark,// ?
|
|
tok_colon, // :
|
|
tok_add, // operator +
|
|
tok_sub, // operator -
|
|
tok_mult, // operator *
|
|
tok_div, // operator /
|
|
tok_link, // operator ~
|
|
tok_not, // operator !
|
|
tok_eq, // operator =
|
|
tok_addeq, // operator +=
|
|
tok_subeq, // operator -=
|
|
tok_multeq, // operator *=
|
|
tok_diveq, // operator /=
|
|
tok_lnkeq, // operator ~=
|
|
tok_cmpeq, // operator ==
|
|
tok_neq, // operator !=
|
|
tok_less, // operator <
|
|
tok_leq, // operator <=
|
|
tok_grt, // operator >
|
|
tok_geq, // operator >=
|
|
tok_eof // <eof> end of token list
|
|
};
|
|
|
|
struct{
|
|
const char* str;
|
|
const u32 type;
|
|
}tok_table[]={
|
|
{"for" ,tok_for },
|
|
{"forindex",tok_forindex },
|
|
{"foreach" ,tok_foreach },
|
|
{"while" ,tok_while },
|
|
{"var" ,tok_var },
|
|
{"func" ,tok_func },
|
|
{"break" ,tok_break },
|
|
{"continue",tok_continue },
|
|
{"return" ,tok_ret },
|
|
{"if" ,tok_if },
|
|
{"elsif" ,tok_elsif },
|
|
{"else" ,tok_else },
|
|
{"nil" ,tok_nil },
|
|
{"(" ,tok_lcurve },
|
|
{")" ,tok_rcurve },
|
|
{"[" ,tok_lbracket },
|
|
{"]" ,tok_rbracket },
|
|
{"{" ,tok_lbrace },
|
|
{"}" ,tok_rbrace },
|
|
{";" ,tok_semi },
|
|
{"and" ,tok_and },
|
|
{"or" ,tok_or },
|
|
{"," ,tok_comma },
|
|
{"." ,tok_dot },
|
|
{"..." ,tok_ellipsis },
|
|
{"?" ,tok_quesmark },
|
|
{":" ,tok_colon },
|
|
{"+" ,tok_add },
|
|
{"-" ,tok_sub },
|
|
{"*" ,tok_mult },
|
|
{"/" ,tok_div },
|
|
{"~" ,tok_link },
|
|
{"!" ,tok_not },
|
|
{"=" ,tok_eq },
|
|
{"+=" ,tok_addeq },
|
|
{"-=" ,tok_subeq },
|
|
{"*=" ,tok_multeq },
|
|
{"/=" ,tok_diveq },
|
|
{"~=" ,tok_lnkeq },
|
|
{"==" ,tok_cmpeq },
|
|
{"!=" ,tok_neq },
|
|
{"<" ,tok_less },
|
|
{"<=" ,tok_leq },
|
|
{">" ,tok_grt },
|
|
{">=" ,tok_geq },
|
|
{nullptr ,0 }
|
|
};
|
|
|
|
struct token
|
|
{
|
|
u32 line;
|
|
u32 col;
|
|
u32 type;
|
|
string str;
|
|
token(u32 l=0,u32 c=0,u32 t=tok_null,const string& s=""):str(s)
|
|
{
|
|
line=l;
|
|
col=c;
|
|
type=t;
|
|
}
|
|
};
|
|
|
|
class lexer
|
|
{
|
|
private:
|
|
u32 line;
|
|
u32 column;
|
|
usize ptr;
|
|
string res;
|
|
error& err;
|
|
std::vector<token> tokens;
|
|
|
|
u32 get_type(const string&);
|
|
bool is_id(char);
|
|
bool is_hex(char);
|
|
bool is_oct(char);
|
|
bool is_dec(char);
|
|
bool is_str(char);
|
|
bool is_single_opr(char);
|
|
bool is_calc_opr(char);
|
|
void die(const string& info){err.err("lexer",line,column,info);}
|
|
void open(const string&);
|
|
string utf8_gen();
|
|
string id_gen();
|
|
string num_gen();
|
|
string str_gen();
|
|
public:
|
|
lexer(error& e):
|
|
line(1),column(0),
|
|
ptr(0),res(""),
|
|
err(e){}
|
|
void scan(const string&);
|
|
void print();
|
|
const std::vector<token>& result() const {return tokens;}
|
|
};
|
|
|
|
bool lexer::is_id(char c)
|
|
{
|
|
return (c=='_')||('a'<=c && c<='z')||('A'<=c&&c<='Z')||(c<0);
|
|
}
|
|
|
|
bool lexer::is_hex(char c)
|
|
{
|
|
return ('0'<=c&&c<='9')||('a'<=c&&c<='f')||('A'<=c && c<='F');
|
|
}
|
|
|
|
bool lexer::is_oct(char c)
|
|
{
|
|
return '0'<=c&&c<='7';
|
|
}
|
|
|
|
bool lexer::is_dec(char c)
|
|
{
|
|
return '0'<=c&&c<='9';
|
|
}
|
|
|
|
bool lexer::is_str(char c)
|
|
{
|
|
return c=='\''||c=='\"'||c=='`';
|
|
}
|
|
|
|
bool lexer::is_single_opr(char c)
|
|
{
|
|
return (
|
|
c=='('||c==')'||c=='['||c==']'||
|
|
c=='{'||c=='}'||c==','||c==';'||
|
|
c=='|'||c==':'||c=='?'||c=='`'||
|
|
c=='&'||c=='@'||c=='%'||c=='$'||
|
|
c=='^'||c=='\\'
|
|
);
|
|
}
|
|
|
|
bool lexer::is_calc_opr(char c)
|
|
{
|
|
return c=='='||c=='+'||c=='-'||c=='*'||c=='!'||c=='/'||c=='<'||c=='>'||c=='~';
|
|
}
|
|
|
|
void lexer::open(const string& file)
|
|
{
|
|
struct stat buffer;
|
|
if(stat(file.c_str(),&buffer)==0 && !S_ISREG(buffer.st_mode))
|
|
{
|
|
err.err("lexer","<"+file+"> is not a regular file");
|
|
err.chkerr();
|
|
}
|
|
std::ifstream fin(file,std::ios::binary);
|
|
if(fin.fail())
|
|
err.err("lexer","failed to open <"+file+">");
|
|
else
|
|
err.load(file);
|
|
std::stringstream ss;
|
|
ss<<fin.rdbuf();
|
|
res=ss.str();
|
|
}
|
|
|
|
u32 lexer::get_type(const string& str)
|
|
{
|
|
for(u32 i=0;tok_table[i].str;++i)
|
|
if(str==tok_table[i].str)
|
|
return tok_table[i].type;
|
|
return tok_null;
|
|
}
|
|
|
|
string lexer::utf8_gen()
|
|
{
|
|
string str="";
|
|
while(ptr<res.size() && res[ptr]<0)
|
|
{
|
|
string tmp="";
|
|
u32 nbytes=utf8_hdchk(res[ptr]);
|
|
if(nbytes)
|
|
{
|
|
tmp+=res[ptr++];
|
|
for(u32 i=0;i<nbytes;++i,++ptr)
|
|
if(ptr<res.size() && (res[ptr]&0xc0)==0x80)
|
|
tmp+=res[ptr];
|
|
if(tmp.length()!=1+nbytes)
|
|
{
|
|
++column;
|
|
string utf_info="0x"+chrhex(tmp[0]);
|
|
for(u32 i=1;i<tmp.size();++i)
|
|
utf_info+=" 0x"+chrhex(tmp[i]);
|
|
die("invalid utf-8 character `"+utf_info+"`, make sure it is utf8-text file");
|
|
std::exit(1);
|
|
}
|
|
str+=tmp;
|
|
column+=2; // may have some problems because not all the unicode takes 2 space
|
|
}
|
|
else
|
|
{
|
|
++ptr;
|
|
++column;
|
|
}
|
|
}
|
|
return str;
|
|
}
|
|
|
|
string lexer::id_gen()
|
|
{
|
|
string str="";
|
|
while(ptr<res.size() && (is_id(res[ptr])||is_dec(res[ptr])))
|
|
{
|
|
if(res[ptr]<0) // utf-8
|
|
str+=utf8_gen();
|
|
else // ascii
|
|
{
|
|
str+=res[ptr++];
|
|
++column;
|
|
}
|
|
}
|
|
return str;
|
|
}
|
|
|
|
string lexer::num_gen()
|
|
{
|
|
// generate hex number
|
|
if(ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='x')
|
|
{
|
|
string str="0x";
|
|
ptr+=2;
|
|
while(ptr<res.size() && is_hex(res[ptr]))
|
|
str+=res[ptr++];
|
|
column+=str.length();
|
|
if(str.length()<3)// "0x"
|
|
die("invalid number `"+str+"`");
|
|
return str;
|
|
}
|
|
// generate oct number
|
|
else if(ptr+1<res.size() && res[ptr]=='0' && res[ptr+1]=='o')
|
|
{
|
|
string str="0o";
|
|
ptr+=2;
|
|
while(ptr<res.size() && is_oct(res[ptr]))
|
|
str+=res[ptr++];
|
|
column+=str.length();
|
|
if(str.length()<3)// "0o"
|
|
die("invalid number `"+str+"`");
|
|
return str;
|
|
}
|
|
// generate dec number
|
|
// dec number -> [0~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*)
|
|
string str="";
|
|
while(ptr<res.size() && is_dec(res[ptr]))
|
|
str+=res[ptr++];
|
|
if(ptr<res.size() && res[ptr]=='.')
|
|
{
|
|
str+=res[ptr++];
|
|
while(ptr<res.size() && is_dec(res[ptr]))
|
|
str+=res[ptr++];
|
|
// "xxxx." is not a correct number
|
|
if(str.back()=='.')
|
|
{
|
|
column+=str.length();
|
|
die("invalid number `"+str+"`");
|
|
return "0";
|
|
}
|
|
}
|
|
if(ptr<res.size() && (res[ptr]=='e' || res[ptr]=='E'))
|
|
{
|
|
str+=res[ptr++];
|
|
if(ptr<res.size() && (res[ptr]=='-' || res[ptr]=='+'))
|
|
str+=res[ptr++];
|
|
while(ptr<res.size() && is_dec(res[ptr]))
|
|
str+=res[ptr++];
|
|
// "xxxe(-|+)" is not a correct number
|
|
if(str.back()=='e' || str.back()=='E' || str.back()=='-' || str.back()=='+')
|
|
{
|
|
column+=str.length();
|
|
die("invalid number `"+str+"`");
|
|
return "0";
|
|
}
|
|
}
|
|
column+=str.length();
|
|
return str;
|
|
}
|
|
|
|
string lexer::str_gen()
|
|
{
|
|
string str="";
|
|
const char begin=res[ptr];
|
|
++column;
|
|
while(++ptr<res.size() && res[ptr]!=begin)
|
|
{
|
|
++column;
|
|
if(res[ptr]=='\n')
|
|
{
|
|
column=0;
|
|
++line;
|
|
}
|
|
if(res[ptr]=='\\' && ptr+1<res.size())
|
|
{
|
|
++column;
|
|
++ptr;
|
|
switch(res[ptr])
|
|
{
|
|
case '0': str+='\0'; break;
|
|
case 'a': str+='\a'; break;
|
|
case 'b': str+='\b'; break;
|
|
case 'e': str+='\033'; break;
|
|
case 't': str+='\t'; break;
|
|
case 'n': str+='\n'; break;
|
|
case 'v': str+='\v'; break;
|
|
case 'f': str+='\f'; break;
|
|
case 'r': str+='\r'; break;
|
|
case '?': str+='\?'; break;
|
|
case '\\':str+='\\'; break;
|
|
case '\'':str+='\''; break;
|
|
case '\"':str+='\"'; break;
|
|
default: str+=res[ptr];break;
|
|
}
|
|
continue;
|
|
}
|
|
str+=res[ptr];
|
|
}
|
|
// check if this string ends with a " or '
|
|
if(ptr++>=res.size())
|
|
{
|
|
die("get EOF when generating string");
|
|
return str;
|
|
}
|
|
++column;
|
|
if(begin=='`' && str.length()!=1)
|
|
die("\'`\' is used for string that includes one character");
|
|
return str;
|
|
}
|
|
|
|
void lexer::scan(const string& file)
|
|
{
|
|
line=1;
|
|
column=0;
|
|
ptr=0;
|
|
open(file);
|
|
|
|
string str;
|
|
while(ptr<res.size())
|
|
{
|
|
while(ptr<res.size() && (res[ptr]==' ' || res[ptr]=='\n' || res[ptr]=='\t' || res[ptr]=='\r' || res[ptr]==0))
|
|
{
|
|
// these characters will be ignored, and '\n' will cause ++line
|
|
++column;
|
|
if(res[ptr++]=='\n')
|
|
{
|
|
++line;
|
|
column=0;
|
|
}
|
|
}
|
|
if(ptr>=res.size()) break;
|
|
if(is_id(res[ptr]))
|
|
{
|
|
str=id_gen();
|
|
u32 type=get_type(str);
|
|
tokens.push_back({line,column,type?type:tok_id,str});
|
|
}
|
|
else if(is_dec(res[ptr]))
|
|
{
|
|
str=num_gen(); // make sure column is correct
|
|
tokens.push_back({line,column,tok_num,str});
|
|
}
|
|
else if(is_str(res[ptr]))
|
|
{
|
|
str=str_gen(); // make sure column is correct
|
|
tokens.push_back({line,column,tok_str,str});
|
|
}
|
|
else if(is_single_opr(res[ptr]))
|
|
{
|
|
str=res[ptr];
|
|
++column;
|
|
u32 type=get_type(str);
|
|
if(!type)
|
|
die("invalid operator `"+str+"`");
|
|
tokens.push_back({line,column,type,str});
|
|
++ptr;
|
|
}
|
|
else if(res[ptr]=='.')
|
|
{
|
|
str=".";
|
|
if(ptr+2<res.size() && res[ptr+1]=='.' && res[ptr+2]=='.')
|
|
str+="..";
|
|
ptr+=str.length();
|
|
column+=str.length();
|
|
tokens.push_back({line,column,get_type(str),str});
|
|
}
|
|
else if(is_calc_opr(res[ptr]))
|
|
{
|
|
// get calculation operator
|
|
str=res[ptr++];
|
|
if(ptr<res.size() && res[ptr]=='=')
|
|
str+=res[ptr++];
|
|
column+=str.length();
|
|
tokens.push_back({line,column,get_type(str),str});
|
|
}
|
|
else if(res[ptr]=='#')// avoid note, after this process ptr will point to a '\n', so next loop line counter+1
|
|
while(++ptr<res.size() && res[ptr]!='\n');
|
|
else
|
|
{
|
|
++column;
|
|
char c=res[ptr++];
|
|
die("invalid character 0x"+chrhex(c));
|
|
}
|
|
}
|
|
tokens.push_back({line,column,tok_eof,"<eof>"});
|
|
res="";
|
|
err.chkerr();
|
|
}
|
|
|
|
void lexer::print()
|
|
{
|
|
for(auto& tok:tokens)
|
|
std::cout<<"("<<tok.line<<" | "<<rawstr(tok.str,128)<<")\n";
|
|
}
|