Nasal-Interpreter/nasal_lexer.h

394 lines
11 KiB
C++

#ifndef __NASAL_LEXER_H__
#define __NASAL_LEXER_H__
#define IS_IDENTIFIER_HEAD(c) ((c=='_')||('a'<=c && c<='z')||('A'<=c&&c<='Z'))
#define IS_IDENTIFIER_BODY(c) ((c=='_')||('a'<=c && c<='z')||('A'<=c&&c<='Z')||('0'<=c&&c<='9'))
#define IS_HEX_NUMBER(c) (('0'<=c&&c<='9')||('a'<=c&&c<='f')||('A'<=c && c<='F'))
#define IS_OCT_NUMEBR(c) ('0'<=c&&c<='7')
#define IS_DIGIT(c) ('0'<=c&&c<='9')
#define IS_STRING_HEAD(c) (c=='\''||c=='\"')
// single operators have only one character
#define IS_SINGLE_OPRATOR(c) (c=='('||c==')'||c=='['||c==']'||c=='{'||c=='}'||c==','||c==';'||c=='|'||c==':'||\
c=='?'||c=='`'||c=='&'||c=='@'||c=='%'||c=='$'||c=='^'||c=='\\')
// calculation operators may have two chars, for example: += -= *= /= ~= != == >= <=
#define IS_CALC_OPERATOR(c) (c=='='||c=='+'||c=='-'||c=='*'||c=='!'||c=='/'||c=='<'||c=='>'||c=='~')
#define IS_NOTE_HEAD(c) (c=='#')
#ifndef TOKEN_TABLE_SIZE
#define TOKEN_TABLE_SIZE 45
struct token_table
{
std::string str;
int tok_type;
}tok_tbl[TOKEN_TABLE_SIZE]=
{
{"for" ,tok_for },
{"forindex",tok_forindex },
{"foreach" ,tok_foreach },
{"while" ,tok_while },
{"var" ,tok_var },
{"func" ,tok_func },
{"break" ,tok_break },
{"continue",tok_continue },
{"return" ,tok_return },
{"if" ,tok_if },
{"elsif" ,tok_elsif },
{"else" ,tok_else },
{"nil" ,tok_nil },
{"(" ,tok_left_curve },
{")" ,tok_right_curve },
{"[" ,tok_left_bracket },
{"]" ,tok_right_bracket},
{"{" ,tok_left_brace },
{"}" ,tok_right_brace },
{";" ,tok_semi },
{"and" ,tok_and },
{"or" ,tok_or },
{"," ,tok_comma },
{"." ,tok_dot },
{"..." ,tok_ellipsis },
{"?" ,tok_quesmark },
{":" ,tok_colon },
{"+" ,tok_add },
{"-" ,tok_sub },
{"*" ,tok_mult },
{"/" ,tok_div },
{"~" ,tok_link },
{"!" ,tok_not },
{"=" ,tok_equal },
{"+=" ,tok_add_equal },
{"-=" ,tok_sub_equal },
{"*=" ,tok_mult_equal },
{"/=" ,tok_div_equal },
{"~=" ,tok_link_equal },
{"==" ,tok_cmp_equal },
{"!=" ,tok_cmp_not_equal},
{"<" ,tok_less_than },
{">" ,tok_greater_than },
{"<=" ,tok_less_equal },
{">=" ,tok_greater_equal},
};
#endif
struct token
{
int line;
int type;
std::string str;
};
class nasal_lexer
{
private:
int error;
std::vector<token> token_list;
std::string identifier_gen(std::vector<char>&,int&,int&);
void generate_number_error(int,std::string);
std::string number_gen(std::vector<char>&,int&,int&);
std::string string_gen(std::vector<char>&,int&,int&);
public:
void clear();
void scanner(std::vector<char>&);
void print_token();
int get_error();
std::vector<token>& get_token_list();
};
void nasal_lexer::clear()
{
token_list.clear();
return;
}
std::string nasal_lexer::identifier_gen(std::vector<char>& res,int& ptr,int& line)
{
int res_size=res.size();
std::string token_str="";
while(ptr<res_size && IS_IDENTIFIER_BODY(res[ptr]))
token_str+=res[ptr++];
return token_str;
// after running this process, ptr will point to the next token's beginning character
}
void nasal_lexer::generate_number_error(int line,std::string token_str)
{
++error;
std::cout<<">> [lexer] line "<<line<<": \""<<token_str<<"\" is not a correct number.\n";
return;
}
std::string nasal_lexer::number_gen(std::vector<char>& res,int& ptr,int& line)
{
int res_size=res.size();
bool scientific_notation=false;// numbers like 1e8 are scientific_notation
std::string token_str="";
// generate hex number
if(res[ptr]=='0' && ptr+1<res_size && res[ptr+1]=='x')
{
token_str="0x";
ptr+=2;
while(ptr<res_size && IS_HEX_NUMBER(res[ptr]))
token_str+=res[ptr++];
if(token_str=="0x")
{
generate_number_error(line,token_str);
return "0";
}
return token_str;
}
// generate oct number
else if(res[ptr]=='0' && ptr+1<res_size && res[ptr+1]=='o')
{
token_str="0o";
ptr+=2;
while(ptr<res_size && IS_OCT_NUMEBR(res[ptr]))
token_str+=res[ptr++];
if(token_str=="0o")
{
generate_number_error(line,token_str);
return "0";
}
return token_str;
}
// generate dec number
// dec number -> 0|[1~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*)
if(ptr<res_size && res[ptr]=='0')
token_str+=res[ptr++];
while(ptr<res_size && IS_DIGIT(res[ptr]))
token_str+=res[ptr++];
if(ptr<res_size && res[ptr]=='.')
{
token_str+=res[ptr++];
// "xxxx." is not a correct number
if(ptr>=res_size)
{
generate_number_error(line,token_str);
return "0";
}
while(ptr<res_size && IS_DIGIT(res[ptr]))
token_str+=res[ptr++];
// "xxxx." is not a correct number
if(token_str.back()=='.')
{
generate_number_error(line,token_str);
return "0";
}
}
if(ptr<res_size && (res[ptr]=='e' || res[ptr]=='E'))
{
token_str+=res[ptr++];
// "xxxe" is not a correct number
if(ptr>=res_size)
{
generate_number_error(line,token_str);
return "0";
}
if(ptr<res_size && (res[ptr]=='-' || res[ptr]=='+'))
token_str+=res[ptr++];
if(ptr>=res_size)
{
generate_number_error(line,token_str);
return "0";
}
if(ptr<res_size && res[ptr]=='0')
token_str+=res[ptr++];
while(ptr<res_size && IS_DIGIT(res[ptr]))
token_str+=res[ptr++];
// "xxxe(-|+)" is not a correct number
if(token_str.back()=='e' || token_str.back()=='E' || token_str.back()=='-' || token_str.back()=='+')
{
generate_number_error(line,token_str);
return "0";
}
}
return token_str;
}
std::string nasal_lexer::string_gen(std::vector<char>& res,int& ptr,int& line)
{
int res_size=res.size();
std::string token_str="";
char str_begin=res[ptr++];
if(ptr>=res_size) return token_str;
while(ptr<res_size && res[ptr]!=str_begin)
{
if(res[ptr]=='\n') ++line;
if(res[ptr]=='\\' && ptr+1<res.size())
{
++ptr;
switch(res[ptr])
{
case 'a':token_str.push_back('\a');break;
case 'b':token_str.push_back('\b');break;
case 'f':token_str.push_back('\f');break;
case 'n':token_str.push_back('\n');break;
case 'r':token_str.push_back('\r');break;
case 't':token_str.push_back('\t');break;
case 'v':token_str.push_back('\v');break;
case '?':token_str.push_back('\?');break;
case '0':token_str.push_back('\0');break;
case '\\':token_str.push_back('\\');break;
case '\'':token_str.push_back('\'');break;
case '\"':token_str.push_back('\"');break;
default: token_str.push_back(res[ptr]);break;
}
}
else
token_str+=res[ptr];
++ptr;
}
// check if this string ends with a " or '
if(ptr>=res_size)
{
++error;
std::cout<<">> [lexer] line "<<line<<": get EOF when generating string.\n";
}
++ptr;
return token_str;
}
void nasal_lexer::scanner(std::vector<char>& res)
{
error=0;
token_list.clear();
int line=1,ptr=0,res_size=res.size();
std::string token_str;
while(ptr<res_size)
{
while(ptr<res_size && (res[ptr]==' ' || res[ptr]=='\n' || res[ptr]=='\t' || res[ptr]=='\r' || res[ptr]<0))
{
// these characters will be ignored, and '\n' will cause ++line
if(res[ptr]=='\n') ++line;
++ptr;
}
if(ptr>=res_size) break;
if(IS_IDENTIFIER_HEAD(res[ptr]))
{
token_str=identifier_gen(res,ptr,line);
token new_token;
new_token.line=line;
new_token.str=token_str;
new_token.type=0;
for(int i=0;i<TOKEN_TABLE_SIZE;++i)
if(token_str==tok_tbl[i].str)
{
new_token.type=tok_tbl[i].tok_type;
break;
}
if(!new_token.type)
new_token.type=tok_identifier;
token_list.push_back(new_token);
}
else if(IS_DIGIT(res[ptr]))
{
token_str=number_gen(res,ptr,line);
token new_token;
new_token.line=line;
new_token.str=token_str;
new_token.type=tok_number;
token_list.push_back(new_token);
}
else if(IS_STRING_HEAD(res[ptr]))
{
token_str=string_gen(res,ptr,line);
token new_token;
new_token.line=line;
new_token.type=tok_string;
new_token.str=token_str;
token_list.push_back(new_token);
}
else if(IS_SINGLE_OPRATOR(res[ptr]))
{
token_str="";
token_str+=res[ptr];
token new_token;
new_token.line=line;
new_token.str=token_str;
for(int i=0;i<TOKEN_TABLE_SIZE;++i)
if(token_str==tok_tbl[i].str)
{
new_token.type=tok_tbl[i].tok_type;
break;
}
token_list.push_back(new_token);
++ptr;
}
else if(res[ptr]=='.')
{
if(ptr+2<res_size && res[ptr+1]=='.' && res[ptr+2]=='.')
{
token_str="...";
ptr+=3;
}
else
{
token_str=".";
++ptr;
}
token new_token;
new_token.line=line;
new_token.str=token_str;
for(int i=0;i<TOKEN_TABLE_SIZE;++i)
if(token_str==tok_tbl[i].str)
{
new_token.type=tok_tbl[i].tok_type;
break;
}
token_list.push_back(new_token);
}
else if(IS_CALC_OPERATOR(res[ptr]))
{
// get calculation operator
token_str="";
token_str+=res[ptr];
++ptr;
if(ptr<res.size() && res[ptr]=='=')
{
token_str+=res[ptr];
++ptr;
}
token new_token;
new_token.line=line;
new_token.str=token_str;
for(int i=0;i<TOKEN_TABLE_SIZE;++i)
if(token_str==tok_tbl[i].str)
{
new_token.type=tok_tbl[i].tok_type;
break;
}
token_list.push_back(new_token);
}
else if(IS_NOTE_HEAD(res[ptr]))
{
// avoid note
while(ptr<res_size && res[ptr]!='\n') ++ptr;
// after this process ptr will point to a '\n'
// don't ++ptr then the counter for line can work correctly
}
else
{
++error;
std::cout<<">> [lexer] line "<<line<<": unknown char "<<(int)res[ptr]<<'.'<<std::endl;
++ptr;
}
}
return;
}
void nasal_lexer::print_token()
{
int size=token_list.size();
for(int i=0;i<size;++i)
std::cout<<"("<<token_list[i].line<<" | "<<token_list[i].str<<")\n";
return;
}
int nasal_lexer::get_error()
{
return error;
}
std::vector<token>& nasal_lexer::get_token_list()
{
return token_list;
}
#endif