Revise DFA for parsing language models.

This commit is contained in:
Lukhnos Liu 2012-10-31 21:55:13 -07:00
parent 31a1a042fe
commit e68845381c
1 changed files with 105 additions and 52 deletions

View File

@ -71,121 +71,174 @@ bool FastLM::open(const char *path)
return false; return false;
} }
// we have 4 states, plus end and error
// 0
// end -> end
// lf -> forward, go to 0
// space -> error
// other -> record ptr, forward, goto 1
// 1
// end -> error
// lf -> error
// space -> zero out, check length, record ptr, go to 2
// other -> forward, go to 1
// 2
// lf -> error
// end -> error
// space -> zero out, check length, record ptr, go to 3
// other -> forward, go to 2
// 3
// end -> error
// lf -> save row, zero out, go to 0
// space -> error
// other -> forward, go to 3
// Regular expression for parsing:
// (\n*\w\w*\s\w\w*\s\w\w*)*$
//
// Expanded as DFA (in Graphviz):
//
// digraph finite_state_machine {
// rankdir = LR;
// size = "10";
//
// node [shape = doublecircle]; End;
// node [shape = circle];
//
// Start -> End [ label = "EOF"];
// Start -> Error [ label = "\\s" ];
// Start -> Start [ label = "\\n" ];
// Start -> 1 [ label = "\\w" ];
//
// 1 -> Error [ label = "\\n, EOF" ];
// 1 -> 2 [ label = "\\s" ];
// 1 -> 1 [ label = "\\w" ];
//
// 2 -> Error [ label = "\\n, \\s, EOF" ];
// 2 -> 3 [ label = "\\w" ];
//
// 3 -> Error [ label = "\\n, EOF "];
// 3 -> 4 [ label = "\\s" ];
// 3 -> 3 [ label = "\\w" ];
//
// 4 -> Error [ label = "\\n, \\s, EOF" ];
// 4 -> 5 [ label = "\\w" ];
//
// 5 -> Error [ label = "\\s, EOF" ];
// 5 -> Start [ label = "\\n" ];
// 5 -> 5 [ label = "\\w" ];
// }
char *head = (char *)data; char *head = (char *)data;
char *end = (char *)data + length; char *end = (char *)data + length;
char c; char c;
Row row; Row row;
state0: start:
// EOF -> end
if (head == end) { if (head == end) {
goto end; goto end;
} }
c = *head; c = *head;
if (c == '\n') { // \s -> error
head++; if (c == ' ') {
goto state0;
}
else if (c == ' ') {
goto error; goto error;
} }
// \n -> start
else if (c == '\n') {
head++;
goto start;
}
// \w -> record column star, state1
row.value = head; row.value = head;
head++; head++;
// fall through state 1 // fall through to state 1
state1: state1:
// EOF -> error
if (head == end) { if (head == end) {
goto error; goto error;
} }
c = *head; c = *head;
// \n -> error
if (c == '\n') { if (c == '\n') {
goto error; goto error;
} }
// \s -> state2 + zero out ending + record column start
else if (c == ' ') { else if (c == ' ') {
if (row.value == head) {
goto error;
}
*head = 0; *head = 0;
head++; head++;
row.key = head; row.key = head;
goto state2; goto state2;
} }
// \w -> state1
head++; head++;
goto state1; goto state1;
state2: state2:
// eof -> error
if (head == end) { if (head == end) {
goto error; goto error;
} }
c = *head; c = *head;
if (c == '\n') { // \n, \s -> error
goto error; if (c == '\n' || c == ' ') {
}
else if (c == ' ') {
if (row.key == head) {
goto error; goto error;
} }
// \w -> state3
head++;
// fall through to state 3
state3:
// eof -> error
if (head == end) {
goto error;
}
c = *head;
// \n -> error
if (c == '\n') {
goto error;
}
// \s -> state4 + zero out ending + record column start
else if (c == ' ') {
*head = 0; *head = 0;
head++; head++;
row.logProbability = head; row.logProbability = head;
goto state3; goto state4;
} }
// \w -> state3
head++; head++;
goto state2; goto state3;
state3: state4:
// eof -> error
if (head == end) { if (head == end) {
goto error; goto error;
} }
c = *head; c = *head;
if (c == '\n') { // \n, \s -> error
if (row.logProbability == head) { if (c == '\n' || c == ' ') {
goto error; goto error;
} }
*head = 0; // \w -> state5
head++; head++;
keyRowMap[row.key].push_back(row);
goto state0; // fall through to state 5
state5:
// eof -> error
if (head == end) {
goto error;
} }
c = *head;
// \s -> error
if (c == ' ') { if (c == ' ') {
goto error; goto error;
} }
// \n -> start
else if (c == '\n') {
*head = 0;
head++; head++;
goto state3; keyRowMap[row.key].push_back(row);
goto start;
}
// \w -> state 5
head++;
goto state5;
error: error:
close(); close();