Import the test code from Formosana

This commit is contained in:
Lukhnos Liu 2022-02-19 09:25:18 -08:00
parent 305c874003
commit d4d8d14004
1 changed files with 214 additions and 2 deletions

View File

@ -22,7 +22,219 @@
// OTHER DEALINGS IN THE SOFTWARE. // OTHER DEALINGS IN THE SOFTWARE.
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include <algorithm>
#include <iostream>
#include <map>
#include <vector>
#include <cstdlib>
#include <sstream>
#include "Gramambular.h"
TEST(GramambularTest, Trivial) { const char* SampleData = R"(
ASSERT_EQ(1, 1); #
# The sample is from libtabe (http://sourceforge.net/projects/libtabe/)
# last updated in 2002. The project was originally initiated by
# Pai-Hsiang Hsiao in 1999.
#
# Libtabe is a frequency table of Taiwanese Mandarin words. The database
# itself is, according to the tar file, released under the BSD License.
#
-9.495858
-9.006414
-99.000000
-8.091803
-99.000000
-13.513987
-12.259095
-7.171551
-10.574273
-11.504072
-10.450457
-7.171052
-99.000000
-11.928720
-13.624335
-12.390804
˙ -3.516024
ˊ -3.516024
ˋ -3.516024
-5.809297
˙ -7.427179
-8.381971
-8.501463
ˋ -99.000000
-8.034095
-8.858181
ˋ -7.608341
ˋ -99.000000
-7.290109
ˋ -10.939895
-99.000000
ˋ -99.000000
ˋ -99.000000
-99.000000
ˋ -9.715317
ˋ -7.926683
ˋ -8.373022
-9.877580
-10.711079
-7.877973
-7.822167
-99.000000
-99.000000
-99.000000
-9.685671
ˋ -10.425662
-99.000000
-99.000000
ˋ -8.888722
ˋ -10.204425
-11.378321
-99.000000
ˋ -99.000000
ˋ -8.450826
-11.074890
-99.000000
ˋ -12.045357
-99.000000
ˋ -99.000000
ˋ -9.517568
ˋ -12.021587
-99.000000
-12.784206
ˊ -6.086515
ˇ -9.164384
ˇ -8.690941
ˇ -10.127828
ˊ -11.336864
ˊ -11.285740
ˇ -12.492933
-6.299461
ˋ -6.736613
ˋ -13.336653
ˇ -10.344678
ˊ -11.668947
ˊ -11.373044
ˋ -9.842421
)";
using namespace std;
using namespace Formosa::Gramambular;
class SimpleLM : public LanguageModel
{
public:
SimpleLM(const char* input, bool swapKeyValue = false)
{
stringstream sstream(input);
while (sstream.good()) {
string line;
getline(sstream, line);
if (!line.size() || (line.size() && line[0] == '#')) {
continue;
}
stringstream linestream(line);
string col0;
string col1;
string col2;
linestream >> col0;
linestream >> col1;
linestream >> col2;
Unigram u;
if (swapKeyValue) {
u.keyValue.key = col1;
u.keyValue.value = col0;
}
else {
u.keyValue.key = col0;
u.keyValue.value = col1;
}
u.score = atof(col2.c_str());
m_db[u.keyValue.key].push_back(u);
}
}
const vector<Bigram> bigramsForKeys(const string &preceedingKey, const string& key) override
{
return vector<Bigram>();
}
const vector<Unigram> unigramsForKey(const string &key) override
{
map<string, vector<Unigram> >::const_iterator f = m_db.find(key);
return f == m_db.end() ? vector<Unigram>() : (*f).second;
}
bool hasUnigramsForKey(const string& key) override
{
map<string, vector<Unigram> >::const_iterator f = m_db.find(key);
return f != m_db.end();
}
protected:
map<string, vector<Unigram> > m_db;
};
TEST(GramambularTest, InputTest) {
SimpleLM lm(SampleData);
BlockReadingBuilder builder(&lm);
builder.insertReadingAtCursor("ㄍㄠ");
builder.insertReadingAtCursor("ㄐㄧˋ");
builder.setCursorIndex(1);
builder.insertReadingAtCursor("ㄎㄜ");
builder.setCursorIndex(0);
builder.deleteReadingAfterCursor();
builder.insertReadingAtCursor("ㄍㄠ");
builder.setCursorIndex(builder.length());
builder.insertReadingAtCursor("ㄍㄨㄥ");
builder.insertReadingAtCursor("");
builder.insertReadingAtCursor("ㄉㄜ˙");
builder.insertReadingAtCursor("ㄋㄧㄢˊ");
builder.insertReadingAtCursor("ㄓㄨㄥ");
builder.insertReadingAtCursor("ㄐㄧㄤˇ");
builder.insertReadingAtCursor("ㄐㄧㄣ");
Walker walker(&builder.grid());
vector<NodeAnchor> walked = walker.reverseWalk(builder.grid().width(), 0.0);
reverse(walked.begin(), walked.end());
vector<string> composed;
for (vector<NodeAnchor>::iterator wi = walked.begin() ; wi != walked.end() ; ++wi) {
composed.push_back((*wi).node->currentKeyValue().value);
}
ASSERT_EQ(composed, (vector<string>{"高科技", "公司", "", "年中", "獎金"}));
}
TEST(GramambularTest, WordSegmentationTest) {
SimpleLM lm2(SampleData, true);
BlockReadingBuilder builder2(&lm2);
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
builder2.insertReadingAtCursor("");
Walker walker2(&builder2.grid());
vector<NodeAnchor> walked = walker2.reverseWalk(builder2.grid().width(), 0.0);
reverse(walked.begin(), walked.end());
vector<string> segmented;
for (vector<NodeAnchor>::iterator wi = walked.begin(); wi != walked.end(); ++wi) {
segmented.push_back((*wi).node->currentKeyValue().key);
}
ASSERT_EQ(segmented, (vector<string>{"高科技", "公司", "", "年終", "獎金"}));
} }