Load Plain Bopomofo data.

This commit is contained in:
Lukhnos Liu 2012-09-10 10:14:32 -07:00
parent 10ff94e141
commit 7b4568e152
3 changed files with 44 additions and 20 deletions

View File

@ -45,6 +45,7 @@
6ACA41FD15FC1D9000935EF6 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41F015FC1D9000935EF6 /* MainMenu.xib */; }; 6ACA41FD15FC1D9000935EF6 /* MainMenu.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6ACA41F015FC1D9000935EF6 /* MainMenu.xib */; };
6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; }; 6ACA41FF15FC1D9000935EF6 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6ACA41F415FC1D9000935EF6 /* main.m */; };
6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; }; 6ACA420215FC1E5200935EF6 /* McBopomofo.app in Resources */ = {isa = PBXBuildFile; fileRef = 6A0D4EA215FC0D2D00ABF4B3 /* McBopomofo.app */; };
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */ = {isa = PBXBuildFile; fileRef = 6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */; };
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; }; 6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */; };
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; }; 6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */ = {isa = PBXBuildFile; fileRef = 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */; };
/* End PBXBuildFile section */ /* End PBXBuildFile section */
@ -205,6 +206,7 @@
6ACA41F615FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.rtf; name = "zh-Hant"; path = "zh-Hant.lproj/License.rtf"; sourceTree = "<group>"; }; 6ACA41F615FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.rtf; name = "zh-Hant"; path = "zh-Hant.lproj/License.rtf"; sourceTree = "<group>"; };
6ACA41F715FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Localizable.strings"; sourceTree = "<group>"; }; 6ACA41F715FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hant"; path = "zh-Hant.lproj/Localizable.strings"; sourceTree = "<group>"; };
6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; }; 6ACA41F815FC1D9000935EF6 /* zh-Hant */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = "zh-Hant"; path = "zh-Hant.lproj/MainMenu.xib"; sourceTree = "<group>"; };
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "data-plain-bpmf.txt"; sourceTree = "<group>"; };
6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; }; 6AE210B015FC63CC003659FE /* PlainBopomofo.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = PlainBopomofo.tiff; sourceTree = "<group>"; };
6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; }; 6AE210B115FC63CC003659FE /* PlainBopomofo@2x.tiff */ = {isa = PBXFileReference; lastKnownFileType = image.tiff; path = "PlainBopomofo@2x.tiff"; sourceTree = "<group>"; };
/* End PBXFileReference section */ /* End PBXFileReference section */
@ -425,6 +427,7 @@
6A38BBF415FC117A00A8A51F /* BPMFMappings.txt */, 6A38BBF415FC117A00A8A51F /* BPMFMappings.txt */,
6A38BBF515FC117A00A8A51F /* BPMFPunctuations.txt */, 6A38BBF515FC117A00A8A51F /* BPMFPunctuations.txt */,
6A38BBF615FC117A00A8A51F /* data.txt */, 6A38BBF615FC117A00A8A51F /* data.txt */,
6AD7CBC715FE555000691B5B /* data-plain-bpmf.txt */,
6A38BBF715FC117A00A8A51F /* heterophony1.list */, 6A38BBF715FC117A00A8A51F /* heterophony1.list */,
6A38BBF815FC117A00A8A51F /* heterophony2.list */, 6A38BBF815FC117A00A8A51F /* heterophony2.list */,
6A38BBF915FC117A00A8A51F /* heterophony3.list */, 6A38BBF915FC117A00A8A51F /* heterophony3.list */,
@ -581,6 +584,7 @@
6A719D0415FC5FD200C8B8E3 /* McBopomofo.iconset in Resources */, 6A719D0415FC5FD200C8B8E3 /* McBopomofo.iconset in Resources */,
6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */, 6AE210B215FC63CC003659FE /* PlainBopomofo.tiff in Resources */,
6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */, 6AE210B315FC63CC003659FE /* PlainBopomofo@2x.tiff in Resources */,
6AD7CBC815FE555000691B5B /* data-plain-bpmf.txt in Resources */,
); );
runOnlyForDeploymentPostprocessing = 0; runOnlyForDeploymentPostprocessing = 0;
}; };

View File

@ -36,13 +36,17 @@
#import <InputMethodKit/InputMethodKit.h> #import <InputMethodKit/InputMethodKit.h>
#import "Mandarin.h" #import "Mandarin.h"
#import "Gramambular.h" #import "Gramambular.h"
#import "SimpleLM.h"
@interface McBopomofoInputMethodController : IMKInputController @interface McBopomofoInputMethodController : IMKInputController
{ {
@private @private
// the reading buffer that takes user input // the reading buffer that takes user input
Formosa::Mandarin::BopomofoReadingBuffer* _bpmfReadingBuffer; Formosa::Mandarin::BopomofoReadingBuffer* _bpmfReadingBuffer;
// language model
Formosa::Gramambular::SimpleLM *_languageModel;
// the grid (lattice) builder for the unigrams (and bigrams) // the grid (lattice) builder for the unigrams (and bigrams)
Formosa::Gramambular::BlockReadingBuilder* _builder; Formosa::Gramambular::BlockReadingBuilder* _builder;

View File

@ -36,7 +36,6 @@
#import <fstream> #import <fstream>
#import <iostream> #import <iostream>
#import <set> #import <set>
#import "SimpleLM.h"
#import "OVStringHelper.h" #import "OVStringHelper.h"
#import "OVUTF8Helper.h" #import "OVUTF8Helper.h"
#import "AppDelegate.h" #import "AppDelegate.h"
@ -109,6 +108,7 @@ static NSString *const kGraphVizOutputfile = @"/tmp/McBopomofo-visualization.dot
// shared language model object that stores our phrase-term probability database // shared language model object that stores our phrase-term probability database
SimpleLM gLanguageModel; SimpleLM gLanguageModel;
SimpleLM gLanguageModelPlainBopomofo;
// private methods // private methods
@interface McBopomofoInputMethodController () <VTCandidateControllerDelegate> @interface McBopomofoInputMethodController () <VTCandidateControllerDelegate>
@ -172,7 +172,8 @@ public:
_bpmfReadingBuffer = new BopomofoReadingBuffer(BopomofoKeyboardLayout::StandardLayout()); _bpmfReadingBuffer = new BopomofoReadingBuffer(BopomofoKeyboardLayout::StandardLayout());
// create the lattice builder // create the lattice builder
_builder = new BlockReadingBuilder(&gLanguageModel); _languageModel = &gLanguageModel;
_builder = new BlockReadingBuilder(_languageModel);
// each Mandarin syllable is separated by a hyphen // each Mandarin syllable is separated by a hyphen
_builder->setJoinSeparator("-"); _builder->setJoinSeparator("-");
@ -309,9 +310,11 @@ public:
{ {
if ([value isKindOfClass:[NSString class]] && [value isEqual:kPlainBopomofoModeIdentifier]) { if ([value isKindOfClass:[NSString class]] && [value isEqual:kPlainBopomofoModeIdentifier]) {
_inputMode = kPlainBopomofoModeIdentifier; _inputMode = kPlainBopomofoModeIdentifier;
_languageModel = &gLanguageModelPlainBopomofo;
} }
else { else {
_inputMode = kBopomofoModeIdentifier; _inputMode = kBopomofoModeIdentifier;
_languageModel = &gLanguageModel;
} }
NSString *basisKeyboardLayoutID = [[NSUserDefaults standardUserDefaults] stringForKey:kBasisKeyboardLayoutPreferenceKey]; NSString *basisKeyboardLayoutID = [[NSUserDefaults standardUserDefaults] stringForKey:kBasisKeyboardLayoutPreferenceKey];
@ -329,6 +332,12 @@ public:
if ([_composingBuffer length] > 0) { if ([_composingBuffer length] > 0) {
[self commitComposition:sender]; [self commitComposition:sender];
} }
if (_builder) {
delete _builder;
_builder = new BlockReadingBuilder(_languageModel);
_builder->setJoinSeparator("-");
}
} }
#pragma mark - IMKServerInput protocol methods #pragma mark - IMKServerInput protocol methods
@ -744,7 +753,7 @@ public:
string reading = _bpmfReadingBuffer->syllable().composedString(); string reading = _bpmfReadingBuffer->syllable().composedString();
// see if we have a unigram for this // see if we have a unigram for this
if (!gLanguageModel.hasUnigramsForKey(reading)) { if (!_languageModel->hasUnigramsForKey(reading)) {
[self beep]; [self beep];
[self updateClientComposingBuffer:client]; [self updateClientComposingBuffer:client];
return YES; return YES;
@ -789,7 +798,7 @@ public:
[self commitComposition:client]; [self commitComposition:client];
_bpmfReadingBuffer->clear(); _bpmfReadingBuffer->clear();
} }
else if (gLanguageModel.hasUnigramsForKey(" ")) { else if (_languageModel->hasUnigramsForKey(" ")) {
_builder->insertReadingAtCursor(" "); _builder->insertReadingAtCursor(" ");
[self popOverflowComposingTextAndWalk:client]; [self popOverflowComposingTextAndWalk:client];
[self updateClientComposingBuffer:client]; [self updateClientComposingBuffer:client];
@ -908,7 +917,7 @@ public:
// punctuation list // punctuation list
if ((char)charCode == '`') { if ((char)charCode == '`') {
if (gLanguageModel.hasUnigramsForKey(string("_punctuation_list"))) { if (_languageModel->hasUnigramsForKey(string("_punctuation_list"))) {
if (_bpmfReadingBuffer->isEmpty()) { if (_bpmfReadingBuffer->isEmpty()) {
_builder->insertReadingAtCursor(string("_punctuation_list")); _builder->insertReadingAtCursor(string("_punctuation_list"));
[self popOverflowComposingTextAndWalk:client]; [self popOverflowComposingTextAndWalk:client];
@ -945,7 +954,7 @@ public:
} }
string customPunctuation = string("_punctuation_") + layout + string(1, (char)charCode); string customPunctuation = string("_punctuation_") + layout + string(1, (char)charCode);
if (gLanguageModel.hasUnigramsForKey(customPunctuation)) { if (_languageModel->hasUnigramsForKey(customPunctuation)) {
if (_bpmfReadingBuffer->isEmpty()) { if (_bpmfReadingBuffer->isEmpty()) {
_builder->insertReadingAtCursor(customPunctuation); _builder->insertReadingAtCursor(customPunctuation);
[self popOverflowComposingTextAndWalk:client]; [self popOverflowComposingTextAndWalk:client];
@ -964,7 +973,7 @@ public:
// if nothing is matched, see if it's a punctuation key // if nothing is matched, see if it's a punctuation key
string punctuation = string("_punctuation_") + string(1, (char)charCode); string punctuation = string("_punctuation_") + string(1, (char)charCode);
if (gLanguageModel.hasUnigramsForKey(punctuation)) { if (_languageModel->hasUnigramsForKey(punctuation)) {
if (_bpmfReadingBuffer->isEmpty()) { if (_bpmfReadingBuffer->isEmpty()) {
_builder->insertReadingAtCursor(punctuation); _builder->insertReadingAtCursor(punctuation);
[self popOverflowComposingTextAndWalk:client]; [self popOverflowComposingTextAndWalk:client];
@ -1302,34 +1311,42 @@ public:
@end @end
static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, SimpleLM &lm)
void LTLoadLanguageModel()
{ {
// load the language model; the performance of this function can be greatly improved // load the language model; the performance of this function can be greatly improved
// with better loading/parsing methods // with better loading/parsing methods
NSDate *__unused startTime = [NSDate date]; NSDate *__unused startTime = [NSDate date];
NSString *dataPath = [[NSBundle bundleForClass:[McBopomofoInputMethodController class]] pathForResource:@"data" ofType:@"txt"]; NSString *dataPath = [[NSBundle bundleForClass:[McBopomofoInputMethodController class]] pathForResource:filenameWithoutExtension ofType:@"txt"];
ifstream ifs; ifstream ifs;
ifs.open([dataPath UTF8String]); ifs.open([dataPath UTF8String]);
while (ifs.good()) { while (ifs.good()) {
string line; string line;
getline(ifs, line); getline(ifs, line);
if (!line.size() || (line.size() && line[0] == '#')) { if (!line.size() || (line.size() && line[0] == '#')) {
continue; continue;
} }
vector<string> p = OVStringHelper::SplitBySpacesOrTabs(line); vector<string> p = OVStringHelper::SplitBySpacesOrTabs(line);
if (p.size() == 3) { if (p.size() == 3) {
gLanguageModel.add(p[1], p[0], atof(p[2].c_str())); lm.add(p[1], p[0], atof(p[2].c_str()));
} }
} }
ifs.close(); ifs.close();
gLanguageModel.add(" ", " ", 0.0);
// insert an empty entry for BOS/EOS markers
lm.add(" ", " ", 0.0);
}
void LTLoadLanguageModel()
{
LTLoadLanguageModelFile(@"data", gLanguageModel);
LTLoadLanguageModelFile(@"data-plain-bpmf", gLanguageModelPlainBopomofo);
// initialize the singleton learning dictionary // initialize the singleton learning dictionary
// putting singleton in @synchronized is the standard way in Objective-C // putting singleton in @synchronized is the standard way in Objective-C
@ -1344,7 +1361,6 @@ void LTLoadLanguageModel()
} }
NSString *appSupportPath = [paths objectAtIndex:0]; NSString *appSupportPath = [paths objectAtIndex:0];
NSString *userDictPath = [appSupportPath stringByAppendingPathComponent:@"McBopomofo"]; NSString *userDictPath = [appSupportPath stringByAppendingPathComponent:@"McBopomofo"];
BOOL isDir = NO; BOOL isDir = NO;