diff --git a/McBopomofo.xcodeproj/project.pbxproj b/McBopomofo.xcodeproj/project.pbxproj index 4d0bf37e..70883d91 100644 --- a/McBopomofo.xcodeproj/project.pbxproj +++ b/McBopomofo.xcodeproj/project.pbxproj @@ -38,6 +38,7 @@ 6AFF97F2253B299E007F1C49 /* NonModalAlertWindowController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6AFF97F0253B299E007F1C49 /* NonModalAlertWindowController.xib */; }; D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = D41355D7278D7409005E5CBD /* LanguageModelManager.mm */; }; D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */; }; + D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */; }; D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; }; D427F76A278C9E29004A2160 /* CandidateUI in Frameworks */ = {isa = PBXBuildFile; productRef = D427F769278C9E29004A2160 /* CandidateUI */; }; D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427F76B278CA1BA004A2160 /* AppDelegate.swift */; }; @@ -161,6 +162,8 @@ D41355D7278D7409005E5CBD /* LanguageModelManager.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LanguageModelManager.mm; sourceTree = ""; }; D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = McBopomofoLM.cpp; sourceTree = ""; }; D41355DA278E6D17005E5CBD /* McBopomofoLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = McBopomofoLM.h; sourceTree = ""; }; + D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = UserPhrasesLM.cpp; sourceTree = ""; }; + D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = UserPhrasesLM.h; sourceTree = ""; }; D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = ""; }; D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = ""; }; D427F768278C9D0D004A2160 /* CandidateUI */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = CandidateUI; path = Packages/CandidateUI; sourceTree = ""; }; @@ -268,10 +271,12 @@ 6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */, 6A0421A615FEF3F50061ED63 /* FastLM.cpp */, 6A0421A715FEF3F50061ED63 /* FastLM.h */, - D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */, - D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */, + D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */, + D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */, D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */, D41355DA278E6D17005E5CBD /* McBopomofoLM.h */, + D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */, + D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */, ); path = Engine; sourceTree = ""; @@ -561,6 +566,7 @@ D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */, D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */, 6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */, + D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */, 6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */, D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */, ); diff --git a/Source/Engine/McBopomofoLM.h b/Source/Engine/McBopomofoLM.h index 292fbf61..521d86dc 100644 --- a/Source/Engine/McBopomofoLM.h +++ b/Source/Engine/McBopomofoLM.h @@ -3,6 +3,7 @@ #include #include "FastLM.h" +#include "UserPhrasesLM.h" namespace McBopomofo { @@ -23,8 +24,8 @@ public: protected: FastLM m_languageModel; - FastLM m_userPhrases; - FastLM m_excludedPhrases; + UserPhrasesLM m_userPhrases; + UserPhrasesLM m_excludedPhrases; }; }; diff --git a/Source/Engine/UserPhrasesLM.cpp b/Source/Engine/UserPhrasesLM.cpp new file mode 100644 index 00000000..30e7b240 --- /dev/null +++ b/Source/Engine/UserPhrasesLM.cpp @@ -0,0 +1,207 @@ +#include "UserPhrasesLM.h" +#include +#include +#include +#include +#include + +using namespace Formosa::Gramambular; +using namespace McBopomofo; + +UserPhrasesLM::UserPhrasesLM() + : fd(-1) + , data(0) + , length(0) +{ +} + +UserPhrasesLM::~UserPhrasesLM() +{ + if (data) { + close(); + } +} + +bool UserPhrasesLM::open(const char *path) +{ + if (data) { + return false; + } + + fd = ::open(path, O_RDONLY); + if (fd == -1) { + printf("open:: file not exist"); + return false; + } + + struct stat sb; + if (fstat(fd, &sb) == -1) { + printf("open:: cannot open file"); + return false; + } + + length = (size_t)sb.st_size; + + data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0); + if (!data) { + ::close(fd); + return false; + } + + char *head = (char *)data; + char *end = (char *)data + length; + char c; + Row row; + +start: + // EOF -> end + if (head == end) { + goto end; + } + + c = *head; + // \s -> error + if (c == ' ') { + goto error; + } + // \n -> start + else if (c == '\n') { + head++; + goto start; + } + + // \w -> record column star, state1 + row.value = head; + head++; + // fall through to state 1 + +state1: + // EOF -> error + if (head == end) { + goto error; + } + + c = *head; + // \n -> error + if (c == '\n') { + goto error; + } + // \s -> state2 + zero out ending + record column start + else if (c == ' ') { + *head = 0; + head++; + row.key = head; + goto state2; + } + + // \w -> state1 + head++; + goto state1; + +state2: + if (head == end) { + *head = 0; + head++; + keyRowMap[row.key].push_back(row); + goto end; + } + + c = *head; + // \s -> error + if (c == ' ' || c == '\n') { + *head = 0; + head++; + keyRowMap[row.key].push_back(row); + if (c == ' ') { + goto state3; + } + goto start; + } + + // \w -> state 2 + head++; + goto state2; + +state3: + if (head == end) { + *head = 0; + head++; + keyRowMap[row.key].push_back(row); + goto end; + } + + c = *head; + if (c == '\n') { + goto start; + } + + head++; + goto state3; + +error: + close(); + return false; + +end: + static const char *space = " "; + Row emptyRow; + emptyRow.key = space; + emptyRow.value = space; + keyRowMap[space].push_back(emptyRow); + + return true; +} + +void UserPhrasesLM::close() +{ + if (data) { + munmap(data, length); + ::close(fd); + data = 0; + } + + keyRowMap.clear(); +} + +void UserPhrasesLM::dump() +{ + size_t rows = 0; + for (map >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) { + const vector& r = (*i).second; + for (vector::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) { + const Row& row = *ri; + cerr << row.key << " " << row.value << "\n"; + rows++; + } + } +} + +const vector UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key) +{ + return vector(); +} + +const vector UserPhrasesLM::unigramsForKey(const string& key) +{ + vector v; + map >::const_iterator i = keyRowMap.find(key.c_str()); + + if (i != keyRowMap.end()) { + for (vector::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) { + Unigram g; + const Row& r = *ri; + g.keyValue.key = r.key; + g.keyValue.value = r.value; + g.score = 0.0; + v.push_back(g); + } + } + + return v; +} + +bool UserPhrasesLM::hasUnigramsForKey(const string& key) +{ + return keyRowMap.find(key.c_str()) != keyRowMap.end(); +} + diff --git a/Source/Engine/UserPhrasesLM.h b/Source/Engine/UserPhrasesLM.h new file mode 100644 index 00000000..4dc81d66 --- /dev/null +++ b/Source/Engine/UserPhrasesLM.h @@ -0,0 +1,51 @@ +#ifndef USERPHRASESLM_H +#define USERPHRASESLM_H + +#include + +#include +#include +#include +#include "LanguageModel.h" + +namespace McBopomofo { + +using namespace Formosa::Gramambular; + +class UserPhrasesLM : public LanguageModel +{ +public: + UserPhrasesLM(); + ~UserPhrasesLM(); + + bool open(const char *path); + void close(); + void dump(); + + virtual const vector bigramsForKeys(const string& preceedingKey, const string& key); + virtual const vector unigramsForKey(const string& key); + virtual bool hasUnigramsForKey(const string& key); + +protected: + struct CStringCmp + { + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) < 0; + } + }; + + struct Row { + const char *key; + const char *value; + }; + + map, CStringCmp> keyRowMap; + int fd; + void *data; + size_t length; +}; + +} + +#endif diff --git a/Source/InputMethodController.mm b/Source/InputMethodController.mm index d17ad3e6..a7fe79a0 100644 --- a/Source/InputMethodController.mm +++ b/Source/InputMethodController.mm @@ -610,13 +610,11 @@ NS_INLINE size_t max(size_t a, size_t b) { return a > b ? a : b; } [string appendString:reading]; [string appendString:@" "]; NSMutableArray *readingsArray = [[NSMutableArray alloc] init]; - vector v = _builder->readingsAtRange(begin,end); + vector v = _builder->readingsAtRange(begin, end); for(vector::iterator it_i=v.begin(); it_i!=v.end(); ++it_i) { [readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]]; } [string appendString:[readingsArray componentsJoinedByString:@"-"]]; - [string appendString:@" "]; - [string appendString:@"-1.0"]; return string; } diff --git a/Source/LanguageModelManager.mm b/Source/LanguageModelManager.mm index 0af6d6eb..51951041 100644 --- a/Source/LanguageModelManager.mm +++ b/Source/LanguageModelManager.mm @@ -98,17 +98,42 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, McBopomo return NO; } - NSString *currentMarkedPhrase = [userPhrase stringByAppendingString:@"\n"]; - + BOOL shuoldAddLineBreakAtFront = NO; NSString *path = [self userPhrasesDataPathMcBopomofo]; - NSFileHandle *file = [NSFileHandle fileHandleForUpdatingAtPath:path]; - if (!file) { + + if ([[NSFileManager defaultManager] fileExistsAtPath:path]) { + NSError *error = nil; + NSDictionary *attr = [[NSFileManager defaultManager] attributesOfItemAtPath:path error:&error]; + unsigned long long fileSize = [attr fileSize]; + if (!error && fileSize) { + NSFileHandle *readFile = [NSFileHandle fileHandleForReadingAtPath:path]; + if (readFile) { + [readFile seekToFileOffset:fileSize - 1]; + NSData *data = [readFile readDataToEndOfFile]; + const void *bytes = [data bytes]; + if (*(char *)bytes != '\n') { + shuoldAddLineBreakAtFront = YES; + } + [readFile closeFile]; + } + } + } + + NSMutableString *currentMarkedPhrase = [NSMutableString string]; + if (shuoldAddLineBreakAtFront) { + [currentMarkedPhrase appendString:@"\n"]; + } + [currentMarkedPhrase appendString:userPhrase]; + [currentMarkedPhrase appendString:@"\n"]; + + NSFileHandle *writeFile = [NSFileHandle fileHandleForUpdatingAtPath:path]; + if (!writeFile) { return NO; } - [file seekToEndOfFile]; + [writeFile seekToEndOfFile]; NSData *data = [currentMarkedPhrase dataUsingEncoding:NSUTF8StringEncoding]; - [file writeData:data]; - [file closeFile]; + [writeFile writeData:data]; + [writeFile closeFile]; [self loadUserPhrasesModel]; return YES; diff --git a/Source/McBopomofo-Bridging-Header.h b/Source/McBopomofo-Bridging-Header.h index 51d288b9..084274e7 100644 --- a/Source/McBopomofo-Bridging-Header.h +++ b/Source/McBopomofo-Bridging-Header.h @@ -7,5 +7,4 @@ @interface LanguageModelManager : NSObject + (void)loadDataModels; + (void)loadUserPhrasesModel; -+ (BOOL)checkIfUserLanguageModelFilesExist; @end