diff --git a/Source/Engine/LanguageModel/UserPhrasesLM.cpp b/Source/Engine/LanguageModel/UserPhrasesLM.cpp new file mode 100644 index 00000000..7c88f9e9 --- /dev/null +++ b/Source/Engine/LanguageModel/UserPhrasesLM.cpp @@ -0,0 +1,236 @@ +// +// UserPhraseLM.cpp +// +// Copyright (c) 2011-2022 The OpenVanilla Project. +// +// Contributors: +// Weizhong Yang (@zonble) @ OpenVanilla +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#include "UserPhrasesLM.h" +#include +#include +#include +#include +#include + +using namespace Formosa::Gramambular; +using namespace vChewing; + +UserPhrasesLM::UserPhrasesLM() + : fd(-1) + , data(0) + , length(0) +{ +} + +UserPhrasesLM::~UserPhrasesLM() +{ + if (data) { + close(); + } +} + +bool UserPhrasesLM::open(const char *path) +{ + if (data) { + return false; + } + + fd = ::open(path, O_RDONLY); + if (fd == -1) { + printf("open:: file not exist"); + return false; + } + + struct stat sb; + if (fstat(fd, &sb) == -1) { + printf("open:: cannot open file"); + return false; + } + + length = (size_t)sb.st_size; + + data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0); + if (!data) { + ::close(fd); + return false; + } + + char *head = (char *)data; + char *end = (char *)data + length; + char c; + Row row; + +start: + // EOF -> end + if (head == end) { + goto end; + } + + c = *head; + // \s -> error + if (c == ' ') { + goto error; + } + // \n -> start + else if (c == '\n') { + head++; + goto start; + } + + // \w -> record column star, state1 + row.value = head; + head++; + // fall through to state 1 + +state1: + // EOF -> error + if (head == end) { + goto error; + } + + c = *head; + // \n -> error + if (c == '\n') { + goto error; + } + // \s -> state2 + zero out ending + record column start + else if (c == ' ') { + *head = 0; + head++; + row.key = head; + goto state2; + } + + // \w -> state1 + head++; + goto state1; + +state2: + if (head == end) { + *head = 0; + head++; + keyRowMap[row.key].push_back(row); + goto end; + } + + c = *head; + // \s -> error + if (c == ' ' || c == '\n') { + *head = 0; + head++; + keyRowMap[row.key].push_back(row); + if (c == ' ') { + goto state3; + } + goto start; + } + + // \w -> state 2 + head++; + goto state2; + +state3: + if (head == end) { + *head = 0; + head++; + keyRowMap[row.key].push_back(row); + goto end; + } + + c = *head; + if (c == '\n') { + goto start; + } + + head++; + goto state3; + +error: + close(); + return false; + +end: + static const char *space = " "; + Row emptyRow; + emptyRow.key = space; + emptyRow.value = space; + keyRowMap[space].push_back(emptyRow); + + return true; +} + +void UserPhrasesLM::close() +{ + if (data) { + munmap(data, length); + ::close(fd); + data = 0; + } + + keyRowMap.clear(); +} + +void UserPhrasesLM::dump() +{ + size_t rows = 0; + for (map >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) { + const vector& r = (*i).second; + for (vector::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) { + const Row& row = *ri; + cerr << row.key << " " << row.value << "\n"; + rows++; + } + } +} + +const vector UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key) +{ + return vector(); +} + +const vector UserPhrasesLM::unigramsForKey(const string& key) +{ + vector v; + map >::const_iterator i = keyRowMap.find(key.c_str()); + + if (i != keyRowMap.end()) { + for (vector::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) { + Unigram g; + const Row& r = *ri; + g.keyValue.key = r.key; + g.keyValue.value = r.value; + g.score = 0.0; + v.push_back(g); + } + } + + return v; +} + +bool UserPhrasesLM::hasUnigramsForKey(const string& key) +{ + return keyRowMap.find(key.c_str()) != keyRowMap.end(); +} diff --git a/Source/Engine/LanguageModel/UserPhrasesLM.h b/Source/Engine/LanguageModel/UserPhrasesLM.h new file mode 100644 index 00000000..2ff27a30 --- /dev/null +++ b/Source/Engine/LanguageModel/UserPhrasesLM.h @@ -0,0 +1,81 @@ +// +// UserPhraseLM.h +// +// Copyright (c) 2011-2022 The OpenVanilla Project. +// +// Contributors: +// Weizhong Yang (@zonble) @ OpenVanilla +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#ifndef USERPHRASESLM_H +#define USERPHRASESLM_H + +#include + +#include +#include +#include +#include "LanguageModel.h" + +namespace vChewing { + +using namespace Formosa::Gramambular; + +class UserPhrasesLM : public LanguageModel +{ +public: + UserPhrasesLM(); + ~UserPhrasesLM(); + + bool open(const char *path); + void close(); + void dump(); + + virtual const vector bigramsForKeys(const string& preceedingKey, const string& key); + virtual const vector unigramsForKey(const string& key); + virtual bool hasUnigramsForKey(const string& key); + +protected: + struct CStringCmp + { + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) < 0; + } + }; + + struct Row { + const char *key; + const char *value; + }; + + map, CStringCmp> keyRowMap; + int fd; + void *data; + size_t length; +}; + +} + +#endif diff --git a/Source/Engine/LanguageModel/vChewingLM.h b/Source/Engine/LanguageModel/vChewingLM.h index 29fb1587..0d57fcea 100644 --- a/Source/Engine/LanguageModel/vChewingLM.h +++ b/Source/Engine/LanguageModel/vChewingLM.h @@ -39,6 +39,7 @@ #include #include "FastLM.h" +#include "UserPhrasesLM.h" namespace vChewing { @@ -59,8 +60,8 @@ public: protected: FastLM m_languageModel; - FastLM m_userPhrases; - FastLM m_excludedPhrases; + UserPhrasesLM m_userPhrases; + UserPhrasesLM m_excludedPhrases; }; }; diff --git a/Source/InputMethodController.mm b/Source/InputMethodController.mm index c07c518d..efb15c5b 100644 --- a/Source/InputMethodController.mm +++ b/Source/InputMethodController.mm @@ -618,8 +618,6 @@ NS_INLINE size_t max(size_t a, size_t b) { return a > b ? a : b; } [readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]]; } [string appendString:[readingsArray componentsJoinedByString:@"-"]]; - [string appendString:@" "]; - [string appendString:@"-1.0"]; return string; } diff --git a/Source/LanguageModelManager.mm b/Source/LanguageModelManager.mm index a955bf97..9ceed6c1 100644 --- a/Source/LanguageModelManager.mm +++ b/Source/LanguageModelManager.mm @@ -134,17 +134,42 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing return NO; } - NSString *currentMarkedPhrase = [userPhrase stringByAppendingString:@"\n"]; - + BOOL shuoldAddLineBreakAtFront = NO; NSString *path = [self userPhrasesDataPathBopomofo]; - NSFileHandle *file = [NSFileHandle fileHandleForUpdatingAtPath:path]; - if (!file) { + + if ([[NSFileManager defaultManager] fileExistsAtPath:path]) { + NSError *error = nil; + NSDictionary *attr = [[NSFileManager defaultManager] attributesOfItemAtPath:path error:&error]; + unsigned long long fileSize = [attr fileSize]; + if (!error && fileSize) { + NSFileHandle *readFile = [NSFileHandle fileHandleForReadingAtPath:path]; + if (readFile) { + [readFile seekToFileOffset:fileSize - 1]; + NSData *data = [readFile readDataToEndOfFile]; + const void *bytes = [data bytes]; + if (*(char *)bytes != '\n') { + shuoldAddLineBreakAtFront = YES; + } + [readFile closeFile]; + } + } + } + + NSMutableString *currentMarkedPhrase = [NSMutableString string]; + if (shuoldAddLineBreakAtFront) { + [currentMarkedPhrase appendString:@"\n"]; + } + [currentMarkedPhrase appendString:userPhrase]; + [currentMarkedPhrase appendString:@"\n"]; + + NSFileHandle *writeFile = [NSFileHandle fileHandleForUpdatingAtPath:path]; + if (!writeFile) { return NO; } - [file seekToEndOfFile]; + [writeFile seekToEndOfFile]; NSData *data = [currentMarkedPhrase dataUsingEncoding:NSUTF8StringEncoding]; - [file writeData:data]; - [file closeFile]; + [writeFile writeData:data]; + [writeFile closeFile]; [self loadUserPhrasesModel]; return YES; diff --git a/Source/Shit4Migration.txt b/Source/Shit4Migration.txt deleted file mode 100644 index 9d4fea3f..00000000 --- a/Source/Shit4Migration.txt +++ /dev/null @@ -1,71 +0,0 @@ -// shared language model object that stores our phrase-term probability database -FastLM gLanguageModelCHT; -FastLM gLanguageModelCHS; -FastLM gUserPhraseLanguageModelCHT; -FastLM gUserPhraseLanguageModelCHS; - -static const int kUserOverrideModelCapacity = 500; -static const double kObservedOverrideHalflife = 5400.0; // 1.5 hr. -vChewing::UserOverrideModel gUserOverrideModelCHT(kUserOverrideModelCapacity, kObservedOverrideHalflife); -vChewing::UserOverrideModel gUserOverrideModelCHS(kUserOverrideModelCapacity, kObservedOverrideHalflife); - -static NSString *LTUserDataFolderPath() -{ - NSArray *paths = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDirectory, YES); - NSString *appSupportPath = [paths objectAtIndex:0]; - NSString *userDictPath = [appSupportPath stringByAppendingPathComponent:@"vChewing"]; - return userDictPath; -} - -static NSString *LTUserPhrasesDataPathCHT() -{ - return [LTUserDataFolderPath() stringByAppendingPathComponent:@"userdata-cht.txt"]; -} - -static NSString *LTUserPhrasesDataPathCHS() -{ - return [LTUserDataFolderPath() stringByAppendingPathComponent:@"userdata-chs.txt"]; -} - -static BOOL LTCheckIfUserLanguageModelFileExists() { - - NSString *folderPath = LTUserDataFolderPath(); - BOOL isFolder = NO; - BOOL folderExist = [[NSFileManager defaultManager] fileExistsAtPath:folderPath isDirectory:&isFolder]; - if (folderExist && !isFolder) { - NSError *error = nil; - [[NSFileManager defaultManager] removeItemAtPath:folderPath error:&error]; - if (error) { - NSLog(@"Failed to remove folder %@", error); - return NO; - } - folderExist = NO; - } - if (!folderExist) { - NSError *error = nil; - [[NSFileManager defaultManager] createDirectoryAtPath:folderPath withIntermediateDirectories:YES attributes:nil error:&error]; - if (error) { - NSLog(@"Failed to create folder %@", error); - return NO; - } - } - NSString *filePathCHS = LTUserPhrasesDataPathCHS(); - if (![[NSFileManager defaultManager] fileExistsAtPath:filePathCHS]) { - BOOL result = [[@"" dataUsingEncoding:NSUTF8StringEncoding] writeToFile:filePathCHS atomically:YES]; - if (!result) { - NSLog(@"Failed to write userdict CHS file"); - return NO; - } - } - NSString *filePathCHT = LTUserPhrasesDataPathCHT(); - if (![[NSFileManager defaultManager] fileExistsAtPath:filePathCHT]) { - BOOL result = [[@"" dataUsingEncoding:NSUTF8StringEncoding] writeToFile:filePathCHT atomically:YES]; - if (!result) { - NSLog(@"Failed to write userdict CHT file"); - return NO; - } - } - return YES; -} - - diff --git a/Source/vChewing-Bridging-Header.h b/Source/vChewing-Bridging-Header.h index 95c5f4f2..e4c62d3f 100644 --- a/Source/vChewing-Bridging-Header.h +++ b/Source/vChewing-Bridging-Header.h @@ -7,5 +7,4 @@ @interface LanguageModelManager : NSObject + (void)loadDataModels; + (void)loadUserPhrasesModel; -+ (BOOL)checkIfUserLanguageModelFilesExist; @end diff --git a/vChewing.xcodeproj/project.pbxproj b/vChewing.xcodeproj/project.pbxproj index bb2e0901..e1d75a29 100644 --- a/vChewing.xcodeproj/project.pbxproj +++ b/vChewing.xcodeproj/project.pbxproj @@ -14,6 +14,7 @@ 5B58E87F278413E7003EA2AD /* MITLicense.txt in Resources */ = {isa = PBXBuildFile; fileRef = 5B58E87D278413E7003EA2AD /* MITLicense.txt */; }; 5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */; }; 5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; }; + 5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */; }; 5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */; }; 5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */; }; 5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */; }; @@ -89,7 +90,8 @@ 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = vChewingLM.cpp; sourceTree = ""; }; 5B5F4F91279294A300922DC2 /* LanguageModelManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LanguageModelManager.h; sourceTree = ""; }; 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LanguageModelManager.mm; sourceTree = ""; }; - 5B5F4F9427929ADC00922DC2 /* Shit4Migration.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = Shit4Migration.txt; sourceTree = ""; }; + 5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = UserPhrasesLM.h; sourceTree = ""; }; + 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = UserPhrasesLM.cpp; sourceTree = ""; }; 5B9781D32763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/InfoPlist.strings"; sourceTree = ""; }; 5B9781D52763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/Localizable.strings"; sourceTree = ""; }; 5B9781D72763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/InfoPlist.strings"; sourceTree = ""; }; @@ -245,6 +247,8 @@ 6A0421A715FEF3F50061ED63 /* FastLM.h */, 5B42B63E27876FDC00BB9B9F /* UserOverrideModel.cpp */, 5B42B63F27876FDC00BB9B9F /* UserOverrideModel.h */, + 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */, + 5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */, ); path = LanguageModel; sourceTree = ""; @@ -292,7 +296,6 @@ 5BF4A6FC27844738007DC6E7 /* frmAboutWindow.m */, 6A0D4EC615FC0D6400ABF4B3 /* InputMethodController.h */, 6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */, - 5B5F4F9427929ADC00922DC2 /* Shit4Migration.txt */, 5B5F4F91279294A300922DC2 /* LanguageModelManager.h */, 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */, 6A0D4EC815FC0D6400ABF4B3 /* main.m */, @@ -642,6 +645,7 @@ 6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */, 6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */, 5BF4A6FE27844738007DC6E7 /* frmAboutWindow.m in Sources */, + 5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */, 5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */, 5BDF2D062791DFF200838ADB /* AppDelegate.swift in Sources */, 5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */,