Zonble: mgrLanguageModel // Adds UserPhrasesLM for user phrases.

Co-Authored-By: Weizhong Yang a.k.a zonble <zonble@gmail.com>
This commit is contained in:
ShikiSuen 2022-01-15 14:56:53 +08:00
parent e4aa9de5ab
commit 404539e33f
8 changed files with 358 additions and 85 deletions

View File

@ -0,0 +1,236 @@
//
// UserPhraseLM.cpp
//
// Copyright (c) 2011-2022 The OpenVanilla Project.
//
// Contributors:
// Weizhong Yang (@zonble) @ OpenVanilla
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
#include "UserPhrasesLM.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <unistd.h>
using namespace Formosa::Gramambular;
using namespace vChewing;
UserPhrasesLM::UserPhrasesLM()
: fd(-1)
, data(0)
, length(0)
{
}
UserPhrasesLM::~UserPhrasesLM()
{
if (data) {
close();
}
}
bool UserPhrasesLM::open(const char *path)
{
if (data) {
return false;
}
fd = ::open(path, O_RDONLY);
if (fd == -1) {
printf("open:: file not exist");
return false;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
printf("open:: cannot open file");
return false;
}
length = (size_t)sb.st_size;
data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0);
if (!data) {
::close(fd);
return false;
}
char *head = (char *)data;
char *end = (char *)data + length;
char c;
Row row;
start:
// EOF -> end
if (head == end) {
goto end;
}
c = *head;
// \s -> error
if (c == ' ') {
goto error;
}
// \n -> start
else if (c == '\n') {
head++;
goto start;
}
// \w -> record column star, state1
row.value = head;
head++;
// fall through to state 1
state1:
// EOF -> error
if (head == end) {
goto error;
}
c = *head;
// \n -> error
if (c == '\n') {
goto error;
}
// \s -> state2 + zero out ending + record column start
else if (c == ' ') {
*head = 0;
head++;
row.key = head;
goto state2;
}
// \w -> state1
head++;
goto state1;
state2:
if (head == end) {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
goto end;
}
c = *head;
// \s -> error
if (c == ' ' || c == '\n') {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
if (c == ' ') {
goto state3;
}
goto start;
}
// \w -> state 2
head++;
goto state2;
state3:
if (head == end) {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
goto end;
}
c = *head;
if (c == '\n') {
goto start;
}
head++;
goto state3;
error:
close();
return false;
end:
static const char *space = " ";
Row emptyRow;
emptyRow.key = space;
emptyRow.value = space;
keyRowMap[space].push_back(emptyRow);
return true;
}
void UserPhrasesLM::close()
{
if (data) {
munmap(data, length);
::close(fd);
data = 0;
}
keyRowMap.clear();
}
void UserPhrasesLM::dump()
{
size_t rows = 0;
for (map<const char *, vector<Row> >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) {
const vector<Row>& r = (*i).second;
for (vector<Row>::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) {
const Row& row = *ri;
cerr << row.key << " " << row.value << "\n";
rows++;
}
}
}
const vector<Bigram> UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key)
{
return vector<Bigram>();
}
const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
{
vector<Unigram> v;
map<const char *, vector<Row> >::const_iterator i = keyRowMap.find(key.c_str());
if (i != keyRowMap.end()) {
for (vector<Row>::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) {
Unigram g;
const Row& r = *ri;
g.keyValue.key = r.key;
g.keyValue.value = r.value;
g.score = 0.0;
v.push_back(g);
}
}
return v;
}
bool UserPhrasesLM::hasUnigramsForKey(const string& key)
{
return keyRowMap.find(key.c_str()) != keyRowMap.end();
}

View File

@ -0,0 +1,81 @@
//
// UserPhraseLM.h
//
// Copyright (c) 2011-2022 The OpenVanilla Project.
//
// Contributors:
// Weizhong Yang (@zonble) @ OpenVanilla
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
#ifndef USERPHRASESLM_H
#define USERPHRASESLM_H
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include "LanguageModel.h"
namespace vChewing {
using namespace Formosa::Gramambular;
class UserPhrasesLM : public LanguageModel
{
public:
UserPhrasesLM();
~UserPhrasesLM();
bool open(const char *path);
void close();
void dump();
virtual const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
virtual const vector<Unigram> unigramsForKey(const string& key);
virtual bool hasUnigramsForKey(const string& key);
protected:
struct CStringCmp
{
bool operator()(const char* s1, const char* s2) const
{
return strcmp(s1, s2) < 0;
}
};
struct Row {
const char *key;
const char *value;
};
map<const char *, vector<Row>, CStringCmp> keyRowMap;
int fd;
void *data;
size_t length;
};
}
#endif

View File

@ -39,6 +39,7 @@
#include <stdio.h>
#include "FastLM.h"
#include "UserPhrasesLM.h"
namespace vChewing {
@ -59,8 +60,8 @@ public:
protected:
FastLM m_languageModel;
FastLM m_userPhrases;
FastLM m_excludedPhrases;
UserPhrasesLM m_userPhrases;
UserPhrasesLM m_excludedPhrases;
};
};

View File

@ -618,8 +618,6 @@ NS_INLINE size_t max(size_t a, size_t b) { return a > b ? a : b; }
[readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]];
}
[string appendString:[readingsArray componentsJoinedByString:@"-"]];
[string appendString:@" "];
[string appendString:@"-1.0"];
return string;
}

View File

@ -134,17 +134,42 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, vChewing
return NO;
}
NSString *currentMarkedPhrase = [userPhrase stringByAppendingString:@"\n"];
BOOL shuoldAddLineBreakAtFront = NO;
NSString *path = [self userPhrasesDataPathBopomofo];
NSFileHandle *file = [NSFileHandle fileHandleForUpdatingAtPath:path];
if (!file) {
if ([[NSFileManager defaultManager] fileExistsAtPath:path]) {
NSError *error = nil;
NSDictionary *attr = [[NSFileManager defaultManager] attributesOfItemAtPath:path error:&error];
unsigned long long fileSize = [attr fileSize];
if (!error && fileSize) {
NSFileHandle *readFile = [NSFileHandle fileHandleForReadingAtPath:path];
if (readFile) {
[readFile seekToFileOffset:fileSize - 1];
NSData *data = [readFile readDataToEndOfFile];
const void *bytes = [data bytes];
if (*(char *)bytes != '\n') {
shuoldAddLineBreakAtFront = YES;
}
[readFile closeFile];
}
}
}
NSMutableString *currentMarkedPhrase = [NSMutableString string];
if (shuoldAddLineBreakAtFront) {
[currentMarkedPhrase appendString:@"\n"];
}
[currentMarkedPhrase appendString:userPhrase];
[currentMarkedPhrase appendString:@"\n"];
NSFileHandle *writeFile = [NSFileHandle fileHandleForUpdatingAtPath:path];
if (!writeFile) {
return NO;
}
[file seekToEndOfFile];
[writeFile seekToEndOfFile];
NSData *data = [currentMarkedPhrase dataUsingEncoding:NSUTF8StringEncoding];
[file writeData:data];
[file closeFile];
[writeFile writeData:data];
[writeFile closeFile];
[self loadUserPhrasesModel];
return YES;

View File

@ -1,71 +0,0 @@
// shared language model object that stores our phrase-term probability database
FastLM gLanguageModelCHT;
FastLM gLanguageModelCHS;
FastLM gUserPhraseLanguageModelCHT;
FastLM gUserPhraseLanguageModelCHS;
static const int kUserOverrideModelCapacity = 500;
static const double kObservedOverrideHalflife = 5400.0; // 1.5 hr.
vChewing::UserOverrideModel gUserOverrideModelCHT(kUserOverrideModelCapacity, kObservedOverrideHalflife);
vChewing::UserOverrideModel gUserOverrideModelCHS(kUserOverrideModelCapacity, kObservedOverrideHalflife);
static NSString *LTUserDataFolderPath()
{
NSArray *paths = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDirectory, YES);
NSString *appSupportPath = [paths objectAtIndex:0];
NSString *userDictPath = [appSupportPath stringByAppendingPathComponent:@"vChewing"];
return userDictPath;
}
static NSString *LTUserPhrasesDataPathCHT()
{
return [LTUserDataFolderPath() stringByAppendingPathComponent:@"userdata-cht.txt"];
}
static NSString *LTUserPhrasesDataPathCHS()
{
return [LTUserDataFolderPath() stringByAppendingPathComponent:@"userdata-chs.txt"];
}
static BOOL LTCheckIfUserLanguageModelFileExists() {
NSString *folderPath = LTUserDataFolderPath();
BOOL isFolder = NO;
BOOL folderExist = [[NSFileManager defaultManager] fileExistsAtPath:folderPath isDirectory:&isFolder];
if (folderExist && !isFolder) {
NSError *error = nil;
[[NSFileManager defaultManager] removeItemAtPath:folderPath error:&error];
if (error) {
NSLog(@"Failed to remove folder %@", error);
return NO;
}
folderExist = NO;
}
if (!folderExist) {
NSError *error = nil;
[[NSFileManager defaultManager] createDirectoryAtPath:folderPath withIntermediateDirectories:YES attributes:nil error:&error];
if (error) {
NSLog(@"Failed to create folder %@", error);
return NO;
}
}
NSString *filePathCHS = LTUserPhrasesDataPathCHS();
if (![[NSFileManager defaultManager] fileExistsAtPath:filePathCHS]) {
BOOL result = [[@"" dataUsingEncoding:NSUTF8StringEncoding] writeToFile:filePathCHS atomically:YES];
if (!result) {
NSLog(@"Failed to write userdict CHS file");
return NO;
}
}
NSString *filePathCHT = LTUserPhrasesDataPathCHT();
if (![[NSFileManager defaultManager] fileExistsAtPath:filePathCHT]) {
BOOL result = [[@"" dataUsingEncoding:NSUTF8StringEncoding] writeToFile:filePathCHT atomically:YES];
if (!result) {
NSLog(@"Failed to write userdict CHT file");
return NO;
}
}
return YES;
}

View File

@ -7,5 +7,4 @@
@interface LanguageModelManager : NSObject
+ (void)loadDataModels;
+ (void)loadUserPhrasesModel;
+ (BOOL)checkIfUserLanguageModelFilesExist;
@end

View File

@ -14,6 +14,7 @@
5B58E87F278413E7003EA2AD /* MITLicense.txt in Resources */ = {isa = PBXBuildFile; fileRef = 5B58E87D278413E7003EA2AD /* MITLicense.txt */; };
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */; };
5B5F4F93279294A300922DC2 /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */; };
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */; };
5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE18278FC48C00F5E44C /* VerticalCandidateController.swift */; };
5BC3EE1C278FC48C00F5E44C /* VTCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE19278FC48C00F5E44C /* VTCandidateController.swift */; };
5BC3EE1D278FC48C00F5E44C /* HorizontalCandidateController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BC3EE1A278FC48C00F5E44C /* HorizontalCandidateController.swift */; };
@ -89,7 +90,8 @@
5B5F4F8D27928F9300922DC2 /* vChewingLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = vChewingLM.cpp; sourceTree = "<group>"; };
5B5F4F91279294A300922DC2 /* LanguageModelManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LanguageModelManager.h; sourceTree = "<group>"; };
5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LanguageModelManager.mm; sourceTree = "<group>"; };
5B5F4F9427929ADC00922DC2 /* Shit4Migration.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = Shit4Migration.txt; sourceTree = "<group>"; };
5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = UserPhrasesLM.h; sourceTree = "<group>"; };
5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = UserPhrasesLM.cpp; sourceTree = "<group>"; };
5B9781D32763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/InfoPlist.strings"; sourceTree = "<group>"; };
5B9781D52763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "zh-Hans.lproj/Localizable.strings"; sourceTree = "<group>"; };
5B9781D72763850700897999 /* zh-Hans */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = "zh-Hans"; path = "Source/zh-Hans.lproj/InfoPlist.strings"; sourceTree = "<group>"; };
@ -245,6 +247,8 @@
6A0421A715FEF3F50061ED63 /* FastLM.h */,
5B42B63E27876FDC00BB9B9F /* UserOverrideModel.cpp */,
5B42B63F27876FDC00BB9B9F /* UserOverrideModel.h */,
5B5F4F962792A4EA00922DC2 /* UserPhrasesLM.cpp */,
5B5F4F952792A4EA00922DC2 /* UserPhrasesLM.h */,
);
path = LanguageModel;
sourceTree = "<group>";
@ -292,7 +296,6 @@
5BF4A6FC27844738007DC6E7 /* frmAboutWindow.m */,
6A0D4EC615FC0D6400ABF4B3 /* InputMethodController.h */,
6A0D4EC715FC0D6400ABF4B3 /* InputMethodController.mm */,
5B5F4F9427929ADC00922DC2 /* Shit4Migration.txt */,
5B5F4F91279294A300922DC2 /* LanguageModelManager.h */,
5B5F4F92279294A300922DC2 /* LanguageModelManager.mm */,
6A0D4EC815FC0D6400ABF4B3 /* main.m */,
@ -642,6 +645,7 @@
6A0D4ED215FC0D6400ABF4B3 /* InputMethodController.mm in Sources */,
6A0D4ED315FC0D6400ABF4B3 /* main.m in Sources */,
5BF4A6FE27844738007DC6E7 /* frmAboutWindow.m in Sources */,
5B5F4F972792A4EA00922DC2 /* UserPhrasesLM.cpp in Sources */,
5B5F4F8E27928F9300922DC2 /* vChewingLM.cpp in Sources */,
5BDF2D062791DFF200838ADB /* AppDelegate.swift in Sources */,
5BC3EE1B278FC48C00F5E44C /* VerticalCandidateController.swift in Sources */,