Adds UserPhrasesLM for user phrases.

Since there is no probability information for users' custom phrases,
they should be stored in a format differs from data.txt. Using the same
format and FastLM to parse user phrases just because of laziness but it
is not the right way.

The pull request adds a new language model class to parse user phrases.
It also update the input method controller to adopt the new user phrase
format.
This commit is contained in:
zonble 2022-01-12 16:53:51 +08:00
parent fb513f51b0
commit d590d748f8
7 changed files with 302 additions and 15 deletions

View File

@ -38,6 +38,7 @@
6AFF97F2253B299E007F1C49 /* NonModalAlertWindowController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6AFF97F0253B299E007F1C49 /* NonModalAlertWindowController.xib */; }; 6AFF97F2253B299E007F1C49 /* NonModalAlertWindowController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6AFF97F0253B299E007F1C49 /* NonModalAlertWindowController.xib */; };
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = D41355D7278D7409005E5CBD /* LanguageModelManager.mm */; }; D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = D41355D7278D7409005E5CBD /* LanguageModelManager.mm */; };
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */; }; D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */; };
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */; };
D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; }; D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; };
D427F76A278C9E29004A2160 /* CandidateUI in Frameworks */ = {isa = PBXBuildFile; productRef = D427F769278C9E29004A2160 /* CandidateUI */; }; D427F76A278C9E29004A2160 /* CandidateUI in Frameworks */ = {isa = PBXBuildFile; productRef = D427F769278C9E29004A2160 /* CandidateUI */; };
D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427F76B278CA1BA004A2160 /* AppDelegate.swift */; }; D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427F76B278CA1BA004A2160 /* AppDelegate.swift */; };
@ -161,6 +162,8 @@
D41355D7278D7409005E5CBD /* LanguageModelManager.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LanguageModelManager.mm; sourceTree = "<group>"; }; D41355D7278D7409005E5CBD /* LanguageModelManager.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LanguageModelManager.mm; sourceTree = "<group>"; };
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = McBopomofoLM.cpp; sourceTree = "<group>"; }; D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = McBopomofoLM.cpp; sourceTree = "<group>"; };
D41355DA278E6D17005E5CBD /* McBopomofoLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = McBopomofoLM.h; sourceTree = "<group>"; }; D41355DA278E6D17005E5CBD /* McBopomofoLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = McBopomofoLM.h; sourceTree = "<group>"; };
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = UserPhrasesLM.cpp; sourceTree = "<group>"; };
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = UserPhrasesLM.h; sourceTree = "<group>"; };
D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = "<group>"; }; D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = "<group>"; };
D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = "<group>"; }; D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = "<group>"; };
D427F768278C9D0D004A2160 /* CandidateUI */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = CandidateUI; path = Packages/CandidateUI; sourceTree = "<group>"; }; D427F768278C9D0D004A2160 /* CandidateUI */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = CandidateUI; path = Packages/CandidateUI; sourceTree = "<group>"; };
@ -268,10 +271,12 @@
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */, 6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
6A0421A615FEF3F50061ED63 /* FastLM.cpp */, 6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
6A0421A715FEF3F50061ED63 /* FastLM.h */, 6A0421A715FEF3F50061ED63 /* FastLM.h */,
D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */, D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */, D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */, D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
D41355DA278E6D17005E5CBD /* McBopomofoLM.h */, D41355DA278E6D17005E5CBD /* McBopomofoLM.h */,
D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */,
D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */,
); );
path = Engine; path = Engine;
sourceTree = "<group>"; sourceTree = "<group>";
@ -561,6 +566,7 @@
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */, D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */,
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */, D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */, 6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */, 6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */, D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
); );

View File

@ -3,6 +3,7 @@
#include <stdio.h> #include <stdio.h>
#include "FastLM.h" #include "FastLM.h"
#include "UserPhrasesLM.h"
namespace McBopomofo { namespace McBopomofo {
@ -23,8 +24,8 @@ public:
protected: protected:
FastLM m_languageModel; FastLM m_languageModel;
FastLM m_userPhrases; UserPhrasesLM m_userPhrases;
FastLM m_excludedPhrases; UserPhrasesLM m_excludedPhrases;
}; };
}; };

View File

@ -0,0 +1,207 @@
#include "UserPhrasesLM.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <fstream>
#include <unistd.h>
using namespace Formosa::Gramambular;
using namespace McBopomofo;
UserPhrasesLM::UserPhrasesLM()
: fd(-1)
, data(0)
, length(0)
{
}
UserPhrasesLM::~UserPhrasesLM()
{
if (data) {
close();
}
}
bool UserPhrasesLM::open(const char *path)
{
if (data) {
return false;
}
fd = ::open(path, O_RDONLY);
if (fd == -1) {
printf("open:: file not exist");
return false;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
printf("open:: cannot open file");
return false;
}
length = (size_t)sb.st_size;
data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0);
if (!data) {
::close(fd);
return false;
}
char *head = (char *)data;
char *end = (char *)data + length;
char c;
Row row;
start:
// EOF -> end
if (head == end) {
goto end;
}
c = *head;
// \s -> error
if (c == ' ') {
goto error;
}
// \n -> start
else if (c == '\n') {
head++;
goto start;
}
// \w -> record column star, state1
row.value = head;
head++;
// fall through to state 1
state1:
// EOF -> error
if (head == end) {
goto error;
}
c = *head;
// \n -> error
if (c == '\n') {
goto error;
}
// \s -> state2 + zero out ending + record column start
else if (c == ' ') {
*head = 0;
head++;
row.key = head;
goto state2;
}
// \w -> state1
head++;
goto state1;
state2:
if (head == end) {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
goto end;
}
c = *head;
// \s -> error
if (c == ' ' || c == '\n') {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
if (c == ' ') {
goto state3;
}
goto start;
}
// \w -> state 2
head++;
goto state2;
state3:
if (head == end) {
*head = 0;
head++;
keyRowMap[row.key].push_back(row);
goto end;
}
c = *head;
if (c == '\n') {
goto start;
}
head++;
goto state3;
error:
close();
return false;
end:
static const char *space = " ";
Row emptyRow;
emptyRow.key = space;
emptyRow.value = space;
keyRowMap[space].push_back(emptyRow);
return true;
}
void UserPhrasesLM::close()
{
if (data) {
munmap(data, length);
::close(fd);
data = 0;
}
keyRowMap.clear();
}
void UserPhrasesLM::dump()
{
size_t rows = 0;
for (map<const char *, vector<Row> >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) {
const vector<Row>& r = (*i).second;
for (vector<Row>::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) {
const Row& row = *ri;
cerr << row.key << " " << row.value << "\n";
rows++;
}
}
}
const vector<Bigram> UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key)
{
return vector<Bigram>();
}
const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
{
vector<Unigram> v;
map<const char *, vector<Row> >::const_iterator i = keyRowMap.find(key.c_str());
if (i != keyRowMap.end()) {
for (vector<Row>::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) {
Unigram g;
const Row& r = *ri;
g.keyValue.key = r.key;
g.keyValue.value = r.value;
g.score = 0.0;
v.push_back(g);
}
}
return v;
}
bool UserPhrasesLM::hasUnigramsForKey(const string& key)
{
return keyRowMap.find(key.c_str()) != keyRowMap.end();
}

View File

@ -0,0 +1,51 @@
#ifndef USERPHRASESLM_H
#define USERPHRASESLM_H
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include "LanguageModel.h"
namespace McBopomofo {
using namespace Formosa::Gramambular;
class UserPhrasesLM : public LanguageModel
{
public:
UserPhrasesLM();
~UserPhrasesLM();
bool open(const char *path);
void close();
void dump();
virtual const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
virtual const vector<Unigram> unigramsForKey(const string& key);
virtual bool hasUnigramsForKey(const string& key);
protected:
struct CStringCmp
{
bool operator()(const char* s1, const char* s2) const
{
return strcmp(s1, s2) < 0;
}
};
struct Row {
const char *key;
const char *value;
};
map<const char *, vector<Row>, CStringCmp> keyRowMap;
int fd;
void *data;
size_t length;
};
}
#endif

View File

@ -610,13 +610,11 @@ NS_INLINE size_t max(size_t a, size_t b) { return a > b ? a : b; }
[string appendString:reading]; [string appendString:reading];
[string appendString:@" "]; [string appendString:@" "];
NSMutableArray *readingsArray = [[NSMutableArray alloc] init]; NSMutableArray *readingsArray = [[NSMutableArray alloc] init];
vector<std::string> v = _builder->readingsAtRange(begin,end); vector<std::string> v = _builder->readingsAtRange(begin, end);
for(vector<std::string>::iterator it_i=v.begin(); it_i!=v.end(); ++it_i) { for(vector<std::string>::iterator it_i=v.begin(); it_i!=v.end(); ++it_i) {
[readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]]; [readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]];
} }
[string appendString:[readingsArray componentsJoinedByString:@"-"]]; [string appendString:[readingsArray componentsJoinedByString:@"-"]];
[string appendString:@" "];
[string appendString:@"-1.0"];
return string; return string;
} }

View File

@ -98,17 +98,42 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, McBopomo
return NO; return NO;
} }
NSString *currentMarkedPhrase = [userPhrase stringByAppendingString:@"\n"]; BOOL shuoldAddLineBreakAtFront = NO;
NSString *path = [self userPhrasesDataPathMcBopomofo]; NSString *path = [self userPhrasesDataPathMcBopomofo];
NSFileHandle *file = [NSFileHandle fileHandleForUpdatingAtPath:path];
if (!file) { if ([[NSFileManager defaultManager] fileExistsAtPath:path]) {
NSError *error = nil;
NSDictionary *attr = [[NSFileManager defaultManager] attributesOfItemAtPath:path error:&error];
unsigned long long fileSize = [attr fileSize];
if (!error && fileSize) {
NSFileHandle *readFile = [NSFileHandle fileHandleForReadingAtPath:path];
if (readFile) {
[readFile seekToFileOffset:fileSize - 1];
NSData *data = [readFile readDataToEndOfFile];
const void *bytes = [data bytes];
if (*(char *)bytes != '\n') {
shuoldAddLineBreakAtFront = YES;
}
[readFile closeFile];
}
}
}
NSMutableString *currentMarkedPhrase = [NSMutableString string];
if (shuoldAddLineBreakAtFront) {
[currentMarkedPhrase appendString:@"\n"];
}
[currentMarkedPhrase appendString:userPhrase];
[currentMarkedPhrase appendString:@"\n"];
NSFileHandle *writeFile = [NSFileHandle fileHandleForUpdatingAtPath:path];
if (!writeFile) {
return NO; return NO;
} }
[file seekToEndOfFile]; [writeFile seekToEndOfFile];
NSData *data = [currentMarkedPhrase dataUsingEncoding:NSUTF8StringEncoding]; NSData *data = [currentMarkedPhrase dataUsingEncoding:NSUTF8StringEncoding];
[file writeData:data]; [writeFile writeData:data];
[file closeFile]; [writeFile closeFile];
[self loadUserPhrasesModel]; [self loadUserPhrasesModel];
return YES; return YES;

View File

@ -7,5 +7,4 @@
@interface LanguageModelManager : NSObject @interface LanguageModelManager : NSObject
+ (void)loadDataModels; + (void)loadDataModels;
+ (void)loadUserPhrasesModel; + (void)loadUserPhrasesModel;
+ (BOOL)checkIfUserLanguageModelFilesExist;
@end @end