Adds UserPhrasesLM for user phrases.
Since there is no probability information for users' custom phrases, they should be stored in a format differs from data.txt. Using the same format and FastLM to parse user phrases just because of laziness but it is not the right way. The pull request adds a new language model class to parse user phrases. It also update the input method controller to adopt the new user phrase format.
This commit is contained in:
parent
fb513f51b0
commit
d590d748f8
|
@ -38,6 +38,7 @@
|
|||
6AFF97F2253B299E007F1C49 /* NonModalAlertWindowController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 6AFF97F0253B299E007F1C49 /* NonModalAlertWindowController.xib */; };
|
||||
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */ = {isa = PBXBuildFile; fileRef = D41355D7278D7409005E5CBD /* LanguageModelManager.mm */; };
|
||||
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */; };
|
||||
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */; };
|
||||
D427A9C125ED28CC005D43E0 /* OpenCCBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */; };
|
||||
D427F76A278C9E29004A2160 /* CandidateUI in Frameworks */ = {isa = PBXBuildFile; productRef = D427F769278C9E29004A2160 /* CandidateUI */; };
|
||||
D427F76C278CA2B0004A2160 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = D427F76B278CA1BA004A2160 /* AppDelegate.swift */; };
|
||||
|
@ -161,6 +162,8 @@
|
|||
D41355D7278D7409005E5CBD /* LanguageModelManager.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LanguageModelManager.mm; sourceTree = "<group>"; };
|
||||
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = McBopomofoLM.cpp; sourceTree = "<group>"; };
|
||||
D41355DA278E6D17005E5CBD /* McBopomofoLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = McBopomofoLM.h; sourceTree = "<group>"; };
|
||||
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = UserPhrasesLM.cpp; sourceTree = "<group>"; };
|
||||
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = UserPhrasesLM.h; sourceTree = "<group>"; };
|
||||
D427A9BF25ED28CC005D43E0 /* McBopomofo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "McBopomofo-Bridging-Header.h"; sourceTree = "<group>"; };
|
||||
D427A9C025ED28CC005D43E0 /* OpenCCBridge.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpenCCBridge.swift; sourceTree = "<group>"; };
|
||||
D427F768278C9D0D004A2160 /* CandidateUI */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = CandidateUI; path = Packages/CandidateUI; sourceTree = "<group>"; };
|
||||
|
@ -268,10 +271,12 @@
|
|||
6A0D4F2215FC0EB100ABF4B3 /* OpenVanilla */,
|
||||
6A0421A615FEF3F50061ED63 /* FastLM.cpp */,
|
||||
6A0421A715FEF3F50061ED63 /* FastLM.h */,
|
||||
D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */,
|
||||
D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */,
|
||||
D41355DC278EA3ED005E5CBD /* UserPhrasesLM.cpp */,
|
||||
D41355DD278EA3ED005E5CBD /* UserPhrasesLM.h */,
|
||||
D41355D9278E6D17005E5CBD /* McBopomofoLM.cpp */,
|
||||
D41355DA278E6D17005E5CBD /* McBopomofoLM.h */,
|
||||
D47F7DD2278C1263002F9DD7 /* UserOverrideModel.cpp */,
|
||||
D47F7DD1278C1263002F9DD7 /* UserOverrideModel.h */,
|
||||
);
|
||||
path = Engine;
|
||||
sourceTree = "<group>";
|
||||
|
@ -561,6 +566,7 @@
|
|||
D41355DB278E6D17005E5CBD /* McBopomofoLM.cpp in Sources */,
|
||||
D47F7DD3278C1263002F9DD7 /* UserOverrideModel.cpp in Sources */,
|
||||
6A0D4F4515FC0EB100ABF4B3 /* Mandarin.cpp in Sources */,
|
||||
D41355DE278EA3ED005E5CBD /* UserPhrasesLM.cpp in Sources */,
|
||||
6A0421A815FEF3F50061ED63 /* FastLM.cpp in Sources */,
|
||||
D41355D8278D74B5005E5CBD /* LanguageModelManager.mm in Sources */,
|
||||
);
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include "FastLM.h"
|
||||
#include "UserPhrasesLM.h"
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
|
@ -23,8 +24,8 @@ public:
|
|||
|
||||
protected:
|
||||
FastLM m_languageModel;
|
||||
FastLM m_userPhrases;
|
||||
FastLM m_excludedPhrases;
|
||||
UserPhrasesLM m_userPhrases;
|
||||
UserPhrasesLM m_excludedPhrases;
|
||||
};
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,207 @@
|
|||
#include "UserPhrasesLM.h"
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
|
||||
using namespace Formosa::Gramambular;
|
||||
using namespace McBopomofo;
|
||||
|
||||
UserPhrasesLM::UserPhrasesLM()
|
||||
: fd(-1)
|
||||
, data(0)
|
||||
, length(0)
|
||||
{
|
||||
}
|
||||
|
||||
UserPhrasesLM::~UserPhrasesLM()
|
||||
{
|
||||
if (data) {
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
bool UserPhrasesLM::open(const char *path)
|
||||
{
|
||||
if (data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
fd = ::open(path, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
printf("open:: file not exist");
|
||||
return false;
|
||||
}
|
||||
|
||||
struct stat sb;
|
||||
if (fstat(fd, &sb) == -1) {
|
||||
printf("open:: cannot open file");
|
||||
return false;
|
||||
}
|
||||
|
||||
length = (size_t)sb.st_size;
|
||||
|
||||
data = mmap(NULL, length, PROT_WRITE, MAP_PRIVATE, fd, 0);
|
||||
if (!data) {
|
||||
::close(fd);
|
||||
return false;
|
||||
}
|
||||
|
||||
char *head = (char *)data;
|
||||
char *end = (char *)data + length;
|
||||
char c;
|
||||
Row row;
|
||||
|
||||
start:
|
||||
// EOF -> end
|
||||
if (head == end) {
|
||||
goto end;
|
||||
}
|
||||
|
||||
c = *head;
|
||||
// \s -> error
|
||||
if (c == ' ') {
|
||||
goto error;
|
||||
}
|
||||
// \n -> start
|
||||
else if (c == '\n') {
|
||||
head++;
|
||||
goto start;
|
||||
}
|
||||
|
||||
// \w -> record column star, state1
|
||||
row.value = head;
|
||||
head++;
|
||||
// fall through to state 1
|
||||
|
||||
state1:
|
||||
// EOF -> error
|
||||
if (head == end) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
c = *head;
|
||||
// \n -> error
|
||||
if (c == '\n') {
|
||||
goto error;
|
||||
}
|
||||
// \s -> state2 + zero out ending + record column start
|
||||
else if (c == ' ') {
|
||||
*head = 0;
|
||||
head++;
|
||||
row.key = head;
|
||||
goto state2;
|
||||
}
|
||||
|
||||
// \w -> state1
|
||||
head++;
|
||||
goto state1;
|
||||
|
||||
state2:
|
||||
if (head == end) {
|
||||
*head = 0;
|
||||
head++;
|
||||
keyRowMap[row.key].push_back(row);
|
||||
goto end;
|
||||
}
|
||||
|
||||
c = *head;
|
||||
// \s -> error
|
||||
if (c == ' ' || c == '\n') {
|
||||
*head = 0;
|
||||
head++;
|
||||
keyRowMap[row.key].push_back(row);
|
||||
if (c == ' ') {
|
||||
goto state3;
|
||||
}
|
||||
goto start;
|
||||
}
|
||||
|
||||
// \w -> state 2
|
||||
head++;
|
||||
goto state2;
|
||||
|
||||
state3:
|
||||
if (head == end) {
|
||||
*head = 0;
|
||||
head++;
|
||||
keyRowMap[row.key].push_back(row);
|
||||
goto end;
|
||||
}
|
||||
|
||||
c = *head;
|
||||
if (c == '\n') {
|
||||
goto start;
|
||||
}
|
||||
|
||||
head++;
|
||||
goto state3;
|
||||
|
||||
error:
|
||||
close();
|
||||
return false;
|
||||
|
||||
end:
|
||||
static const char *space = " ";
|
||||
Row emptyRow;
|
||||
emptyRow.key = space;
|
||||
emptyRow.value = space;
|
||||
keyRowMap[space].push_back(emptyRow);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void UserPhrasesLM::close()
|
||||
{
|
||||
if (data) {
|
||||
munmap(data, length);
|
||||
::close(fd);
|
||||
data = 0;
|
||||
}
|
||||
|
||||
keyRowMap.clear();
|
||||
}
|
||||
|
||||
void UserPhrasesLM::dump()
|
||||
{
|
||||
size_t rows = 0;
|
||||
for (map<const char *, vector<Row> >::const_iterator i = keyRowMap.begin(), e = keyRowMap.end(); i != e; ++i) {
|
||||
const vector<Row>& r = (*i).second;
|
||||
for (vector<Row>::const_iterator ri = r.begin(), re = r.end(); ri != re; ++ri) {
|
||||
const Row& row = *ri;
|
||||
cerr << row.key << " " << row.value << "\n";
|
||||
rows++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const vector<Bigram> UserPhrasesLM::bigramsForKeys(const string& preceedingKey, const string& key)
|
||||
{
|
||||
return vector<Bigram>();
|
||||
}
|
||||
|
||||
const vector<Unigram> UserPhrasesLM::unigramsForKey(const string& key)
|
||||
{
|
||||
vector<Unigram> v;
|
||||
map<const char *, vector<Row> >::const_iterator i = keyRowMap.find(key.c_str());
|
||||
|
||||
if (i != keyRowMap.end()) {
|
||||
for (vector<Row>::const_iterator ri = (*i).second.begin(), re = (*i).second.end(); ri != re; ++ri) {
|
||||
Unigram g;
|
||||
const Row& r = *ri;
|
||||
g.keyValue.key = r.key;
|
||||
g.keyValue.value = r.value;
|
||||
g.score = 0.0;
|
||||
v.push_back(g);
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
bool UserPhrasesLM::hasUnigramsForKey(const string& key)
|
||||
{
|
||||
return keyRowMap.find(key.c_str()) != keyRowMap.end();
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
#ifndef USERPHRASESLM_H
|
||||
#define USERPHRASESLM_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include "LanguageModel.h"
|
||||
|
||||
namespace McBopomofo {
|
||||
|
||||
using namespace Formosa::Gramambular;
|
||||
|
||||
class UserPhrasesLM : public LanguageModel
|
||||
{
|
||||
public:
|
||||
UserPhrasesLM();
|
||||
~UserPhrasesLM();
|
||||
|
||||
bool open(const char *path);
|
||||
void close();
|
||||
void dump();
|
||||
|
||||
virtual const vector<Bigram> bigramsForKeys(const string& preceedingKey, const string& key);
|
||||
virtual const vector<Unigram> unigramsForKey(const string& key);
|
||||
virtual bool hasUnigramsForKey(const string& key);
|
||||
|
||||
protected:
|
||||
struct CStringCmp
|
||||
{
|
||||
bool operator()(const char* s1, const char* s2) const
|
||||
{
|
||||
return strcmp(s1, s2) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct Row {
|
||||
const char *key;
|
||||
const char *value;
|
||||
};
|
||||
|
||||
map<const char *, vector<Row>, CStringCmp> keyRowMap;
|
||||
int fd;
|
||||
void *data;
|
||||
size_t length;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -610,13 +610,11 @@ NS_INLINE size_t max(size_t a, size_t b) { return a > b ? a : b; }
|
|||
[string appendString:reading];
|
||||
[string appendString:@" "];
|
||||
NSMutableArray *readingsArray = [[NSMutableArray alloc] init];
|
||||
vector<std::string> v = _builder->readingsAtRange(begin,end);
|
||||
vector<std::string> v = _builder->readingsAtRange(begin, end);
|
||||
for(vector<std::string>::iterator it_i=v.begin(); it_i!=v.end(); ++it_i) {
|
||||
[readingsArray addObject:[NSString stringWithUTF8String:it_i->c_str()]];
|
||||
}
|
||||
[string appendString:[readingsArray componentsJoinedByString:@"-"]];
|
||||
[string appendString:@" "];
|
||||
[string appendString:@"-1.0"];
|
||||
return string;
|
||||
}
|
||||
|
||||
|
|
|
@ -98,17 +98,42 @@ static void LTLoadLanguageModelFile(NSString *filenameWithoutExtension, McBopomo
|
|||
return NO;
|
||||
}
|
||||
|
||||
NSString *currentMarkedPhrase = [userPhrase stringByAppendingString:@"\n"];
|
||||
|
||||
BOOL shuoldAddLineBreakAtFront = NO;
|
||||
NSString *path = [self userPhrasesDataPathMcBopomofo];
|
||||
NSFileHandle *file = [NSFileHandle fileHandleForUpdatingAtPath:path];
|
||||
if (!file) {
|
||||
|
||||
if ([[NSFileManager defaultManager] fileExistsAtPath:path]) {
|
||||
NSError *error = nil;
|
||||
NSDictionary *attr = [[NSFileManager defaultManager] attributesOfItemAtPath:path error:&error];
|
||||
unsigned long long fileSize = [attr fileSize];
|
||||
if (!error && fileSize) {
|
||||
NSFileHandle *readFile = [NSFileHandle fileHandleForReadingAtPath:path];
|
||||
if (readFile) {
|
||||
[readFile seekToFileOffset:fileSize - 1];
|
||||
NSData *data = [readFile readDataToEndOfFile];
|
||||
const void *bytes = [data bytes];
|
||||
if (*(char *)bytes != '\n') {
|
||||
shuoldAddLineBreakAtFront = YES;
|
||||
}
|
||||
[readFile closeFile];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NSMutableString *currentMarkedPhrase = [NSMutableString string];
|
||||
if (shuoldAddLineBreakAtFront) {
|
||||
[currentMarkedPhrase appendString:@"\n"];
|
||||
}
|
||||
[currentMarkedPhrase appendString:userPhrase];
|
||||
[currentMarkedPhrase appendString:@"\n"];
|
||||
|
||||
NSFileHandle *writeFile = [NSFileHandle fileHandleForUpdatingAtPath:path];
|
||||
if (!writeFile) {
|
||||
return NO;
|
||||
}
|
||||
[file seekToEndOfFile];
|
||||
[writeFile seekToEndOfFile];
|
||||
NSData *data = [currentMarkedPhrase dataUsingEncoding:NSUTF8StringEncoding];
|
||||
[file writeData:data];
|
||||
[file closeFile];
|
||||
[writeFile writeData:data];
|
||||
[writeFile closeFile];
|
||||
|
||||
[self loadUserPhrasesModel];
|
||||
return YES;
|
||||
|
|
|
@ -7,5 +7,4 @@
|
|||
@interface LanguageModelManager : NSObject
|
||||
+ (void)loadDataModels;
|
||||
+ (void)loadUserPhrasesModel;
|
||||
+ (BOOL)checkIfUserLanguageModelFilesExist;
|
||||
@end
|
||||
|
|
Loading…
Reference in New Issue