LMs // +LMCoreEX, the Swift successor of ParselessLM (WIP).
Co-Authored-By: ix4n33 <16833681+isaacxen@users.noreply.github.com>
This commit is contained in:
parent
ed5fe63b2e
commit
6c66fd26c0
|
@ -0,0 +1,154 @@
|
||||||
|
// Copyright (c) 2021 and onwards The vChewing Project (MIT-NTL License).
|
||||||
|
// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||||
|
this software and associated documentation files (the "Software"), to deal in
|
||||||
|
the Software without restriction, including without limitation the rights to
|
||||||
|
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||||
|
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||||
|
subject to the following conditions:
|
||||||
|
|
||||||
|
1. The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
2. No trademark license is granted to use the trade names, trademarks, service
|
||||||
|
marks, or product names of Contributor, except as required to fulfill notice
|
||||||
|
requirements above.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||||
|
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||||
|
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||||
|
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||||
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/// 與之前的 LMCore 不同,LMCoreEX 不在辭典內記錄實體,而是記錄 range 範圍。
|
||||||
|
/// 需要資料的時候,直接拿 range 去 strData 取資料。
|
||||||
|
/// 資料記錄原理與上游 C++ 的 ParselessLM 差不多,但用的是 Swift 原生手段。
|
||||||
|
/// 主要時間消耗仍在 For 迴圈,但這個算法可以顯著減少記憶體佔用。
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
|
||||||
|
extension vChewing {
|
||||||
|
@frozen public struct LMCoreEX {
|
||||||
|
var rangeMap: [String: [Range<String.Index>]] = [:]
|
||||||
|
var strData: String = ""
|
||||||
|
var shouldReverse: Bool = false
|
||||||
|
var allowConsolidation: Bool = false
|
||||||
|
var defaultScore: Double = 0
|
||||||
|
var shouldForceDefaultScore: Bool = false
|
||||||
|
|
||||||
|
public var count: Int {
|
||||||
|
rangeMap.count
|
||||||
|
}
|
||||||
|
|
||||||
|
public init(
|
||||||
|
reverse: Bool = false, consolidate: Bool = false, defaultScore scoreDefault: Double = 0,
|
||||||
|
forceDefaultScore: Bool = false
|
||||||
|
) {
|
||||||
|
rangeMap = [:]
|
||||||
|
allowConsolidation = consolidate
|
||||||
|
shouldReverse = reverse
|
||||||
|
defaultScore = scoreDefault
|
||||||
|
shouldForceDefaultScore = forceDefaultScore
|
||||||
|
}
|
||||||
|
|
||||||
|
public func isLoaded() -> Bool {
|
||||||
|
!rangeMap.isEmpty
|
||||||
|
}
|
||||||
|
|
||||||
|
@discardableResult public mutating func open(_ path: String) -> Bool {
|
||||||
|
if isLoaded() {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if allowConsolidation {
|
||||||
|
LMConsolidator.fixEOF(path: path)
|
||||||
|
LMConsolidator.consolidate(path: path, pragma: true)
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
strData = try String(contentsOfFile: path, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
|
||||||
|
strData.ranges(splitBy: "\n").forEach {
|
||||||
|
let neta = strData[$0].components(separatedBy: " ")
|
||||||
|
if neta.count >= 2 {
|
||||||
|
let theKey = shouldReverse ? neta[1] : neta[0]
|
||||||
|
if !neta[0].isEmpty, !neta[1].isEmpty, theKey.first != "#" {
|
||||||
|
let theValue = $0
|
||||||
|
rangeMap[theKey, default: []].append(theValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
IME.prtDebugIntel("\(error)")
|
||||||
|
IME.prtDebugIntel("↑ Exception happened when reading data at: \(path).")
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
public mutating func close() {
|
||||||
|
if isLoaded() {
|
||||||
|
rangeMap.removeAll()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - Advanced features
|
||||||
|
|
||||||
|
public func dump() {
|
||||||
|
var strDump = ""
|
||||||
|
for entry in rangeMap {
|
||||||
|
let netaRanges: [Range<String.Index>] = entry.value
|
||||||
|
for netaRange in netaRanges {
|
||||||
|
let neta = strData[netaRange]
|
||||||
|
let addline = neta + "\n"
|
||||||
|
strDump += addline
|
||||||
|
}
|
||||||
|
}
|
||||||
|
IME.prtDebugIntel(strDump)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func bigramsForKeys(precedingKey: String, key: String) -> [Megrez.Bigram] {
|
||||||
|
// 這裡用了點廢話處理,不然函數構建體會被 Swift 格式整理工具給毀掉。
|
||||||
|
// 其實只要一句「[Megrez.Bigram]()」就夠了。
|
||||||
|
precedingKey == key ? [Megrez.Bigram]() : [Megrez.Bigram]()
|
||||||
|
}
|
||||||
|
|
||||||
|
public func unigramsFor(key: String) -> [Megrez.Unigram] {
|
||||||
|
var grams: [Megrez.Unigram] = []
|
||||||
|
if let arrRangeRecords: [Range<String.Index>] = rangeMap[key] {
|
||||||
|
for netaRange in arrRangeRecords {
|
||||||
|
let neta = strData[netaRange].components(separatedBy: " ")
|
||||||
|
let theValue: String = shouldReverse ? neta[0] : neta[1]
|
||||||
|
let kvPair = Megrez.KeyValuePair(key: key, value: theValue)
|
||||||
|
var theScore = defaultScore
|
||||||
|
if neta.count >= 3, !shouldForceDefaultScore {
|
||||||
|
theScore = .init(neta[2]) ?? defaultScore
|
||||||
|
}
|
||||||
|
grams.append(Megrez.Unigram(keyValue: kvPair, score: theScore))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return grams
|
||||||
|
}
|
||||||
|
|
||||||
|
public func hasUnigramsFor(key: String) -> Bool {
|
||||||
|
rangeMap[key] != nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: - StringView Ranges Extension (by Isaac Xen)
|
||||||
|
|
||||||
|
extension String {
|
||||||
|
fileprivate func ranges(splitBy separator: Element) -> [Range<String.Index>] {
|
||||||
|
var startIndex = startIndex
|
||||||
|
return split(separator: separator).reduce(into: []) { ranges, substring in
|
||||||
|
_ = range(of: substring, range: startIndex..<endIndex).map { range in
|
||||||
|
ranges.append(range)
|
||||||
|
startIndex = range.upperBound
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -48,6 +48,7 @@
|
||||||
5B782EC4280C243C007276DE /* KeyHandler_HandleCandidate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B782EC3280C243C007276DE /* KeyHandler_HandleCandidate.swift */; };
|
5B782EC4280C243C007276DE /* KeyHandler_HandleCandidate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B782EC3280C243C007276DE /* KeyHandler_HandleCandidate.swift */; };
|
||||||
5B7BC4B027AFFBE800F66C24 /* frmPrefWindow.xib in Resources */ = {isa = PBXBuildFile; fileRef = 5B7BC4AE27AFFBE800F66C24 /* frmPrefWindow.xib */; };
|
5B7BC4B027AFFBE800F66C24 /* frmPrefWindow.xib in Resources */ = {isa = PBXBuildFile; fileRef = 5B7BC4AE27AFFBE800F66C24 /* frmPrefWindow.xib */; };
|
||||||
5B7F225D2808501000DDD3CB /* KeyHandler_HandleInput.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B7F225C2808501000DDD3CB /* KeyHandler_HandleInput.swift */; };
|
5B7F225D2808501000DDD3CB /* KeyHandler_HandleInput.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B7F225C2808501000DDD3CB /* KeyHandler_HandleInput.swift */; };
|
||||||
|
5B887F302826AEA400B6651E /* lmCoreEX.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B887F2F2826AEA400B6651E /* lmCoreEX.swift */; };
|
||||||
5B949BD92816DC5400D87B5D /* LineReader.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B949BD82816DC5400D87B5D /* LineReader.swift */; };
|
5B949BD92816DC5400D87B5D /* LineReader.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B949BD82816DC5400D87B5D /* LineReader.swift */; };
|
||||||
5B949BDB2816DDBC00D87B5D /* LMConsolidator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B949BDA2816DDBC00D87B5D /* LMConsolidator.swift */; };
|
5B949BDB2816DDBC00D87B5D /* LMConsolidator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B949BDA2816DDBC00D87B5D /* LMConsolidator.swift */; };
|
||||||
5BA0DF312817857D009E73BB /* lmUserOverride.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */; };
|
5BA0DF312817857D009E73BB /* lmUserOverride.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */; };
|
||||||
|
@ -225,6 +226,7 @@
|
||||||
5B7BC4AF27AFFBE800F66C24 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = Base; path = Base.lproj/frmPrefWindow.xib; sourceTree = "<group>"; };
|
5B7BC4AF27AFFBE800F66C24 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = Base; path = Base.lproj/frmPrefWindow.xib; sourceTree = "<group>"; };
|
||||||
5B7BC4B227AFFC0B00F66C24 /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/frmPrefWindow.strings; sourceTree = "<group>"; };
|
5B7BC4B227AFFC0B00F66C24 /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/frmPrefWindow.strings; sourceTree = "<group>"; };
|
||||||
5B7F225C2808501000DDD3CB /* KeyHandler_HandleInput.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = KeyHandler_HandleInput.swift; sourceTree = "<group>"; tabWidth = 2; usesTabs = 0; };
|
5B7F225C2808501000DDD3CB /* KeyHandler_HandleInput.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = KeyHandler_HandleInput.swift; sourceTree = "<group>"; tabWidth = 2; usesTabs = 0; };
|
||||||
|
5B887F2F2826AEA400B6651E /* lmCoreEX.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = lmCoreEX.swift; sourceTree = "<group>"; usesTabs = 0; };
|
||||||
5B949BD82816DC5400D87B5D /* LineReader.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = LineReader.swift; sourceTree = "<group>"; usesTabs = 0; };
|
5B949BD82816DC5400D87B5D /* LineReader.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = LineReader.swift; sourceTree = "<group>"; usesTabs = 0; };
|
||||||
5B949BDA2816DDBC00D87B5D /* LMConsolidator.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = LMConsolidator.swift; sourceTree = "<group>"; usesTabs = 0; };
|
5B949BDA2816DDBC00D87B5D /* LMConsolidator.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = LMConsolidator.swift; sourceTree = "<group>"; usesTabs = 0; };
|
||||||
5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = lmUserOverride.swift; sourceTree = "<group>"; usesTabs = 0; };
|
5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = lmUserOverride.swift; sourceTree = "<group>"; usesTabs = 0; };
|
||||||
|
@ -394,6 +396,7 @@
|
||||||
children = (
|
children = (
|
||||||
5B407309281672610023DFFF /* lmAssociates.swift */,
|
5B407309281672610023DFFF /* lmAssociates.swift */,
|
||||||
5BA0DF2F2817857D009E73BB /* lmCore.swift */,
|
5BA0DF2F2817857D009E73BB /* lmCore.swift */,
|
||||||
|
5B887F2F2826AEA400B6651E /* lmCoreEX.swift */,
|
||||||
5B00A22F282011980058E5DB /* lmLite.swift */,
|
5B00A22F282011980058E5DB /* lmLite.swift */,
|
||||||
5B40730A281672610023DFFF /* lmReplacements.swift */,
|
5B40730A281672610023DFFF /* lmReplacements.swift */,
|
||||||
5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */,
|
5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */,
|
||||||
|
@ -1088,6 +1091,7 @@
|
||||||
5BA9FD4827FEF3C9002DE248 /* PreferencesWindowController.swift in Sources */,
|
5BA9FD4827FEF3C9002DE248 /* PreferencesWindowController.swift in Sources */,
|
||||||
5BD0113B28180D6100609769 /* LMInstantiator.swift in Sources */,
|
5BD0113B28180D6100609769 /* LMInstantiator.swift in Sources */,
|
||||||
D4E569DC27A34D0E00AC2CEF /* CTools.m in Sources */,
|
D4E569DC27A34D0E00AC2CEF /* CTools.m in Sources */,
|
||||||
|
5B887F302826AEA400B6651E /* lmCoreEX.swift in Sources */,
|
||||||
5BA9FD4627FEF3C9002DE248 /* Container.swift in Sources */,
|
5BA9FD4627FEF3C9002DE248 /* Container.swift in Sources */,
|
||||||
D47F7DD0278C0897002F9DD7 /* ctlNonModalAlertWindow.swift in Sources */,
|
D47F7DD0278C0897002F9DD7 /* ctlNonModalAlertWindow.swift in Sources */,
|
||||||
5B38F5A2281E2E49007D5F5D /* 0_Megrez.swift in Sources */,
|
5B38F5A2281E2E49007D5F5D /* 0_Megrez.swift in Sources */,
|
||||||
|
|
Loading…
Reference in New Issue