From 6c66fd26c03bb5b5c8efeb4c5caaa88a822c2a45 Mon Sep 17 00:00:00 2001
From: ShikiSuen <shikisuen@outlook.com>
Date: Sun, 8 May 2022 09:23:42 +0800
Subject: [PATCH] LMs // +LMCoreEX, the Swift successor of ParselessLM (WIP).

Co-Authored-By: ix4n33 <16833681+isaacxen@users.noreply.github.com>
---
 .../LangModelRelated/SubLMs/lmCoreEX.swift    | 154 ++++++++++++++++++
 vChewing.xcodeproj/project.pbxproj            |   4 +
 2 files changed, 158 insertions(+)
 create mode 100644 Source/Modules/LangModelRelated/SubLMs/lmCoreEX.swift

diff --git a/Source/Modules/LangModelRelated/SubLMs/lmCoreEX.swift b/Source/Modules/LangModelRelated/SubLMs/lmCoreEX.swift
new file mode 100644
index 00000000..de69e9ef
--- /dev/null
+++ b/Source/Modules/LangModelRelated/SubLMs/lmCoreEX.swift
@@ -0,0 +1,154 @@
+// Copyright (c) 2021 and onwards The vChewing Project (MIT-NTL License).
+// StringView Ranges extension by (c) 2022 and onwards Isaac Xen (MIT License).
+/*
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+1. The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+2. No trademark license is granted to use the trade names, trademarks, service
+marks, or product names of Contributor, except as required to fulfill notice
+requirements above.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/// 與之前的 LMCore 不同，LMCoreEX 不在辭典內記錄實體，而是記錄 range 範圍。
+/// 需要資料的時候，直接拿 range 去 strData 取資料。
+/// 資料記錄原理與上游 C++ 的 ParselessLM 差不多，但用的是 Swift 原生手段。
+/// 主要時間消耗仍在 For 迴圈，但這個算法可以顯著減少記憶體佔用。
+
+import Foundation
+
+extension vChewing {
+  @frozen public struct LMCoreEX {
+    var rangeMap: [String: [Range<String.Index>]] = [:]
+    var strData: String = ""
+    var shouldReverse: Bool = false
+    var allowConsolidation: Bool = false
+    var defaultScore: Double = 0
+    var shouldForceDefaultScore: Bool = false
+
+    public var count: Int {
+      rangeMap.count
+    }
+
+    public init(
+      reverse: Bool = false, consolidate: Bool = false, defaultScore scoreDefault: Double = 0,
+      forceDefaultScore: Bool = false
+    ) {
+      rangeMap = [:]
+      allowConsolidation = consolidate
+      shouldReverse = reverse
+      defaultScore = scoreDefault
+      shouldForceDefaultScore = forceDefaultScore
+    }
+
+    public func isLoaded() -> Bool {
+      !rangeMap.isEmpty
+    }
+
+    @discardableResult public mutating func open(_ path: String) -> Bool {
+      if isLoaded() {
+        return false
+      }
+
+      if allowConsolidation {
+        LMConsolidator.fixEOF(path: path)
+        LMConsolidator.consolidate(path: path, pragma: true)
+      }
+
+      do {
+        strData = try String(contentsOfFile: path, encoding: .utf8).replacingOccurrences(of: "\t", with: " ")
+        strData.ranges(splitBy: "\n").forEach {
+          let neta = strData[$0].components(separatedBy: " ")
+          if neta.count >= 2 {
+            let theKey = shouldReverse ? neta[1] : neta[0]
+            if !neta[0].isEmpty, !neta[1].isEmpty, theKey.first != "#" {
+              let theValue = $0
+              rangeMap[theKey, default: []].append(theValue)
+            }
+          }
+        }
+      } catch {
+        IME.prtDebugIntel("\(error)")
+        IME.prtDebugIntel("↑ Exception happened when reading data at: \(path).")
+        return false
+      }
+
+      return true
+    }
+
+    public mutating func close() {
+      if isLoaded() {
+        rangeMap.removeAll()
+      }
+    }
+
+    // MARK: - Advanced features
+
+    public func dump() {
+      var strDump = ""
+      for entry in rangeMap {
+        let netaRanges: [Range<String.Index>] = entry.value
+        for netaRange in netaRanges {
+          let neta = strData[netaRange]
+          let addline = neta + "\n"
+          strDump += addline
+        }
+      }
+      IME.prtDebugIntel(strDump)
+    }
+
+    public func bigramsForKeys(precedingKey: String, key: String) -> [Megrez.Bigram] {
+      // 這裡用了點廢話處理，不然函數構建體會被 Swift 格式整理工具給毀掉。
+      // 其實只要一句「[Megrez.Bigram]()」就夠了。
+      precedingKey == key ? [Megrez.Bigram]() : [Megrez.Bigram]()
+    }
+
+    public func unigramsFor(key: String) -> [Megrez.Unigram] {
+      var grams: [Megrez.Unigram] = []
+      if let arrRangeRecords: [Range<String.Index>] = rangeMap[key] {
+        for netaRange in arrRangeRecords {
+          let neta = strData[netaRange].components(separatedBy: " ")
+          let theValue: String = shouldReverse ? neta[0] : neta[1]
+          let kvPair = Megrez.KeyValuePair(key: key, value: theValue)
+          var theScore = defaultScore
+          if neta.count >= 3, !shouldForceDefaultScore {
+            theScore = .init(neta[2]) ?? defaultScore
+          }
+          grams.append(Megrez.Unigram(keyValue: kvPair, score: theScore))
+        }
+      }
+      return grams
+    }
+
+    public func hasUnigramsFor(key: String) -> Bool {
+      rangeMap[key] != nil
+    }
+  }
+}
+
+// MARK: - StringView Ranges Extension (by Isaac Xen)
+
+extension String {
+  fileprivate func ranges(splitBy separator: Element) -> [Range<String.Index>] {
+    var startIndex = startIndex
+    return split(separator: separator).reduce(into: []) { ranges, substring in
+      _ = range(of: substring, range: startIndex..<endIndex).map { range in
+        ranges.append(range)
+        startIndex = range.upperBound
+      }
+    }
+  }
+}
diff --git a/vChewing.xcodeproj/project.pbxproj b/vChewing.xcodeproj/project.pbxproj
index 3beae7a4..5685aff1 100644
--- a/vChewing.xcodeproj/project.pbxproj
+++ b/vChewing.xcodeproj/project.pbxproj
@@ -48,6 +48,7 @@
 		5B782EC4280C243C007276DE /* KeyHandler_HandleCandidate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B782EC3280C243C007276DE /* KeyHandler_HandleCandidate.swift */; };
 		5B7BC4B027AFFBE800F66C24 /* frmPrefWindow.xib in Resources */ = {isa = PBXBuildFile; fileRef = 5B7BC4AE27AFFBE800F66C24 /* frmPrefWindow.xib */; };
 		5B7F225D2808501000DDD3CB /* KeyHandler_HandleInput.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B7F225C2808501000DDD3CB /* KeyHandler_HandleInput.swift */; };
+		5B887F302826AEA400B6651E /* lmCoreEX.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B887F2F2826AEA400B6651E /* lmCoreEX.swift */; };
 		5B949BD92816DC5400D87B5D /* LineReader.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B949BD82816DC5400D87B5D /* LineReader.swift */; };
 		5B949BDB2816DDBC00D87B5D /* LMConsolidator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5B949BDA2816DDBC00D87B5D /* LMConsolidator.swift */; };
 		5BA0DF312817857D009E73BB /* lmUserOverride.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */; };
@@ -225,6 +226,7 @@
 		5B7BC4AF27AFFBE800F66C24 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = Base; path = Base.lproj/frmPrefWindow.xib; sourceTree = "<group>"; };
 		5B7BC4B227AFFC0B00F66C24 /* en */ = {isa = PBXFileReference; lastKnownFileType = text.plist.strings; name = en; path = en.lproj/frmPrefWindow.strings; sourceTree = "<group>"; };
 		5B7F225C2808501000DDD3CB /* KeyHandler_HandleInput.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; indentWidth = 2; lineEnding = 0; path = KeyHandler_HandleInput.swift; sourceTree = "<group>"; tabWidth = 2; usesTabs = 0; };
+		5B887F2F2826AEA400B6651E /* lmCoreEX.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = lmCoreEX.swift; sourceTree = "<group>"; usesTabs = 0; };
 		5B949BD82816DC5400D87B5D /* LineReader.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = LineReader.swift; sourceTree = "<group>"; usesTabs = 0; };
 		5B949BDA2816DDBC00D87B5D /* LMConsolidator.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = LMConsolidator.swift; sourceTree = "<group>"; usesTabs = 0; };
 		5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */ = {isa = PBXFileReference; explicitFileType = sourcecode.swift; fileEncoding = 4; lineEnding = 0; path = lmUserOverride.swift; sourceTree = "<group>"; usesTabs = 0; };
@@ -394,6 +396,7 @@
 			children = (
 				5B407309281672610023DFFF /* lmAssociates.swift */,
 				5BA0DF2F2817857D009E73BB /* lmCore.swift */,
+				5B887F2F2826AEA400B6651E /* lmCoreEX.swift */,
 				5B00A22F282011980058E5DB /* lmLite.swift */,
 				5B40730A281672610023DFFF /* lmReplacements.swift */,
 				5BA0DF2E2817857D009E73BB /* lmUserOverride.swift */,
@@ -1088,6 +1091,7 @@
 				5BA9FD4827FEF3C9002DE248 /* PreferencesWindowController.swift in Sources */,
 				5BD0113B28180D6100609769 /* LMInstantiator.swift in Sources */,
 				D4E569DC27A34D0E00AC2CEF /* CTools.m in Sources */,
+				5B887F302826AEA400B6651E /* lmCoreEX.swift in Sources */,
 				5BA9FD4627FEF3C9002DE248 /* Container.swift in Sources */,
 				D47F7DD0278C0897002F9DD7 /* ctlNonModalAlertWindow.swift in Sources */,
 				5B38F5A2281E2E49007D5F5D /* 0_Megrez.swift in Sources */,