LMCassette // Refactor && Fix .clear().

2023-12-31 01:21:22 +08:00 · 2023-12-31 01:21:22 +08:00 · 1c92ab8edf
parent 4317c9c653
commit 1c92ab8edf
4 changed files with 278 additions and 328 deletions
--- a/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift
+++ b/Packages/vChewing_LangModelAssembly/Sources/LangModelAssembly/SubLMs/lmCassette.swift
@ -40,21 +40,24 @@ public extension vChewingLM {
    public private(set) var areCandidateKeysShiftHeld: Bool = false
    public private(set) var supplyQuickResults: Bool = false
    public private(set) var supplyPartiallyMatchedResults: Bool = false
-
-    /// 計算頻率時要用到的東西
-    private static let fscale = 2.7
+    /// 計算頻率時要用到的東西 - NORM
    private var norm = 0.0
+  }
+}

+public extension vChewingLM.LMCassette {
+  /// 計算頻率時要用到的東西 - fscale
+  private static let fscale = 2.7
  /// 萬用花牌字符，哪怕花牌鍵仍不可用。
-    public var wildcard: String { wildcardKey.isEmpty ? "†" : wildcardKey }
+  var wildcard: String { wildcardKey.isEmpty ? "†" : wildcardKey }
  /// 資料陣列內承載的核心 charDef 資料筆數。
-    public var count: Int { charDefMap.count }
+  var count: Int { charDefMap.count }
  /// 是否已有資料載入。
-    public var isLoaded: Bool { !charDefMap.isEmpty }
+  var isLoaded: Bool { !charDefMap.isEmpty }
  /// 返回「允許使用的敲字鍵」的陣列。
-    public var allowedKeys: [String] { Array(keyNameMap.keys + [" "]).deduplicated }
+  var allowedKeys: [String] { Array(keyNameMap.keys + [" "]).deduplicated }
  /// 將給定的按鍵字母轉換成要顯示的形態。
-    public func convertKeyToDisplay(char: String) -> String {
+  func convertKeyToDisplay(char: String) -> String {
    keyNameMap[char] ?? char
  }

@ -76,87 +79,92 @@ public extension vChewingLM {
  /// 第三欄資料為對應字根、可有可無。第一欄與第二欄分別為「字詞」與「統計頻次」。
  /// - Parameter path: 檔案路徑。
  /// - Returns: 是否載入成功。
-    @discardableResult public mutating func open(_ path: String) -> Bool {
+  @discardableResult mutating func open(_ path: String) -> Bool {
    if isLoaded { return false }
    let oldPath = filePath
    filePath = nil
    if FileManager.default.fileExists(atPath: path) {
      do {
        guard let fileHandle = FileHandle(forReadingAtPath: path) else {
-            throw FileErrors.fileHandleError("")
+          throw vChewingLM.FileErrors.fileHandleError("")
        }
        let lineReader = try LineReader(file: fileHandle)
        var theMaxKeyLength = 1
        var loadingKeys = false
-          var loadingQuickSets = false
-          var loadingCharDefinitions = false
-          var loadingSymbolDefinitions = false
+        var loadingQuickSets = false {
+          willSet {
+            supplyQuickResults = true
+            if !newValue, quickDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
+          }
+        }
+        var loadingCharDefinitions = false {
+          willSet {
+            if !newValue, charDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
+          }
+        }
+        var loadingSymbolDefinitions = false {
+          willSet {
+            if !newValue, symbolDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
+          }
+        }
        var loadingOctagramData = false
        var keysUsedInCharDef: Set<String> = .init()
+
        for strLine in lineReader {
-            if strLine.starts(with: "%keyname") {
-              if !loadingKeys, strLine.contains("begin") { loadingKeys = true }
-              if loadingKeys, strLine.contains("end") { loadingKeys = false }
-            }
+          let isTabDelimiting = strLine.contains("\t")
+          let cells = isTabDelimiting ? strLine.split(separator: "\t") : strLine.split(separator: " ")
+          guard cells.count >= 1 else { continue }
+          let strFirstCell = cells[0].trimmingCharacters(in: .newlines)
+          let strSecondCell = cells.count >= 2 ? cells[1].trimmingCharacters(in: .newlines) : nil
+          // 處理雜項資訊
+          if strLine.first == "%", strFirstCell != "%" {
            // %flag_disp_partial_match
            if strLine == "%flag_disp_partial_match" {
              supplyPartiallyMatchedResults = true
              supplyQuickResults = true
            }
-            // %quick
-            if strLine.starts(with: "%quick") {
-              supplyQuickResults = true
-              if !loadingQuickSets, strLine.contains("begin") {
-                loadingQuickSets = true
+            guard let strSecondCell = strSecondCell else { continue }
+            processTags: switch strFirstCell {
+            case "%keyname" where strSecondCell == "begin": loadingKeys = true
+            case "%keyname" where strSecondCell == "end": loadingKeys = false
+            case "%quick" where strSecondCell == "begin": loadingQuickSets = true
+            case "%quick" where strSecondCell == "end": loadingQuickSets = false
+            case "%chardef" where strSecondCell == "begin": loadingCharDefinitions = true
+            case "%chardef" where strSecondCell == "end": loadingCharDefinitions = false
+            case "%symboldef" where strSecondCell == "begin": loadingSymbolDefinitions = true
+            case "%symboldef" where strSecondCell == "end": loadingSymbolDefinitions = false
+            case "%octagram" where strSecondCell == "begin": loadingOctagramData = true
+            case "%octagram" where strSecondCell == "end": loadingOctagramData = false
+            case "%ename" where nameENG.isEmpty:
+              parseSubCells: for neta in strSecondCell.components(separatedBy: ";") {
+                let subNetaGroup = neta.components(separatedBy: ":")
+                guard subNetaGroup.count == 2, subNetaGroup[1].contains("en") else { continue }
+                nameENG = String(subNetaGroup[0])
+                break parseSubCells
              }
-              if loadingQuickSets, strLine.contains("end") {
-                loadingQuickSets = false
-                if quickDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
+              guard nameENG.isEmpty else { break processTags }
+              nameENG = strSecondCell
+            case "%intlname" where nameIntl.isEmpty: nameIntl = strSecondCell.replacingOccurrences(of: "_", with: " ")
+            case "%cname" where nameCJK.isEmpty: nameCJK = strSecondCell
+            case "%sname" where nameShort.isEmpty: nameShort = strSecondCell
+            case "%nullcandidate" where nullCandidate.isEmpty: nullCandidate = strSecondCell
+            case "%selkey" where selectionKeys.isEmpty: selectionKeys = strSecondCell.map(\.description).deduplicated.joined()
+            case "%endkey" where endKeys.isEmpty: endKeys = strSecondCell.map(\.description).deduplicated
+            case "%wildcardkey" where wildcardKey.isEmpty: wildcardKey = strSecondCell.first?.description ?? ""
+            case "%keys_to_directly_commit" where keysToDirectlyCommit.isEmpty: keysToDirectlyCommit = strSecondCell
+            default: break processTags
            }
+            continue
          }
-            // %chardef
-            if strLine.starts(with: "%chardef") {
-              if !loadingCharDefinitions, strLine.contains("begin") {
-                loadingCharDefinitions = true
-              }
-              if loadingCharDefinitions, strLine.contains("end") {
-                loadingCharDefinitions = false
-                if charDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
-              }
-            }
-            // %symboldef
-            if strLine.starts(with: "%symboldef") {
-              if !loadingSymbolDefinitions, strLine.contains("begin") {
-                loadingSymbolDefinitions = true
-              }
-              if loadingSymbolDefinitions, strLine.contains("end") {
-                loadingSymbolDefinitions = false
-                if symbolDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
-              }
-            }
-            // %octagram
-            if strLine.starts(with: "%octagram") {
-              if !loadingOctagramData, strLine.contains("begin") {
-                loadingOctagramData = true
-              }
-              if loadingOctagramData, strLine.contains("end") {
-                loadingOctagramData = false
-              }
-            }
-            // Start data parsing.
-            let cells: [String.SubSequence] =
-              strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
-            guard cells.count >= 2 else { continue }
-            let strFirstCell = cells[0].trimmingCharacters(in: .newlines)
-            let strSecondCell = cells[1].trimmingCharacters(in: .newlines)
-            if loadingKeys, !cells[0].starts(with: "%keyname") {
-              keyNameMap[strFirstCell] = cells[1].trimmingCharacters(in: .newlines)
-            } else if loadingQuickSets, !strLine.starts(with: "%quick") {
+
+          // 處理普通資料
+          guard let strSecondCell = strSecondCell else { continue }
+          if loadingKeys {
+            keyNameMap[strFirstCell] = strSecondCell.trimmingCharacters(in: .newlines)
+          } else if loadingQuickSets {
            theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
            quickDefMap[strFirstCell, default: .init()].append(strSecondCell)
-            } else if loadingCharDefinitions, !loadingSymbolDefinitions,
-                      !strLine.starts(with: "%chardef"), !strLine.starts(with: "%symboldef")
-            {
+          } else if loadingCharDefinitions, !loadingSymbolDefinitions {
            theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
            charDefMap[strFirstCell, default: []].append(strSecondCell)
            if strFirstCell.count > 1 {
@ -170,12 +178,12 @@ public extension vChewingLM {
              keyComps.removeLast()
              charDefWildcardMap[keyComps.joined() + wildcard, default: []].append(strSecondCell)
            }
-            } else if loadingSymbolDefinitions, !strLine.starts(with: "%chardef"), !strLine.starts(with: "%symboldef") {
+          } else if loadingSymbolDefinitions {
            theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
            symbolDefMap[strFirstCell, default: []].append(strSecondCell)
            reverseLookupMap[strSecondCell, default: []].append(strFirstCell)
-            } else if loadingOctagramData, !strLine.starts(with: "%octagram") {
-              guard let countValue = Int(cells[1]) else { continue }
+          } else if loadingOctagramData {
+            guard let countValue = Int(strSecondCell) else { continue }
            switch cells.count {
            case 2: octagramMap[strFirstCell] = countValue
            case 3: octagramDividedMap[strFirstCell] = (countValue, cells[2].trimmingCharacters(in: .newlines))
@ -183,35 +191,6 @@ public extension vChewingLM {
            }
            norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
          }
-            guard !loadingKeys, !loadingQuickSets, !loadingCharDefinitions, !loadingOctagramData else { continue }
-            if nameENG.isEmpty, strLine.starts(with: "%ename ") {
-              for neta in cells[1].components(separatedBy: ";") {
-                let subNetaGroup = neta.components(separatedBy: ":")
-                if subNetaGroup.count == 2, subNetaGroup[1].contains("en") {
-                  nameENG = String(subNetaGroup[0])
-                  break
-                }
-              }
-              if nameENG.isEmpty { nameENG = strSecondCell }
-            }
-            if nameIntl.isEmpty, strLine.starts(with: "%intlname ") {
-              nameIntl = strSecondCell.replacingOccurrences(of: "_", with: " ")
-            }
-            if nameCJK.isEmpty, strLine.starts(with: "%cname ") { nameCJK = strSecondCell }
-            if nameShort.isEmpty, strLine.starts(with: "%sname ") { nameShort = strSecondCell }
-            if nullCandidate.isEmpty, strLine.starts(with: "%nullcandidate ") { nullCandidate = strSecondCell }
-            if selectionKeys.isEmpty, strLine.starts(with: "%selkey ") {
-              selectionKeys = cells[1].map(\.description).deduplicated.joined()
-            }
-            if endKeys.isEmpty, strLine.starts(with: "%endkey ") {
-              endKeys = cells[1].map(\.description).deduplicated
-            }
-            if wildcardKey.isEmpty, strLine.starts(with: "%wildcardkey ") {
-              wildcardKey = cells[1].first?.description ?? ""
-            }
-            if keysToDirectlyCommit.isEmpty, strLine.starts(with: "%keys_to_directly_commit ") {
-              keysToDirectlyCommit = strSecondCell
-            }
        }
        // Post process.
        if CandidateKey.validate(keys: selectionKeys) != nil { selectionKeys = "1234567890" }
@ -232,28 +211,11 @@ public extension vChewingLM {
    return false
  }

-    public mutating func clear() {
-      filePath = nil
-      nullCandidate.removeAll()
-      keyNameMap.removeAll()
-      quickDefMap.removeAll()
-      charDefMap.removeAll()
-      charDefWildcardMap.removeAll()
-      nameShort.removeAll()
-      nameENG.removeAll()
-      nameCJK.removeAll()
-      selectionKeys.removeAll()
-      endKeys.removeAll()
-      reverseLookupMap.removeAll()
-      octagramMap.removeAll()
-      octagramDividedMap.removeAll()
-      wildcardKey.removeAll()
-      nameIntl.removeAll()
-      maxKeyLength = 1
-      norm = 0
+  mutating func clear() {
+    self = .init()
  }

-    public func quickSetsFor(key: String) -> String? {
+  func quickSetsFor(key: String) -> String? {
    guard !key.isEmpty else { return nil }
    var result = [String]()
    if let specifiedResult = quickDefMap[key], !specifiedResult.isEmpty {
@ -280,7 +242,7 @@ public extension vChewingLM {
  /// 根據給定的字根索引鍵，來獲取資料庫辭典內的對應結果。
  /// - parameters:
  ///   - key: 讀音索引鍵。
-    public func unigramsFor(key: String) -> [Megrez.Unigram] {
+  func unigramsFor(key: String) -> [Megrez.Unigram] {
    let arrRaw = charDefMap[key]?.deduplicated ?? []
    var arrRawWildcard: [String] = []
    if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated,
@ -323,7 +285,7 @@ public extension vChewingLM {
  /// 根據給定的讀音索引鍵來確認資料庫辭典內是否存在對應的資料。
  /// - parameters:
  ///   - key: 讀音索引鍵。
-    public func hasUnigramsFor(key: String) -> Bool {
+  func hasUnigramsFor(key: String) -> Bool {
    charDefMap[key] != nil
      || (charDefWildcardMap[key] != nil && key.contains(wildcard) && key.first?.description != wildcard)
  }
@ -348,7 +310,6 @@ public extension vChewingLM {
    }
    return weight
  }
-  }
 }

 // MARK: - 引入冪乘函式
--- a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMCassetteTests.swift
+++ b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMCassetteTests.swift
@ -47,7 +47,7 @@ final class LMCassetteTests: XCTestCase {
    NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
    XCTAssertFalse(lmCassette.quickDefMap.isEmpty)
    print(lmCassette.quickSetsFor(key: ",.") ?? "")
-    XCTAssertEqual(lmCassette.keyNameMap.count, 41)
+    XCTAssertEqual(lmCassette.keyNameMap.count, 31)
    XCTAssertEqual(lmCassette.charDefMap.count, 29491)
    XCTAssertEqual(lmCassette.charDefWildcardMap.count, 11946)
    XCTAssertEqual(lmCassette.octagramMap.count, 0)
--- a/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMUserOverrideTests.swift
+++ b/Packages/vChewing_LangModelAssembly/Tests/LangModelAssemblyTests/LMUserOverrideTests.swift
@ -23,9 +23,9 @@ final class LMUserOverrideTests: XCTestCase {

  func testUOM_1_BasicOps() throws {
    let uom = vChewingLM.LMUserOverride(capacity: capacity, decayConstant: Double(halfLife), dataURL: nullURL)
-    let key = "((ㄍㄨㄥ-ㄙ,公司),(ㄉㄜ˙,的),ㄋㄧㄢˊ-ㄓㄨㄥ)"
-    let headReading = "ㄋㄧㄢˊ-ㄓㄨㄥ"
-    let expectedSuggestion = "年終"
+    let key = "((ㄕㄣˊ-ㄌㄧˇ-ㄌㄧㄥˊ-ㄏㄨㄚˊ,神里綾華),(ㄉㄜ˙,的),ㄍㄡˇ)"
+    let headReading = "ㄍㄡˇ"
+    let expectedSuggestion = "狗"
    observe(who: uom, key: key, candidate: expectedSuggestion, timestamp: nowTimeStamp)
    var suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp, headReading: headReading)
    XCTAssertEqual(Set(suggested.candidates.map(\.1.value)).first ?? "", expectedSuggestion)
@ -46,10 +46,10 @@ final class LMUserOverrideTests: XCTestCase {

  func testUOM_2_NewestAgainstRepeatedlyUsed() throws {
    let uom = vChewingLM.LMUserOverride(capacity: capacity, decayConstant: Double(halfLife), dataURL: nullURL)
-    let key = "((ㄍㄨㄥ-ㄙ,公司),(ㄉㄜ˙,的),ㄋㄧㄢˊ-ㄓㄨㄥ)"
-    let headReading = "ㄋㄧㄢˊ-ㄓㄨㄥ"
-    let valRepeatedlyUsed = "年終" // 更常用
-    let valNewest = "年中" // 最近偶爾用了一次
+    let key = "((ㄕㄣˊ-ㄌㄧˇ-ㄌㄧㄥˊ-ㄏㄨㄚˊ,神里綾華),(ㄉㄜ˙,的),ㄍㄡˇ)"
+    let headReading = "ㄍㄡˇ"
+    let valRepeatedlyUsed = "狗" // 更常用
+    let valNewest = "苟" // 最近偶爾用了一次
    let stamps: [Double] = [0, 0.5, 2, 2.5, 4, 4.5, 5.3].map { nowTimeStamp + halfLife * $0 }
    stamps.forEach { stamp in
      observe(who: uom, key: key, candidate: valRepeatedlyUsed, timestamp: stamp)
@ -62,8 +62,6 @@ final class LMUserOverrideTests: XCTestCase {
    }
    // 試試看偶爾選了不常用的詞的話、是否會影響上文所生成的有一定強效的記憶。
    observe(who: uom, key: key, candidate: valNewest, timestamp: nowTimeStamp + halfLife * 23.4)
-    suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp + halfLife * 23.6, headReading: headReading)
-    XCTAssertEqual(Set(suggested.candidates.map(\.1.value)).first ?? "", valNewest)
    suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp + halfLife * 26, headReading: headReading)
    XCTAssertEqual(Set(suggested.candidates.map(\.1.value)).first ?? "", valNewest)
    suggested = uom.getSuggestion(key: key, timestamp: nowTimeStamp + halfLife * 50, headReading: headReading)
@ -72,9 +70,9 @@ final class LMUserOverrideTests: XCTestCase {
  }

  func testUOM_3_LRUTable() throws {
-    let a = (key: "((ㄍㄨㄥ-ㄙ,公司),(ㄉㄜ˙,的),ㄋㄧㄢˊ-ㄓㄨㄥ)", value: "年終", head: "ㄋㄧㄢˊ-ㄓㄨㄥ")
-    let b = (key: "((ㄑㄧˋ-ㄧㄝˋ,企業),(ㄉㄜ˙,的),ㄐㄧㄤˇ-ㄐㄧㄣ)", value: "獎金", head: "ㄐㄧㄤˇ-ㄐㄧㄣ")
-    let c = (key: "((ㄒㄩㄝˊ-ㄕㄥ,學生),(ㄉㄜ˙,的),ㄈㄨˊ-ㄌㄧˋ)", value: "福利", head: "ㄈㄨˊ-ㄌㄧˋ")
+    let a = (key: "((ㄕㄣˊ-ㄌㄧˇ-ㄌㄧㄥˊ-ㄏㄨㄚˊ,神里綾華),(ㄉㄜ˙,的),ㄍㄡˇ)", value: "狗", head: "ㄍㄡˇ")
+    let b = (key: "((ㄆㄞˋ-ㄇㄥˊ,派蒙),(ㄉㄜ˙,的),ㄐㄧㄤˇ-ㄐㄧㄣ)", value: "伙食費", head: "ㄏㄨㄛˇ-ㄕˊ-ㄈㄟˋ")
+    let c = (key: "((ㄍㄨㄛˊ-ㄅㄥ,國崩),(ㄉㄜ˙,的),ㄇㄠˋ-ㄗ˙)", value: "帽子", head: "ㄇㄠˋ-ㄗ˙")
    let d = (key: "((ㄌㄟˊ-ㄉㄧㄢˋ-ㄐㄧㄤ-ㄐㄩㄣ,雷電將軍),(ㄉㄜ˙,的),ㄐㄧㄠˇ-ㄔㄡˋ)", value: "腳臭", head: "ㄐㄧㄠˇ-ㄔㄡˋ")
    let uom = vChewingLM.LMUserOverride(capacity: 2, decayConstant: Double(halfLife), dataURL: nullURL)
    observe(who: uom, key: a.key, candidate: a.value, timestamp: nowTimeStamp)
--- a/Packages/vChewing_LangModelAssembly/Tests/TestCINData/array30.cin2
+++ b/Packages/vChewing_LangModelAssembly/Tests/TestCINData/array30.cin2
@ -16,6 +16,7 @@
 %phase_auto_skip_endkey
 %flag_disp_full_match
 %flag_disp_partial_match
+%keys_to_directly_commit !@#$%^&*()-_=+[{]}\|:'"<>?
 %keyname begin
 a 1-
 b 5v
@ -47,16 +48,6 @@ z	1v
 / 0v
 ; 0-
 , 8v
-1	１
-2	２
-3	３
-4	４
-5	５
-6	６
-7	７
-8	８
-9	９
-0	０
 %keyname end
 %quick begin
 ,	，火米精燈料鄰勞類營