LMCassette // Partial CIN2 Support (%wildcardKey).

- Also disable wildcardKey if it is included in the %chardef section.
This commit is contained in:
ShikiSuen 2022-10-18 23:15:51 +08:00
parent d6f47b6213
commit 2cdfce640f
3 changed files with 56 additions and 10 deletions

View File

@ -21,8 +21,10 @@ extension vChewingLM {
public private(set) var maxKeyLength: Int = 1
public private(set) var selectionKeys: [String] = []
public private(set) var endKeys: [String] = []
public private(set) var wildcardKey: String = ""
public private(set) var keyNameMap: [String: String] = [:]
public private(set) var charDefMap: [String: [String]] = [:]
public private(set) var charDefWildcardMap: [String: [String]] = [:]
/// [:]
public private(set) var octagramMap: [String: Int] = [:]
/// [:(, )]
@ -50,6 +52,7 @@ extension vChewingLM {
/// - `%encoding` Swift UTF-8
/// - `%selkey`
/// - `%endkey`
/// - `%wildcardkey`
/// - `%keyname begin` `%keyname end` Swift
/// - `%chardef begin` `%chardef end`
/// - `%octagram begin` `%octagram end`
@ -72,22 +75,33 @@ extension vChewingLM {
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false }
if loadingCharDefinitions, strLine.contains("%chardef end") {
loadingCharDefinitions = false
if charDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
}
if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true }
if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false }
let cells: [String.SubSequence] =
strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
guard cells.count >= 2 else { continue }
let strFirstCell = String(cells[0])
if loadingKeys, !cells[0].contains("%keyname") {
keyNameMap[String(cells[0])] = String(cells[1])
keyNameMap[strFirstCell] = String(cells[1])
} else if loadingCharDefinitions, !strLine.contains("%chardef") {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
charDefMap[String(cells[0]), default: []].append(String(cells[1]))
charDefMap[strFirstCell, default: []].append(String(cells[1]))
var keyComps = strFirstCell.charComponents
while !keyComps.isEmpty, !wildcardKey.isEmpty {
keyComps.removeLast()
if !wildcardKey.isEmpty {
charDefWildcardMap[keyComps.joined() + wildcardKey, default: []].append(String(cells[1]))
}
}
} else if loadingOctagramData, !strLine.contains("%octagram") {
guard let countValue = Int(cells[1]) else { continue }
switch cells.count {
case 2: octagramMap[String(cells[0])] = countValue
case 3: octagramDividedMap[String(cells[0])] = (countValue, String(cells[2]))
case 2: octagramMap[strFirstCell] = countValue
case 3: octagramDividedMap[strFirstCell] = (countValue, String(cells[2]))
default: break
}
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
@ -110,8 +124,12 @@ extension vChewingLM {
if endKeys.isEmpty, strLine.contains("%endkey ") {
endKeys = cells[1].map { String($0) }.deduplicated
}
if wildcardKey.isEmpty, strLine.contains("%wildcardkey ") {
wildcardKey = cells[1].first?.description ?? ""
}
}
maxKeyLength = theMaxKeyLength
keyNameMap[wildcardKey] = keyNameMap[wildcardKey] ?? ""
return true
} catch {
vCLog("CIN Loading Failed: File Access Error.")
@ -125,12 +143,14 @@ extension vChewingLM {
public func clear() {
keyNameMap.removeAll()
charDefMap.removeAll()
charDefWildcardMap.removeAll()
nameENG.removeAll()
nameCJK.removeAll()
selectionKeys.removeAll()
endKeys.removeAll()
octagramMap.removeAll()
octagramDividedMap.removeAll()
wildcardKey.removeAll()
maxKeyLength = 1
norm = 0
}
@ -139,19 +159,42 @@ extension vChewingLM {
/// - parameters:
/// - key:
public func unigramsFor(key: String) -> [Megrez.Unigram] {
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] }
let arrRaw = charDefMap[key]?.deduplicated ?? []
var arrRawWildcard: [String] = []
if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated,
key.contains(wildcardKey), key.first?.description != wildcardKey
{
arrRawWildcard.append(contentsOf: arrRawWildcardValues)
}
var arrResults = [Megrez.Unigram]()
for (i, neta) in arrRaw.enumerated() {
var lowestScore: Double = 0
for neta in arrRaw {
let theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(i) * -0.001
return Double(arrResults.count) * -0.001 - 9.5
}()
lowestScore = min(theScore, lowestScore)
arrResults.append(.init(value: neta, score: theScore))
}
lowestScore = min(-9.5, lowestScore)
if !arrRawWildcard.isEmpty {
for neta in arrRawWildcard {
var theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(arrResults.count) * -0.001 - 9.7
}()
theScore += lowestScore
arrResults.append(.init(value: neta, score: theScore))
}
}
return arrResults
}
@ -160,6 +203,7 @@ extension vChewingLM {
/// - key:
public func hasUnigramsFor(key: String) -> Bool {
charDefMap[key] != nil
|| (charDefWildcardMap[key] != nil && key.contains(wildcardKey) && key.first?.description != wildcardKey)
}
// MARK: - Private Functions.

View File

@ -25,8 +25,10 @@ final class LMCassetteTests: XCTestCase {
NSLog("LMCassette: Start loading CIN.")
lmCassette.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
XCTAssertEqual(lmCassette.charDefMap.count, 23494)
print(lmCassette.unigramsFor(key: "aaaz"))
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
XCTAssertEqual(lmCassette.charDefMap.count, 23494)
XCTAssertEqual(lmCassette.charDefWildcardMap.count, 8390)
XCTAssertEqual(lmCassette.octagramMap.count, 14616)
XCTAssertEqual(lmCassette.octagramDividedMap.count, 0)
XCTAssertEqual(lmCassette.nameENG, "Wubi")

View File

@ -4,6 +4,7 @@
%cname 五笔
#sname WUBI
%selkey 1234567890
%wildcardkey z
%keyname begin
a 工
b 子
@ -30,7 +31,6 @@ v 女
w 人
x 幺
y 言
z
%keyname end
%chardef begin
aa 式