LMCassette // Partial CIN2 Support (%wildcardKey).

- Also disable wildcardKey if it is included in the %chardef section.
This commit is contained in:
ShikiSuen 2022-10-18 23:15:51 +08:00
parent d6f47b6213
commit 2cdfce640f
3 changed files with 56 additions and 10 deletions

View File

@ -21,8 +21,10 @@ extension vChewingLM {
public private(set) var maxKeyLength: Int = 1 public private(set) var maxKeyLength: Int = 1
public private(set) var selectionKeys: [String] = [] public private(set) var selectionKeys: [String] = []
public private(set) var endKeys: [String] = [] public private(set) var endKeys: [String] = []
public private(set) var wildcardKey: String = ""
public private(set) var keyNameMap: [String: String] = [:] public private(set) var keyNameMap: [String: String] = [:]
public private(set) var charDefMap: [String: [String]] = [:] public private(set) var charDefMap: [String: [String]] = [:]
public private(set) var charDefWildcardMap: [String: [String]] = [:]
/// [:] /// [:]
public private(set) var octagramMap: [String: Int] = [:] public private(set) var octagramMap: [String: Int] = [:]
/// [:(, )] /// [:(, )]
@ -50,6 +52,7 @@ extension vChewingLM {
/// - `%encoding` Swift UTF-8 /// - `%encoding` Swift UTF-8
/// - `%selkey` /// - `%selkey`
/// - `%endkey` /// - `%endkey`
/// - `%wildcardkey`
/// - `%keyname begin` `%keyname end` Swift /// - `%keyname begin` `%keyname end` Swift
/// - `%chardef begin` `%chardef end` /// - `%chardef begin` `%chardef end`
/// - `%octagram begin` `%octagram end` /// - `%octagram begin` `%octagram end`
@ -72,22 +75,33 @@ extension vChewingLM {
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true } if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false } if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true } if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false } if loadingCharDefinitions, strLine.contains("%chardef end") {
loadingCharDefinitions = false
if charDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
}
if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true } if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true }
if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false } if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false }
let cells: [String.SubSequence] = let cells: [String.SubSequence] =
strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ") strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
guard cells.count >= 2 else { continue } guard cells.count >= 2 else { continue }
let strFirstCell = String(cells[0])
if loadingKeys, !cells[0].contains("%keyname") { if loadingKeys, !cells[0].contains("%keyname") {
keyNameMap[String(cells[0])] = String(cells[1]) keyNameMap[strFirstCell] = String(cells[1])
} else if loadingCharDefinitions, !strLine.contains("%chardef") { } else if loadingCharDefinitions, !strLine.contains("%chardef") {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count) theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
charDefMap[String(cells[0]), default: []].append(String(cells[1])) charDefMap[strFirstCell, default: []].append(String(cells[1]))
var keyComps = strFirstCell.charComponents
while !keyComps.isEmpty, !wildcardKey.isEmpty {
keyComps.removeLast()
if !wildcardKey.isEmpty {
charDefWildcardMap[keyComps.joined() + wildcardKey, default: []].append(String(cells[1]))
}
}
} else if loadingOctagramData, !strLine.contains("%octagram") { } else if loadingOctagramData, !strLine.contains("%octagram") {
guard let countValue = Int(cells[1]) else { continue } guard let countValue = Int(cells[1]) else { continue }
switch cells.count { switch cells.count {
case 2: octagramMap[String(cells[0])] = countValue case 2: octagramMap[strFirstCell] = countValue
case 3: octagramDividedMap[String(cells[0])] = (countValue, String(cells[2])) case 3: octagramDividedMap[strFirstCell] = (countValue, String(cells[2]))
default: break default: break
} }
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue) norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
@ -110,8 +124,12 @@ extension vChewingLM {
if endKeys.isEmpty, strLine.contains("%endkey ") { if endKeys.isEmpty, strLine.contains("%endkey ") {
endKeys = cells[1].map { String($0) }.deduplicated endKeys = cells[1].map { String($0) }.deduplicated
} }
if wildcardKey.isEmpty, strLine.contains("%wildcardkey ") {
wildcardKey = cells[1].first?.description ?? ""
}
} }
maxKeyLength = theMaxKeyLength maxKeyLength = theMaxKeyLength
keyNameMap[wildcardKey] = keyNameMap[wildcardKey] ?? ""
return true return true
} catch { } catch {
vCLog("CIN Loading Failed: File Access Error.") vCLog("CIN Loading Failed: File Access Error.")
@ -125,12 +143,14 @@ extension vChewingLM {
public func clear() { public func clear() {
keyNameMap.removeAll() keyNameMap.removeAll()
charDefMap.removeAll() charDefMap.removeAll()
charDefWildcardMap.removeAll()
nameENG.removeAll() nameENG.removeAll()
nameCJK.removeAll() nameCJK.removeAll()
selectionKeys.removeAll() selectionKeys.removeAll()
endKeys.removeAll() endKeys.removeAll()
octagramMap.removeAll() octagramMap.removeAll()
octagramDividedMap.removeAll() octagramDividedMap.removeAll()
wildcardKey.removeAll()
maxKeyLength = 1 maxKeyLength = 1
norm = 0 norm = 0
} }
@ -139,19 +159,42 @@ extension vChewingLM {
/// - parameters: /// - parameters:
/// - key: /// - key:
public func unigramsFor(key: String) -> [Megrez.Unigram] { public func unigramsFor(key: String) -> [Megrez.Unigram] {
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] } let arrRaw = charDefMap[key]?.deduplicated ?? []
var arrRawWildcard: [String] = []
if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated,
key.contains(wildcardKey), key.first?.description != wildcardKey
{
arrRawWildcard.append(contentsOf: arrRawWildcardValues)
}
var arrResults = [Megrez.Unigram]() var arrResults = [Megrez.Unigram]()
for (i, neta) in arrRaw.enumerated() { var lowestScore: Double = 0
for neta in arrRaw {
let theScore: Double = { let theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 { if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count) return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] { } else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count) return calculateWeight(count: freqData, phraseLength: neta.count)
} }
return Double(i) * -0.001 return Double(arrResults.count) * -0.001 - 9.5
}() }()
lowestScore = min(theScore, lowestScore)
arrResults.append(.init(value: neta, score: theScore)) arrResults.append(.init(value: neta, score: theScore))
} }
lowestScore = min(-9.5, lowestScore)
if !arrRawWildcard.isEmpty {
for neta in arrRawWildcard {
var theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(arrResults.count) * -0.001 - 9.7
}()
theScore += lowestScore
arrResults.append(.init(value: neta, score: theScore))
}
}
return arrResults return arrResults
} }
@ -160,6 +203,7 @@ extension vChewingLM {
/// - key: /// - key:
public func hasUnigramsFor(key: String) -> Bool { public func hasUnigramsFor(key: String) -> Bool {
charDefMap[key] != nil charDefMap[key] != nil
|| (charDefWildcardMap[key] != nil && key.contains(wildcardKey) && key.first?.description != wildcardKey)
} }
// MARK: - Private Functions. // MARK: - Private Functions.

View File

@ -25,8 +25,10 @@ final class LMCassetteTests: XCTestCase {
NSLog("LMCassette: Start loading CIN.") NSLog("LMCassette: Start loading CIN.")
lmCassette.open(pathCINFile) lmCassette.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)") NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
XCTAssertEqual(lmCassette.charDefMap.count, 23494) print(lmCassette.unigramsFor(key: "aaaz"))
XCTAssertEqual(lmCassette.keyNameMap.count, 26) XCTAssertEqual(lmCassette.keyNameMap.count, 26)
XCTAssertEqual(lmCassette.charDefMap.count, 23494)
XCTAssertEqual(lmCassette.charDefWildcardMap.count, 8390)
XCTAssertEqual(lmCassette.octagramMap.count, 14616) XCTAssertEqual(lmCassette.octagramMap.count, 14616)
XCTAssertEqual(lmCassette.octagramDividedMap.count, 0) XCTAssertEqual(lmCassette.octagramDividedMap.count, 0)
XCTAssertEqual(lmCassette.nameENG, "Wubi") XCTAssertEqual(lmCassette.nameENG, "Wubi")

View File

@ -4,6 +4,7 @@
%cname 五笔 %cname 五笔
#sname WUBI #sname WUBI
%selkey 1234567890 %selkey 1234567890
%wildcardkey z
%keyname begin %keyname begin
a 工 a 工
b 子 b 子
@ -30,7 +31,6 @@ v 女
w 人 w 人
x 幺 x 幺
y 言 y 言
z
%keyname end %keyname end
%chardef begin %chardef begin
aa 式 aa 式