LMCassette // Partial CIN2 Support (%wildcardKey).
- Also disable wildcardKey if it is included in the %chardef section.
This commit is contained in:
parent
d6f47b6213
commit
2cdfce640f
|
@ -21,8 +21,10 @@ extension vChewingLM {
|
||||||
public private(set) var maxKeyLength: Int = 1
|
public private(set) var maxKeyLength: Int = 1
|
||||||
public private(set) var selectionKeys: [String] = []
|
public private(set) var selectionKeys: [String] = []
|
||||||
public private(set) var endKeys: [String] = []
|
public private(set) var endKeys: [String] = []
|
||||||
|
public private(set) var wildcardKey: String = ""
|
||||||
public private(set) var keyNameMap: [String: String] = [:]
|
public private(set) var keyNameMap: [String: String] = [:]
|
||||||
public private(set) var charDefMap: [String: [String]] = [:]
|
public private(set) var charDefMap: [String: [String]] = [:]
|
||||||
|
public private(set) var charDefWildcardMap: [String: [String]] = [:]
|
||||||
/// 字根輸入法專用八股文:[字詞:頻次]。
|
/// 字根輸入法專用八股文:[字詞:頻次]。
|
||||||
public private(set) var octagramMap: [String: Int] = [:]
|
public private(set) var octagramMap: [String: Int] = [:]
|
||||||
/// 音韻輸入法專用八股文:[字詞:(頻次, 讀音)]。
|
/// 音韻輸入法專用八股文:[字詞:(頻次, 讀音)]。
|
||||||
|
@ -50,6 +52,7 @@ extension vChewingLM {
|
||||||
/// - `%encoding` 不處理,因為 Swift 只認 UTF-8。
|
/// - `%encoding` 不處理,因為 Swift 只認 UTF-8。
|
||||||
/// - `%selkey` 不處理,因為威注音輸入法有自己的選字鍵體系。
|
/// - `%selkey` 不處理,因為威注音輸入法有自己的選字鍵體系。
|
||||||
/// - `%endkey` 是會觸發組字事件的按鍵。
|
/// - `%endkey` 是會觸發組字事件的按鍵。
|
||||||
|
/// - `%wildcardkey` 決定磁帶的萬能鍵名稱,只有第一個字元會生效。
|
||||||
/// - `%keyname begin` 至 `%keyname end` 之間是字根翻譯表,先讀取為 Swift 辭典以備用。
|
/// - `%keyname begin` 至 `%keyname end` 之間是字根翻譯表,先讀取為 Swift 辭典以備用。
|
||||||
/// - `%chardef begin` 至 `%chardef end` 之間則是詞庫資料。
|
/// - `%chardef begin` 至 `%chardef end` 之間則是詞庫資料。
|
||||||
/// - `%octagram begin` 至 `%octagram end` 之間則是詞語頻次資料。
|
/// - `%octagram begin` 至 `%octagram end` 之間則是詞語頻次資料。
|
||||||
|
@ -72,22 +75,33 @@ extension vChewingLM {
|
||||||
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
|
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
|
||||||
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
|
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
|
||||||
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
|
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
|
||||||
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false }
|
if loadingCharDefinitions, strLine.contains("%chardef end") {
|
||||||
|
loadingCharDefinitions = false
|
||||||
|
if charDefMap.keys.contains(wildcardKey) { wildcardKey = "" }
|
||||||
|
}
|
||||||
if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true }
|
if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true }
|
||||||
if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false }
|
if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false }
|
||||||
let cells: [String.SubSequence] =
|
let cells: [String.SubSequence] =
|
||||||
strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
|
strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
|
||||||
guard cells.count >= 2 else { continue }
|
guard cells.count >= 2 else { continue }
|
||||||
|
let strFirstCell = String(cells[0])
|
||||||
if loadingKeys, !cells[0].contains("%keyname") {
|
if loadingKeys, !cells[0].contains("%keyname") {
|
||||||
keyNameMap[String(cells[0])] = String(cells[1])
|
keyNameMap[strFirstCell] = String(cells[1])
|
||||||
} else if loadingCharDefinitions, !strLine.contains("%chardef") {
|
} else if loadingCharDefinitions, !strLine.contains("%chardef") {
|
||||||
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
|
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
|
||||||
charDefMap[String(cells[0]), default: []].append(String(cells[1]))
|
charDefMap[strFirstCell, default: []].append(String(cells[1]))
|
||||||
|
var keyComps = strFirstCell.charComponents
|
||||||
|
while !keyComps.isEmpty, !wildcardKey.isEmpty {
|
||||||
|
keyComps.removeLast()
|
||||||
|
if !wildcardKey.isEmpty {
|
||||||
|
charDefWildcardMap[keyComps.joined() + wildcardKey, default: []].append(String(cells[1]))
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if loadingOctagramData, !strLine.contains("%octagram") {
|
} else if loadingOctagramData, !strLine.contains("%octagram") {
|
||||||
guard let countValue = Int(cells[1]) else { continue }
|
guard let countValue = Int(cells[1]) else { continue }
|
||||||
switch cells.count {
|
switch cells.count {
|
||||||
case 2: octagramMap[String(cells[0])] = countValue
|
case 2: octagramMap[strFirstCell] = countValue
|
||||||
case 3: octagramDividedMap[String(cells[0])] = (countValue, String(cells[2]))
|
case 3: octagramDividedMap[strFirstCell] = (countValue, String(cells[2]))
|
||||||
default: break
|
default: break
|
||||||
}
|
}
|
||||||
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
|
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
|
||||||
|
@ -110,8 +124,12 @@ extension vChewingLM {
|
||||||
if endKeys.isEmpty, strLine.contains("%endkey ") {
|
if endKeys.isEmpty, strLine.contains("%endkey ") {
|
||||||
endKeys = cells[1].map { String($0) }.deduplicated
|
endKeys = cells[1].map { String($0) }.deduplicated
|
||||||
}
|
}
|
||||||
|
if wildcardKey.isEmpty, strLine.contains("%wildcardkey ") {
|
||||||
|
wildcardKey = cells[1].first?.description ?? ""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
maxKeyLength = theMaxKeyLength
|
maxKeyLength = theMaxKeyLength
|
||||||
|
keyNameMap[wildcardKey] = keyNameMap[wildcardKey] ?? "?"
|
||||||
return true
|
return true
|
||||||
} catch {
|
} catch {
|
||||||
vCLog("CIN Loading Failed: File Access Error.")
|
vCLog("CIN Loading Failed: File Access Error.")
|
||||||
|
@ -125,12 +143,14 @@ extension vChewingLM {
|
||||||
public func clear() {
|
public func clear() {
|
||||||
keyNameMap.removeAll()
|
keyNameMap.removeAll()
|
||||||
charDefMap.removeAll()
|
charDefMap.removeAll()
|
||||||
|
charDefWildcardMap.removeAll()
|
||||||
nameENG.removeAll()
|
nameENG.removeAll()
|
||||||
nameCJK.removeAll()
|
nameCJK.removeAll()
|
||||||
selectionKeys.removeAll()
|
selectionKeys.removeAll()
|
||||||
endKeys.removeAll()
|
endKeys.removeAll()
|
||||||
octagramMap.removeAll()
|
octagramMap.removeAll()
|
||||||
octagramDividedMap.removeAll()
|
octagramDividedMap.removeAll()
|
||||||
|
wildcardKey.removeAll()
|
||||||
maxKeyLength = 1
|
maxKeyLength = 1
|
||||||
norm = 0
|
norm = 0
|
||||||
}
|
}
|
||||||
|
@ -139,19 +159,42 @@ extension vChewingLM {
|
||||||
/// - parameters:
|
/// - parameters:
|
||||||
/// - key: 讀音索引鍵。
|
/// - key: 讀音索引鍵。
|
||||||
public func unigramsFor(key: String) -> [Megrez.Unigram] {
|
public func unigramsFor(key: String) -> [Megrez.Unigram] {
|
||||||
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] }
|
let arrRaw = charDefMap[key]?.deduplicated ?? []
|
||||||
|
var arrRawWildcard: [String] = []
|
||||||
|
if let arrRawWildcardValues = charDefWildcardMap[key]?.deduplicated,
|
||||||
|
key.contains(wildcardKey), key.first?.description != wildcardKey
|
||||||
|
{
|
||||||
|
arrRawWildcard.append(contentsOf: arrRawWildcardValues)
|
||||||
|
}
|
||||||
var arrResults = [Megrez.Unigram]()
|
var arrResults = [Megrez.Unigram]()
|
||||||
for (i, neta) in arrRaw.enumerated() {
|
var lowestScore: Double = 0
|
||||||
|
for neta in arrRaw {
|
||||||
let theScore: Double = {
|
let theScore: Double = {
|
||||||
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
|
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
|
||||||
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
|
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
|
||||||
} else if let freqData = octagramMap[neta] {
|
} else if let freqData = octagramMap[neta] {
|
||||||
return calculateWeight(count: freqData, phraseLength: neta.count)
|
return calculateWeight(count: freqData, phraseLength: neta.count)
|
||||||
}
|
}
|
||||||
return Double(i) * -0.001
|
return Double(arrResults.count) * -0.001 - 9.5
|
||||||
}()
|
}()
|
||||||
|
lowestScore = min(theScore, lowestScore)
|
||||||
arrResults.append(.init(value: neta, score: theScore))
|
arrResults.append(.init(value: neta, score: theScore))
|
||||||
}
|
}
|
||||||
|
lowestScore = min(-9.5, lowestScore)
|
||||||
|
if !arrRawWildcard.isEmpty {
|
||||||
|
for neta in arrRawWildcard {
|
||||||
|
var theScore: Double = {
|
||||||
|
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
|
||||||
|
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
|
||||||
|
} else if let freqData = octagramMap[neta] {
|
||||||
|
return calculateWeight(count: freqData, phraseLength: neta.count)
|
||||||
|
}
|
||||||
|
return Double(arrResults.count) * -0.001 - 9.7
|
||||||
|
}()
|
||||||
|
theScore += lowestScore
|
||||||
|
arrResults.append(.init(value: neta, score: theScore))
|
||||||
|
}
|
||||||
|
}
|
||||||
return arrResults
|
return arrResults
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -160,6 +203,7 @@ extension vChewingLM {
|
||||||
/// - key: 讀音索引鍵。
|
/// - key: 讀音索引鍵。
|
||||||
public func hasUnigramsFor(key: String) -> Bool {
|
public func hasUnigramsFor(key: String) -> Bool {
|
||||||
charDefMap[key] != nil
|
charDefMap[key] != nil
|
||||||
|
|| (charDefWildcardMap[key] != nil && key.contains(wildcardKey) && key.first?.description != wildcardKey)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - Private Functions.
|
// MARK: - Private Functions.
|
||||||
|
|
|
@ -25,8 +25,10 @@ final class LMCassetteTests: XCTestCase {
|
||||||
NSLog("LMCassette: Start loading CIN.")
|
NSLog("LMCassette: Start loading CIN.")
|
||||||
lmCassette.open(pathCINFile)
|
lmCassette.open(pathCINFile)
|
||||||
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
|
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
|
||||||
XCTAssertEqual(lmCassette.charDefMap.count, 23494)
|
print(lmCassette.unigramsFor(key: "aaaz"))
|
||||||
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
|
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
|
||||||
|
XCTAssertEqual(lmCassette.charDefMap.count, 23494)
|
||||||
|
XCTAssertEqual(lmCassette.charDefWildcardMap.count, 8390)
|
||||||
XCTAssertEqual(lmCassette.octagramMap.count, 14616)
|
XCTAssertEqual(lmCassette.octagramMap.count, 14616)
|
||||||
XCTAssertEqual(lmCassette.octagramDividedMap.count, 0)
|
XCTAssertEqual(lmCassette.octagramDividedMap.count, 0)
|
||||||
XCTAssertEqual(lmCassette.nameENG, "Wubi")
|
XCTAssertEqual(lmCassette.nameENG, "Wubi")
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
%cname 五笔
|
%cname 五笔
|
||||||
#sname WUBI
|
#sname WUBI
|
||||||
%selkey 1234567890
|
%selkey 1234567890
|
||||||
|
%wildcardkey z
|
||||||
%keyname begin
|
%keyname begin
|
||||||
a 工
|
a 工
|
||||||
b 子
|
b 子
|
||||||
|
@ -30,7 +31,6 @@ v 女
|
||||||
w 人
|
w 人
|
||||||
x 幺
|
x 幺
|
||||||
y 言
|
y 言
|
||||||
z ?
|
|
||||||
%keyname end
|
%keyname end
|
||||||
%chardef begin
|
%chardef begin
|
||||||
aa 式
|
aa 式
|
||||||
|
|
Loading…
Reference in New Issue