LMCassette // Partial CIN2 Support (%octagram).

- Data loading is supported. However, the entire interaction method with Megrez needs to be reconsidered for using frequency data of words consisting of more than 1 kanji.
This commit is contained in:
ShikiSuen 2022-10-18 20:49:49 +08:00
parent 007928ea6f
commit d6f47b6213
4 changed files with 25653 additions and 19037 deletions

View File

@ -23,6 +23,14 @@ extension vChewingLM {
public private(set) var endKeys: [String] = [] public private(set) var endKeys: [String] = []
public private(set) var keyNameMap: [String: String] = [:] public private(set) var keyNameMap: [String: String] = [:]
public private(set) var charDefMap: [String: [String]] = [:] public private(set) var charDefMap: [String: [String]] = [:]
/// [:]
public private(set) var octagramMap: [String: Int] = [:]
/// [:(, )]
public private(set) var octagramDividedMap: [String: (Int, String)] = [:]
/// 西
private static let fscale = 2.7
private var norm = 0.0
/// ///
public var count: Int { charDefMap.count } public var count: Int { charDefMap.count }
@ -44,6 +52,8 @@ extension vChewingLM {
/// - `%endkey` /// - `%endkey`
/// - `%keyname begin` `%keyname end` Swift /// - `%keyname begin` `%keyname end` Swift
/// - `%chardef begin` `%chardef end` /// - `%chardef begin` `%chardef end`
/// - `%octagram begin` `%octagram end`
///
/// - Parameter path: /// - Parameter path:
/// - Returns: /// - Returns:
@discardableResult public func open(_ path: String) -> Bool { @discardableResult public func open(_ path: String) -> Bool {
@ -57,21 +67,32 @@ extension vChewingLM {
var theMaxKeyLength = 1 var theMaxKeyLength = 1
var loadingKeys = false var loadingKeys = false
var loadingCharDefinitions = false var loadingCharDefinitions = false
var loadingOctagramData = false
for (_, strLine) in lineReader.enumerated() { for (_, strLine) in lineReader.enumerated() {
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true } if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false } if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true } if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false } if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false }
if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true }
if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false }
let cells: [String.SubSequence] = let cells: [String.SubSequence] =
strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ") strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
guard cells.count == 2 else { continue } guard cells.count >= 2 else { continue }
if loadingKeys, !cells[0].contains("%keyname") { if loadingKeys, !cells[0].contains("%keyname") {
keyNameMap[String(cells[0])] = String(cells[1]) keyNameMap[String(cells[0])] = String(cells[1])
} else if loadingCharDefinitions, !strLine.contains("%chardef") { } else if loadingCharDefinitions, !strLine.contains("%chardef") {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count) theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
charDefMap[String(cells[0]), default: []].append(String(cells[1])) charDefMap[String(cells[0]), default: []].append(String(cells[1]))
} else if loadingOctagramData, !strLine.contains("%octagram") {
guard let countValue = Int(cells[1]) else { continue }
switch cells.count {
case 2: octagramMap[String(cells[0])] = countValue
case 3: octagramDividedMap[String(cells[0])] = (countValue, String(cells[2]))
default: break
}
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
} }
guard !loadingKeys, !loadingCharDefinitions else { continue } guard !loadingKeys, !loadingCharDefinitions, !loadingOctagramData else { continue }
if nameENG.isEmpty, strLine.contains("%ename ") { if nameENG.isEmpty, strLine.contains("%ename ") {
for neta in cells[1].components(separatedBy: ";") { for neta in cells[1].components(separatedBy: ";") {
let subNetaGroup = neta.components(separatedBy: ":") let subNetaGroup = neta.components(separatedBy: ":")
@ -108,17 +129,28 @@ extension vChewingLM {
nameCJK.removeAll() nameCJK.removeAll()
selectionKeys.removeAll() selectionKeys.removeAll()
endKeys.removeAll() endKeys.removeAll()
octagramMap.removeAll()
octagramDividedMap.removeAll()
maxKeyLength = 1 maxKeyLength = 1
norm = 0
} }
/// ///
/// - parameters: /// - parameters:
/// - key: /// - key:
public func unigramsFor(key: String) -> [Megrez.Unigram] { public func unigramsFor(key: String) -> [Megrez.Unigram] {
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] } guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] }
var arrResults = [Megrez.Unigram]() var arrResults = [Megrez.Unigram]()
for (i, neta) in arrRaw.enumerated() { for (i, neta) in arrRaw.enumerated() {
arrResults.append(.init(value: neta, score: Double(i) * -0.001)) let theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(i) * -0.001
}()
arrResults.append(.init(value: neta, score: theScore))
} }
return arrResults return arrResults
} }
@ -129,5 +161,40 @@ extension vChewingLM {
public func hasUnigramsFor(key: String) -> Bool { public func hasUnigramsFor(key: String) -> Bool {
charDefMap[key] != nil charDefMap[key] != nil
} }
// MARK: - Private Functions.
private func calculateWeight(count theCount: Int, phraseLength: Int) -> Double {
var weight: Double = 0
switch theCount {
case -2: //
weight = -13
case -1: //
weight = -13
case 0: //
weight = log10(
Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) * 0.25 / norm)
default:
weight = log10(
Self.fscale ** (Double(phraseLength) / 3.0 - 1.0)
* Double(theCount) / norm
)
}
return weight
}
} }
} }
// MARK: -
// Ref: https://stackoverflow.com/a/41581695/4162914
precedencegroup ExponentiationPrecedence {
associativity: right
higherThan: MultiplicationPrecedence
}
infix operator **: ExponentiationPrecedence
private func ** (_ base: Double, _ exp: Double) -> Double {
pow(base, exp)
}

View File

@ -19,33 +19,20 @@ private let packageRootPath = URL(fileURLWithPath: #file).pathComponents.prefix(
private let testDataPath: String = packageRootPath + "/Tests/TestCINData/" private let testDataPath: String = packageRootPath + "/Tests/TestCINData/"
final class LMCassetteTests: XCTestCase { final class LMCassetteTests: XCTestCase {
func testCassetteLoadWubi98() throws {
let pathCINFile = testDataPath + "wubi98.cin"
let lmCassette98 = vChewingLM.LMCassette()
NSLog("LMCassette: Start loading CIN.")
lmCassette98.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette98.count)")
XCTAssertEqual(lmCassette98.charDefMap.count, 21491)
XCTAssertEqual(lmCassette98.keyNameMap.count, 26)
XCTAssertEqual(lmCassette98.nameENG, "Wubi98")
XCTAssertEqual(lmCassette98.nameCJK, "五笔98")
XCTAssertEqual(lmCassette98.maxKeyLength, 4)
XCTAssertEqual(lmCassette98.endKeys.count, 0)
XCTAssertEqual(lmCassette98.selectionKeys.count, 10)
}
func testCassetteLoadWubi86() throws { func testCassetteLoadWubi86() throws {
let pathCINFile = testDataPath + "wubi86.cin" let pathCINFile = testDataPath + "wubi.cin"
let lmCassette86 = vChewingLM.LMCassette() let lmCassette = vChewingLM.LMCassette()
NSLog("LMCassette: Start loading CIN.") NSLog("LMCassette: Start loading CIN.")
lmCassette86.open(pathCINFile) lmCassette.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette86.count)") NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
XCTAssertEqual(lmCassette86.charDefMap.count, 10690) XCTAssertEqual(lmCassette.charDefMap.count, 23494)
XCTAssertEqual(lmCassette86.keyNameMap.count, 26) XCTAssertEqual(lmCassette.keyNameMap.count, 26)
XCTAssertEqual(lmCassette86.nameENG, "Wubi86") XCTAssertEqual(lmCassette.octagramMap.count, 14616)
XCTAssertEqual(lmCassette86.nameCJK, "五笔86") XCTAssertEqual(lmCassette.octagramDividedMap.count, 0)
XCTAssertEqual(lmCassette86.maxKeyLength, 4) XCTAssertEqual(lmCassette.nameENG, "Wubi")
XCTAssertEqual(lmCassette86.endKeys.count, 0) XCTAssertEqual(lmCassette.nameCJK, "五笔")
XCTAssertEqual(lmCassette86.selectionKeys.count, 10) XCTAssertEqual(lmCassette.maxKeyLength, 4)
XCTAssertEqual(lmCassette.endKeys.count, 0)
XCTAssertEqual(lmCassette.selectionKeys.count, 10)
} }
} }

File diff suppressed because it is too large Load Diff