LMCassette // Partial CIN2 Support (%octagram).

- Data loading is supported. However, the entire interaction method with Megrez needs to be reconsidered for using frequency data of words consisting of more than 1 kanji.
This commit is contained in:
ShikiSuen 2022-10-18 20:49:49 +08:00
parent 007928ea6f
commit d6f47b6213
4 changed files with 25653 additions and 19037 deletions

View File

@ -23,6 +23,14 @@ extension vChewingLM {
public private(set) var endKeys: [String] = []
public private(set) var keyNameMap: [String: String] = [:]
public private(set) var charDefMap: [String: [String]] = [:]
/// [:]
public private(set) var octagramMap: [String: Int] = [:]
/// [:(, )]
public private(set) var octagramDividedMap: [String: (Int, String)] = [:]
/// 西
private static let fscale = 2.7
private var norm = 0.0
///
public var count: Int { charDefMap.count }
@ -44,6 +52,8 @@ extension vChewingLM {
/// - `%endkey`
/// - `%keyname begin` `%keyname end` Swift
/// - `%chardef begin` `%chardef end`
/// - `%octagram begin` `%octagram end`
///
/// - Parameter path:
/// - Returns:
@discardableResult public func open(_ path: String) -> Bool {
@ -57,21 +67,32 @@ extension vChewingLM {
var theMaxKeyLength = 1
var loadingKeys = false
var loadingCharDefinitions = false
var loadingOctagramData = false
for (_, strLine) in lineReader.enumerated() {
if !loadingKeys, strLine.contains("%keyname begin") { loadingKeys = true }
if loadingKeys, strLine.contains("%keyname end") { loadingKeys = false }
if !loadingCharDefinitions, strLine.contains("%chardef begin") { loadingCharDefinitions = true }
if loadingCharDefinitions, strLine.contains("%chardef end") { loadingCharDefinitions = false }
if !loadingOctagramData, strLine.contains("%octagram begin") { loadingOctagramData = true }
if loadingOctagramData, strLine.contains("%octagram end") { loadingOctagramData = false }
let cells: [String.SubSequence] =
strLine.contains("\t") ? strLine.split(separator: "\t") : strLine.split(separator: " ")
guard cells.count == 2 else { continue }
guard cells.count >= 2 else { continue }
if loadingKeys, !cells[0].contains("%keyname") {
keyNameMap[String(cells[0])] = String(cells[1])
} else if loadingCharDefinitions, !strLine.contains("%chardef") {
theMaxKeyLength = max(theMaxKeyLength, cells[0].count)
charDefMap[String(cells[0]), default: []].append(String(cells[1]))
} else if loadingOctagramData, !strLine.contains("%octagram") {
guard let countValue = Int(cells[1]) else { continue }
switch cells.count {
case 2: octagramMap[String(cells[0])] = countValue
case 3: octagramDividedMap[String(cells[0])] = (countValue, String(cells[2]))
default: break
}
guard !loadingKeys, !loadingCharDefinitions else { continue }
norm += Self.fscale ** (Double(cells[0].count) / 3.0 - 1.0) * Double(countValue)
}
guard !loadingKeys, !loadingCharDefinitions, !loadingOctagramData else { continue }
if nameENG.isEmpty, strLine.contains("%ename ") {
for neta in cells[1].components(separatedBy: ";") {
let subNetaGroup = neta.components(separatedBy: ":")
@ -108,17 +129,28 @@ extension vChewingLM {
nameCJK.removeAll()
selectionKeys.removeAll()
endKeys.removeAll()
octagramMap.removeAll()
octagramDividedMap.removeAll()
maxKeyLength = 1
norm = 0
}
///
///
/// - parameters:
/// - key:
public func unigramsFor(key: String) -> [Megrez.Unigram] {
guard let arrRaw = charDefMap[key]?.deduplicated, !arrRaw.isEmpty else { return [] }
var arrResults = [Megrez.Unigram]()
for (i, neta) in arrRaw.enumerated() {
arrResults.append(.init(value: neta, score: Double(i) * -0.001))
let theScore: Double = {
if let freqDataPair = octagramDividedMap[neta], key == freqDataPair.1 {
return calculateWeight(count: freqDataPair.0, phraseLength: neta.count)
} else if let freqData = octagramMap[neta] {
return calculateWeight(count: freqData, phraseLength: neta.count)
}
return Double(i) * -0.001
}()
arrResults.append(.init(value: neta, score: theScore))
}
return arrResults
}
@ -129,5 +161,40 @@ extension vChewingLM {
public func hasUnigramsFor(key: String) -> Bool {
charDefMap[key] != nil
}
// MARK: - Private Functions.
private func calculateWeight(count theCount: Int, phraseLength: Int) -> Double {
var weight: Double = 0
switch theCount {
case -2: //
weight = -13
case -1: //
weight = -13
case 0: //
weight = log10(
Self.fscale ** (Double(phraseLength) / 3.0 - 1.0) * 0.25 / norm)
default:
weight = log10(
Self.fscale ** (Double(phraseLength) / 3.0 - 1.0)
* Double(theCount) / norm
)
}
return weight
}
}
}
// MARK: -
// Ref: https://stackoverflow.com/a/41581695/4162914
precedencegroup ExponentiationPrecedence {
associativity: right
higherThan: MultiplicationPrecedence
}
infix operator **: ExponentiationPrecedence
private func ** (_ base: Double, _ exp: Double) -> Double {
pow(base, exp)
}

View File

@ -19,33 +19,20 @@ private let packageRootPath = URL(fileURLWithPath: #file).pathComponents.prefix(
private let testDataPath: String = packageRootPath + "/Tests/TestCINData/"
final class LMCassetteTests: XCTestCase {
func testCassetteLoadWubi98() throws {
let pathCINFile = testDataPath + "wubi98.cin"
let lmCassette98 = vChewingLM.LMCassette()
NSLog("LMCassette: Start loading CIN.")
lmCassette98.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette98.count)")
XCTAssertEqual(lmCassette98.charDefMap.count, 21491)
XCTAssertEqual(lmCassette98.keyNameMap.count, 26)
XCTAssertEqual(lmCassette98.nameENG, "Wubi98")
XCTAssertEqual(lmCassette98.nameCJK, "五笔98")
XCTAssertEqual(lmCassette98.maxKeyLength, 4)
XCTAssertEqual(lmCassette98.endKeys.count, 0)
XCTAssertEqual(lmCassette98.selectionKeys.count, 10)
}
func testCassetteLoadWubi86() throws {
let pathCINFile = testDataPath + "wubi86.cin"
let lmCassette86 = vChewingLM.LMCassette()
let pathCINFile = testDataPath + "wubi.cin"
let lmCassette = vChewingLM.LMCassette()
NSLog("LMCassette: Start loading CIN.")
lmCassette86.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette86.count)")
XCTAssertEqual(lmCassette86.charDefMap.count, 10690)
XCTAssertEqual(lmCassette86.keyNameMap.count, 26)
XCTAssertEqual(lmCassette86.nameENG, "Wubi86")
XCTAssertEqual(lmCassette86.nameCJK, "五笔86")
XCTAssertEqual(lmCassette86.maxKeyLength, 4)
XCTAssertEqual(lmCassette86.endKeys.count, 0)
XCTAssertEqual(lmCassette86.selectionKeys.count, 10)
lmCassette.open(pathCINFile)
NSLog("LMCassette: Finished loading CIN. Entries: \(lmCassette.count)")
XCTAssertEqual(lmCassette.charDefMap.count, 23494)
XCTAssertEqual(lmCassette.keyNameMap.count, 26)
XCTAssertEqual(lmCassette.octagramMap.count, 14616)
XCTAssertEqual(lmCassette.octagramDividedMap.count, 0)
XCTAssertEqual(lmCassette.nameENG, "Wubi")
XCTAssertEqual(lmCassette.nameCJK, "五笔")
XCTAssertEqual(lmCassette.maxKeyLength, 4)
XCTAssertEqual(lmCassette.endKeys.count, 0)
XCTAssertEqual(lmCassette.selectionKeys.count, 10)
}
}

File diff suppressed because it is too large Load Diff