Skip to content

feat: Optimize SmartCn Dictionaries and Add Dictionary Loading Tests #1154

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
internal abstract class AbstractDictionary
{
// LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");
protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312",
EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);


/// <summary>
/// First Chinese Character in GB2312 (15 * 94)
Expand Down Expand Up @@ -162,7 +164,7 @@ public virtual long Hash1(char c)
/// </summary>
/// <param name="carray">character array</param>
/// <returns>hashcode</returns>
public virtual long Hash1(char[] carray)
public virtual long Hash1(ReadOnlySpan<char> carray)
{
long p = 1099511628211L;
long hash = unchecked((long)0xcbf29ce484222325L);
Expand Down Expand Up @@ -210,7 +212,7 @@ public virtual int Hash2(char c)
/// </summary>
/// <param name="carray">character array</param>
/// <returns>hashcode</returns>
public virtual int Hash2(char[] carray)
public virtual int Hash2(ReadOnlySpan<char> carray)
{
int hash = 5381;

Expand Down
66 changes: 41 additions & 25 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -254,30 +254,47 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
/// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
/// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
int i, cnt, length, total = 0;

// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
int[]
buffer = new int[3];
byte[] intBuffer = new byte[4];

Span<int> buffer = stackalloc int[3];
string tmpword;

// LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
// LUCENENET specific - refactored constants for clarity

// The 3756th position (using 1-based counting) corresponds to index 3755 (using 0-based indexing)
// This matches the original Java implementation which used 3755 + GB2312_FIRST_CHAR in the condition
const int HEADER_POSITION = 3755;

//using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
using var reader = new BinaryReader(dctFile);

// GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
{

string currentStr = GetCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
try
{
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
}
catch (EndOfStreamException)
{
// Test dictionary files contain fewer entries than production files
// Breaking here is normal and expected behavior for test files
break;
}

dctFile.Read(intBuffer, 0, intBuffer.Length);
// the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
continue;
Expand All @@ -286,37 +303,37 @@ public virtual void LoadFromFile(string dctFilePath)
int j = 0;
while (j < cnt)
{
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// frequency
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// length
dctFile.Read(intBuffer, 0, intBuffer.Length);
// buffer[2] = ByteBuffer.wrap(intBuffer).order(
// ByteOrder.LITTLE_ENDIAN).getInt();// handle
// LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
buffer[0] = reader.ReadInt32(); // frequency
buffer[1] = reader.ReadInt32(); // length
buffer[2] = reader.ReadInt32(); // Skip handle value (unused)

length = buffer[1];
if (length > 0)
if (length > 0 && dctFile.Position + length <= dctFile.Length)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
byte[] lchBuffer = reader.ReadBytes(length); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET

//tmpword = new String(lchBuffer, "GB2312");
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
//tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
if (i != 3755 + GB2312_FIRST_CHAR)


if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
{
tmpword = currentStr + tmpword;
}
char[] carray = tmpword.ToCharArray();

ReadOnlySpan<char> carray = tmpword.AsSpan();
long hashId = Hash1(carray);
int index = GetAvaliableIndex(hashId, carray);

if (index != -1)
{
if (bigramHashTable[index] == 0)
{
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;

}
frequencyTable[index] += buffer[0];
}
Expand All @@ -326,8 +343,7 @@ public virtual void LoadFromFile(string dctFilePath)
}
// log.info("load dictionary done! " + dctFilePath + " total:" + total);
}

private int GetAvaliableIndex(long hashId, char[] carray)
private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
{
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
Expand Down Expand Up @@ -357,7 +373,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
/// <summary>
/// lookup the index into the frequency array.
/// </summary>
private int GetBigramItemIndex(char[] carray)
private int GetBigramItemIndex(ReadOnlySpan<char> carray)
{
long hashId = Hash1(carray);
int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
Expand Down Expand Up @@ -388,7 +404,7 @@ private int GetBigramItemIndex(char[] carray)
return -1;
}

public int GetFrequency(char[] carray)
public int GetFrequency(ReadOnlySpan<char> carray)
{
int index = GetBigramItemIndex(carray);
if (index != -1)
Expand Down
45 changes: 23 additions & 22 deletions src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -340,62 +340,62 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
/// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
/// <returns>number of words read</returns>
/// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
/// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.

// The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
// The 3756th is used (as a header) to store information.
int[]
buffer = new int[3];
byte[] intBuffer = new byte[4];

Span<int> buffer = stackalloc int[3];
string tmpword;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please declare tmpword after buffer below, not inline.


// LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
// LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.

using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
using (var reader = new BinaryReader(dctFile))
{

// GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
{
// if (i == 5231)
// System.out.println(i);

dctFile.Read(intBuffer, 0, intBuffer.Length);
// the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET

if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}

wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
int j = 0;
while (j < cnt)
{
// wordItemTable[i][j] = new WordItem();
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// frequency
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// length
dctFile.Read(intBuffer, 0, intBuffer.Length);
buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
.GetInt32();// handle

// LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
buffer[0] = reader.ReadInt32(); // frequency
buffer[1] = reader.ReadInt32(); // length
buffer[2] = reader.ReadInt32(); // handle

// wordItemTable[i][j].frequency = buffer[0];

wordItem_frequencyTable[i][j] = buffer[0];

length = buffer[1];
if (length > 0)
{
byte[] lchBuffer = new byte[length];
dctFile.Read(lchBuffer, 0, lchBuffer.Length);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
byte[] lchBuffer = reader.ReadBytes(length);
tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
}
else
Expand All @@ -411,6 +411,7 @@ private int LoadMainDataFromFile(string dctFilePath)
SetTableIndex(str[0], i);
}
}

return total;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,13 @@
<NoWarn>$(NoWarn);1591;1573</NoWarn>
</PropertyGroup>



<ItemGroup>
<EmbeddedResource Include="Hhmm/*.mem" Label="Dict Test Data" />
<EmbeddedResource Include="**/*.txt" Exclude="bin/**/*;obj/**/*" Label="Text Test Data" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\dotnet\Lucene.Net.ICU\Lucene.Net.ICU.csproj" />
<ProjectReference Include="..\dotnet\Lucene.Net.ICU\Lucene.Net.ICU.csproj" />
<ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj" />
<ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj" />
</ItemGroup>
Expand All @@ -64,4 +62,8 @@
<PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
</ItemGroup>

<ItemGroup>
<InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
</ItemGroup>

</Project>
108 changes: 108 additions & 0 deletions src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/TestBuildDictionary.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
using J2N;
using Lucene.Net.Analysis.Cn.Smart;
using Lucene.Net.Analysis.Cn.Smart.Hhmm;
using Lucene.Net.Attributes;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.IO;
using Assert = Lucene.Net.TestFramework.Assert;

namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

[LuceneNetSpecific]
public class TestBuildDictionary : LuceneTestCase
{
private DirectoryInfo tempDir;

public override void OneTimeSetUp()
{
base.OneTimeSetUp();
tempDir = CreateTempDir("smartcn-data");
AnalyzerProfile.ANALYSIS_DATA_DIR = tempDir.FullName;
using (var zipFileStream = typeof(TestBuildDictionary).FindAndGetManifestResourceStream("custom-dictionary-input.zip"))
{
TestUtil.Unzip(zipFileStream, tempDir);
}
}

public override void OneTimeTearDown()
{
AnalyzerProfile.ANALYSIS_DATA_DIR = null; // Ensure this test data is not loaded for other tests
base.OneTimeTearDown();
}

[Test]
public void TestBigramDictionary()
{
// First test - builds and loads dictionary from .dict file
BigramDictionary bigramDict = BigramDictionary.GetInstance();
CheckBigramDictionary(bigramDict);

// Ensure .mem file was created
string memFile = System.IO.Path.Combine(tempDir.FullName, "bigramdict.mem");
Assert.IsTrue(File.Exists(memFile), "Memory file should be created after first load");

// Delete the original .dict file
string dictFile = System.IO.Path.Combine(tempDir.FullName, "bigramdict.dct");
Assert.IsTrue(File.Exists(dictFile), $"{dictFile} does not exist.");
File.Delete(dictFile);


// Second test - should load from .mem file now
bigramDict = BigramDictionary.GetInstance();
CheckBigramDictionary(bigramDict);
}

private static void CheckBigramDictionary(BigramDictionary bigramDict)
{
Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect.");
Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect.");
}

[Test]
public void TestWordDictionary()
{
// First test - builds and loads dictionary from .dict file
WordDictionary wordDict = WordDictionary.GetInstance();
CheckWordDictionary(wordDict);

// Ensure .mem file was created
string memFile = System.IO.Path.Combine(tempDir.FullName, "coredict.mem");
Assert.IsTrue(File.Exists(memFile), "Memory file should be created after first load");

// Delete the original .dict file
string dictFile = System.IO.Path.Combine(tempDir.FullName, "coredict.dct");
Assert.IsTrue(File.Exists(dictFile), $"{dictFile} does not exist.");
File.Delete(dictFile);


// Second test - should load from .mem file now
wordDict = WordDictionary.GetInstance();
CheckWordDictionary(wordDict);
}

private static void CheckWordDictionary(WordDictionary wordDict)
{
Assert.AreEqual(30, wordDict.GetFrequency("尼".ToCharArray()), "Frequency for '尼' is incorrect.");
Assert.AreEqual(0, wordDict.GetFrequency("missing".ToCharArray()), "Expected frequency 0 for unknown word.");
}
}
}
Binary file not shown.
Loading