apache · NehanPathan · Mar 31, 2025 · Apr 8, 2025 · Apr 9, 2025 · Apr 24, 2025
diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs
@@ -33,7 +33,9 @@ namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
     internal abstract class AbstractDictionary
     {
         // LUCENENET specific: cached GB2312 encoding to avoid repeated calls to Encoding.GetEncoding("GB2312")
-        protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312");
+        protected static readonly Encoding gb2312Encoding = Encoding.GetEncoding("GB2312",
+    EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
+
 
         /// <summary>
         /// First Chinese Character in GB2312 (15 * 94)
@@ -162,7 +164,7 @@ public virtual long Hash1(char c)
         /// </summary>
         /// <param name="carray">character array</param>
         /// <returns>hashcode</returns>
-        public virtual long Hash1(char[] carray)
+        public virtual long Hash1(ReadOnlySpan<char> carray)
         {
             long p = 1099511628211L;
             long hash = unchecked((long)0xcbf29ce484222325L);
@@ -210,7 +212,7 @@ public virtual int Hash2(char c)
         /// </summary>
         /// <param name="carray">character array</param>
         /// <returns>hashcode</returns>
-        public virtual int Hash2(char[] carray)
+        public virtual int Hash2(ReadOnlySpan<char> carray)
         {
             int hash = 5381;
 

diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs
@@ -254,30 +254,47 @@ private void Load(string dictRoot)
         /// <summary>
         /// Load the datafile into this <see cref="BigramDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
+        /// <param name="dctFilePath">Path to the Bigramdictionary (bigramdict.dct)</param>
         /// <exception cref="IOException">If there is a low-level I/O error</exception>
         public virtual void LoadFromFile(string dctFilePath)
         {
             int i, cnt, length, total = 0;
+
             // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
+
+            Span<int> buffer = stackalloc int[3];
             string tmpword;
+
+            // LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
+            // LUCENENET specific - refactored constants for clarity
+
+            // The 3756th position (using 1-based counting) corresponds to index 3755 (using 0-based indexing)
+            // This matches the original Java implementation which used 3755 + GB2312_FIRST_CHAR in the condition
+            const int HEADER_POSITION = 3755;
+
             //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
             using var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read);
+            using var reader = new BinaryReader(dctFile);
 
             // GB2312 characters 0 - 6768
             for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
             {
+
                 string currentStr = GetCCByGB2312Id(i);
                 // if (i == 5231)
                 // System.out.println(i);
+                try
+                {
+                    cnt = reader.ReadInt32();  // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+                }
+                catch (EndOfStreamException)
+                {
+                    // Test dictionary files contain fewer entries than production files
+                    // Breaking here is normal and expected behavior for test files
+                    break;
+                }
 
-                dctFile.Read(intBuffer, 0, intBuffer.Length);
-                // the dictionary was developed for C, and byte order must be converted to work with Java
-                cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
                 if (cnt <= 0)
                 {
                     continue;
@@ -286,37 +303,37 @@ public virtual void LoadFromFile(string dctFilePath)
                 int j = 0;
                 while (j < cnt)
                 {
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// frequency
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                        .GetInt32();// length
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // buffer[2] = ByteBuffer.wrap(intBuffer).order(
-                    // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+                    // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+                    buffer[0] = reader.ReadInt32(); // frequency
+                    buffer[1] = reader.ReadInt32(); // length
+                    buffer[2] = reader.ReadInt32(); // Skip handle value (unused)
 
                     length = buffer[1];
-                    if (length > 0)
+                    if (length > 0 && dctFile.Position + length <= dctFile.Length)
                     {
-                        byte[] lchBuffer = new byte[length];
-                        dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                        byte[] lchBuffer = reader.ReadBytes(length);  // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+
                         //tmpword = new String(lchBuffer, "GB2312");
                         tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
                         //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
-                        if (i != 3755 + GB2312_FIRST_CHAR)
+
+
+                        if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
                         {
                             tmpword = currentStr + tmpword;
                         }
-                        char[] carray = tmpword.ToCharArray();
+
+                        ReadOnlySpan<char> carray = tmpword.AsSpan();
                         long hashId = Hash1(carray);
                         int index = GetAvaliableIndex(hashId, carray);
+
                         if (index != -1)
                         {
                             if (bigramHashTable[index] == 0)
                             {
                                 bigramHashTable[index] = hashId;
                                 // bigramStringTable[index] = tmpword;
+
                             }
                             frequencyTable[index] += buffer[0];
                         }
@@ -326,8 +343,7 @@ public virtual void LoadFromFile(string dctFilePath)
             }
             // log.info("load dictionary done! " + dctFilePath + " total:" + total);
         }
-
-        private int GetAvaliableIndex(long hashId, char[] carray)
+        private int GetAvaliableIndex(long hashId, ReadOnlySpan<char> carray)
         {
             int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
             int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
@@ -357,7 +373,7 @@ private int GetAvaliableIndex(long hashId, char[] carray)
         /// <summary>
         /// lookup the index into the frequency array.
         /// </summary>
-        private int GetBigramItemIndex(char[] carray)
+        private int GetBigramItemIndex(ReadOnlySpan<char> carray)
         {
             long hashId = Hash1(carray);
             int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
@@ -388,7 +404,7 @@ private int GetBigramItemIndex(char[] carray)
                 return -1;
         }
 
-        public int GetFrequency(char[] carray)
+        public int GetFrequency(ReadOnlySpan<char> carray)
         {
             int index = GetBigramItemIndex(carray);
             if (index != -1)

diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs
@@ -340,62 +340,62 @@ private void SaveToObj(FileInfo serialObj)
         /// <summary>
         /// Load the datafile into this <see cref="WordDictionary"/>
         /// </summary>
-        /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param>
-        /// <returns>number of words read</returns>
+        /// <param name="dctFilePath">Path to word dictionary (coredict.dct)</param>
+        /// <returns>Number of words read</returns>
         /// <exception cref="IOException">If there is a low-level I/O error.</exception>
         private int LoadMainDataFromFile(string dctFilePath)
         {
             int i, cnt, length, total = 0;
-            // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
+
+            // The file only counted 6763 Chinese characters plus 5 reserved slots (3756~3760).
             // The 3756th is used (as a header) to store information.
-            int[]
-            buffer = new int[3];
-            byte[] intBuffer = new byte[4];
+
+            Span<int> buffer = stackalloc int[3];
             string tmpword;
+
+            // LUCENENET: Removed intBuffer arrays since BinaryReader handles reading values directly in a more type-safe and readable way.
+            // LUCENENET: Use BinaryReader to simplify endian conversion and stream reading.
+
             using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
+            using (var reader = new BinaryReader(dctFile))
             {
-
                 // GB2312 characters 0 - 6768
                 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
                 {
                     // if (i == 5231)
                     // System.out.println(i);
 
-                    dctFile.Read(intBuffer, 0, intBuffer.Length);
-                    // the dictionary was developed for C, and byte order must be converted to work with Java
-                    cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+                    cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+
                     if (cnt <= 0)
                     {
                         wordItem_charArrayTable[i] = null;
                         wordItem_frequencyTable[i] = null;
                         continue;
                     }
+
                     wordItem_charArrayTable[i] = new char[cnt][];
                     wordItem_frequencyTable[i] = new int[cnt];
                     total += cnt;
                     int j = 0;
                     while (j < cnt)
                     {
                         // wordItemTable[i][j] = new WordItem();
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// frequency
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// length
-                        dctFile.Read(intBuffer, 0, intBuffer.Length);
-                        buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
-                            .GetInt32();// handle
+
+                        // LUCENENET: Use BinaryReader to decode little endian instead of ByteBuffer, since this is the default in .NET
+                        buffer[0] = reader.ReadInt32(); // frequency
+                        buffer[1] = reader.ReadInt32(); // length
+                        buffer[2] = reader.ReadInt32(); // handle
 
                         // wordItemTable[i][j].frequency = buffer[0];
+
                         wordItem_frequencyTable[i][j] = buffer[0];
 
                         length = buffer[1];
                         if (length > 0)
                         {
-                            byte[] lchBuffer = new byte[length];
-                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
-                            tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET specific: use cached encoding instance from base class
+                            byte[] lchBuffer = reader.ReadBytes(length);
+                            tmpword = gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance from base class
                             wordItem_charArrayTable[i][j] = tmpword.ToCharArray();
                         }
                         else
@@ -411,6 +411,7 @@ private int LoadMainDataFromFile(string dctFilePath)
                     SetTableIndex(str[0], i);
                 }
             }
+
             return total;
         }
 

diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj
@@ -38,15 +38,13 @@
     <NoWarn>$(NoWarn);1591;1573</NoWarn>
   </PropertyGroup>
 
-
-
   <ItemGroup>
     <EmbeddedResource Include="Hhmm/*.mem" Label="Dict Test Data" />
     <EmbeddedResource Include="**/*.txt" Exclude="bin/**/*;obj/**/*" Label="Text Test Data" />
   </ItemGroup>
 
   <ItemGroup>
-	<ProjectReference Include="..\dotnet\Lucene.Net.ICU\Lucene.Net.ICU.csproj" />
+    <ProjectReference Include="..\dotnet\Lucene.Net.ICU\Lucene.Net.ICU.csproj" />
     <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj" />
     <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj" />
   </ItemGroup>
@@ -64,4 +62,8 @@
     <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" />
   </ItemGroup>
 
+  <ItemGroup>
+    <InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
+  </ItemGroup>
+
 </Project>
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/TestBuildDictionary.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/TestBuildDictionary.cs
@@ -0,0 +1,108 @@
+using J2N;
+using Lucene.Net.Analysis.Cn.Smart;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+using Assert = Lucene.Net.TestFramework.Assert;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    [LuceneNetSpecific]
+    public class TestBuildDictionary : LuceneTestCase
+    {
+        private DirectoryInfo tempDir;
+
+        public override void OneTimeSetUp()
+        {
+            base.OneTimeSetUp();
+            tempDir = CreateTempDir("smartcn-data");
+            AnalyzerProfile.ANALYSIS_DATA_DIR = tempDir.FullName;
+            using (var zipFileStream = typeof(TestBuildDictionary).FindAndGetManifestResourceStream("custom-dictionary-input.zip"))
+            {
+                TestUtil.Unzip(zipFileStream, tempDir);
+            }
+        }
+
+        public override void OneTimeTearDown()
+        {
+            AnalyzerProfile.ANALYSIS_DATA_DIR = null; // Ensure this test data is not loaded for other tests
+            base.OneTimeTearDown();
+        }
+
+        [Test]
+        public void TestBigramDictionary()
+        {
+            // First test - builds and loads dictionary from .dict file
+            BigramDictionary bigramDict = BigramDictionary.GetInstance();
+            CheckBigramDictionary(bigramDict);
+
+            // Ensure .mem file was created
+            string memFile = System.IO.Path.Combine(tempDir.FullName, "bigramdict.mem");
+            Assert.IsTrue(File.Exists(memFile), "Memory file should be created after first load");
+
+            // Delete the original .dict file
+            string dictFile = System.IO.Path.Combine(tempDir.FullName, "bigramdict.dct");
+            Assert.IsTrue(File.Exists(dictFile), $"{dictFile} does not exist.");
+            File.Delete(dictFile);
+
+
+            // Second test - should load from .mem file now
+            bigramDict = BigramDictionary.GetInstance();
+            CheckBigramDictionary(bigramDict);
+        }
+
+        private static void CheckBigramDictionary(BigramDictionary bigramDict)
+        {
+            Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()), "Frequency for '啊hello' is incorrect.");
+            Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()), "Frequency for '阿world' is incorrect.");
+        }
+
+        [Test]
+        public void TestWordDictionary()
+        {
+            // First test - builds and loads dictionary from .dict file
+            WordDictionary wordDict = WordDictionary.GetInstance();
+            CheckWordDictionary(wordDict);
+
+            // Ensure .mem file was created
+            string memFile = System.IO.Path.Combine(tempDir.FullName, "coredict.mem");
+            Assert.IsTrue(File.Exists(memFile), "Memory file should be created after first load");
+
+            // Delete the original .dict file
+            string dictFile = System.IO.Path.Combine(tempDir.FullName, "coredict.dct");
+            Assert.IsTrue(File.Exists(dictFile), $"{dictFile} does not exist.");
+            File.Delete(dictFile);
+
+
+            // Second test - should load from .mem file now
+            wordDict = WordDictionary.GetInstance();
+            CheckWordDictionary(wordDict);
+        }
+
+        private static void CheckWordDictionary(WordDictionary wordDict)
+        {
+            Assert.AreEqual(30, wordDict.GetFrequency("尼".ToCharArray()), "Frequency for '尼' is incorrect.");
+            Assert.AreEqual(0, wordDict.GetFrequency("missing".ToCharArray()), "Expected frequency 0 for unknown word.");
+        }
+    }
+}
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/custom-dictionary-input.zip b/src/Lucene.Net.Tests.Analysis.SmartCn/Hhmm/custom-dictionary-input.zip