Visual Studio Reformat: Emby.Server.Implementations Part T-T
This commit is contained in:
parent
0efc699e3d
commit
25f0315e91
|
@ -1,14 +1,14 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using MediaBrowser.Controller.Configuration;
|
||||||
|
using MediaBrowser.Controller.Dto;
|
||||||
using MediaBrowser.Controller.Entities;
|
using MediaBrowser.Controller.Entities;
|
||||||
using MediaBrowser.Controller.Entities.TV;
|
using MediaBrowser.Controller.Entities.TV;
|
||||||
using MediaBrowser.Controller.Library;
|
using MediaBrowser.Controller.Library;
|
||||||
using MediaBrowser.Controller.TV;
|
using MediaBrowser.Controller.TV;
|
||||||
using MediaBrowser.Model.Entities;
|
using MediaBrowser.Model.Entities;
|
||||||
using MediaBrowser.Model.Querying;
|
using MediaBrowser.Model.Querying;
|
||||||
using System;
|
|
||||||
using System.Collections.Generic;
|
|
||||||
using System.Linq;
|
|
||||||
using MediaBrowser.Controller.Configuration;
|
|
||||||
using MediaBrowser.Controller.Dto;
|
|
||||||
|
|
||||||
namespace Emby.Server.Implementations.TV
|
namespace Emby.Server.Implementations.TV
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.IO;
|
|
||||||
using System.IO.Compression;
|
|
||||||
using NLangDetect.Core.Utils;
|
|
||||||
using MediaBrowser.Model.Serialization;
|
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using MediaBrowser.Model.Serialization;
|
||||||
|
using NLangDetect.Core.Utils;
|
||||||
|
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
public enum ErrorCode
|
public enum ErrorCode
|
||||||
{
|
{
|
||||||
NoTextError,
|
NoTextError,
|
||||||
FormatError,
|
FormatError,
|
||||||
FileLoadError,
|
FileLoadError,
|
||||||
DuplicateLangError,
|
DuplicateLangError,
|
||||||
NeedLoadProfileError,
|
NeedLoadProfileError,
|
||||||
CantDetectError,
|
CantDetectError,
|
||||||
CantOpenTrainData,
|
CantOpenTrainData,
|
||||||
TrainDataFormatError,
|
TrainDataFormatError,
|
||||||
InitParamError,
|
InitParamError,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,13 +2,13 @@
|
||||||
|
|
||||||
namespace NLangDetect.Core.Extensions
|
namespace NLangDetect.Core.Extensions
|
||||||
{
|
{
|
||||||
public static class CharExtensions
|
public static class CharExtensions
|
||||||
{
|
{
|
||||||
private const int MIN_CODE_POINT = 0x000000;
|
private const int MIN_CODE_POINT = 0x000000;
|
||||||
private const int MAX_CODE_POINT = 0x10ffff;
|
private const int MAX_CODE_POINT = 0x10ffff;
|
||||||
|
|
||||||
private static readonly int[] _unicodeBlockStarts =
|
private static readonly int[] _unicodeBlockStarts =
|
||||||
{
|
{
|
||||||
#region Unicode block starts
|
#region Unicode block starts
|
||||||
|
|
||||||
0x0000, // Basic Latin
|
0x0000, // Basic Latin
|
||||||
|
@ -165,8 +165,8 @@ namespace NLangDetect.Core.Extensions
|
||||||
#endregion
|
#endregion
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly UnicodeBlock?[] _unicodeBlocks =
|
private static readonly UnicodeBlock?[] _unicodeBlocks =
|
||||||
{
|
{
|
||||||
#region Unicode blocks
|
#region Unicode blocks
|
||||||
UnicodeBlock.BasicLatin,
|
UnicodeBlock.BasicLatin,
|
||||||
UnicodeBlock.Latin1Supplement,
|
UnicodeBlock.Latin1Supplement,
|
||||||
|
@ -322,53 +322,53 @@ namespace NLangDetect.Core.Extensions
|
||||||
#endregion
|
#endregion
|
||||||
};
|
};
|
||||||
|
|
||||||
#region Public methods
|
#region Public methods
|
||||||
|
|
||||||
/// <remarks>
|
/// <remarks>
|
||||||
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
|
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
|
||||||
/// </remarks>
|
/// </remarks>
|
||||||
public static UnicodeBlock? GetUnicodeBlock(this char ch)
|
public static UnicodeBlock? GetUnicodeBlock(this char ch)
|
||||||
{
|
|
||||||
int codePoint = ch;
|
|
||||||
|
|
||||||
if (!IsValidCodePoint(codePoint))
|
|
||||||
{
|
|
||||||
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
|
|
||||||
}
|
|
||||||
|
|
||||||
int top, bottom, current;
|
|
||||||
|
|
||||||
bottom = 0;
|
|
||||||
top = _unicodeBlockStarts.Length;
|
|
||||||
current = top / 2;
|
|
||||||
|
|
||||||
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
|
|
||||||
while (top - bottom > 1)
|
|
||||||
{
|
|
||||||
if (codePoint >= _unicodeBlockStarts[current])
|
|
||||||
{
|
{
|
||||||
bottom = current;
|
int codePoint = ch;
|
||||||
}
|
|
||||||
else
|
if (!IsValidCodePoint(codePoint))
|
||||||
{
|
{
|
||||||
top = current;
|
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
int top, bottom, current;
|
||||||
|
|
||||||
|
bottom = 0;
|
||||||
|
top = _unicodeBlockStarts.Length;
|
||||||
|
current = top / 2;
|
||||||
|
|
||||||
|
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
|
||||||
|
while (top - bottom > 1)
|
||||||
|
{
|
||||||
|
if (codePoint >= _unicodeBlockStarts[current])
|
||||||
|
{
|
||||||
|
bottom = current;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
top = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
current = (top + bottom) / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _unicodeBlocks[current];
|
||||||
}
|
}
|
||||||
|
|
||||||
current = (top + bottom) / 2;
|
#endregion
|
||||||
}
|
|
||||||
|
|
||||||
return _unicodeBlocks[current];
|
#region Private helper methods
|
||||||
|
|
||||||
|
private static bool IsValidCodePoint(int codePoint)
|
||||||
|
{
|
||||||
|
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
#region Private helper methods
|
|
||||||
|
|
||||||
private static bool IsValidCodePoint(int codePoint)
|
|
||||||
{
|
|
||||||
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endregion
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,50 +2,50 @@
|
||||||
|
|
||||||
namespace NLangDetect.Core.Extensions
|
namespace NLangDetect.Core.Extensions
|
||||||
{
|
{
|
||||||
public static class RandomExtensions
|
public static class RandomExtensions
|
||||||
{
|
|
||||||
private const double _Epsilon = 2.22044604925031E-15;
|
|
||||||
|
|
||||||
private static readonly object _mutex = new object();
|
|
||||||
|
|
||||||
private static double _nextNextGaussian;
|
|
||||||
private static bool _hasNextNextGaussian;
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
|
|
||||||
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
|
|
||||||
/// </summary>
|
|
||||||
/// <remarks>
|
|
||||||
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
|
|
||||||
/// </remarks>
|
|
||||||
public static double NextGaussian(this Random random)
|
|
||||||
{
|
{
|
||||||
lock (_mutex)
|
private const double _Epsilon = 2.22044604925031E-15;
|
||||||
{
|
|
||||||
if (_hasNextNextGaussian)
|
private static readonly object _mutex = new object();
|
||||||
|
|
||||||
|
private static double _nextNextGaussian;
|
||||||
|
private static bool _hasNextNextGaussian;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
|
||||||
|
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
|
||||||
|
/// </remarks>
|
||||||
|
public static double NextGaussian(this Random random)
|
||||||
{
|
{
|
||||||
_hasNextNextGaussian = false;
|
lock (_mutex)
|
||||||
|
{
|
||||||
|
if (_hasNextNextGaussian)
|
||||||
|
{
|
||||||
|
_hasNextNextGaussian = false;
|
||||||
|
|
||||||
return _nextNextGaussian;
|
return _nextNextGaussian;
|
||||||
|
}
|
||||||
|
|
||||||
|
double v1, v2, s;
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
||||||
|
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
||||||
|
s = v1 * v1 + v2 * v2;
|
||||||
|
}
|
||||||
|
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
|
||||||
|
|
||||||
|
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
|
||||||
|
|
||||||
|
_nextNextGaussian = v2 * multiplier;
|
||||||
|
_hasNextNextGaussian = true;
|
||||||
|
|
||||||
|
return v1 * multiplier;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double v1, v2, s;
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
|
||||||
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
|
||||||
s = v1 * v1 + v2 * v2;
|
|
||||||
}
|
|
||||||
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
|
|
||||||
|
|
||||||
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
|
|
||||||
|
|
||||||
_nextNextGaussian = v2 * multiplier;
|
|
||||||
_hasNextNextGaussian = true;
|
|
||||||
|
|
||||||
return v1 * multiplier;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,131 +1,131 @@
|
||||||
namespace NLangDetect.Core.Extensions
|
namespace NLangDetect.Core.Extensions
|
||||||
{
|
{
|
||||||
public enum UnicodeBlock
|
public enum UnicodeBlock
|
||||||
{
|
{
|
||||||
BasicLatin,
|
BasicLatin,
|
||||||
Latin1Supplement,
|
Latin1Supplement,
|
||||||
LatinExtendedA,
|
LatinExtendedA,
|
||||||
LatinExtendedB,
|
LatinExtendedB,
|
||||||
IpaExtensions,
|
IpaExtensions,
|
||||||
SpacingModifierLetters,
|
SpacingModifierLetters,
|
||||||
CombiningDiacriticalMarks,
|
CombiningDiacriticalMarks,
|
||||||
Greek,
|
Greek,
|
||||||
Cyrillic,
|
Cyrillic,
|
||||||
CyrillicSupplementary,
|
CyrillicSupplementary,
|
||||||
Armenian,
|
Armenian,
|
||||||
Hebrew,
|
Hebrew,
|
||||||
Arabic,
|
Arabic,
|
||||||
Syriac,
|
Syriac,
|
||||||
Thaana,
|
Thaana,
|
||||||
Devanagari,
|
Devanagari,
|
||||||
Bengali,
|
Bengali,
|
||||||
Gurmukhi,
|
Gurmukhi,
|
||||||
Gujarati,
|
Gujarati,
|
||||||
Oriya,
|
Oriya,
|
||||||
Tamil,
|
Tamil,
|
||||||
Telugu,
|
Telugu,
|
||||||
Kannada,
|
Kannada,
|
||||||
Malayalam,
|
Malayalam,
|
||||||
Sinhala,
|
Sinhala,
|
||||||
Thai,
|
Thai,
|
||||||
Lao,
|
Lao,
|
||||||
Tibetan,
|
Tibetan,
|
||||||
Myanmar,
|
Myanmar,
|
||||||
Georgian,
|
Georgian,
|
||||||
HangulJamo,
|
HangulJamo,
|
||||||
Ethiopic,
|
Ethiopic,
|
||||||
Cherokee,
|
Cherokee,
|
||||||
UnifiedCanadianAboriginalSyllabics,
|
UnifiedCanadianAboriginalSyllabics,
|
||||||
Ogham,
|
Ogham,
|
||||||
Runic,
|
Runic,
|
||||||
Tagalog,
|
Tagalog,
|
||||||
Hanunoo,
|
Hanunoo,
|
||||||
Buhid,
|
Buhid,
|
||||||
Tagbanwa,
|
Tagbanwa,
|
||||||
Khmer,
|
Khmer,
|
||||||
Mongolian,
|
Mongolian,
|
||||||
Limbu,
|
Limbu,
|
||||||
TaiLe,
|
TaiLe,
|
||||||
KhmerSymbols,
|
KhmerSymbols,
|
||||||
PhoneticExtensions,
|
PhoneticExtensions,
|
||||||
LatinExtendedAdditional,
|
LatinExtendedAdditional,
|
||||||
GreekExtended,
|
GreekExtended,
|
||||||
GeneralPunctuation,
|
GeneralPunctuation,
|
||||||
SuperscriptsAndSubscripts,
|
SuperscriptsAndSubscripts,
|
||||||
CurrencySymbols,
|
CurrencySymbols,
|
||||||
CombiningMarksForSymbols,
|
CombiningMarksForSymbols,
|
||||||
LetterlikeSymbols,
|
LetterlikeSymbols,
|
||||||
NumberForms,
|
NumberForms,
|
||||||
Arrows,
|
Arrows,
|
||||||
MathematicalOperators,
|
MathematicalOperators,
|
||||||
MiscellaneousTechnical,
|
MiscellaneousTechnical,
|
||||||
ControlPictures,
|
ControlPictures,
|
||||||
OpticalCharacterRecognition,
|
OpticalCharacterRecognition,
|
||||||
EnclosedAlphanumerics,
|
EnclosedAlphanumerics,
|
||||||
BoxDrawing,
|
BoxDrawing,
|
||||||
BlockElements,
|
BlockElements,
|
||||||
GeometricShapes,
|
GeometricShapes,
|
||||||
MiscellaneousSymbols,
|
MiscellaneousSymbols,
|
||||||
Dingbats,
|
Dingbats,
|
||||||
MiscellaneousMathematicalSymbolsA,
|
MiscellaneousMathematicalSymbolsA,
|
||||||
SupplementalArrowsA,
|
SupplementalArrowsA,
|
||||||
BraillePatterns,
|
BraillePatterns,
|
||||||
SupplementalArrowsB,
|
SupplementalArrowsB,
|
||||||
MiscellaneousMathematicalSymbolsB,
|
MiscellaneousMathematicalSymbolsB,
|
||||||
SupplementalMathematicalOperators,
|
SupplementalMathematicalOperators,
|
||||||
MiscellaneousSymbolsAndArrows,
|
MiscellaneousSymbolsAndArrows,
|
||||||
CjkRadicalsSupplement,
|
CjkRadicalsSupplement,
|
||||||
KangxiRadicals,
|
KangxiRadicals,
|
||||||
IdeographicDescriptionCharacters,
|
IdeographicDescriptionCharacters,
|
||||||
CjkSymbolsAndPunctuation,
|
CjkSymbolsAndPunctuation,
|
||||||
Hiragana,
|
Hiragana,
|
||||||
Katakana,
|
Katakana,
|
||||||
Bopomofo,
|
Bopomofo,
|
||||||
HangulCompatibilityJamo,
|
HangulCompatibilityJamo,
|
||||||
Kanbun,
|
Kanbun,
|
||||||
BopomofoExtended,
|
BopomofoExtended,
|
||||||
KatakanaPhoneticExtensions,
|
KatakanaPhoneticExtensions,
|
||||||
EnclosedCjkLettersAndMonths,
|
EnclosedCjkLettersAndMonths,
|
||||||
CjkCompatibility,
|
CjkCompatibility,
|
||||||
CjkUnifiedIdeographsExtensionA,
|
CjkUnifiedIdeographsExtensionA,
|
||||||
YijingHexagramSymbols,
|
YijingHexagramSymbols,
|
||||||
CjkUnifiedIdeographs,
|
CjkUnifiedIdeographs,
|
||||||
YiSyllables,
|
YiSyllables,
|
||||||
YiRadicals,
|
YiRadicals,
|
||||||
HangulSyllables,
|
HangulSyllables,
|
||||||
HighSurrogates,
|
HighSurrogates,
|
||||||
HighPrivateUseSurrogates,
|
HighPrivateUseSurrogates,
|
||||||
LowSurrogates,
|
LowSurrogates,
|
||||||
PrivateUseArea,
|
PrivateUseArea,
|
||||||
CjkCompatibilityIdeographs,
|
CjkCompatibilityIdeographs,
|
||||||
AlphabeticPresentationForms,
|
AlphabeticPresentationForms,
|
||||||
ArabicPresentationFormsA,
|
ArabicPresentationFormsA,
|
||||||
VariationSelectors,
|
VariationSelectors,
|
||||||
CombiningHalfMarks,
|
CombiningHalfMarks,
|
||||||
CjkCompatibilityForms,
|
CjkCompatibilityForms,
|
||||||
SmallFormVariants,
|
SmallFormVariants,
|
||||||
ArabicPresentationFormsB,
|
ArabicPresentationFormsB,
|
||||||
HalfwidthAndFullwidthForms,
|
HalfwidthAndFullwidthForms,
|
||||||
Specials,
|
Specials,
|
||||||
LinearBSyllabary,
|
LinearBSyllabary,
|
||||||
LinearBIdeograms,
|
LinearBIdeograms,
|
||||||
AegeanNumbers,
|
AegeanNumbers,
|
||||||
OldItalic,
|
OldItalic,
|
||||||
Gothic,
|
Gothic,
|
||||||
Ugaritic,
|
Ugaritic,
|
||||||
Deseret,
|
Deseret,
|
||||||
Shavian,
|
Shavian,
|
||||||
Osmanya,
|
Osmanya,
|
||||||
CypriotSyllabary,
|
CypriotSyllabary,
|
||||||
ByzantineMusicalSymbols,
|
ByzantineMusicalSymbols,
|
||||||
MusicalSymbols,
|
MusicalSymbols,
|
||||||
TaiXuanJingSymbols,
|
TaiXuanJingSymbols,
|
||||||
MathematicalAlphanumericSymbols,
|
MathematicalAlphanumericSymbols,
|
||||||
CjkUnifiedIdeographsExtensionB,
|
CjkUnifiedIdeographsExtensionB,
|
||||||
CjkCompatibilityIdeographsSupplement,
|
CjkCompatibilityIdeographsSupplement,
|
||||||
Tags,
|
Tags,
|
||||||
VariationSelectorsSupplement,
|
VariationSelectorsSupplement,
|
||||||
SupplementaryPrivateUseAreaA,
|
SupplementaryPrivateUseAreaA,
|
||||||
SupplementaryPrivateUseAreaB,
|
SupplementaryPrivateUseAreaB,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,67 +1,67 @@
|
||||||
using System;
|
using System;
|
||||||
|
using System.IO;
|
||||||
using System.IO.Compression;
|
using System.IO.Compression;
|
||||||
using System.Xml;
|
using System.Xml;
|
||||||
using NLangDetect.Core.Utils;
|
using NLangDetect.Core.Utils;
|
||||||
using System.IO;
|
|
||||||
|
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
// TODO IMM HI: xml reader not tested
|
// TODO IMM HI: xml reader not tested
|
||||||
public static class GenProfile
|
public static class GenProfile
|
||||||
{
|
|
||||||
#region Public methods
|
|
||||||
|
|
||||||
public static LangProfile load(string lang, string file)
|
|
||||||
{
|
{
|
||||||
LangProfile profile = new LangProfile(lang);
|
#region Public methods
|
||||||
TagExtractor tagextractor = new TagExtractor("abstract", 100);
|
|
||||||
Stream inputStream = null;
|
|
||||||
|
|
||||||
try
|
public static LangProfile load(string lang, string file)
|
||||||
{
|
|
||||||
inputStream = File.OpenRead(file);
|
|
||||||
|
|
||||||
string extension = Path.GetExtension(file) ?? "";
|
|
||||||
|
|
||||||
if (extension.ToUpper() == ".GZ")
|
|
||||||
{
|
{
|
||||||
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
|
LangProfile profile = new LangProfile(lang);
|
||||||
}
|
TagExtractor tagextractor = new TagExtractor("abstract", 100);
|
||||||
|
Stream inputStream = null;
|
||||||
|
|
||||||
using (XmlReader xmlReader = XmlReader.Create(inputStream))
|
try
|
||||||
{
|
|
||||||
while (xmlReader.Read())
|
|
||||||
{
|
|
||||||
switch (xmlReader.NodeType)
|
|
||||||
{
|
{
|
||||||
case XmlNodeType.Element:
|
inputStream = File.OpenRead(file);
|
||||||
tagextractor.SetTag(xmlReader.Name);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case XmlNodeType.Text:
|
string extension = Path.GetExtension(file) ?? "";
|
||||||
tagextractor.Add(xmlReader.Value);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case XmlNodeType.EndElement:
|
if (extension.ToUpper() == ".GZ")
|
||||||
tagextractor.CloseTag(profile);
|
{
|
||||||
break;
|
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
|
||||||
|
}
|
||||||
|
|
||||||
|
using (XmlReader xmlReader = XmlReader.Create(inputStream))
|
||||||
|
{
|
||||||
|
while (xmlReader.Read())
|
||||||
|
{
|
||||||
|
switch (xmlReader.NodeType)
|
||||||
|
{
|
||||||
|
case XmlNodeType.Element:
|
||||||
|
tagextractor.SetTag(xmlReader.Name);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case XmlNodeType.Text:
|
||||||
|
tagextractor.Add(xmlReader.Value);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case XmlNodeType.EndElement:
|
||||||
|
tagextractor.CloseTag(profile);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (inputStream != null)
|
||||||
|
{
|
||||||
|
inputStream.Close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
finally
|
|
||||||
{
|
|
||||||
if (inputStream != null)
|
|
||||||
{
|
|
||||||
inputStream.Close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Console.WriteLine(lang + ": " + tagextractor.Count);
|
Console.WriteLine(lang + ": " + tagextractor.Count);
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,21 +2,21 @@
|
||||||
|
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
[Serializable]
|
[Serializable]
|
||||||
public class InternalException : Exception
|
public class InternalException : Exception
|
||||||
{
|
|
||||||
#region Constructor(s)
|
|
||||||
|
|
||||||
public InternalException(string message, Exception innerException)
|
|
||||||
: base(message, innerException)
|
|
||||||
{
|
{
|
||||||
}
|
#region Constructor(s)
|
||||||
|
|
||||||
public InternalException(string message)
|
public InternalException(string message, Exception innerException)
|
||||||
: this(message, null)
|
: base(message, innerException)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
public InternalException(string message)
|
||||||
}
|
: this(message, null)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,44 +2,44 @@ using System.Globalization;
|
||||||
|
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
// TODO IMM HI: name??
|
// TODO IMM HI: name??
|
||||||
public class Language
|
public class Language
|
||||||
{
|
|
||||||
#region Constructor(s)
|
|
||||||
|
|
||||||
public Language(string name, double probability)
|
|
||||||
{
|
{
|
||||||
Name = name;
|
#region Constructor(s)
|
||||||
Probability = probability;
|
|
||||||
|
public Language(string name, double probability)
|
||||||
|
{
|
||||||
|
Name = name;
|
||||||
|
Probability = probability;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Object overrides
|
||||||
|
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
if (Name == null)
|
||||||
|
{
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
string.Format(
|
||||||
|
CultureInfo.InvariantCulture.NumberFormat,
|
||||||
|
"{0}:{1:0.000000}",
|
||||||
|
Name,
|
||||||
|
Probability);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public string Name { get; set; }
|
||||||
|
|
||||||
|
public double Probability { get; set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
#region Object overrides
|
|
||||||
|
|
||||||
public override string ToString()
|
|
||||||
{
|
|
||||||
if (Name == null)
|
|
||||||
{
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
return
|
|
||||||
string.Format(
|
|
||||||
CultureInfo.InvariantCulture.NumberFormat,
|
|
||||||
"{0}:{1:0.000000}",
|
|
||||||
Name,
|
|
||||||
Probability);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
#region Properties
|
|
||||||
|
|
||||||
public string Name { get; set; }
|
|
||||||
|
|
||||||
public double Probability { get; set; }
|
|
||||||
|
|
||||||
#endregion
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,22 +2,22 @@
|
||||||
|
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
public class NLangDetectException : Exception
|
public class NLangDetectException : Exception
|
||||||
{
|
|
||||||
#region Constructor(s)
|
|
||||||
|
|
||||||
public NLangDetectException(string message, ErrorCode errorCode)
|
|
||||||
: base(message)
|
|
||||||
{
|
{
|
||||||
ErrorCode = errorCode;
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public NLangDetectException(string message, ErrorCode errorCode)
|
||||||
|
: base(message)
|
||||||
|
{
|
||||||
|
ErrorCode = errorCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public ErrorCode ErrorCode { get; private set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
#region Properties
|
|
||||||
|
|
||||||
public ErrorCode ErrorCode { get; private set; }
|
|
||||||
|
|
||||||
#endregion
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,33 +3,33 @@ using System.Collections.Generic;
|
||||||
|
|
||||||
namespace NLangDetect.Core
|
namespace NLangDetect.Core
|
||||||
{
|
{
|
||||||
public class ProbVector
|
public class ProbVector
|
||||||
{
|
|
||||||
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
|
|
||||||
|
|
||||||
public double this[int key]
|
|
||||||
{
|
{
|
||||||
get
|
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
|
||||||
{
|
|
||||||
double value;
|
|
||||||
|
|
||||||
return _dict.TryGetValue(key, out value) ? value : 0.0;
|
public double this[int key]
|
||||||
}
|
|
||||||
|
|
||||||
set
|
|
||||||
{
|
|
||||||
if (Math.Abs(value) < double.Epsilon)
|
|
||||||
{
|
{
|
||||||
if (_dict.ContainsKey(key))
|
get
|
||||||
{
|
{
|
||||||
_dict.Remove(key);
|
double value;
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
return _dict.TryGetValue(key, out value) ? value : 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
set
|
||||||
|
{
|
||||||
|
if (Math.Abs(value) < double.Epsilon)
|
||||||
|
{
|
||||||
|
if (_dict.ContainsKey(key))
|
||||||
|
{
|
||||||
|
_dict.Remove(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
_dict[key] = value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_dict[key] = value;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.Globalization;
|
using System.Globalization;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
using System.Reflection;
|
|
||||||
using System.Text.RegularExpressions;
|
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System;
|
using System.Text.RegularExpressions;
|
||||||
|
|
||||||
namespace NLangDetect.Core.Utils
|
namespace NLangDetect.Core.Utils
|
||||||
{
|
{
|
||||||
|
@ -29,7 +28,7 @@ namespace NLangDetect.Core.Utils
|
||||||
|
|
||||||
private static Dictionary<string, string> LoadMessages()
|
private static Dictionary<string, string> LoadMessages()
|
||||||
{
|
{
|
||||||
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ;
|
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1);
|
||||||
|
|
||||||
Stream messagesStream =
|
Stream messagesStream =
|
||||||
typeof(Messages).Assembly
|
typeof(Messages).Assembly
|
||||||
|
|
|
@ -6,14 +6,14 @@ using NLangDetect.Core.Extensions;
|
||||||
|
|
||||||
namespace NLangDetect.Core.Utils
|
namespace NLangDetect.Core.Utils
|
||||||
{
|
{
|
||||||
public class NGram
|
public class NGram
|
||||||
{
|
{
|
||||||
public const int GramsCount = 3;
|
public const int GramsCount = 3;
|
||||||
|
|
||||||
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
|
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
|
||||||
|
|
||||||
private static readonly string[] CjkClass =
|
private static readonly string[] CjkClass =
|
||||||
{
|
{
|
||||||
#region CJK classes
|
#region CJK classes
|
||||||
|
|
||||||
Messages.getString("NGram.KANJI_1_0"),
|
Messages.getString("NGram.KANJI_1_0"),
|
||||||
|
@ -146,185 +146,185 @@ namespace NLangDetect.Core.Utils
|
||||||
#endregion
|
#endregion
|
||||||
};
|
};
|
||||||
|
|
||||||
private static readonly Dictionary<char, char> _cjkMap;
|
private static readonly Dictionary<char, char> _cjkMap;
|
||||||
|
|
||||||
private StringBuilder _grams;
|
private StringBuilder _grams;
|
||||||
private bool _capitalword;
|
private bool _capitalword;
|
||||||
|
|
||||||
#region Constructor(s)
|
#region Constructor(s)
|
||||||
|
|
||||||
static NGram()
|
static NGram()
|
||||||
{
|
|
||||||
_cjkMap = new Dictionary<char, char>();
|
|
||||||
|
|
||||||
foreach (string cjk_list in CjkClass)
|
|
||||||
{
|
|
||||||
char representative = cjk_list[0];
|
|
||||||
|
|
||||||
for (int i = 0; i < cjk_list.Length; i++)
|
|
||||||
{
|
{
|
||||||
_cjkMap.Add(cjk_list[i], representative);
|
_cjkMap = new Dictionary<char, char>();
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public NGram()
|
foreach (string cjk_list in CjkClass)
|
||||||
{
|
|
||||||
_grams = new StringBuilder(" ");
|
|
||||||
_capitalword = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
#region Public methods
|
|
||||||
|
|
||||||
public static char Normalize(char ch)
|
|
||||||
{
|
|
||||||
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
|
|
||||||
|
|
||||||
if (!unicodeBlock.HasValue)
|
|
||||||
{
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (unicodeBlock.Value)
|
|
||||||
{
|
|
||||||
case UnicodeBlock.BasicLatin:
|
|
||||||
{
|
|
||||||
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
|
|
||||||
{
|
{
|
||||||
return ' ';
|
char representative = cjk_list[0];
|
||||||
|
|
||||||
|
for (int i = 0; i < cjk_list.Length; i++)
|
||||||
|
{
|
||||||
|
_cjkMap.Add(cjk_list[i], representative);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.Latin1Supplement:
|
|
||||||
{
|
|
||||||
if (Latin1Excluded.IndexOf(ch) >= 0)
|
|
||||||
{
|
|
||||||
return ' ';
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.GeneralPunctuation:
|
|
||||||
{
|
|
||||||
return ' ';
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.Arabic:
|
|
||||||
{
|
|
||||||
if (ch == '\u06cc')
|
|
||||||
{
|
|
||||||
return '\u064a';
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.LatinExtendedAdditional:
|
|
||||||
{
|
|
||||||
if (ch >= '\u1ea0')
|
|
||||||
{
|
|
||||||
return '\u1ec3';
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.Hiragana:
|
|
||||||
{
|
|
||||||
return '\u3042';
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.Katakana:
|
|
||||||
{
|
|
||||||
return '\u30a2';
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.Bopomofo:
|
|
||||||
case UnicodeBlock.BopomofoExtended:
|
|
||||||
{
|
|
||||||
return '\u3105';
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.CjkUnifiedIdeographs:
|
|
||||||
{
|
|
||||||
if (_cjkMap.ContainsKey(ch))
|
|
||||||
{
|
|
||||||
return _cjkMap[ch];
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case UnicodeBlock.HangulSyllables:
|
|
||||||
{
|
|
||||||
return '\uac00';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void AddChar(char ch)
|
|
||||||
{
|
|
||||||
ch = Normalize(ch);
|
|
||||||
char lastchar = _grams[_grams.Length - 1];
|
|
||||||
if (lastchar == ' ')
|
|
||||||
{
|
|
||||||
_grams = new StringBuilder(" ");
|
|
||||||
_capitalword = false;
|
|
||||||
if (ch == ' ') return;
|
|
||||||
}
|
|
||||||
else if (_grams.Length >= GramsCount)
|
|
||||||
{
|
|
||||||
_grams.Remove(0, 1);
|
|
||||||
}
|
|
||||||
_grams.Append(ch);
|
|
||||||
|
|
||||||
if (char.IsUpper(ch))
|
|
||||||
{
|
|
||||||
if (char.IsUpper(lastchar)) _capitalword = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
_capitalword = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public string Get(int n)
|
|
||||||
{
|
|
||||||
if (_capitalword)
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
int len = _grams.Length;
|
|
||||||
|
|
||||||
if (n < 1 || n > 3 || len < n)
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n == 1)
|
|
||||||
{
|
|
||||||
char ch = _grams[len - 1];
|
|
||||||
|
|
||||||
if (ch == ' ')
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ch.ToString();
|
public NGram()
|
||||||
}
|
{
|
||||||
|
_grams = new StringBuilder(" ");
|
||||||
|
_capitalword = false;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO IMM HI: is ToString() here effective?
|
#endregion
|
||||||
return _grams.ToString().SubSequence(len - n, len);
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public static char Normalize(char ch)
|
||||||
|
{
|
||||||
|
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
|
||||||
|
|
||||||
|
if (!unicodeBlock.HasValue)
|
||||||
|
{
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (unicodeBlock.Value)
|
||||||
|
{
|
||||||
|
case UnicodeBlock.BasicLatin:
|
||||||
|
{
|
||||||
|
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
|
||||||
|
{
|
||||||
|
return ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Latin1Supplement:
|
||||||
|
{
|
||||||
|
if (Latin1Excluded.IndexOf(ch) >= 0)
|
||||||
|
{
|
||||||
|
return ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.GeneralPunctuation:
|
||||||
|
{
|
||||||
|
return ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Arabic:
|
||||||
|
{
|
||||||
|
if (ch == '\u06cc')
|
||||||
|
{
|
||||||
|
return '\u064a';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.LatinExtendedAdditional:
|
||||||
|
{
|
||||||
|
if (ch >= '\u1ea0')
|
||||||
|
{
|
||||||
|
return '\u1ec3';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Hiragana:
|
||||||
|
{
|
||||||
|
return '\u3042';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Katakana:
|
||||||
|
{
|
||||||
|
return '\u30a2';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Bopomofo:
|
||||||
|
case UnicodeBlock.BopomofoExtended:
|
||||||
|
{
|
||||||
|
return '\u3105';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.CjkUnifiedIdeographs:
|
||||||
|
{
|
||||||
|
if (_cjkMap.ContainsKey(ch))
|
||||||
|
{
|
||||||
|
return _cjkMap[ch];
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.HangulSyllables:
|
||||||
|
{
|
||||||
|
return '\uac00';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void AddChar(char ch)
|
||||||
|
{
|
||||||
|
ch = Normalize(ch);
|
||||||
|
char lastchar = _grams[_grams.Length - 1];
|
||||||
|
if (lastchar == ' ')
|
||||||
|
{
|
||||||
|
_grams = new StringBuilder(" ");
|
||||||
|
_capitalword = false;
|
||||||
|
if (ch == ' ') return;
|
||||||
|
}
|
||||||
|
else if (_grams.Length >= GramsCount)
|
||||||
|
{
|
||||||
|
_grams.Remove(0, 1);
|
||||||
|
}
|
||||||
|
_grams.Append(ch);
|
||||||
|
|
||||||
|
if (char.IsUpper(ch))
|
||||||
|
{
|
||||||
|
if (char.IsUpper(lastchar)) _capitalword = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_capitalword = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public string Get(int n)
|
||||||
|
{
|
||||||
|
if (_capitalword)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
int len = _grams.Length;
|
||||||
|
|
||||||
|
if (n < 1 || n > 3 || len < n)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n == 1)
|
||||||
|
{
|
||||||
|
char ch = _grams[len - 1];
|
||||||
|
|
||||||
|
if (ch == ' ')
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ch.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO IMM HI: is ToString() here effective?
|
||||||
|
return _grams.ToString().SubSequence(len - n, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,75 +2,75 @@ using System.Text;
|
||||||
|
|
||||||
namespace NLangDetect.Core.Utils
|
namespace NLangDetect.Core.Utils
|
||||||
{
|
{
|
||||||
public class TagExtractor
|
public class TagExtractor
|
||||||
{
|
|
||||||
// TODO IMM HI: do the really need to be internal?
|
|
||||||
internal string Target;
|
|
||||||
internal int Threshold;
|
|
||||||
internal StringBuilder StringBuilder;
|
|
||||||
internal string Tag;
|
|
||||||
|
|
||||||
#region Constructor(s)
|
|
||||||
|
|
||||||
public TagExtractor(string tag, int threshold)
|
|
||||||
{
|
{
|
||||||
Target = tag;
|
// TODO IMM HI: do the really need to be internal?
|
||||||
Threshold = threshold;
|
internal string Target;
|
||||||
Count = 0;
|
internal int Threshold;
|
||||||
Clear();
|
internal StringBuilder StringBuilder;
|
||||||
}
|
internal string Tag;
|
||||||
|
|
||||||
#endregion
|
#region Constructor(s)
|
||||||
|
|
||||||
#region Public methods
|
public TagExtractor(string tag, int threshold)
|
||||||
|
|
||||||
public void Clear()
|
|
||||||
{
|
|
||||||
StringBuilder = new StringBuilder();
|
|
||||||
Tag = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void SetTag(string tag)
|
|
||||||
{
|
|
||||||
Tag = tag;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void Add(string line)
|
|
||||||
{
|
|
||||||
if (Tag == Target && line != null)
|
|
||||||
{
|
|
||||||
StringBuilder.Append(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void CloseTag(LangProfile profile)
|
|
||||||
{
|
|
||||||
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
|
|
||||||
{
|
|
||||||
var gram = new NGram();
|
|
||||||
|
|
||||||
for (int i = 0; i < StringBuilder.Length; i++)
|
|
||||||
{
|
{
|
||||||
gram.AddChar(StringBuilder[i]);
|
Target = tag;
|
||||||
|
Threshold = threshold;
|
||||||
for (int n = 1; n <= NGram.GramsCount; n++)
|
Count = 0;
|
||||||
{
|
Clear();
|
||||||
profile.Add(gram.Get(n));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Count++;
|
#endregion
|
||||||
}
|
|
||||||
|
|
||||||
Clear();
|
#region Public methods
|
||||||
|
|
||||||
|
public void Clear()
|
||||||
|
{
|
||||||
|
StringBuilder = new StringBuilder();
|
||||||
|
Tag = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void SetTag(string tag)
|
||||||
|
{
|
||||||
|
Tag = tag;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Add(string line)
|
||||||
|
{
|
||||||
|
if (Tag == Target && line != null)
|
||||||
|
{
|
||||||
|
StringBuilder.Append(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void CloseTag(LangProfile profile)
|
||||||
|
{
|
||||||
|
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
|
||||||
|
{
|
||||||
|
var gram = new NGram();
|
||||||
|
|
||||||
|
for (int i = 0; i < StringBuilder.Length; i++)
|
||||||
|
{
|
||||||
|
gram.AddChar(StringBuilder[i]);
|
||||||
|
|
||||||
|
for (int n = 1; n <= NGram.GramsCount; n++)
|
||||||
|
{
|
||||||
|
profile.Add(gram.Get(n));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public int Count { get; private set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
}
|
}
|
||||||
|
|
||||||
#endregion
|
|
||||||
|
|
||||||
#region Properties
|
|
||||||
|
|
||||||
public int Count { get; private set; }
|
|
||||||
|
|
||||||
#endregion
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using MediaBrowser.Model.IO;
|
using MediaBrowser.Model.IO;
|
||||||
using Microsoft.Extensions.Logging;
|
|
||||||
using MediaBrowser.Model.Serialization;
|
using MediaBrowser.Model.Serialization;
|
||||||
using MediaBrowser.Model.Text;
|
using MediaBrowser.Model.Text;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
using NLangDetect.Core;
|
using NLangDetect.Core;
|
||||||
using UniversalDetector;
|
using UniversalDetector;
|
||||||
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ namespace UniversalDetector
|
||||||
this.confidence = 0.0f;
|
this.confidence = 0.0f;
|
||||||
base.Reset();
|
base.Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
public string Charset => charset;
|
public string Charset => charset;
|
||||||
|
|
||||||
public float Confidence => confidence;
|
public float Confidence => confidence;
|
||||||
|
@ -109,9 +109,9 @@ namespace UniversalDetector
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
this.confidence = confidence;
|
this.confidence = confidence;
|
||||||
// if (Finished != null) {
|
// if (Finished != null) {
|
||||||
// Finished(charset, confidence);
|
// Finished(charset, confidence);
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,27 +57,34 @@ namespace UniversalDetector.Core
|
||||||
int codingState = 0;
|
int codingState = 0;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
int charLen = codingSM.CurrentCharLen;
|
int charLen = codingSM.CurrentCharLen;
|
||||||
if (i == offset) {
|
if (i == offset)
|
||||||
|
{
|
||||||
lastChar[1] = buf[offset];
|
lastChar[1] = buf[offset];
|
||||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
} else {
|
}
|
||||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
else
|
||||||
|
{
|
||||||
|
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastChar[0] = buf[max-1];
|
lastChar[0] = buf[max - 1];
|
||||||
|
|
||||||
if (state == ProbingState.Detecting)
|
if (state == ProbingState.Detecting)
|
||||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||||
|
|
|
@ -40,20 +40,20 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
public class BitPackage
|
public class BitPackage
|
||||||
{
|
{
|
||||||
public static int INDEX_SHIFT_4BITS = 3;
|
public static int INDEX_SHIFT_4BITS = 3;
|
||||||
public static int INDEX_SHIFT_8BITS = 2;
|
public static int INDEX_SHIFT_8BITS = 2;
|
||||||
public static int INDEX_SHIFT_16BITS = 1;
|
public static int INDEX_SHIFT_16BITS = 1;
|
||||||
|
|
||||||
public static int SHIFT_MASK_4BITS = 7;
|
public static int SHIFT_MASK_4BITS = 7;
|
||||||
public static int SHIFT_MASK_8BITS = 3;
|
public static int SHIFT_MASK_8BITS = 3;
|
||||||
public static int SHIFT_MASK_16BITS = 1;
|
public static int SHIFT_MASK_16BITS = 1;
|
||||||
|
|
||||||
public static int BIT_SHIFT_4BITS = 2;
|
public static int BIT_SHIFT_4BITS = 2;
|
||||||
public static int BIT_SHIFT_8BITS = 3;
|
public static int BIT_SHIFT_8BITS = 3;
|
||||||
public static int BIT_SHIFT_16BITS = 4;
|
public static int BIT_SHIFT_16BITS = 4;
|
||||||
|
|
||||||
public static int UNIT_MASK_4BITS = 0x0000000F;
|
public static int UNIT_MASK_4BITS = 0x0000000F;
|
||||||
public static int UNIT_MASK_8BITS = 0x000000FF;
|
public static int UNIT_MASK_8BITS = 0x000000FF;
|
||||||
public static int UNIT_MASK_16BITS = 0x0000FFFF;
|
public static int UNIT_MASK_16BITS = 0x0000FFFF;
|
||||||
|
|
||||||
private int indexShift;
|
private int indexShift;
|
||||||
|
@ -94,5 +94,5 @@ namespace UniversalDetector.Core
|
||||||
return (data[i >> indexShift] >>
|
return (data[i >> indexShift] >>
|
||||||
((i & shiftMask) << bitShift)) & unitMask;
|
((i & shiftMask) << bitShift)) & unitMask;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,9 +97,11 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
//we only care about 2-bytes character in our distribution analysis
|
//we only care about 2-bytes character in our distribution analysis
|
||||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||||
if (order >= 0) {
|
if (order >= 0)
|
||||||
|
{
|
||||||
totalChars++;
|
totalChars++;
|
||||||
if (order < tableSize) { // order is valid
|
if (order < tableSize)
|
||||||
|
{ // order is valid
|
||||||
if (512 > charToFreqOrder[order])
|
if (512 > charToFreqOrder[order])
|
||||||
freqChars++;
|
freqChars++;
|
||||||
}
|
}
|
||||||
|
@ -124,7 +126,8 @@ namespace UniversalDetector.Core
|
||||||
// negative answer
|
// negative answer
|
||||||
if (totalChars <= 0 || freqChars <= MINIMUM_DATA_THRESHOLD)
|
if (totalChars <= 0 || freqChars <= MINIMUM_DATA_THRESHOLD)
|
||||||
return SURE_NO;
|
return SURE_NO;
|
||||||
if (totalChars != freqChars) {
|
if (totalChars != freqChars)
|
||||||
|
{
|
||||||
float r = freqChars / ((totalChars - freqChars) * typicalDistributionRatio);
|
float r = freqChars / ((totalChars - freqChars) * typicalDistributionRatio);
|
||||||
if (r < SURE_YES)
|
if (r < SURE_YES)
|
||||||
return r;
|
return r;
|
||||||
|
@ -610,8 +613,8 @@ namespace UniversalDetector.Core
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
public override int GetOrder(byte[] buf, int offset)
|
public override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
if (buf[offset] >= 0xB0 && buf[offset+1] >= 0xA1)
|
if (buf[offset] >= 0xB0 && buf[offset + 1] >= 0xA1)
|
||||||
return 94 * (buf[offset] - 0xb0) + buf[offset+1] - 0xA1;
|
return 94 * (buf[offset] - 0xb0) + buf[offset + 1] - 0xA1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -1040,7 +1043,7 @@ namespace UniversalDetector.Core
|
||||||
public override int GetOrder(byte[] buf, int offset)
|
public override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
if (buf[offset] >= 0xC4)
|
if (buf[offset] >= 0xC4)
|
||||||
return 94 * (buf[offset] - 0xC4) + buf[offset+1] - 0xA1;
|
return 94 * (buf[offset] - 0xC4) + buf[offset + 1] - 0xA1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -1048,7 +1051,7 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
public class EUCKRDistributionAnalyser : CharDistributionAnalyser
|
public class EUCKRDistributionAnalyser : CharDistributionAnalyser
|
||||||
{
|
{
|
||||||
// Sampling from about 20M text materials include literature and computer technology
|
// Sampling from about 20M text materials include literature and computer technology
|
||||||
/*
|
/*
|
||||||
* 128 --> 0.79
|
* 128 --> 0.79
|
||||||
* 256 --> 0.92
|
* 256 --> 0.92
|
||||||
|
@ -1634,7 +1637,7 @@ namespace UniversalDetector.Core
|
||||||
public override int GetOrder(byte[] buf, int offset)
|
public override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
if (buf[offset] >= 0xB0)
|
if (buf[offset] >= 0xB0)
|
||||||
return 94 * (buf[offset] - 0xB0) + buf[offset+1] - 0xA1;
|
return 94 * (buf[offset] - 0xB0) + buf[offset + 1] - 0xA1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -2559,12 +2562,15 @@ namespace UniversalDetector.Core
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public override int GetOrder(byte[] buf, int offset)
|
public override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
if (buf[offset] >= 0xA4) {
|
if (buf[offset] >= 0xA4)
|
||||||
if (buf[offset+1] >= 0xA1)
|
{
|
||||||
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0xA1 + 63;
|
if (buf[offset + 1] >= 0xA1)
|
||||||
|
return 157 * (buf[offset] - 0xA4) + buf[offset + 1] - 0xA1 + 63;
|
||||||
else
|
else
|
||||||
return 157 * (buf[offset] - 0xA4) + buf[offset+1] - 0x40;
|
return 157 * (buf[offset] - 0xA4) + buf[offset + 1] - 0x40;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3140,9 +3146,9 @@ namespace UniversalDetector.Core
|
||||||
order = 188 * (buf[offset] - 0xE0 + 31);
|
order = 188 * (buf[offset] - 0xE0 + 31);
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
order += buf[offset+1] - 0x40;
|
order += buf[offset + 1] - 0x40;
|
||||||
|
|
||||||
if (buf[offset+1] > 0x7F)
|
if (buf[offset + 1] > 0x7F)
|
||||||
order--;
|
order--;
|
||||||
return order;
|
return order;
|
||||||
}
|
}
|
||||||
|
@ -3162,7 +3168,7 @@ namespace UniversalDetector.Core
|
||||||
public override int GetOrder(byte[] buf, int offset)
|
public override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
if (buf[offset] >= 0xA0)
|
if (buf[offset] >= 0xA0)
|
||||||
return 94 * (buf[offset] - 0xA1) + buf[offset+1] - 0xA1;
|
return 94 * (buf[offset] - 0xA1) + buf[offset + 1] - 0xA1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,7 +40,8 @@ using System.IO;
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
public enum ProbingState {
|
public enum ProbingState
|
||||||
|
{
|
||||||
Detecting = 0, // no sure answer yet, but caller can ask for confidence
|
Detecting = 0, // no sure answer yet, but caller can ask for confidence
|
||||||
FoundIt = 1, // positive answer
|
FoundIt = 1, // positive answer
|
||||||
NotMe = 2 // negative answer
|
NotMe = 2 // negative answer
|
||||||
|
@ -107,21 +108,27 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
byte[] result = null;
|
byte[] result = null;
|
||||||
|
|
||||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
using (MemoryStream ms = new MemoryStream(buf.Length))
|
||||||
|
{
|
||||||
|
|
||||||
bool meetMSB = false;
|
bool meetMSB = false;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
int prev = offset;
|
int prev = offset;
|
||||||
int cur = offset;
|
int cur = offset;
|
||||||
|
|
||||||
while (cur < max) {
|
while (cur < max)
|
||||||
|
{
|
||||||
byte b = buf[cur];
|
byte b = buf[cur];
|
||||||
|
|
||||||
if ((b & 0x80) != 0) {
|
if ((b & 0x80) != 0)
|
||||||
|
{
|
||||||
meetMSB = true;
|
meetMSB = true;
|
||||||
} else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|
}
|
||||||
|| b > SMALL_Z) {
|
else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|
||||||
if (meetMSB && cur > prev) {
|
|| b > SMALL_Z)
|
||||||
|
{
|
||||||
|
if (meetMSB && cur > prev)
|
||||||
|
{
|
||||||
ms.Write(buf, prev, cur - prev);
|
ms.Write(buf, prev, cur - prev);
|
||||||
ms.WriteByte(SPACE);
|
ms.WriteByte(SPACE);
|
||||||
meetMSB = false;
|
meetMSB = false;
|
||||||
|
@ -149,14 +156,16 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
byte[] result = null;
|
byte[] result = null;
|
||||||
|
|
||||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
using (MemoryStream ms = new MemoryStream(buf.Length))
|
||||||
|
{
|
||||||
|
|
||||||
bool inTag = false;
|
bool inTag = false;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
int prev = offset;
|
int prev = offset;
|
||||||
int cur = offset;
|
int cur = offset;
|
||||||
|
|
||||||
while (cur < max) {
|
while (cur < max)
|
||||||
|
{
|
||||||
|
|
||||||
byte b = buf[cur];
|
byte b = buf[cur];
|
||||||
|
|
||||||
|
@ -167,8 +176,10 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
// it's ascii, but it's not a letter
|
// it's ascii, but it's not a letter
|
||||||
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|
||||||
|| (b > CAPITAL_Z && b < SMALL_A))) {
|
|| (b > CAPITAL_Z && b < SMALL_A)))
|
||||||
if (cur > prev && !inTag) {
|
{
|
||||||
|
if (cur > prev && !inTag)
|
||||||
|
{
|
||||||
ms.Write(buf, prev, cur - prev);
|
ms.Write(buf, prev, cur - prev);
|
||||||
ms.WriteByte(SPACE);
|
ms.WriteByte(SPACE);
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,7 +60,8 @@ namespace UniversalDetector.Core
|
||||||
// for each byte we get its class, if it is first byte,
|
// for each byte we get its class, if it is first byte,
|
||||||
// we also get byte length
|
// we also get byte length
|
||||||
int byteCls = model.GetClass(b);
|
int byteCls = model.GetClass(b);
|
||||||
if (currentState == SMModel.START) {
|
if (currentState == SMModel.START)
|
||||||
|
{
|
||||||
currentBytePos = 0;
|
currentBytePos = 0;
|
||||||
currentCharLen = model.charLenTable[byteCls];
|
currentCharLen = model.charLenTable[byteCls];
|
||||||
}
|
}
|
||||||
|
|
|
@ -62,29 +62,36 @@ namespace UniversalDetector.Core
|
||||||
int codingState;
|
int codingState;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
int charLen = codingSM.CurrentCharLen;
|
int charLen = codingSM.CurrentCharLen;
|
||||||
if (i == offset) {
|
if (i == offset)
|
||||||
|
{
|
||||||
lastChar[1] = buf[offset];
|
lastChar[1] = buf[offset];
|
||||||
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
|
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
} else {
|
}
|
||||||
contextAnalyser.HandleOneChar(buf, i-1, charLen);
|
else
|
||||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
{
|
||||||
|
contextAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
|
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastChar[0] = buf[max-1];
|
lastChar[0] = buf[max - 1];
|
||||||
if (state == ProbingState.Detecting)
|
if (state == ProbingState.Detecting)
|
||||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
|
|
|
@ -60,27 +60,34 @@ namespace UniversalDetector.Core
|
||||||
int codingState;
|
int codingState;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
int charLen = codingSM.CurrentCharLen;
|
int charLen = codingSM.CurrentCharLen;
|
||||||
if (i == offset) {
|
if (i == offset)
|
||||||
|
{
|
||||||
lastChar[1] = buf[offset];
|
lastChar[1] = buf[offset];
|
||||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
} else {
|
}
|
||||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
else
|
||||||
|
{
|
||||||
|
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastChar[0] = buf[max-1];
|
lastChar[0] = buf[max - 1];
|
||||||
|
|
||||||
if (state == ProbingState.Detecting)
|
if (state == ProbingState.Detecting)
|
||||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||||
|
|
|
@ -56,27 +56,34 @@ namespace UniversalDetector.Core
|
||||||
int codingState;
|
int codingState;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = 0; i < max; i++) {
|
for (int i = 0; i < max; i++)
|
||||||
|
{
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
int charLen = codingSM.CurrentCharLen;
|
int charLen = codingSM.CurrentCharLen;
|
||||||
if (i == offset) {
|
if (i == offset)
|
||||||
|
{
|
||||||
lastChar[1] = buf[offset];
|
lastChar[1] = buf[offset];
|
||||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
} else {
|
}
|
||||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
else
|
||||||
|
{
|
||||||
|
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastChar[0] = buf[max-1];
|
lastChar[0] = buf[max - 1];
|
||||||
|
|
||||||
if (state == ProbingState.Detecting)
|
if (state == ProbingState.Detecting)
|
||||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||||
|
|
|
@ -67,22 +67,30 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max && state == ProbingState.Detecting; i++) {
|
for (int i = offset; i < max && state == ProbingState.Detecting; i++)
|
||||||
for (int j = activeSM - 1; j >= 0; j--) {
|
{
|
||||||
|
for (int j = activeSM - 1; j >= 0; j--)
|
||||||
|
{
|
||||||
// byte is feed to all active state machine
|
// byte is feed to all active state machine
|
||||||
int codingState = codingSM[j].NextState(buf[i]);
|
int codingState = codingSM[j].NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
// got negative answer for this state machine, make it inactive
|
// got negative answer for this state machine, make it inactive
|
||||||
activeSM--;
|
activeSM--;
|
||||||
if (activeSM == 0) {
|
if (activeSM == 0)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
return state;
|
return state;
|
||||||
} else if (j != activeSM) {
|
}
|
||||||
|
else if (j != activeSM)
|
||||||
|
{
|
||||||
CodingStateMachine t = codingSM[activeSM];
|
CodingStateMachine t = codingSM[activeSM];
|
||||||
codingSM[activeSM] = codingSM[j];
|
codingSM[activeSM] = codingSM[j];
|
||||||
codingSM[j] = t;
|
codingSM[j] = t;
|
||||||
}
|
}
|
||||||
} else if (codingState == SMModel.ITSME) {
|
}
|
||||||
|
else if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
detectedCharset = codingSM[j].ModelName;
|
detectedCharset = codingSM[j].ModelName;
|
||||||
return state;
|
return state;
|
||||||
|
|
|
@ -87,7 +87,7 @@ namespace UniversalDetector.Core
|
||||||
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly static int[] HZCharLenTable = {0, 0, 0, 0, 0, 0};
|
private readonly static int[] HZCharLenTable = { 0, 0, 0, 0, 0, 0 };
|
||||||
|
|
||||||
public HZSMModel() : base(
|
public HZSMModel() : base(
|
||||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||||
|
@ -153,7 +153,7 @@ namespace UniversalDetector.Core
|
||||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
private readonly static int[] ISO2022CNCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
|
|
||||||
public ISO2022CNSMModel() : base(
|
public ISO2022CNSMModel() : base(
|
||||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||||
|
@ -220,7 +220,7 @@ namespace UniversalDetector.Core
|
||||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
private readonly static int[] ISO2022JPCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
|
|
||||||
public ISO2022JPSMModel() : base(
|
public ISO2022JPSMModel() : base(
|
||||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||||
|
@ -284,7 +284,7 @@ namespace UniversalDetector.Core
|
||||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
|
private readonly static int[] ISO2022KRCharLenTable = { 0, 0, 0, 0, 0, 0 };
|
||||||
|
|
||||||
public ISO2022KRSMModel() : base(
|
public ISO2022KRSMModel() : base(
|
||||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||||
|
|
|
@ -64,30 +64,38 @@ namespace UniversalDetector.Core
|
||||||
int codingState = SMModel.START;
|
int codingState = SMModel.START;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
int charLen = codingSM.CurrentCharLen;
|
int charLen = codingSM.CurrentCharLen;
|
||||||
if (i == offset) {
|
if (i == offset)
|
||||||
|
{
|
||||||
lastChar[1] = buf[offset];
|
lastChar[1] = buf[offset];
|
||||||
analyser.HandleOneChar(lastChar, 0, charLen);
|
analyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
} else {
|
}
|
||||||
analyser.HandleOneChar(buf, i-1, charLen);
|
else
|
||||||
|
{
|
||||||
|
analyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lastChar[0] = buf[max-1];
|
lastChar[0] = buf[max - 1];
|
||||||
|
|
||||||
if (state == ProbingState.Detecting) {
|
if (state == ProbingState.Detecting)
|
||||||
|
{
|
||||||
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* General ideas of the Hebrew charset recognition
|
* General ideas of the Hebrew charset recognition
|
||||||
|
@ -144,11 +143,11 @@ namespace UniversalDetector.Core
|
||||||
public class HebrewProber : CharsetProber
|
public class HebrewProber : CharsetProber
|
||||||
{
|
{
|
||||||
// windows-1255 / ISO-8859-8 code points of interest
|
// windows-1255 / ISO-8859-8 code points of interest
|
||||||
private const byte FINAL_KAF = 0xEA;
|
private const byte FINAL_KAF = 0xEA;
|
||||||
private const byte NORMAL_KAF = 0xEB;
|
private const byte NORMAL_KAF = 0xEB;
|
||||||
private const byte FINAL_MEM = 0xED;
|
private const byte FINAL_MEM = 0xED;
|
||||||
private const byte NORMAL_MEM = 0xEE;
|
private const byte NORMAL_MEM = 0xEE;
|
||||||
private const byte FINAL_NUN = 0xEF;
|
private const byte FINAL_NUN = 0xEF;
|
||||||
private const byte NORMAL_NUN = 0xF0;
|
private const byte NORMAL_NUN = 0xF0;
|
||||||
private const byte FINAL_PE = 0xF3;
|
private const byte FINAL_PE = 0xF3;
|
||||||
private const byte NORMAL_PE = 0xF4;
|
private const byte NORMAL_PE = 0xF4;
|
||||||
|
@ -217,14 +216,17 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
|
|
||||||
byte b = buf[i];
|
byte b = buf[i];
|
||||||
|
|
||||||
// a word just ended
|
// a word just ended
|
||||||
if (b == 0x20) {
|
if (b == 0x20)
|
||||||
|
{
|
||||||
// *(curPtr-2) was not a space so prev is not a 1 letter word
|
// *(curPtr-2) was not a space so prev is not a 1 letter word
|
||||||
if (beforePrev != 0x20) {
|
if (beforePrev != 0x20)
|
||||||
|
{
|
||||||
// case (1) [-2:not space][-1:final letter][cur:space]
|
// case (1) [-2:not space][-1:final letter][cur:space]
|
||||||
if (IsFinal(prev))
|
if (IsFinal(prev))
|
||||||
finalCharLogicalScore++;
|
finalCharLogicalScore++;
|
||||||
|
@ -233,7 +235,9 @@ namespace UniversalDetector.Core
|
||||||
finalCharVisualScore++;
|
finalCharVisualScore++;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
// case (3) [-2:space][-1:final letter][cur:not space]
|
// case (3) [-2:space][-1:final letter][cur:not space]
|
||||||
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
||||||
++finalCharVisualScore;
|
++finalCharVisualScore;
|
||||||
|
|
|
@ -160,7 +160,7 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
// This is just one way to calculate confidence. It works well for me.
|
// This is just one way to calculate confidence. It works well for me.
|
||||||
if (totalRel > MINIMUM_DATA_THRESHOLD)
|
if (totalRel > MINIMUM_DATA_THRESHOLD)
|
||||||
return ((float)(totalRel - relSample[0]))/totalRel;
|
return ((float)(totalRel - relSample[0])) / totalRel;
|
||||||
else
|
else
|
||||||
return DONT_KNOW;
|
return DONT_KNOW;
|
||||||
}
|
}
|
||||||
|
@ -181,22 +181,28 @@ namespace UniversalDetector.Core
|
||||||
// to record those bytes as well and analyse the character once it
|
// to record those bytes as well and analyse the character once it
|
||||||
// is complete, but since a character will not make much difference,
|
// is complete, but since a character will not make much difference,
|
||||||
// skipping it will simplify our logic and improve performance.
|
// skipping it will simplify our logic and improve performance.
|
||||||
for (int i = needToSkipCharNum+offset; i < max; ) {
|
for (int i = needToSkipCharNum + offset; i < max;)
|
||||||
|
{
|
||||||
int order = GetOrder(buf, i, out charLen);
|
int order = GetOrder(buf, i, out charLen);
|
||||||
i += charLen;
|
i += charLen;
|
||||||
if (i > max) {
|
if (i > max)
|
||||||
|
{
|
||||||
needToSkipCharNum = i - max;
|
needToSkipCharNum = i - max;
|
||||||
lastCharOrder = -1;
|
lastCharOrder = -1;
|
||||||
} else {
|
}
|
||||||
if (order != -1 && lastCharOrder != -1) {
|
else
|
||||||
totalRel ++;
|
{
|
||||||
if (totalRel > MAX_REL_THRESHOLD) {
|
if (order != -1 && lastCharOrder != -1)
|
||||||
|
{
|
||||||
|
totalRel++;
|
||||||
|
if (totalRel > MAX_REL_THRESHOLD)
|
||||||
|
{
|
||||||
done = true;
|
done = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||||
}
|
}
|
||||||
lastCharOrder = order;
|
lastCharOrder = order;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -210,7 +216,8 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
// Only 2-bytes characters are of our interest
|
// Only 2-bytes characters are of our interest
|
||||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||||
if (order != -1 && lastCharOrder != -1) {
|
if (order != -1 && lastCharOrder != -1)
|
||||||
|
{
|
||||||
totalRel++;
|
totalRel++;
|
||||||
// count this sequence to its category counter
|
// count this sequence to its category counter
|
||||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||||
|
@ -221,7 +228,8 @@ namespace UniversalDetector.Core
|
||||||
public void Reset()
|
public void Reset()
|
||||||
{
|
{
|
||||||
totalRel = 0;
|
totalRel = 0;
|
||||||
for (int i = 0; i < CATEGORIES_NUM; i++) {
|
for (int i = 0; i < CATEGORIES_NUM; i++)
|
||||||
|
{
|
||||||
relSample[i] = 0;
|
relSample[i] = 0;
|
||||||
needToSkipCharNum = 0;
|
needToSkipCharNum = 0;
|
||||||
lastCharOrder = -1;
|
lastCharOrder = -1;
|
||||||
|
@ -254,8 +262,9 @@ namespace UniversalDetector.Core
|
||||||
charLen = 1;
|
charLen = 1;
|
||||||
|
|
||||||
// return its order if it is hiragana
|
// return its order if it is hiragana
|
||||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||||
byte low = buf[offset+1];
|
{
|
||||||
|
byte low = buf[offset + 1];
|
||||||
if (low >= 0x9F && low <= 0xF1)
|
if (low >= 0x9F && low <= 0xF1)
|
||||||
return low - 0x9F;
|
return low - 0x9F;
|
||||||
}
|
}
|
||||||
|
@ -265,8 +274,9 @@ namespace UniversalDetector.Core
|
||||||
protected override int GetOrder(byte[] buf, int offset)
|
protected override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
// We are only interested in Hiragana
|
// We are only interested in Hiragana
|
||||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||||
byte low = buf[offset+1];
|
{
|
||||||
|
byte low = buf[offset + 1];
|
||||||
if (low >= 0x9F && low <= 0xF1)
|
if (low >= 0x9F && low <= 0xF1)
|
||||||
return low - 0x9F;
|
return low - 0x9F;
|
||||||
}
|
}
|
||||||
|
@ -292,8 +302,9 @@ namespace UniversalDetector.Core
|
||||||
charLen = 1;
|
charLen = 1;
|
||||||
|
|
||||||
// return its order if it is hiragana
|
// return its order if it is hiragana
|
||||||
if (high == HIRAGANA_FIRST_BYTE) {
|
if (high == HIRAGANA_FIRST_BYTE)
|
||||||
byte low = buf[offset+1];
|
{
|
||||||
|
byte low = buf[offset + 1];
|
||||||
if (low >= 0xA1 && low <= 0xF3)
|
if (low >= 0xA1 && low <= 0xF3)
|
||||||
return low - 0xA1;
|
return low - 0xA1;
|
||||||
}
|
}
|
||||||
|
@ -303,8 +314,9 @@ namespace UniversalDetector.Core
|
||||||
protected override int GetOrder(byte[] buf, int offset)
|
protected override int GetOrder(byte[] buf, int offset)
|
||||||
{
|
{
|
||||||
// We are only interested in Hiragana
|
// We are only interested in Hiragana
|
||||||
if (buf[offset] == HIRAGANA_FIRST_BYTE) {
|
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||||
byte low = buf[offset+1];
|
{
|
||||||
|
byte low = buf[offset + 1];
|
||||||
if (low >= 0xA1 && low <= 0xF3)
|
if (low >= 0xA1 && low <= 0xF3)
|
||||||
return low - 0xA1;
|
return low - 0xA1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
@ -135,12 +134,14 @@ namespace UniversalDetector.Core
|
||||||
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
|
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
|
||||||
byte charClass, freq;
|
byte charClass, freq;
|
||||||
|
|
||||||
for (int i = 0; i < newbuf.Length; i++) {
|
for (int i = 0; i < newbuf.Length; i++)
|
||||||
|
{
|
||||||
charClass = Latin1_CharToClass[newbuf[i]];
|
charClass = Latin1_CharToClass[newbuf[i]];
|
||||||
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
|
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
|
||||||
if (freq == 0) {
|
if (freq == 0)
|
||||||
state = ProbingState.NotMe;
|
{
|
||||||
break;
|
state = ProbingState.NotMe;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
freqCounter[freq]++;
|
freqCounter[freq]++;
|
||||||
lastCharClass = charClass;
|
lastCharClass = charClass;
|
||||||
|
@ -155,13 +156,17 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
float confidence = 0.0f;
|
float confidence = 0.0f;
|
||||||
int total = 0;
|
int total = 0;
|
||||||
for (int i = 0; i < FREQ_CAT_NUM; i++) {
|
for (int i = 0; i < FREQ_CAT_NUM; i++)
|
||||||
|
{
|
||||||
total += freqCounter[i];
|
total += freqCounter[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (total <= 0) {
|
if (total <= 0)
|
||||||
|
{
|
||||||
confidence = 0.0f;
|
confidence = 0.0f;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
confidence = freqCounter[3] * 1.0f / total;
|
confidence = freqCounter[3] * 1.0f / total;
|
||||||
confidence -= freqCounter[1] * 20.0f / total;
|
confidence -= freqCounter[1] * 20.0f / total;
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
@ -67,7 +66,8 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
public override string GetCharsetName()
|
public override string GetCharsetName()
|
||||||
{
|
{
|
||||||
if (bestGuess == -1) {
|
if (bestGuess == -1)
|
||||||
|
{
|
||||||
GetConfidence();
|
GetConfidence();
|
||||||
if (bestGuess == -1)
|
if (bestGuess == -1)
|
||||||
bestGuess = 0;
|
bestGuess = 0;
|
||||||
|
@ -78,13 +78,17 @@ namespace UniversalDetector.Core
|
||||||
public override void Reset()
|
public override void Reset()
|
||||||
{
|
{
|
||||||
activeNum = 0;
|
activeNum = 0;
|
||||||
for (int i = 0; i < probers.Length; i++) {
|
for (int i = 0; i < probers.Length; i++)
|
||||||
if (probers[i] != null) {
|
{
|
||||||
probers[i].Reset();
|
if (probers[i] != null)
|
||||||
isActive[i] = true;
|
{
|
||||||
++activeNum;
|
probers[i].Reset();
|
||||||
} else {
|
isActive[i] = true;
|
||||||
isActive[i] = false;
|
++activeNum;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
isActive[i] = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bestGuess = -1;
|
bestGuess = -1;
|
||||||
|
@ -100,13 +104,18 @@ namespace UniversalDetector.Core
|
||||||
bool keepNext = true;
|
bool keepNext = true;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
if ((buf[i] & 0x80) != 0) {
|
{
|
||||||
|
if ((buf[i] & 0x80) != 0)
|
||||||
|
{
|
||||||
highbyteBuf[hptr++] = buf[i];
|
highbyteBuf[hptr++] = buf[i];
|
||||||
keepNext = true;
|
keepNext = true;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
//if previous is highbyte, keep this even it is a ASCII
|
//if previous is highbyte, keep this even it is a ASCII
|
||||||
if (keepNext) {
|
if (keepNext)
|
||||||
|
{
|
||||||
highbyteBuf[hptr++] = buf[i];
|
highbyteBuf[hptr++] = buf[i];
|
||||||
keepNext = false;
|
keepNext = false;
|
||||||
}
|
}
|
||||||
|
@ -115,18 +124,23 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
ProbingState st = ProbingState.NotMe;
|
ProbingState st = ProbingState.NotMe;
|
||||||
|
|
||||||
for (int i = 0; i < probers.Length; i++) {
|
for (int i = 0; i < probers.Length; i++)
|
||||||
|
{
|
||||||
if (!isActive[i])
|
if (!isActive[i])
|
||||||
continue;
|
continue;
|
||||||
st = probers[i].HandleData(highbyteBuf, 0, hptr);
|
st = probers[i].HandleData(highbyteBuf, 0, hptr);
|
||||||
if (st == ProbingState.FoundIt) {
|
if (st == ProbingState.FoundIt)
|
||||||
|
{
|
||||||
bestGuess = i;
|
bestGuess = i;
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
} else if (st == ProbingState.NotMe) {
|
}
|
||||||
|
else if (st == ProbingState.NotMe)
|
||||||
|
{
|
||||||
isActive[i] = false;
|
isActive[i] = false;
|
||||||
activeNum--;
|
activeNum--;
|
||||||
if (activeNum <= 0) {
|
if (activeNum <= 0)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -140,16 +154,23 @@ namespace UniversalDetector.Core
|
||||||
float bestConf = 0.0f;
|
float bestConf = 0.0f;
|
||||||
float cf = 0.0f;
|
float cf = 0.0f;
|
||||||
|
|
||||||
if (state == ProbingState.FoundIt) {
|
if (state == ProbingState.FoundIt)
|
||||||
|
{
|
||||||
return 0.99f;
|
return 0.99f;
|
||||||
} else if (state == ProbingState.NotMe) {
|
}
|
||||||
|
else if (state == ProbingState.NotMe)
|
||||||
|
{
|
||||||
return 0.01f;
|
return 0.01f;
|
||||||
} else {
|
}
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
|
{
|
||||||
if (!isActive[i])
|
if (!isActive[i])
|
||||||
continue;
|
continue;
|
||||||
cf = probers[i].GetConfidence();
|
cf = probers[i].GetConfidence();
|
||||||
if (bestConf < cf) {
|
if (bestConf < cf)
|
||||||
|
{
|
||||||
bestConf = cf;
|
bestConf = cf;
|
||||||
bestGuess = i;
|
bestGuess = i;
|
||||||
}
|
}
|
||||||
|
@ -162,10 +183,14 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
float cf;
|
float cf;
|
||||||
GetConfidence();
|
GetConfidence();
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
if (!isActive[i]) {
|
{
|
||||||
|
if (!isActive[i])
|
||||||
|
{
|
||||||
//Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]);
|
//Console.WriteLine(" MBCS inactive: {0} (confidence is too low).", ProberName[i]);
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
cf = probers[i].GetConfidence();
|
cf = probers[i].GetConfidence();
|
||||||
//Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]);
|
//Console.WriteLine(" MBCS {0}: [{1}]", cf, ProberName[i]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,7 +174,7 @@ namespace UniversalDetector.Core
|
||||||
// it is used for frequency analysis only, and we are validating
|
// it is used for frequency analysis only, and we are validating
|
||||||
// each code range there as well. So it is safe to set it to be
|
// each code range there as well. So it is safe to set it to be
|
||||||
// 2 here.
|
// 2 here.
|
||||||
private readonly static int[] GB18030CharLenTable = {0, 1, 1, 1, 1, 1, 2};
|
private readonly static int[] GB18030CharLenTable = { 0, 1, 1, 1, 1, 1, 2 };
|
||||||
|
|
||||||
public GB18030SMModel() : base(
|
public GB18030SMModel() : base(
|
||||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||||
|
@ -235,7 +235,7 @@ namespace UniversalDetector.Core
|
||||||
BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
|
BitPackage.Pack4bits(ERROR,START,START,START,START,START,START,START) //10-17
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly static int[] BIG5CharLenTable = {0, 1, 1, 2, 0};
|
private readonly static int[] BIG5CharLenTable = { 0, 1, 1, 2, 0 };
|
||||||
|
|
||||||
public BIG5SMModel() : base(
|
public BIG5SMModel() : base(
|
||||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
@ -88,19 +87,24 @@ namespace UniversalDetector.Core
|
||||||
if (newBuf.Length == 0)
|
if (newBuf.Length == 0)
|
||||||
return state; // Nothing to see here, move on.
|
return state; // Nothing to see here, move on.
|
||||||
|
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
|
{
|
||||||
if (!isActive[i])
|
if (!isActive[i])
|
||||||
continue;
|
continue;
|
||||||
st = probers[i].HandleData(newBuf, 0, newBuf.Length);
|
st = probers[i].HandleData(newBuf, 0, newBuf.Length);
|
||||||
|
|
||||||
if (st == ProbingState.FoundIt) {
|
if (st == ProbingState.FoundIt)
|
||||||
|
{
|
||||||
bestGuess = i;
|
bestGuess = i;
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
} else if (st == ProbingState.NotMe) {
|
}
|
||||||
|
else if (st == ProbingState.NotMe)
|
||||||
|
{
|
||||||
isActive[i] = false;
|
isActive[i] = false;
|
||||||
activeNum--;
|
activeNum--;
|
||||||
if (activeNum <= 0) {
|
if (activeNum <= 0)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -112,24 +116,25 @@ namespace UniversalDetector.Core
|
||||||
public override float GetConfidence()
|
public override float GetConfidence()
|
||||||
{
|
{
|
||||||
float bestConf = 0.0f, cf;
|
float bestConf = 0.0f, cf;
|
||||||
switch (state) {
|
switch (state)
|
||||||
case ProbingState.FoundIt:
|
{
|
||||||
return 0.99f; //sure yes
|
case ProbingState.FoundIt:
|
||||||
case ProbingState.NotMe:
|
return 0.99f; //sure yes
|
||||||
return 0.01f; //sure no
|
case ProbingState.NotMe:
|
||||||
default:
|
return 0.01f; //sure no
|
||||||
for (int i = 0; i < PROBERS_NUM; i++)
|
default:
|
||||||
{
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
if (!isActive[i])
|
|
||||||
continue;
|
|
||||||
cf = probers[i].GetConfidence();
|
|
||||||
if (bestConf < cf)
|
|
||||||
{
|
{
|
||||||
bestConf = cf;
|
if (!isActive[i])
|
||||||
bestGuess = i;
|
continue;
|
||||||
|
cf = probers[i].GetConfidence();
|
||||||
|
if (bestConf < cf)
|
||||||
|
{
|
||||||
|
bestConf = cf;
|
||||||
|
bestGuess = i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
break;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
return bestConf;
|
return bestConf;
|
||||||
}
|
}
|
||||||
|
@ -137,8 +142,9 @@ namespace UniversalDetector.Core
|
||||||
public override void DumpStatus()
|
public override void DumpStatus()
|
||||||
{
|
{
|
||||||
float cf = GetConfidence();
|
float cf = GetConfidence();
|
||||||
// Console.WriteLine(" SBCS Group Prober --------begin status");
|
// Console.WriteLine(" SBCS Group Prober --------begin status");
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
|
{
|
||||||
if (isActive[i])
|
if (isActive[i])
|
||||||
probers[i].DumpStatus();
|
probers[i].DumpStatus();
|
||||||
//else
|
//else
|
||||||
|
@ -148,15 +154,19 @@ namespace UniversalDetector.Core
|
||||||
//Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf);
|
//Console.WriteLine(" SBCS Group found best match [{0}] confidence {1}.", probers[bestGuess].GetCharsetName(), cf);
|
||||||
}
|
}
|
||||||
|
|
||||||
public override void Reset ()
|
public override void Reset()
|
||||||
{
|
{
|
||||||
int activeNum = 0;
|
int activeNum = 0;
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
if (probers[i] != null) {
|
{
|
||||||
|
if (probers[i] != null)
|
||||||
|
{
|
||||||
probers[i].Reset();
|
probers[i].Reset();
|
||||||
isActive[i] = true;
|
isActive[i] = true;
|
||||||
activeNum++;
|
activeNum++;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
isActive[i] = false;
|
isActive[i] = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -167,7 +177,8 @@ namespace UniversalDetector.Core
|
||||||
public override string GetCharsetName()
|
public override string GetCharsetName()
|
||||||
{
|
{
|
||||||
//if we have no answer yet
|
//if we have no answer yet
|
||||||
if (bestGuess == -1) {
|
if (bestGuess == -1)
|
||||||
|
{
|
||||||
GetConfidence();
|
GetConfidence();
|
||||||
//no charset seems positive
|
//no charset seems positive
|
||||||
if (bestGuess == -1)
|
if (bestGuess == -1)
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
@ -49,7 +48,7 @@ namespace UniversalDetector.Core
|
||||||
private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
|
private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
|
||||||
private const int SYMBOL_CAT_ORDER = 250;
|
private const int SYMBOL_CAT_ORDER = 250;
|
||||||
private const int NUMBER_OF_SEQ_CAT = 4;
|
private const int NUMBER_OF_SEQ_CAT = 4;
|
||||||
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1;
|
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1;
|
||||||
private const int NEGATIVE_CAT = 0;
|
private const int NEGATIVE_CAT = 0;
|
||||||
|
|
||||||
protected SequenceModel model;
|
protected SequenceModel model;
|
||||||
|
@ -89,28 +88,33 @@ namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
byte order = model.GetOrder(buf[i]);
|
byte order = model.GetOrder(buf[i]);
|
||||||
|
|
||||||
if (order < SYMBOL_CAT_ORDER)
|
if (order < SYMBOL_CAT_ORDER)
|
||||||
totalChar++;
|
totalChar++;
|
||||||
|
|
||||||
if (order < SAMPLE_SIZE) {
|
if (order < SAMPLE_SIZE)
|
||||||
|
{
|
||||||
freqChar++;
|
freqChar++;
|
||||||
|
|
||||||
if (lastOrder < SAMPLE_SIZE) {
|
if (lastOrder < SAMPLE_SIZE)
|
||||||
|
{
|
||||||
totalSeqs++;
|
totalSeqs++;
|
||||||
if (!reversed)
|
if (!reversed)
|
||||||
++(seqCounters[model.GetPrecedence(lastOrder*SAMPLE_SIZE+order)]);
|
++(seqCounters[model.GetPrecedence(lastOrder * SAMPLE_SIZE + order)]);
|
||||||
else // reverse the order of the letters in the lookup
|
else // reverse the order of the letters in the lookup
|
||||||
++(seqCounters[model.GetPrecedence(order*SAMPLE_SIZE+lastOrder)]);
|
++(seqCounters[model.GetPrecedence(order * SAMPLE_SIZE + lastOrder)]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastOrder = order;
|
lastOrder = order;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (state == ProbingState.Detecting) {
|
if (state == ProbingState.Detecting)
|
||||||
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD) {
|
{
|
||||||
|
if (totalSeqs > SB_ENOUGH_REL_THRESHOLD)
|
||||||
|
{
|
||||||
float cf = GetConfidence();
|
float cf = GetConfidence();
|
||||||
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
if (cf > POSITIVE_SHORTCUT_THRESHOLD)
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
|
@ -139,7 +143,8 @@ namespace UniversalDetector.Core
|
||||||
// POSITIVE_APPROACH
|
// POSITIVE_APPROACH
|
||||||
float r = 0.0f;
|
float r = 0.0f;
|
||||||
|
|
||||||
if (totalSeqs > 0) {
|
if (totalSeqs > 0)
|
||||||
|
{
|
||||||
r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
|
r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
|
||||||
r = r * freqChar / totalChar;
|
r = r * freqChar / totalChar;
|
||||||
if (r >= 1.0f)
|
if (r >= 1.0f)
|
||||||
|
|
|
@ -69,29 +69,36 @@ namespace UniversalDetector.Core
|
||||||
int codingState;
|
int codingState;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
int charLen = codingSM.CurrentCharLen;
|
int charLen = codingSM.CurrentCharLen;
|
||||||
if (i == offset) {
|
if (i == offset)
|
||||||
|
{
|
||||||
lastChar[1] = buf[offset];
|
lastChar[1] = buf[offset];
|
||||||
contextAnalyser.HandleOneChar(lastChar, 2-charLen, charLen);
|
contextAnalyser.HandleOneChar(lastChar, 2 - charLen, charLen);
|
||||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||||
} else {
|
}
|
||||||
contextAnalyser.HandleOneChar(buf, i+1-charLen, charLen);
|
else
|
||||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
{
|
||||||
|
contextAnalyser.HandleOneChar(buf, i + 1 - charLen, charLen);
|
||||||
|
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lastChar[0] = buf[max-1];
|
lastChar[0] = buf[max - 1];
|
||||||
if (state == ProbingState.Detecting)
|
if (state == ProbingState.Detecting)
|
||||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
@ -54,7 +53,7 @@ namespace UniversalDetector.Core
|
||||||
public int[] charLenTable;
|
public int[] charLenTable;
|
||||||
|
|
||||||
private string name;
|
private string name;
|
||||||
|
|
||||||
public string Name => name;
|
public string Name => name;
|
||||||
|
|
||||||
private int classFactor;
|
private int classFactor;
|
||||||
|
|
|
@ -36,7 +36,6 @@
|
||||||
*
|
*
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
|
|
||||||
using System;
|
|
||||||
|
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
@ -51,12 +50,12 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
// freqSeqs / totalSeqs
|
// freqSeqs / totalSeqs
|
||||||
protected float typicalPositiveRatio;
|
protected float typicalPositiveRatio;
|
||||||
|
|
||||||
public float TypicalPositiveRatio => typicalPositiveRatio;
|
public float TypicalPositiveRatio => typicalPositiveRatio;
|
||||||
|
|
||||||
// not used
|
// not used
|
||||||
protected bool keepEnglishLetter;
|
protected bool keepEnglishLetter;
|
||||||
|
|
||||||
public bool KeepEnglishLetter => keepEnglishLetter;
|
public bool KeepEnglishLetter => keepEnglishLetter;
|
||||||
|
|
||||||
protected string charsetName;
|
protected string charsetName;
|
||||||
|
|
|
@ -51,7 +51,8 @@ namespace UniversalDetector.Core
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
public override string GetCharsetName() {
|
public override string GetCharsetName()
|
||||||
|
{
|
||||||
return "UTF-8";
|
return "UTF-8";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,21 +68,25 @@ namespace UniversalDetector.Core
|
||||||
int codingState = SMModel.START;
|
int codingState = SMModel.START;
|
||||||
int max = offset + len;
|
int max = offset + len;
|
||||||
|
|
||||||
for (int i = offset; i < max; i++) {
|
for (int i = offset; i < max; i++)
|
||||||
|
{
|
||||||
|
|
||||||
codingState = codingSM.NextState(buf[i]);
|
codingState = codingSM.NextState(buf[i]);
|
||||||
|
|
||||||
if (codingState == SMModel.ERROR) {
|
if (codingState == SMModel.ERROR)
|
||||||
|
{
|
||||||
state = ProbingState.NotMe;
|
state = ProbingState.NotMe;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (codingState == SMModel.ITSME) {
|
if (codingState == SMModel.ITSME)
|
||||||
|
{
|
||||||
state = ProbingState.FoundIt;
|
state = ProbingState.FoundIt;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (codingState == SMModel.START) {
|
if (codingState == SMModel.START)
|
||||||
|
{
|
||||||
if (codingSM.CurrentCharLen >= 2)
|
if (codingSM.CurrentCharLen >= 2)
|
||||||
numOfMBChar++;
|
numOfMBChar++;
|
||||||
}
|
}
|
||||||
|
@ -98,11 +103,14 @@ namespace UniversalDetector.Core
|
||||||
float unlike = 0.99f;
|
float unlike = 0.99f;
|
||||||
float confidence = 0.0f;
|
float confidence = 0.0f;
|
||||||
|
|
||||||
if (numOfMBChar < 6) {
|
if (numOfMBChar < 6)
|
||||||
|
{
|
||||||
for (int i = 0; i < numOfMBChar; i++)
|
for (int i = 0; i < numOfMBChar; i++)
|
||||||
unlike *= ONE_CHAR_PROB;
|
unlike *= ONE_CHAR_PROB;
|
||||||
confidence = 1.0f - unlike;
|
confidence = 1.0f - unlike;
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
confidence = 0.99f;
|
confidence = 0.99f;
|
||||||
}
|
}
|
||||||
return confidence;
|
return confidence;
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
namespace UniversalDetector.Core
|
namespace UniversalDetector.Core
|
||||||
{
|
{
|
||||||
|
|
||||||
enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 };
|
enum InputState { PureASCII = 0, EscASCII = 1, Highbyte = 2 };
|
||||||
|
|
||||||
public abstract class UniversalDetector
|
public abstract class UniversalDetector
|
||||||
{
|
{
|
||||||
|
@ -70,7 +70,8 @@ namespace UniversalDetector.Core
|
||||||
protected CharsetProber escCharsetProber;
|
protected CharsetProber escCharsetProber;
|
||||||
protected string detectedCharset;
|
protected string detectedCharset;
|
||||||
|
|
||||||
public UniversalDetector(int languageFilter) {
|
public UniversalDetector(int languageFilter)
|
||||||
|
{
|
||||||
this.start = true;
|
this.start = true;
|
||||||
this.inputState = InputState.PureASCII;
|
this.inputState = InputState.PureASCII;
|
||||||
this.lastChar = 0x00;
|
this.lastChar = 0x00;
|
||||||
|
@ -80,7 +81,8 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
public virtual void Feed(byte[] buf, int offset, int len)
|
public virtual void Feed(byte[] buf, int offset, int len)
|
||||||
{
|
{
|
||||||
if (done) {
|
if (done)
|
||||||
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -88,52 +90,60 @@ namespace UniversalDetector.Core
|
||||||
gotData = true;
|
gotData = true;
|
||||||
|
|
||||||
// If the data starts with BOM, we know it is UTF
|
// If the data starts with BOM, we know it is UTF
|
||||||
if (start) {
|
if (start)
|
||||||
|
{
|
||||||
start = false;
|
start = false;
|
||||||
if (len > 3) {
|
if (len > 3)
|
||||||
switch (buf[0]) {
|
{
|
||||||
case 0xEF:
|
switch (buf[0])
|
||||||
if (0xBB == buf[1] && 0xBF == buf[2])
|
{
|
||||||
detectedCharset = "UTF-8";
|
case 0xEF:
|
||||||
break;
|
if (0xBB == buf[1] && 0xBF == buf[2])
|
||||||
case 0xFE:
|
detectedCharset = "UTF-8";
|
||||||
if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
|
break;
|
||||||
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
case 0xFE:
|
||||||
detectedCharset = "X-ISO-10646-UCS-4-3412";
|
if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
|
||||||
else if (0xFF == buf[1])
|
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
detectedCharset = "UTF-16BE";
|
detectedCharset = "X-ISO-10646-UCS-4-3412";
|
||||||
break;
|
else if (0xFF == buf[1])
|
||||||
case 0x00:
|
detectedCharset = "UTF-16BE";
|
||||||
if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
|
break;
|
||||||
detectedCharset = "UTF-32BE";
|
case 0x00:
|
||||||
else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
|
if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
|
||||||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
detectedCharset = "UTF-32BE";
|
||||||
detectedCharset = "X-ISO-10646-UCS-4-2143";
|
else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
|
||||||
break;
|
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
case 0xFF:
|
detectedCharset = "X-ISO-10646-UCS-4-2143";
|
||||||
if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
|
break;
|
||||||
detectedCharset = "UTF-32LE";
|
case 0xFF:
|
||||||
else if (0xFE == buf[1])
|
if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
|
||||||
detectedCharset = "UTF-16LE";
|
detectedCharset = "UTF-32LE";
|
||||||
break;
|
else if (0xFE == buf[1])
|
||||||
|
detectedCharset = "UTF-16LE";
|
||||||
|
break;
|
||||||
} // switch
|
} // switch
|
||||||
}
|
}
|
||||||
if (detectedCharset != null) {
|
if (detectedCharset != null)
|
||||||
|
{
|
||||||
done = true;
|
done = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < len; i++) {
|
for (int i = 0; i < len; i++)
|
||||||
|
{
|
||||||
|
|
||||||
// other than 0xa0, if every other character is ascii, the page is ascii
|
// other than 0xa0, if every other character is ascii, the page is ascii
|
||||||
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) {
|
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)
|
||||||
|
{
|
||||||
// we got a non-ascii byte (high-byte)
|
// we got a non-ascii byte (high-byte)
|
||||||
if (inputState != InputState.Highbyte) {
|
if (inputState != InputState.Highbyte)
|
||||||
|
{
|
||||||
inputState = InputState.Highbyte;
|
inputState = InputState.Highbyte;
|
||||||
|
|
||||||
// kill EscCharsetProber if it is active
|
// kill EscCharsetProber if it is active
|
||||||
if (escCharsetProber != null) {
|
if (escCharsetProber != null)
|
||||||
|
{
|
||||||
escCharsetProber = null;
|
escCharsetProber = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,9 +155,12 @@ namespace UniversalDetector.Core
|
||||||
if (charsetProbers[2] == null)
|
if (charsetProbers[2] == null)
|
||||||
charsetProbers[2] = new Latin1Prober();
|
charsetProbers[2] = new Latin1Prober();
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
if (inputState == InputState.PureASCII &&
|
if (inputState == InputState.PureASCII &&
|
||||||
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) {
|
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E)))
|
||||||
|
{
|
||||||
// found escape character or HZ "~{"
|
// found escape character or HZ "~{"
|
||||||
inputState = InputState.EscASCII;
|
inputState = InputState.EscASCII;
|
||||||
}
|
}
|
||||||
|
@ -157,25 +170,31 @@ namespace UniversalDetector.Core
|
||||||
|
|
||||||
ProbingState st = ProbingState.NotMe;
|
ProbingState st = ProbingState.NotMe;
|
||||||
|
|
||||||
switch (inputState) {
|
switch (inputState)
|
||||||
|
{
|
||||||
case InputState.EscASCII:
|
case InputState.EscASCII:
|
||||||
if (escCharsetProber == null) {
|
if (escCharsetProber == null)
|
||||||
|
{
|
||||||
escCharsetProber = new EscCharsetProber();
|
escCharsetProber = new EscCharsetProber();
|
||||||
}
|
}
|
||||||
st = escCharsetProber.HandleData(buf, offset, len);
|
st = escCharsetProber.HandleData(buf, offset, len);
|
||||||
if (st == ProbingState.FoundIt) {
|
if (st == ProbingState.FoundIt)
|
||||||
|
{
|
||||||
done = true;
|
done = true;
|
||||||
detectedCharset = escCharsetProber.GetCharsetName();
|
detectedCharset = escCharsetProber.GetCharsetName();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case InputState.Highbyte:
|
case InputState.Highbyte:
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
if (charsetProbers[i] != null) {
|
{
|
||||||
|
if (charsetProbers[i] != null)
|
||||||
|
{
|
||||||
st = charsetProbers[i].HandleData(buf, offset, len);
|
st = charsetProbers[i].HandleData(buf, offset, len);
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
charsetProbers[i].DumpStatus();
|
charsetProbers[i].DumpStatus();
|
||||||
#endif
|
#endif
|
||||||
if (st == ProbingState.FoundIt) {
|
if (st == ProbingState.FoundIt)
|
||||||
|
{
|
||||||
done = true;
|
done = true;
|
||||||
detectedCharset = charsetProbers[i].GetCharsetName();
|
detectedCharset = charsetProbers[i].GetCharsetName();
|
||||||
return;
|
return;
|
||||||
|
@ -195,38 +214,47 @@ namespace UniversalDetector.Core
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public virtual void DataEnd()
|
public virtual void DataEnd()
|
||||||
{
|
{
|
||||||
if (!gotData) {
|
if (!gotData)
|
||||||
|
{
|
||||||
// we haven't got any data yet, return immediately
|
// we haven't got any data yet, return immediately
|
||||||
// caller program sometimes call DataEnd before anything has
|
// caller program sometimes call DataEnd before anything has
|
||||||
// been sent to detector
|
// been sent to detector
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (detectedCharset != null) {
|
if (detectedCharset != null)
|
||||||
|
{
|
||||||
done = true;
|
done = true;
|
||||||
Report(detectedCharset, 1.0f);
|
Report(detectedCharset, 1.0f);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inputState == InputState.Highbyte) {
|
if (inputState == InputState.Highbyte)
|
||||||
|
{
|
||||||
float proberConfidence = 0.0f;
|
float proberConfidence = 0.0f;
|
||||||
float maxProberConfidence = 0.0f;
|
float maxProberConfidence = 0.0f;
|
||||||
int maxProber = 0;
|
int maxProber = 0;
|
||||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
for (int i = 0; i < PROBERS_NUM; i++)
|
||||||
if (charsetProbers[i] != null) {
|
{
|
||||||
|
if (charsetProbers[i] != null)
|
||||||
|
{
|
||||||
proberConfidence = charsetProbers[i].GetConfidence();
|
proberConfidence = charsetProbers[i].GetConfidence();
|
||||||
if (proberConfidence > maxProberConfidence) {
|
if (proberConfidence > maxProberConfidence)
|
||||||
|
{
|
||||||
maxProberConfidence = proberConfidence;
|
maxProberConfidence = proberConfidence;
|
||||||
maxProber = i;
|
maxProber = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maxProberConfidence > MINIMUM_THRESHOLD) {
|
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
||||||
|
{
|
||||||
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
|
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (inputState == InputState.PureASCII) {
|
}
|
||||||
|
else if (inputState == InputState.PureASCII)
|
||||||
|
{
|
||||||
Report("ASCII", 1.0f);
|
Report("ASCII", 1.0f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user