Merge pull request #598 from cvium/remove_textencoding

Remove MediaBrowser.Text for license violations and hackiness
This commit is contained in:
Andrew Rabert 2019-01-20 10:33:59 -05:00 committed by GitHub
commit acb9afd908
129 changed files with 81 additions and 12177 deletions

View File

@ -22,7 +22,6 @@ using System.Collections.Generic;
using System.IO;
using System.Linq;
using MediaBrowser.Model.IO;
using MediaBrowser.Model.Text;
namespace BDInfo
{
@ -72,8 +71,7 @@ namespace BDInfo
public event OnPlaylistFileScanError PlaylistFileScanError;
public BDROM(
string path, IFileSystem fileSystem, ITextEncoding textEncoding)
public BDROM(string path, IFileSystem fileSystem)
{
if (string.IsNullOrEmpty(path))
{
@ -167,7 +165,7 @@ namespace BDInfo
foreach (var file in files)
{
PlaylistFiles.Add(
file.Name.ToUpper(), new TSPlaylistFile(this, file, _fileSystem, textEncoding));
file.Name.ToUpper(), new TSPlaylistFile(this, file, _fileSystem));
}
}
@ -187,7 +185,7 @@ namespace BDInfo
foreach (var file in files)
{
StreamClipFiles.Add(
file.Name.ToUpper(), new TSStreamClipFile(file, _fileSystem, textEncoding));
file.Name.ToUpper(), new TSStreamClipFile(file, _fileSystem));
}
}

View File

@ -1,4 +1,4 @@
//============================================================================
//============================================================================
// BDInfo - Blu-ray Video and Audio Analysis Tool
// Copyright © 2010 Cinema Squid
//
@ -21,15 +21,14 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using MediaBrowser.Model.IO;
using MediaBrowser.Model.Text;
namespace BDInfo
{
public class TSPlaylistFile
{
private readonly IFileSystem _fileSystem;
private readonly ITextEncoding _textEncoding;
private FileSystemMetadata FileInfo = null;
public string FileType = null;
public bool IsInitialized = false;
@ -64,26 +63,22 @@ namespace BDInfo
public List<TSGraphicsStream> GraphicsStreams =
new List<TSGraphicsStream>();
public TSPlaylistFile(
BDROM bdrom,
FileSystemMetadata fileInfo, IFileSystem fileSystem, ITextEncoding textEncoding)
public TSPlaylistFile(BDROM bdrom,
FileSystemMetadata fileInfo, IFileSystem fileSystem)
{
BDROM = bdrom;
FileInfo = fileInfo;
_fileSystem = fileSystem;
_textEncoding = textEncoding;
Name = fileInfo.Name.ToUpper();
}
public TSPlaylistFile(
BDROM bdrom,
public TSPlaylistFile(BDROM bdrom,
string name,
List<TSStreamClip> clips, IFileSystem fileSystem, ITextEncoding textEncoding)
List<TSStreamClip> clips, IFileSystem fileSystem)
{
BDROM = bdrom;
Name = name;
_fileSystem = fileSystem;
_textEncoding = textEncoding;
IsCustom = true;
foreach (var clip in clips)
{
@ -1245,8 +1240,7 @@ namespace BDInfo
int count,
ref int pos)
{
string val =
_textEncoding.GetASCIIEncoding().GetString(data, pos, count);
string val = Encoding.ASCII.GetString(data, pos, count);
pos += count;

View File

@ -1,4 +1,4 @@
//============================================================================
//============================================================================
// BDInfo - Blu-ray Video and Audio Analysis Tool
// Copyright © 2010 Cinema Squid
//
@ -21,15 +21,14 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using MediaBrowser.Model.IO;
using MediaBrowser.Model.Text;
namespace BDInfo
{
public class TSStreamClipFile
{
private readonly IFileSystem _fileSystem;
private readonly ITextEncoding _textEncoding;
public FileSystemMetadata FileInfo = null;
public string FileType = null;
public bool IsValid = false;
@ -38,12 +37,10 @@ namespace BDInfo
public Dictionary<ushort, TSStream> Streams =
new Dictionary<ushort, TSStream>();
public TSStreamClipFile(
FileSystemMetadata fileInfo, IFileSystem fileSystem, ITextEncoding textEncoding)
public TSStreamClipFile(FileSystemMetadata fileInfo, IFileSystem fileSystem)
{
FileInfo = fileInfo;
_fileSystem = fileSystem;
_textEncoding = textEncoding;
Name = fileInfo.Name.ToUpper();
}
@ -69,7 +66,7 @@ namespace BDInfo
byte[] fileType = new byte[8];
Array.Copy(data, 0, fileType, 0, fileType.Length);
FileType = _textEncoding.GetASCIIEncoding().GetString(fileType, 0, fileType.Length);
FileType = Encoding.ASCII.GetString(fileType, 0, fileType.Length);
if (FileType != "HDMV0100" &&
FileType != "HDMV0200")
{
@ -165,8 +162,7 @@ namespace BDInfo
byte[] languageBytes = new byte[3];
Array.Copy(clipData, streamOffset + 3,
languageBytes, 0, languageBytes.Length);
string languageCode =
_textEncoding.GetASCIIEncoding().GetString(languageBytes, 0, languageBytes.Length);
string languageCode = Encoding.ASCII.GetString(languageBytes, 0, languageBytes.Length);
var channelLayout = (TSChannelLayout)
(clipData[streamOffset + 2] >> 4);
@ -196,8 +192,7 @@ namespace BDInfo
byte[] languageBytes = new byte[3];
Array.Copy(clipData, streamOffset + 2,
languageBytes, 0, languageBytes.Length);
string languageCode =
_textEncoding.GetASCIIEncoding().GetString(languageBytes, 0, languageBytes.Length);
string languageCode = Encoding.ASCII.GetString(languageBytes, 0, languageBytes.Length);
stream = new TSGraphicsStream();
stream.LanguageCode = languageCode;
@ -216,8 +211,7 @@ namespace BDInfo
byte[] languageBytes = new byte[3];
Array.Copy(clipData, streamOffset + 3,
languageBytes, 0, languageBytes.Length);
string languageCode =
_textEncoding.GetASCIIEncoding().GetString(languageBytes, 0, languageBytes.Length);
string languageCode = Encoding.ASCII.GetString(languageBytes, 0, languageBytes.Length);
#if DEBUG
Debug.WriteLine(string.Format(
"\t{0} {1} {2}",

View File

@ -99,7 +99,6 @@ using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Services;
using MediaBrowser.Model.System;
using MediaBrowser.Model.Tasks;
using MediaBrowser.Model.Text;
using MediaBrowser.Model.Threading;
using MediaBrowser.Model.Updates;
using MediaBrowser.Model.Xml;
@ -113,6 +112,7 @@ using ServiceStack;
using ServiceStack.Text.Jsv;
using StringExtensions = MediaBrowser.Controller.Extensions.StringExtensions;
using X509Certificate = System.Security.Cryptography.X509Certificates.X509Certificate;
using UtfUnknown;
namespace Emby.Server.Implementations
{
@ -309,7 +309,6 @@ namespace Emby.Server.Implementations
private IEncodingManager EncodingManager { get; set; }
private IChannelManager ChannelManager { get; set; }
protected ITextEncoding TextEncoding { get; private set; }
/// <summary>
/// Gets or sets the user data repository.
@ -826,9 +825,7 @@ namespace Emby.Server.Implementations
StringExtensions.LocalizationManager = LocalizationManager;
RegisterSingleInstance(LocalizationManager);
TextEncoding = new TextEncoding.TextEncoding(FileSystemManager, LoggerFactory.CreateLogger("TextEncoding"), JsonSerializer);
RegisterSingleInstance(TextEncoding);
BlurayExaminer = new BdInfoExaminer(FileSystemManager, TextEncoding);
BlurayExaminer = new BdInfoExaminer(FileSystemManager);
RegisterSingleInstance(BlurayExaminer);
RegisterSingleInstance<IXmlReaderSettingsFactory>(new XmlReaderSettingsFactory());
@ -873,7 +870,6 @@ namespace Emby.Server.Implementations
ServerConfigurationManager,
"web/index.html",
NetworkManager,
TextEncoding,
JsonSerializer,
XmlSerializer,
GetParseFn);
@ -950,7 +946,7 @@ namespace Emby.Server.Implementations
AuthService = new AuthService(UserManager, authContext, ServerConfigurationManager, SessionManager, NetworkManager);
RegisterSingleInstance(AuthService);
SubtitleEncoder = new MediaBrowser.MediaEncoding.Subtitles.SubtitleEncoder(LibraryManager, LoggerFactory.CreateLogger("SubtitleEncoder"), ApplicationPaths, FileSystemManager, MediaEncoder, JsonSerializer, HttpClient, MediaSourceManager, ProcessFactory, TextEncoding);
SubtitleEncoder = new MediaBrowser.MediaEncoding.Subtitles.SubtitleEncoder(LibraryManager, LoggerFactory.CreateLogger("SubtitleEncoder"), ApplicationPaths, FileSystemManager, MediaEncoder, JsonSerializer, HttpClient, MediaSourceManager, ProcessFactory);
RegisterSingleInstance(SubtitleEncoder);
RegisterSingleInstance(CreateResourceFileManager());

View File

@ -27,6 +27,7 @@
<PackageReference Include="SimpleInjector" Version="4.4.2" />
<PackageReference Include="SQLitePCL.pretty.core" Version="1.1.8" />
<PackageReference Include="SQLitePCLRaw.core" Version="1.1.11" />
<PackageReference Include="UTF.Unknown" Version="1.0.0-beta1" />
</ItemGroup>
<ItemGroup>
@ -42,8 +43,6 @@
<EmbeddedResource Include="Localization\iso6392.txt" />
<EmbeddedResource Include="Localization\countries.json" />
<EmbeddedResource Include="Localization\Core\*.json" />
<EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\*" />
<EmbeddedResource Include="TextEncoding\NLangDetect\Utils\messages.properties" />
<EmbeddedResource Include="Localization\Ratings\*.txt" />
</ItemGroup>

View File

@ -19,7 +19,6 @@ using MediaBrowser.Model.Events;
using MediaBrowser.Model.Extensions;
using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Services;
using MediaBrowser.Model.Text;
using Microsoft.Extensions.Logging;
namespace Emby.Server.Implementations.HttpServer
@ -37,11 +36,7 @@ namespace Emby.Server.Implementations.HttpServer
private readonly IServerConfigurationManager _config;
private readonly INetworkManager _networkManager;
private readonly IServerApplicationHost _appHost;
private readonly ITextEncoding _textEncoding;
private readonly IJsonSerializer _jsonSerializer;
private readonly IXmlSerializer _xmlSerializer;
private readonly Func<Type, Func<string, object>> _funcParseFn;
@ -60,7 +55,6 @@ namespace Emby.Server.Implementations.HttpServer
IServerConfigurationManager config,
string defaultRedirectPath,
INetworkManager networkManager,
ITextEncoding textEncoding,
IJsonSerializer jsonSerializer,
IXmlSerializer xmlSerializer,
Func<Type, Func<string, object>> funcParseFn)
@ -70,7 +64,6 @@ namespace Emby.Server.Implementations.HttpServer
_config = config;
DefaultRedirectPath = defaultRedirectPath;
_networkManager = networkManager;
_textEncoding = textEncoding;
_jsonSerializer = jsonSerializer;
_xmlSerializer = xmlSerializer;
_funcParseFn = funcParseFn;
@ -147,7 +140,7 @@ namespace Emby.Server.Implementations.HttpServer
return;
}
var connection = new WebSocketConnection(e.WebSocket, e.Endpoint, _jsonSerializer, _logger, _textEncoding)
var connection = new WebSocketConnection(e.WebSocket, e.Endpoint, _jsonSerializer, _logger)
{
OnReceive = ProcessWebSocketMessageReceived,
Url = e.Url,

View File

@ -1,4 +1,4 @@
using System;
using System;
using System.Net.WebSockets;
using System.Text;
using System.Threading;
@ -8,8 +8,8 @@ using MediaBrowser.Controller.Net;
using MediaBrowser.Model.Net;
using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Services;
using MediaBrowser.Model.Text;
using Microsoft.Extensions.Logging;
using UtfUnknown;
namespace Emby.Server.Implementations.HttpServer
{
@ -68,7 +68,6 @@ namespace Emby.Server.Implementations.HttpServer
/// </summary>
/// <value>The query string.</value>
public QueryParamCollection QueryString { get; set; }
private readonly ITextEncoding _textEncoding;
/// <summary>
/// Initializes a new instance of the <see cref="WebSocketConnection" /> class.
@ -78,7 +77,7 @@ namespace Emby.Server.Implementations.HttpServer
/// <param name="jsonSerializer">The json serializer.</param>
/// <param name="logger">The logger.</param>
/// <exception cref="ArgumentNullException">socket</exception>
public WebSocketConnection(IWebSocket socket, string remoteEndPoint, IJsonSerializer jsonSerializer, ILogger logger, ITextEncoding textEncoding)
public WebSocketConnection(IWebSocket socket, string remoteEndPoint, IJsonSerializer jsonSerializer, ILogger logger)
{
if (socket == null)
{
@ -110,7 +109,6 @@ namespace Emby.Server.Implementations.HttpServer
RemoteEndPoint = remoteEndPoint;
_logger = logger;
_textEncoding = textEncoding;
socket.Closed += socket_Closed;
}
@ -132,8 +130,7 @@ namespace Emby.Server.Implementations.HttpServer
{
return;
}
var charset = _textEncoding.GetDetectedEncodingName(bytes, bytes.Length, null, false);
var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
@ -141,7 +138,7 @@ namespace Emby.Server.Implementations.HttpServer
}
else
{
OnReceiveInternal(_textEncoding.GetASCIIEncoding().GetString(bytes, 0, bytes.Length));
OnReceiveInternal(Encoding.ASCII.GetString(bytes, 0, bytes.Length));
}
}
@ -161,7 +158,7 @@ namespace Emby.Server.Implementations.HttpServer
var bytes = memory.Slice(0, length).ToArray();
var charset = _textEncoding.GetDetectedEncodingName(bytes, bytes.Length, null, false);
var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
@ -169,7 +166,7 @@ namespace Emby.Server.Implementations.HttpServer
}
else
{
OnReceiveInternal(_textEncoding.GetASCIIEncoding().GetString(bytes, 0, bytes.Length));
OnReceiveInternal(Encoding.ASCII.GetString(bytes, 0, bytes.Length));
}
}

View File

@ -1,371 +0,0 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using NLangDetect.Core.Extensions;
using NLangDetect.Core.Utils;
namespace NLangDetect.Core
{
public class Detector
{
private const double _AlphaDefault = 0.5;
private const double _AlphaWidth = 0.05;
private const int _IterationLimit = 1000;
private const double _ProbThreshold = 0.1;
private const double _ConvThreshold = 0.99999;
private const int _BaseFreq = 10000;
private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
private readonly Dictionary<string, ProbVector> _wordLangProbMap;
private readonly List<string> _langlist;
private StringBuilder _text;
private double[] _langprob;
private double _alpha = _AlphaDefault;
private const int _trialsCount = 7;
private int _maxTextLength = 10000;
private double[] _priorMap;
private int? _seed;
#region Constructor(s)
public Detector(DetectorFactory factory)
{
_wordLangProbMap = factory.WordLangProbMap;
_langlist = factory.Langlist;
_text = new StringBuilder();
_seed = factory.Seed;
}
#endregion
#region Public methods
public void SetAlpha(double alpha)
{
_alpha = alpha;
}
public void SetPriorMap(Dictionary<string, double> priorMap)
{
_priorMap = new double[_langlist.Count];
double sump = 0;
for (int i = 0; i < _priorMap.Length; i++)
{
string lang = _langlist[i];
if (priorMap.ContainsKey(lang))
{
double p = priorMap[lang];
if (p < 0)
{
throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
}
_priorMap[i] = p;
sump += p;
}
}
if (sump <= 0)
{
throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
}
for (int i = 0; i < _priorMap.Length; i++)
{
_priorMap[i] /= sump;
}
}
public void SetMaxTextLength(int max_text_length)
{
_maxTextLength = max_text_length;
}
// TODO IMM HI: TextReader?
public void Append(StreamReader streamReader)
{
var buf = new char[_maxTextLength / 2];
while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
{
int length = streamReader.Read(buf, 0, buf.Length);
Append(new string(buf, 0, length));
}
}
public void Append(string text)
{
text = _UrlRegex.Replace(text, " ");
text = _MailRegex.Replace(text, " ");
char pre = '\0';
for (int i = 0; i < text.Length && i < _maxTextLength; i++)
{
char c = NGram.Normalize(text[i]);
if (c != ' ' || pre != ' ')
{
_text.Append(c);
}
pre = c;
}
}
private void CleanText()
{
int latinCount = 0, nonLatinCount = 0;
for (int i = 0; i < _text.Length; i++)
{
char c = _text[i];
if (c <= 'z' && c >= 'A')
{
latinCount++;
}
else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
{
nonLatinCount++;
}
}
if (latinCount * 2 < nonLatinCount)
{
var textWithoutLatin = new StringBuilder();
for (int i = 0; i < _text.Length; i++)
{
char c = _text[i];
if (c > 'z' || c < 'A')
{
textWithoutLatin.Append(c);
}
}
_text = textWithoutLatin;
}
}
public string Detect()
{
List<Language> probabilities = GetProbabilities();
return
probabilities.Count > 0
? probabilities[0].Name
: null;
}
public List<Language> GetProbabilities()
{
if (_langprob == null)
{
DetectBlock();
}
var list = SortProbability(_langprob);
return list;
}
#endregion
#region Private helper methods
private static double NormalizeProb(double[] probs)
{
double maxp = 0, sump = 0;
sump += probs.Sum();
for (int i = 0; i < probs.Length; i++)
{
double p = probs[i] / sump;
if (maxp < p)
{
maxp = p;
}
probs[i] = p;
}
return maxp;
}
private static string UnicodeEncode(string word)
{
var resultSb = new StringBuilder();
foreach (char ch in word)
{
if (ch >= '\u0080')
{
string st = string.Format("{0:x}", 0x10000 + ch);
while (st.Length < 4)
{
st = "0" + st;
}
resultSb
.Append("\\u")
.Append(st.SubSequence(1, 5));
}
else
{
resultSb.Append(ch);
}
}
return resultSb.ToString();
}
private void DetectBlock()
{
CleanText();
List<string> ngrams = ExtractNGrams();
if (ngrams.Count == 0)
{
throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
}
_langprob = new double[_langlist.Count];
var rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
for (int t = 0; t < _trialsCount; t++)
{
double[] prob = InitProbability();
// TODO IMM HI: verify it works
double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
for (int i = 0; ; i++)
{
int r = rand.Next(ngrams.Count);
UpdateLangProb(prob, ngrams[r], alpha);
if (i % 5 == 0)
{
if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
{
break;
}
}
}
for (int j = 0; j < _langprob.Length; j++)
{
_langprob[j] += prob[j] / _trialsCount;
}
}
}
private double[] InitProbability()
{
var prob = new double[_langlist.Count];
if (_priorMap != null)
{
for (int i = 0; i < prob.Length; i++)
{
prob[i] = _priorMap[i];
}
}
else
{
for (int i = 0; i < prob.Length; i++)
{
prob[i] = 1.0 / _langlist.Count;
}
}
return prob;
}
private List<string> ExtractNGrams()
{
var list = new List<string>();
var ngram = new NGram();
for (int i = 0; i < _text.Length; i++)
{
ngram.AddChar(_text[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
string w = ngram.Get(n);
if (w != null && _wordLangProbMap.ContainsKey(w))
{
list.Add(w);
}
}
}
return list;
}
private void UpdateLangProb(double[] prob, string word, double alpha)
{
if (word == null || !_wordLangProbMap.ContainsKey(word))
{
return;
}
ProbVector langProbMap = _wordLangProbMap[word];
double weight = alpha / _BaseFreq;
for (int i = 0; i < prob.Length; i++)
{
prob[i] *= weight + langProbMap[i];
}
}
private List<Language> SortProbability(double[] prob)
{
var list = new List<Language>();
for (int j = 0; j < prob.Length; j++)
{
double p = prob[j];
if (p > _ProbThreshold)
{
for (int i = 0; i <= list.Count; i++)
{
if (i == list.Count || list[i].Probability < p)
{
list.Insert(i, new Language(_langlist[j], p));
break;
}
}
}
}
return list;
}
#endregion
}
}

View File

@ -1,125 +0,0 @@
using System;
using System.Collections.Generic;
using System.Linq;
using MediaBrowser.Model.Serialization;
using NLangDetect.Core.Utils;
namespace NLangDetect.Core
{
public class DetectorFactory
{
public Dictionary<string, ProbVector> WordLangProbMap;
public List<string> Langlist;
private static readonly DetectorFactory _instance = new DetectorFactory();
#region Constructor(s)
private DetectorFactory()
{
WordLangProbMap = new Dictionary<string, ProbVector>();
Langlist = new List<string>();
}
#endregion
#region Public methods
public static void LoadProfiles(IJsonSerializer json)
{
var assembly = typeof(DetectorFactory).Assembly;
var names = assembly.GetManifestResourceNames()
.Where(i => i.IndexOf("NLangDetect.Profiles", StringComparison.Ordinal) != -1)
.ToList();
var index = 0;
foreach (var name in names)
{
using (var stream = assembly.GetManifestResourceStream(name))
{
var langProfile = (LangProfile)json.DeserializeFromStream(stream, typeof(LangProfile));
AddProfile(langProfile, index);
}
index++;
}
}
public static Detector Create()
{
return CreateDetector();
}
public static Detector Create(double alpha)
{
var detector = CreateDetector();
detector.SetAlpha(alpha);
return detector;
}
public static void SetSeed(int? seed)
{
_instance.Seed = seed;
}
#endregion
#region Internal methods
internal static void AddProfile(LangProfile profile, int index)
{
var lang = profile.name;
if (_instance.Langlist.Contains(lang))
{
throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError);
}
_instance.Langlist.Add(lang);
foreach (string word in profile.freq.Keys)
{
if (!_instance.WordLangProbMap.ContainsKey(word))
{
_instance.WordLangProbMap.Add(word, new ProbVector());
}
double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1];
_instance.WordLangProbMap[word][index] = prob;
}
}
internal static void Clear()
{
_instance.Langlist.Clear();
_instance.WordLangProbMap.Clear();
}
#endregion
#region Private helper methods
private static Detector CreateDetector()
{
if (_instance.Langlist.Count == 0)
{
throw new NLangDetectException("need to load profiles", ErrorCode.NeedLoadProfileError);
}
return new Detector(_instance);
}
#endregion
#region Properties
public int? Seed { get; private set; }
#endregion
}
}

View File

@ -1,15 +0,0 @@
namespace NLangDetect.Core
{
public enum ErrorCode
{
NoTextError,
FormatError,
FileLoadError,
DuplicateLangError,
NeedLoadProfileError,
CantDetectError,
CantOpenTrainData,
TrainDataFormatError,
InitParamError,
}
}

View File

@ -1,374 +0,0 @@
using System;
namespace NLangDetect.Core.Extensions
{
public static class CharExtensions
{
private const int MIN_CODE_POINT = 0x000000;
private const int MAX_CODE_POINT = 0x10ffff;
private static readonly int[] _unicodeBlockStarts =
{
#region Unicode block starts
0x0000, // Basic Latin
0x0080, // Latin-1 Supplement
0x0100, // Latin Extended-A
0x0180, // Latin Extended-B
0x0250, // IPA Extensions
0x02B0, // Spacing Modifier Letters
0x0300, // Combining Diacritical Marks
0x0370, // Greek and Coptic
0x0400, // Cyrillic
0x0500, // Cyrillic Supplementary
0x0530, // Armenian
0x0590, // Hebrew
0x0600, // Arabic
0x0700, // Syriac
0x0750, // unassigned
0x0780, // Thaana
0x07C0, // unassigned
0x0900, // Devanagari
0x0980, // Bengali
0x0A00, // Gurmukhi
0x0A80, // Gujarati
0x0B00, // Oriya
0x0B80, // Tamil
0x0C00, // Telugu
0x0C80, // Kannada
0x0D00, // Malayalam
0x0D80, // Sinhala
0x0E00, // Thai
0x0E80, // Lao
0x0F00, // Tibetan
0x1000, // Myanmar
0x10A0, // Georgian
0x1100, // Hangul Jamo
0x1200, // Ethiopic
0x1380, // unassigned
0x13A0, // Cherokee
0x1400, // Unified Canadian Aboriginal Syllabics
0x1680, // Ogham
0x16A0, // Runic
0x1700, // Tagalog
0x1720, // Hanunoo
0x1740, // Buhid
0x1760, // Tagbanwa
0x1780, // Khmer
0x1800, // Mongolian
0x18B0, // unassigned
0x1900, // Limbu
0x1950, // Tai Le
0x1980, // unassigned
0x19E0, // Khmer Symbols
0x1A00, // unassigned
0x1D00, // Phonetic Extensions
0x1D80, // unassigned
0x1E00, // Latin Extended Additional
0x1F00, // Greek Extended
0x2000, // General Punctuation
0x2070, // Superscripts and Subscripts
0x20A0, // Currency Symbols
0x20D0, // Combining Diacritical Marks for Symbols
0x2100, // Letterlike Symbols
0x2150, // Number Forms
0x2190, // Arrows
0x2200, // Mathematical Operators
0x2300, // Miscellaneous Technical
0x2400, // Control Pictures
0x2440, // Optical Character Recognition
0x2460, // Enclosed Alphanumerics
0x2500, // Box Drawing
0x2580, // Block Elements
0x25A0, // Geometric Shapes
0x2600, // Miscellaneous Symbols
0x2700, // Dingbats
0x27C0, // Miscellaneous Mathematical Symbols-A
0x27F0, // Supplemental Arrows-A
0x2800, // Braille Patterns
0x2900, // Supplemental Arrows-B
0x2980, // Miscellaneous Mathematical Symbols-B
0x2A00, // Supplemental Mathematical Operators
0x2B00, // Miscellaneous Symbols and Arrows
0x2C00, // unassigned
0x2E80, // CJK Radicals Supplement
0x2F00, // Kangxi Radicals
0x2FE0, // unassigned
0x2FF0, // Ideographic Description Characters
0x3000, // CJK Symbols and Punctuation
0x3040, // Hiragana
0x30A0, // Katakana
0x3100, // Bopomofo
0x3130, // Hangul Compatibility Jamo
0x3190, // Kanbun
0x31A0, // Bopomofo Extended
0x31C0, // unassigned
0x31F0, // Katakana Phonetic Extensions
0x3200, // Enclosed CJK Letters and Months
0x3300, // CJK Compatibility
0x3400, // CJK Unified Ideographs Extension A
0x4DC0, // Yijing Hexagram Symbols
0x4E00, // CJK Unified Ideographs
0xA000, // Yi Syllables
0xA490, // Yi Radicals
0xA4D0, // unassigned
0xAC00, // Hangul Syllables
0xD7B0, // unassigned
0xD800, // High Surrogates
0xDB80, // High Private Use Surrogates
0xDC00, // Low Surrogates
0xE000, // Private Use
0xF900, // CJK Compatibility Ideographs
0xFB00, // Alphabetic Presentation Forms
0xFB50, // Arabic Presentation Forms-A
0xFE00, // Variation Selectors
0xFE10, // unassigned
0xFE20, // Combining Half Marks
0xFE30, // CJK Compatibility Forms
0xFE50, // Small Form Variants
0xFE70, // Arabic Presentation Forms-B
0xFF00, // Halfwidth and Fullwidth Forms
0xFFF0, // Specials
0x10000, // Linear B Syllabary
0x10080, // Linear B Ideograms
0x10100, // Aegean Numbers
0x10140, // unassigned
0x10300, // Old Italic
0x10330, // Gothic
0x10350, // unassigned
0x10380, // Ugaritic
0x103A0, // unassigned
0x10400, // Deseret
0x10450, // Shavian
0x10480, // Osmanya
0x104B0, // unassigned
0x10800, // Cypriot Syllabary
0x10840, // unassigned
0x1D000, // Byzantine Musical Symbols
0x1D100, // Musical Symbols
0x1D200, // unassigned
0x1D300, // Tai Xuan Jing Symbols
0x1D360, // unassigned
0x1D400, // Mathematical Alphanumeric Symbols
0x1D800, // unassigned
0x20000, // CJK Unified Ideographs Extension B
0x2A6E0, // unassigned
0x2F800, // CJK Compatibility Ideographs Supplement
0x2FA20, // unassigned
0xE0000, // Tags
0xE0080, // unassigned
0xE0100, // Variation Selectors Supplement
0xE01F0, // unassigned
0xF0000, // Supplementary Private Use Area-A
0x100000, // Supplementary Private Use Area-B
#endregion
};
private static readonly UnicodeBlock?[] _unicodeBlocks =
{
#region Unicode blocks
UnicodeBlock.BasicLatin,
UnicodeBlock.Latin1Supplement,
UnicodeBlock.LatinExtendedA,
UnicodeBlock.LatinExtendedB,
UnicodeBlock.IpaExtensions,
UnicodeBlock.SpacingModifierLetters,
UnicodeBlock.CombiningDiacriticalMarks,
UnicodeBlock.Greek,
UnicodeBlock.Cyrillic,
UnicodeBlock.CyrillicSupplementary,
UnicodeBlock.Armenian,
UnicodeBlock.Hebrew,
UnicodeBlock.Arabic,
UnicodeBlock.Syriac,
null,
UnicodeBlock.Thaana,
null,
UnicodeBlock.Devanagari,
UnicodeBlock.Bengali,
UnicodeBlock.Gurmukhi,
UnicodeBlock.Gujarati,
UnicodeBlock.Oriya,
UnicodeBlock.Tamil,
UnicodeBlock.Telugu,
UnicodeBlock.Kannada,
UnicodeBlock.Malayalam,
UnicodeBlock.Sinhala,
UnicodeBlock.Thai,
UnicodeBlock.Lao,
UnicodeBlock.Tibetan,
UnicodeBlock.Myanmar,
UnicodeBlock.Georgian,
UnicodeBlock.HangulJamo,
UnicodeBlock.Ethiopic,
null,
UnicodeBlock.Cherokee,
UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
UnicodeBlock.Ogham,
UnicodeBlock.Runic,
UnicodeBlock.Tagalog,
UnicodeBlock.Hanunoo,
UnicodeBlock.Buhid,
UnicodeBlock.Tagbanwa,
UnicodeBlock.Khmer,
UnicodeBlock.Mongolian,
null,
UnicodeBlock.Limbu,
UnicodeBlock.TaiLe,
null,
UnicodeBlock.KhmerSymbols,
null,
UnicodeBlock.PhoneticExtensions,
null,
UnicodeBlock.LatinExtendedAdditional,
UnicodeBlock.GreekExtended,
UnicodeBlock.GeneralPunctuation,
UnicodeBlock.SuperscriptsAndSubscripts,
UnicodeBlock.CurrencySymbols,
UnicodeBlock.CombiningMarksForSymbols,
UnicodeBlock.LetterlikeSymbols,
UnicodeBlock.NumberForms,
UnicodeBlock.Arrows,
UnicodeBlock.MathematicalOperators,
UnicodeBlock.MiscellaneousTechnical,
UnicodeBlock.ControlPictures,
UnicodeBlock.OpticalCharacterRecognition,
UnicodeBlock.EnclosedAlphanumerics,
UnicodeBlock.BoxDrawing,
UnicodeBlock.BlockElements,
UnicodeBlock.GeometricShapes,
UnicodeBlock.MiscellaneousSymbols,
UnicodeBlock.Dingbats,
UnicodeBlock.MiscellaneousMathematicalSymbolsA,
UnicodeBlock.SupplementalArrowsA,
UnicodeBlock.BraillePatterns,
UnicodeBlock.SupplementalArrowsB,
UnicodeBlock.MiscellaneousMathematicalSymbolsB,
UnicodeBlock.SupplementalMathematicalOperators,
UnicodeBlock.MiscellaneousSymbolsAndArrows,
null,
UnicodeBlock.CjkRadicalsSupplement,
UnicodeBlock.KangxiRadicals,
null,
UnicodeBlock.IdeographicDescriptionCharacters,
UnicodeBlock.CjkSymbolsAndPunctuation,
UnicodeBlock.Hiragana,
UnicodeBlock.Katakana,
UnicodeBlock.Bopomofo,
UnicodeBlock.HangulCompatibilityJamo,
UnicodeBlock.Kanbun,
UnicodeBlock.BopomofoExtended,
null,
UnicodeBlock.KatakanaPhoneticExtensions,
UnicodeBlock.EnclosedCjkLettersAndMonths,
UnicodeBlock.CjkCompatibility,
UnicodeBlock.CjkUnifiedIdeographsExtensionA,
UnicodeBlock.YijingHexagramSymbols,
UnicodeBlock.CjkUnifiedIdeographs,
UnicodeBlock.YiSyllables,
UnicodeBlock.YiRadicals,
null,
UnicodeBlock.HangulSyllables,
null,
UnicodeBlock.HighSurrogates,
UnicodeBlock.HighPrivateUseSurrogates,
UnicodeBlock.LowSurrogates,
UnicodeBlock.PrivateUseArea,
UnicodeBlock.CjkCompatibilityIdeographs,
UnicodeBlock.AlphabeticPresentationForms,
UnicodeBlock.ArabicPresentationFormsA,
UnicodeBlock.VariationSelectors,
null,
UnicodeBlock.CombiningHalfMarks,
UnicodeBlock.CjkCompatibilityForms,
UnicodeBlock.SmallFormVariants,
UnicodeBlock.ArabicPresentationFormsB,
UnicodeBlock.HalfwidthAndFullwidthForms,
UnicodeBlock.Specials,
UnicodeBlock.LinearBSyllabary,
UnicodeBlock.LinearBIdeograms,
UnicodeBlock.AegeanNumbers,
null,
UnicodeBlock.OldItalic,
UnicodeBlock.Gothic,
null,
UnicodeBlock.Ugaritic,
null,
UnicodeBlock.Deseret,
UnicodeBlock.Shavian,
UnicodeBlock.Osmanya,
null,
UnicodeBlock.CypriotSyllabary,
null,
UnicodeBlock.ByzantineMusicalSymbols,
UnicodeBlock.MusicalSymbols,
null,
UnicodeBlock.TaiXuanJingSymbols,
null,
UnicodeBlock.MathematicalAlphanumericSymbols,
null,
UnicodeBlock.CjkUnifiedIdeographsExtensionB,
null,
UnicodeBlock.CjkCompatibilityIdeographsSupplement,
null,
UnicodeBlock.Tags,
null,
UnicodeBlock.VariationSelectorsSupplement,
null,
UnicodeBlock.SupplementaryPrivateUseAreaA,
UnicodeBlock.SupplementaryPrivateUseAreaB,
#endregion
};
#region Public methods
/// <remarks>
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
/// </remarks>
public static UnicodeBlock? GetUnicodeBlock(this char ch)
{
int codePoint = ch;
if (!IsValidCodePoint(codePoint))
{
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
}
int top, bottom, current;
bottom = 0;
top = _unicodeBlockStarts.Length;
current = top / 2;
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
while (top - bottom > 1)
{
if (codePoint >= _unicodeBlockStarts[current])
{
bottom = current;
}
else
{
top = current;
}
current = (top + bottom) / 2;
}
return _unicodeBlocks[current];
}
#endregion
#region Private helper methods
private static bool IsValidCodePoint(int codePoint)
{
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
#endregion
}
}

View File

@ -1,51 +0,0 @@
using System;
namespace NLangDetect.Core.Extensions
{
public static class RandomExtensions
{
private const double _Epsilon = 2.22044604925031E-15;
private static readonly object _mutex = new object();
private static double _nextNextGaussian;
private static bool _hasNextNextGaussian;
/// <summary>
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
/// </summary>
/// <remarks>
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
/// </remarks>
public static double NextGaussian(this Random random)
{
lock (_mutex)
{
if (_hasNextNextGaussian)
{
_hasNextNextGaussian = false;
return _nextNextGaussian;
}
double v1, v2, s;
do
{
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
s = v1 * v1 + v2 * v2;
}
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
_nextNextGaussian = v2 * multiplier;
_hasNextNextGaussian = true;
return v1 * multiplier;
}
}
}
}

View File

@ -1,25 +0,0 @@
using System;
namespace NLangDetect.Core.Extensions
{
public static class StringExtensions
{
/// <summary>
/// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
/// </summary>
/// <param name="s"></param>
/// <param name="start">the start index, inclusive</param>
/// <param name="end">the end index, exclusive</param>
/// <returns>the specified subsequence</returns>
/// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
public static string SubSequence(this string s, int start, int end)
{
if (start < 0) throw new ArgumentOutOfRangeException(nameof(start), "Argument must not be negative.");
if (end < 0) throw new ArgumentOutOfRangeException(nameof(end), "Argument must not be negative.");
if (end > s.Length) throw new ArgumentOutOfRangeException(nameof(end), "Argument must not be greater than the input string's length.");
if (start > end) throw new ArgumentOutOfRangeException(nameof(start), "Argument must not be greater than the 'end' argument.");
return s.Substring(start, end - start);
}
}
}

View File

@ -1,131 +0,0 @@
namespace NLangDetect.Core.Extensions
{
public enum UnicodeBlock
{
BasicLatin,
Latin1Supplement,
LatinExtendedA,
LatinExtendedB,
IpaExtensions,
SpacingModifierLetters,
CombiningDiacriticalMarks,
Greek,
Cyrillic,
CyrillicSupplementary,
Armenian,
Hebrew,
Arabic,
Syriac,
Thaana,
Devanagari,
Bengali,
Gurmukhi,
Gujarati,
Oriya,
Tamil,
Telugu,
Kannada,
Malayalam,
Sinhala,
Thai,
Lao,
Tibetan,
Myanmar,
Georgian,
HangulJamo,
Ethiopic,
Cherokee,
UnifiedCanadianAboriginalSyllabics,
Ogham,
Runic,
Tagalog,
Hanunoo,
Buhid,
Tagbanwa,
Khmer,
Mongolian,
Limbu,
TaiLe,
KhmerSymbols,
PhoneticExtensions,
LatinExtendedAdditional,
GreekExtended,
GeneralPunctuation,
SuperscriptsAndSubscripts,
CurrencySymbols,
CombiningMarksForSymbols,
LetterlikeSymbols,
NumberForms,
Arrows,
MathematicalOperators,
MiscellaneousTechnical,
ControlPictures,
OpticalCharacterRecognition,
EnclosedAlphanumerics,
BoxDrawing,
BlockElements,
GeometricShapes,
MiscellaneousSymbols,
Dingbats,
MiscellaneousMathematicalSymbolsA,
SupplementalArrowsA,
BraillePatterns,
SupplementalArrowsB,
MiscellaneousMathematicalSymbolsB,
SupplementalMathematicalOperators,
MiscellaneousSymbolsAndArrows,
CjkRadicalsSupplement,
KangxiRadicals,
IdeographicDescriptionCharacters,
CjkSymbolsAndPunctuation,
Hiragana,
Katakana,
Bopomofo,
HangulCompatibilityJamo,
Kanbun,
BopomofoExtended,
KatakanaPhoneticExtensions,
EnclosedCjkLettersAndMonths,
CjkCompatibility,
CjkUnifiedIdeographsExtensionA,
YijingHexagramSymbols,
CjkUnifiedIdeographs,
YiSyllables,
YiRadicals,
HangulSyllables,
HighSurrogates,
HighPrivateUseSurrogates,
LowSurrogates,
PrivateUseArea,
CjkCompatibilityIdeographs,
AlphabeticPresentationForms,
ArabicPresentationFormsA,
VariationSelectors,
CombiningHalfMarks,
CjkCompatibilityForms,
SmallFormVariants,
ArabicPresentationFormsB,
HalfwidthAndFullwidthForms,
Specials,
LinearBSyllabary,
LinearBIdeograms,
AegeanNumbers,
OldItalic,
Gothic,
Ugaritic,
Deseret,
Shavian,
Osmanya,
CypriotSyllabary,
ByzantineMusicalSymbols,
MusicalSymbols,
TaiXuanJingSymbols,
MathematicalAlphanumericSymbols,
CjkUnifiedIdeographsExtensionB,
CjkCompatibilityIdeographsSupplement,
Tags,
VariationSelectorsSupplement,
SupplementaryPrivateUseAreaA,
SupplementaryPrivateUseAreaB,
}
}

View File

@ -1,67 +0,0 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Xml;
using NLangDetect.Core.Utils;
namespace NLangDetect.Core
{
// TODO IMM HI: xml reader not tested
public static class GenProfile
{
#region Public methods
public static LangProfile load(string lang, string file)
{
var profile = new LangProfile(lang);
var tagextractor = new TagExtractor("abstract", 100);
Stream inputStream = null;
try
{
inputStream = File.OpenRead(file);
string extension = Path.GetExtension(file) ?? "";
if (extension.ToUpper() == ".GZ")
{
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
}
using (var xmlReader = XmlReader.Create(inputStream))
{
while (xmlReader.Read())
{
switch (xmlReader.NodeType)
{
case XmlNodeType.Element:
tagextractor.SetTag(xmlReader.Name);
break;
case XmlNodeType.Text:
tagextractor.Add(xmlReader.Value);
break;
case XmlNodeType.EndElement:
tagextractor.CloseTag(profile);
break;
}
}
}
}
finally
{
if (inputStream != null)
{
inputStream.Close();
}
}
Console.WriteLine(lang + ": " + tagextractor.Count);
return profile;
}
#endregion
}
}

View File

@ -1,22 +0,0 @@
using System;
namespace NLangDetect.Core
{
[Serializable]
public class InternalException : Exception
{
#region Constructor(s)
public InternalException(string message, Exception innerException)
: base(message, innerException)
{
}
public InternalException(string message)
: this(message, null)
{
}
#endregion
}
}

View File

@ -1,45 +0,0 @@
using System.Globalization;
namespace NLangDetect.Core
{
// TODO IMM HI: name??
public class Language
{
#region Constructor(s)
public Language(string name, double probability)
{
Name = name;
Probability = probability;
}
#endregion
#region Object overrides
public override string ToString()
{
if (Name == null)
{
return "";
}
return
string.Format(
CultureInfo.InvariantCulture.NumberFormat,
"{0}:{1:0.000000}",
Name,
Probability);
}
#endregion
#region Properties
public string Name { get; set; }
public double Probability { get; set; }
#endregion
}
}

View File

@ -1,37 +0,0 @@
using System;
using MediaBrowser.Model.Serialization;
namespace NLangDetect.Core
{
// TODO IMM HI: change to non-static class
// TODO IMM HI: hide other, unnecassary classes via internal?
public static class LanguageDetector
{
private const double _DefaultAlpha = 0.5;
#region Public methods
public static void Initialize(IJsonSerializer json)
{
DetectorFactory.LoadProfiles(json);
}
public static void Release()
{
DetectorFactory.Clear();
}
public static string DetectLanguage(string plainText)
{
if (string.IsNullOrEmpty(plainText)) { throw new ArgumentException("Argument can't be null nor empty.", nameof(plainText)); }
var detector = DetectorFactory.Create(_DefaultAlpha);
detector.Append(plainText);
return detector.Detect();
}
#endregion
}
}

View File

@ -1,23 +0,0 @@
using System;
namespace NLangDetect.Core
{
public class NLangDetectException : Exception
{
#region Constructor(s)
public NLangDetectException(string message, ErrorCode errorCode)
: base(message)
{
ErrorCode = errorCode;
}
#endregion
#region Properties
public ErrorCode ErrorCode { get; private set; }
#endregion
}
}

View File

@ -1,33 +0,0 @@
using System;
using System.Collections.Generic;
namespace NLangDetect.Core
{
public class ProbVector
{
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
public double this[int key]
{
get
{
return _dict.TryGetValue(key, out var value) ? value : 0.0;
}
set
{
if (Math.Abs(value) < double.Epsilon)
{
if (_dict.ContainsKey(key))
{
_dict.Remove(key);
}
return;
}
_dict[key] = value;
}
}
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,118 +0,0 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace NLangDetect.Core.Utils
{
public class LangProfile
{
private const int MinimumFreq = 2;
private const int LessFreqRatio = 100000;
public string name { get; set; }
public Dictionary<string, int> freq { get; set; }
public int[] n_words { get; set; }
#region Constructor(s)
public LangProfile()
{
freq = new Dictionary<string, int>();
n_words = new int[NGram.GramsCount];
}
public LangProfile(string name)
{
this.name = name;
freq = new Dictionary<string, int>();
n_words = new int[NGram.GramsCount];
}
#endregion
#region Public methods
public void Add(string gram)
{
if (name == null || gram == null) return; // Illegal
int len = gram.Length;
if (len < 1 || len > NGram.GramsCount) return; // Illegal
n_words[len - 1]++;
if (freq.ContainsKey(gram))
{
freq[gram] = freq[gram] + 1;
}
else
{
freq.Add(gram, 1);
}
}
public void OmitLessFreq()
{
if (name == null) return; // Illegal
int threshold = n_words[0] / LessFreqRatio;
if (threshold < MinimumFreq) threshold = MinimumFreq;
ICollection<string> keys = freq.Keys;
int roman = 0;
// TODO IMM HI: move up?
var regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
var keysToRemove = new List<string>();
foreach (string key in keys)
{
int count = freq[key];
if (count <= threshold)
{
n_words[key.Length - 1] -= count;
keysToRemove.Add(key);
}
else
{
if (regex1.IsMatch(key))
{
roman += count;
}
}
}
foreach (string keyToRemove in keysToRemove)
{
freq.Remove(keyToRemove);
}
// roman check
keysToRemove = new List<string>();
if (roman < n_words[0] / 3)
{
ICollection<string> keys2 = freq.Keys;
// TODO IMM HI: move up?
var regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
foreach (string key in keys2)
{
int count = freq[key];
if (regex2.IsMatch(key))
{
n_words[key.Length - 1] -= count;
keysToRemove.Add(key);
}
}
foreach (string keyToRemove in keysToRemove)
{
freq.Remove(keyToRemove);
}
}
}
#endregion
}
}

View File

@ -1,88 +0,0 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
namespace NLangDetect.Core.Utils
{
public static class Messages
{
private static readonly Dictionary<string, string> _messages;
static Messages()
{
_messages = LoadMessages();
}
public static string getString(string key)
{
return
_messages.TryGetValue(key, out var value)
? value
: string.Format("!{0}!", key);
}
private static Dictionary<string, string> LoadMessages()
{
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1);
var messagesStream =
typeof(Messages).Assembly
.GetManifestResourceStream(manifestName);
if (messagesStream == null)
{
throw new InternalException(string.Format("Couldn't get embedded resource named '{0}'.", manifestName));
}
using (messagesStream)
using (var sr = new StreamReader(messagesStream))
{
var messages = new Dictionary<string, string>();
while (!sr.EndOfStream)
{
string line = sr.ReadLine();
if (string.IsNullOrEmpty(line))
{
continue;
}
string[] keyValue = line.Split('=');
if (keyValue.Length != 2)
{
throw new InternalException(string.Format("Invalid format of the 'Messages.properties' resource. Offending line: '{0}'.", line.Trim()));
}
string key = keyValue[0];
string value = UnescapeUnicodeString(keyValue[1]);
messages.Add(key, value);
}
return messages;
}
}
/// <remarks>
/// Taken from: http://stackoverflow.com/questions/1615559/converting-unicode-strings-to-escaped-ascii-string/1615860#1615860
/// </remarks>
private static string UnescapeUnicodeString(string s)
{
if (s == null)
{
return null;
}
return
Regex.Replace(
s,
@"\\u(?<Value>[a-zA-Z0-9]{4})",
match => ((char)int.Parse(match.Groups["Value"].Value, NumberStyles.HexNumber)).ToString());
}
}
}

View File

@ -1,330 +0,0 @@
// TODO IMM HI: check which classes can be made internal?
using System.Collections.Generic;
using System.Text;
using NLangDetect.Core.Extensions;
namespace NLangDetect.Core.Utils
{
public class NGram
{
public const int GramsCount = 3;
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
private static readonly string[] CjkClass =
{
#region CJK classes
Messages.getString("NGram.KANJI_1_0"),
Messages.getString("NGram.KANJI_1_2"),
Messages.getString("NGram.KANJI_1_4"),
Messages.getString("NGram.KANJI_1_8"),
Messages.getString("NGram.KANJI_1_11"),
Messages.getString("NGram.KANJI_1_12"),
Messages.getString("NGram.KANJI_1_13"),
Messages.getString("NGram.KANJI_1_14"),
Messages.getString("NGram.KANJI_1_16"),
Messages.getString("NGram.KANJI_1_18"),
Messages.getString("NGram.KANJI_1_22"),
Messages.getString("NGram.KANJI_1_27"),
Messages.getString("NGram.KANJI_1_29"),
Messages.getString("NGram.KANJI_1_31"),
Messages.getString("NGram.KANJI_1_35"),
Messages.getString("NGram.KANJI_2_0"),
Messages.getString("NGram.KANJI_2_1"),
Messages.getString("NGram.KANJI_2_4"),
Messages.getString("NGram.KANJI_2_9"),
Messages.getString("NGram.KANJI_2_10"),
Messages.getString("NGram.KANJI_2_11"),
Messages.getString("NGram.KANJI_2_12"),
Messages.getString("NGram.KANJI_2_13"),
Messages.getString("NGram.KANJI_2_15"),
Messages.getString("NGram.KANJI_2_16"),
Messages.getString("NGram.KANJI_2_18"),
Messages.getString("NGram.KANJI_2_21"),
Messages.getString("NGram.KANJI_2_22"),
Messages.getString("NGram.KANJI_2_23"),
Messages.getString("NGram.KANJI_2_28"),
Messages.getString("NGram.KANJI_2_29"),
Messages.getString("NGram.KANJI_2_30"),
Messages.getString("NGram.KANJI_2_31"),
Messages.getString("NGram.KANJI_2_32"),
Messages.getString("NGram.KANJI_2_35"),
Messages.getString("NGram.KANJI_2_36"),
Messages.getString("NGram.KANJI_2_37"),
Messages.getString("NGram.KANJI_2_38"),
Messages.getString("NGram.KANJI_3_1"),
Messages.getString("NGram.KANJI_3_2"),
Messages.getString("NGram.KANJI_3_3"),
Messages.getString("NGram.KANJI_3_4"),
Messages.getString("NGram.KANJI_3_5"),
Messages.getString("NGram.KANJI_3_8"),
Messages.getString("NGram.KANJI_3_9"),
Messages.getString("NGram.KANJI_3_11"),
Messages.getString("NGram.KANJI_3_12"),
Messages.getString("NGram.KANJI_3_13"),
Messages.getString("NGram.KANJI_3_15"),
Messages.getString("NGram.KANJI_3_16"),
Messages.getString("NGram.KANJI_3_18"),
Messages.getString("NGram.KANJI_3_19"),
Messages.getString("NGram.KANJI_3_22"),
Messages.getString("NGram.KANJI_3_23"),
Messages.getString("NGram.KANJI_3_27"),
Messages.getString("NGram.KANJI_3_29"),
Messages.getString("NGram.KANJI_3_30"),
Messages.getString("NGram.KANJI_3_31"),
Messages.getString("NGram.KANJI_3_32"),
Messages.getString("NGram.KANJI_3_35"),
Messages.getString("NGram.KANJI_3_36"),
Messages.getString("NGram.KANJI_3_37"),
Messages.getString("NGram.KANJI_3_38"),
Messages.getString("NGram.KANJI_4_0"),
Messages.getString("NGram.KANJI_4_9"),
Messages.getString("NGram.KANJI_4_10"),
Messages.getString("NGram.KANJI_4_16"),
Messages.getString("NGram.KANJI_4_17"),
Messages.getString("NGram.KANJI_4_18"),
Messages.getString("NGram.KANJI_4_22"),
Messages.getString("NGram.KANJI_4_24"),
Messages.getString("NGram.KANJI_4_28"),
Messages.getString("NGram.KANJI_4_34"),
Messages.getString("NGram.KANJI_4_39"),
Messages.getString("NGram.KANJI_5_10"),
Messages.getString("NGram.KANJI_5_11"),
Messages.getString("NGram.KANJI_5_12"),
Messages.getString("NGram.KANJI_5_13"),
Messages.getString("NGram.KANJI_5_14"),
Messages.getString("NGram.KANJI_5_18"),
Messages.getString("NGram.KANJI_5_26"),
Messages.getString("NGram.KANJI_5_29"),
Messages.getString("NGram.KANJI_5_34"),
Messages.getString("NGram.KANJI_5_39"),
Messages.getString("NGram.KANJI_6_0"),
Messages.getString("NGram.KANJI_6_3"),
Messages.getString("NGram.KANJI_6_9"),
Messages.getString("NGram.KANJI_6_10"),
Messages.getString("NGram.KANJI_6_11"),
Messages.getString("NGram.KANJI_6_12"),
Messages.getString("NGram.KANJI_6_16"),
Messages.getString("NGram.KANJI_6_18"),
Messages.getString("NGram.KANJI_6_20"),
Messages.getString("NGram.KANJI_6_21"),
Messages.getString("NGram.KANJI_6_22"),
Messages.getString("NGram.KANJI_6_23"),
Messages.getString("NGram.KANJI_6_25"),
Messages.getString("NGram.KANJI_6_28"),
Messages.getString("NGram.KANJI_6_29"),
Messages.getString("NGram.KANJI_6_30"),
Messages.getString("NGram.KANJI_6_32"),
Messages.getString("NGram.KANJI_6_34"),
Messages.getString("NGram.KANJI_6_35"),
Messages.getString("NGram.KANJI_6_37"),
Messages.getString("NGram.KANJI_6_39"),
Messages.getString("NGram.KANJI_7_0"),
Messages.getString("NGram.KANJI_7_3"),
Messages.getString("NGram.KANJI_7_6"),
Messages.getString("NGram.KANJI_7_7"),
Messages.getString("NGram.KANJI_7_9"),
Messages.getString("NGram.KANJI_7_11"),
Messages.getString("NGram.KANJI_7_12"),
Messages.getString("NGram.KANJI_7_13"),
Messages.getString("NGram.KANJI_7_16"),
Messages.getString("NGram.KANJI_7_18"),
Messages.getString("NGram.KANJI_7_19"),
Messages.getString("NGram.KANJI_7_20"),
Messages.getString("NGram.KANJI_7_21"),
Messages.getString("NGram.KANJI_7_23"),
Messages.getString("NGram.KANJI_7_25"),
Messages.getString("NGram.KANJI_7_28"),
Messages.getString("NGram.KANJI_7_29"),
Messages.getString("NGram.KANJI_7_32"),
Messages.getString("NGram.KANJI_7_33"),
Messages.getString("NGram.KANJI_7_35"),
Messages.getString("NGram.KANJI_7_37"),
#endregion
};
private static readonly Dictionary<char, char> _cjkMap;
private StringBuilder _grams;
private bool _capitalword;
#region Constructor(s)
static NGram()
{
_cjkMap = new Dictionary<char, char>();
foreach (string cjk_list in CjkClass)
{
char representative = cjk_list[0];
for (int i = 0; i < cjk_list.Length; i++)
{
_cjkMap.Add(cjk_list[i], representative);
}
}
}
public NGram()
{
_grams = new StringBuilder(" ");
_capitalword = false;
}
#endregion
#region Public methods
public static char Normalize(char ch)
{
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
if (!unicodeBlock.HasValue)
{
return ch;
}
switch (unicodeBlock.Value)
{
case UnicodeBlock.BasicLatin:
{
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
{
return ' ';
}
break;
}
case UnicodeBlock.Latin1Supplement:
{
if (Latin1Excluded.IndexOf(ch) >= 0)
{
return ' ';
}
break;
}
case UnicodeBlock.GeneralPunctuation:
{
return ' ';
}
case UnicodeBlock.Arabic:
{
if (ch == '\u06cc')
{
return '\u064a';
}
break;
}
case UnicodeBlock.LatinExtendedAdditional:
{
if (ch >= '\u1ea0')
{
return '\u1ec3';
}
break;
}
case UnicodeBlock.Hiragana:
{
return '\u3042';
}
case UnicodeBlock.Katakana:
{
return '\u30a2';
}
case UnicodeBlock.Bopomofo:
case UnicodeBlock.BopomofoExtended:
{
return '\u3105';
}
case UnicodeBlock.CjkUnifiedIdeographs:
{
if (_cjkMap.ContainsKey(ch))
{
return _cjkMap[ch];
}
break;
}
case UnicodeBlock.HangulSyllables:
{
return '\uac00';
}
}
return ch;
}
public void AddChar(char ch)
{
ch = Normalize(ch);
char lastchar = _grams[_grams.Length - 1];
if (lastchar == ' ')
{
_grams = new StringBuilder(" ");
_capitalword = false;
if (ch == ' ') return;
}
else if (_grams.Length >= GramsCount)
{
_grams.Remove(0, 1);
}
_grams.Append(ch);
if (char.IsUpper(ch))
{
if (char.IsUpper(lastchar)) _capitalword = true;
}
else
{
_capitalword = false;
}
}
public string Get(int n)
{
if (_capitalword)
{
return null;
}
int len = _grams.Length;
if (n < 1 || n > 3 || len < n)
{
return null;
}
if (n == 1)
{
char ch = _grams[len - 1];
if (ch == ' ')
{
return null;
}
return ch.ToString();
}
// TODO IMM HI: is ToString() here effective?
return _grams.ToString().SubSequence(len - n, len);
}
#endregion
}
}

View File

@ -1,76 +0,0 @@
using System.Text;
namespace NLangDetect.Core.Utils
{
public class TagExtractor
{
// TODO IMM HI: do the really need to be internal?
internal string Target;
internal int Threshold;
internal StringBuilder StringBuilder;
internal string Tag;
#region Constructor(s)
public TagExtractor(string tag, int threshold)
{
Target = tag;
Threshold = threshold;
Count = 0;
Clear();
}
#endregion
#region Public methods
public void Clear()
{
StringBuilder = new StringBuilder();
Tag = null;
}
public void SetTag(string tag)
{
Tag = tag;
}
public void Add(string line)
{
if (Tag == Target && line != null)
{
StringBuilder.Append(line);
}
}
public void CloseTag(LangProfile profile)
{
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
{
var gram = new NGram();
for (int i = 0; i < StringBuilder.Length; i++)
{
gram.AddChar(StringBuilder[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
profile.Add(gram.Get(n));
}
}
Count++;
}
Clear();
}
#endregion
#region Properties
public int Count { get; private set; }
#endregion
}
}

View File

@ -1,128 +0,0 @@
NGram.CJK_KANJI_EXCLUDE=\u0020\uFF08\uFF09
NGram.LATIN1_EXCLUDE=\u00A0\u00AB\u00B0\u00BB
NGram.KANJI_1_0=\u4F7C\u6934
NGram.KANJI_1_2=\u88CF\u95B2
NGram.KANJI_1_4=\u7027\u7DCB
NGram.KANJI_1_8=\u4E80\u4E9C\u4EEE\u5263\u5264\u5270\u52C5\u52E7\u52F2\u53B3\u5449\u58CA\u58CC\u5968\u59C9\u59EB\u5D8B\u5DE3\u5E30\u6075\u622F\u623B\u6255\u629C\u629E\u62DD\u62E1\u633F\u635C\u63FA\u6442\u6589\u658E\u6669\u66A6\u66FD\u6804\u685C\u6B69\u6B6F\u6BBB\u6C37\u6C5A\u6D44\u6E09\u6E0B\u6E13\u6EDD\u713C\u72A0\u731F\u7363\u7A32\u7A42\u7A93\u7ADC\u7C8B\u7C9B\u7DD1\u7E01\u7E04\u7E26\u7E4A\u7E4B\u7E70\u8074\u8107\u8133\u81D3\u820E\u8217\u8358\u83D3\u85AC\u8987\u899A\u8B21\u8B72\u8B83\u8CDB\u9045\u90F7\u91C8\u9271\u9283\u92AD\u9665\u967A\u96A0\u96A3\u96B7\u970A\u983C\u9854\u9855\u99C6\u9A12\u9ED9\u9F62
NGram.KANJI_1_11=\u67D8\u831C
NGram.KANJI_1_12=\u5742\u57FC\u5800
NGram.KANJI_1_13=\u4E3C\u4E98\u4FE3\u4FF5\u5072\u51A8\u53A9\u5451\u546A\u5504\u5516\u55A9\u55B0\u5618\u5642\u565B\u567A\u56A2\u57F4\u5840\u5841\u58F1\u59F6\u5A2F\u5B22\u5B8D\u5DCC\u5EFB\u5F10\u60A9\u60E3\u61D0\u62F6\u63B4\u63BB\u63C3\u6681\u685F\u6955\u6962\u696F\u698A\u698E\u69FB\u6A2B\u6A7F\u6B53\u6BD8\u6D99\u6E07\u7460\u7473\u7560\u7573\u758E\u7690\u7815\u783A\u7962\u7A4F\u7A63\u7AEA\u7BED\u7CA7\u7D18\u7D3A\u7E4D\u8061\u8218\u8276\u82C5\u8597\u85AB\u86CD\u874B\u88FE\u8ACF\u8B90\u8D0B\u8FBF\u9013\u9061\u914E\u9154\u918D\u9190\u91A4\u91B8\u9262\u929A\u92ED\u92F3\u932C\u96EB\u96F0\u976D\u97EE\u981A\u99C4\u9A28\u9AC4\u9B8E\u9C10\u9D0E\u9D5C\u9D8F\u9E78\u9EB9\u9EBA\u9EBF
NGram.KANJI_1_14=\u5F66\u7984\u7985
NGram.KANJI_1_16=\u5861\u7B25\u844E\u9419\u9D07
NGram.KANJI_1_18=\u5039\u514E\u51E7\u51EA\u5301\u5302\u5859\u58F7\u59AC\u5C2D\u5CA8\u5EFC\u6357\u64B9\u67CA\u6802\u6834\u68BC\u6900\u6919\u691B\u69D9\u6AE8\u6D9C\u6E8C\u6F09\u6F45\u701E\u7026\u7114\u72DB\u7577\u75E9\u783F\u7895\u7A50\u7AC3\u7B48\u7B86\u7BAA\u7C7E\u7C82\u7C8D\u7CCE\u7D2C\u7F6B\u7FEB\u8557\u85AE\u86CE\u877F\u8997\u8ACC\u8CB0\u8CCE\u8FE9\u9197\u920E\u9266\u927E\u92F2\u9306\u9453\u9784\u982C\u9834\u99C8\u9BF5\u9C2F\u9D2C
NGram.KANJI_1_22=\u6762\u6A17\u887F
NGram.KANJI_1_27=\u4E21\u4E57\u4ECF\u4F1D\u4FA1\u4FF3\u5024\u50CD\u5150\u5186\u51E6\u52B4\u52B9\u5358\u53CE\u55B6\u56E3\u56F2\u56F3\u570F\u5727\u5869\u5897\u58F2\u5909\u5B9F\u5BDB\u5BFE\u5C02\u5DFB\u5E2F\u5E81\u5E83\u5EC3\u5F3E\u5F93\u5FB3\u5FB4\u5FDC\u60AA\u6226\u6238\u6271\u62E0\u6319\u63B2\u6483\u64AE\u67A0\u67FB\u691C\u697D\u69D8\u6A29\u6B73\u6B74\u6BCE\u6C17\u6CA2\u6D5C\u6E08\u6E80\u702C\u7523\u767A\u770C\u7D4C\u7D75\u7D76\u7D99\u7D9A\u7DCF\u8535\u8846\u89A7\u89B3\u8A33\u8AAC\u8AAD\u8C4A\u8EE2\u8EFD\u8FBA\u8FBC\u9244\u9332\u95A2\u95D8\u96D1\u99C5\u9A13\u9ED2
NGram.KANJI_1_29=\u4F0E\u4FFA\u5036\u53E1\u54B2\u5506\u583A\u5C3B\u5CAC\u5CE0\u5CEF\u6803\u68B6\u6A0B\u6A8E\u73C2\u7551\u7826\u7881\u79B0\u7B39\u8429\u8599\u8FBB\u9162\u95C7\u9688\u96BC\u9AEA\u9DF2
NGram.KANJI_1_31=\u5553\u938C
NGram.KANJI_1_35=\u51B4\u564C\u57DC\u5B2C\u6822\u685D\u690B\u6973\u6C93\u7511\u7887\u7A17\u83D6\u847A\u8494\u8526\u854E\u85C1\u86F8\u88B4\u93A7\u9B92\u9C39\u9C48\u9C52
NGram.KANJI_2_0=\u4E2B\u4EC3\u4F09\u4F57\u4F6F\u4F70\u4FD1\u4FDA\u500C\u5043\u516E\u5189\u5241\u530D\u5310\u5412\u54AB\u54AF\u5514\u5556\u55B1\u561F\u573B\u586D\u587D\u58C5\u58D1\u5914\u5A62\u5A6A\u5AE6\u5B40\u5B5B\u5B70\u5BB8\u5CD2\u5D01\u5D34\u5E11\u5EA0\u5F0B\u5F2D\u5F87\u607F\u621B\u6221\u6289\u63A3\u6452\u646D\u64D8\u652B\u6600\u6631\u6641\u66F7\u6773\u67B8\u67DD\u67DE\u6829\u68FB\u69AD\u6A47\u6C10\u6C68\u6C74\u6C85\u6CD3\u6D31\u6D93\u6D94\u6DB8\u6DBF\u6DC5\u6E6E\u6EA7\u6EB4\u6EC2\u6F2A\u6F2F\u6FB9\u6FC2\u6FDB\u6FEE\u70AF\u70FD\u7166\u726F\u729B\u739F\u73DE\u740A\u746D\u749C\u749F\u74E0\u759D\u75A3\u75CD\u75DE\u7600\u7620\u7688\u7738\u7762\u776B\u777D\u77E3\u781D\u7837\u78A3\u7946\u7B60\u7F44\u7F54\u7F5F\u7FAF\u8026\u807F\u80C4\u80DB\u80ED\u81E7\u824B\u82B7\u82E3\u8392\u846D\u84D3\u8548\u85B9\u86DE\u873F\u8753\u8782\u87AB\u87B3\u87D1\u87E0\u87FE\u8821\u88D8\u88E8\u8913\u891A\u892B\u8983\u8C3F\u8C49\u8C82\u8D6D\u8DE4\u8E1D\u8E1E\u8E7C\u8FE5\u8FE8\u9005\u9035\u9050\u9082\u9083\u9095\u90E2\u911E\u91AE\u91B4\u93D6\u9621\u968D\u96B9\u96D2\u9711\u9713\u973E\u9AB0\u9AB7\u9AE6\u9B03\u9B23\u9EDC\u9EEF
NGram.KANJI_2_1=\u4E82\u4F48\u4F54\u50F9\u5167\u528D\u52DE\u532F\u537B\u53C3\u5433\u555F\u55AE\u56B4\u570D\u5716\u58D3\u58DE\u5920\u5967\u5A1B\u5BEB\u5BEC\u5C08\u5C0D\u5C46\u5C6C\u5CFD\u5E36\u5E6B\u5EC8\u5EF3\u5F48\u5F91\u5F9E\u5FB5\u6046\u60E1\u61F7\u6232\u6236\u64C7\u64CA\u64D4\u64DA\u64F4\u651D\u6578\u65B7\u6649\u6A13\u6A23\u6A6B\u6A94\u6AA2\u6B0A\u6B50\u6B61\u6B72\u6B77\u6B78\u6C92\u6EAB\u6EFF\u6FD5\u6FDF\u71DF\u722D\u72C0\u734E\u737B\u746A\u7522\u773E\u78BC\u7A69\u7C3D\u7CB5\u7D55\u7D72\u7DA0\u7DAB\u7DE3\u7E5E\u7E6A\u7E7C\u7E8C\u8072\u807D\u8085\u812B\u8166\u8173\u81D8\u8209\u820A\u8332\u838A\u840A\u85E5\u860B\u8655\u865B\u88DD\u89BA\u89BD\u89C0\u8AAA\u8B6F\u8B7D\u8B8A\u8B93\u8C50\u8CF4\u8E64\u8F15\u8F49\u8FA6\u8FAD\u9109\u9130\u91AB\u91CB\u92B7\u9304\u9322\u95CA\u96A8\u96AA\u96B1\u96B8\u96D6\u96D9\u96DC\u9748\u975C\u986F\u9918\u99DB\u9A57\u9B25\u9EA5\u9EC3\u9EDE\u9F52
NGram.KANJI_2_4=\u514C\u51AA\u5614\u56AE\u56C2\u582F\u58FA\u5B0C\u5D11\u5DD2\u5DD6\u5E40\u5E5F\u5EEC\u6137\u6417\u6488\u64F2\u652A\u6582\u6689\u689F\u68D7\u69D3\u6A97\u6AB8\u6ABB\u6AC3\u6ADA\u6B7F\u6BB2\u6EA5\u6EC4\u6EF2\u7009\u701D\u7028\u703E\u7165\u71BE\u721B\u7463\u7464\u7469\u7515\u7526\u75FA\u7621\u779E\u79B1\u7A1F\u7AC4\u7AC7\u7B8F\u7BE9\u7D2E\u7D68\u7D8F\u7DB8\u7DBA\u7E46\u7E79\u7F4C\u7F88\u8070\u8073\u8076\u81BE\u82BB\u83A2\u858A\u8591\u861A\u8778\u87EC\u8805\u880D\u893B\u8A1B\u8A25\u8A36\u8A85\u8AA6\u8B17\u8B28\u8CB6\u8CE4\u8D16\u8D1B\u8ECB\u9112\u9214\u9249\u93AC\u9594\u9598\u95BB\u95D5\u965E\u96B4\u97DC\u9821\u9824\u9921\u9952\u9A55\u9A5B\u9B1A\u9C13\u9D09\u9DAF\u9E1A\u9E75\u9F67
NGram.KANJI_2_9=\u4E9F\u4F6C\u4FDE\u4FFE\u5029\u5140\u51A2\u5345\u539D\u53FB\u54C7\u5599\u560E\u561B\u563B\u566C\u5676\u5729\u574D\u57E4\u595A\u598D\u5A1F\u5A25\u5A77\u5AB2\u5AD6\u5BF0\u5C2C\u5CEA\u5E37\u5F08\u6059\u606A\u6096\u609A\u62A8\u6555\u6556\u66E6\u675E\u68E3\u69BB\u6BCB\u6BD3\u6C1F\u6C26\u6C81\u6DC4\u6DDE\u6E32\u6E44\u6E4D\u6F33\u6F7C\u6FA7\u701A\u701B\u715C\u741B\u7428\u7480\u74A8\u7504\u752C\u768B\u76CE\u78CA\u78FA\u79BA\u7C27\u8046\u81FB\u8331\u8393\u83C1\u8403\u8438\u843C\u8446\u85B0\u87D2\u8862\u8DC6\u9074\u9131\u9672\u96EF\u9704\u9706\u977C\u9ABC\u9E92\u9ECF
NGram.KANJI_2_10=\u51BD\u5704\u7350\u73A5
NGram.KANJI_2_11=\u4E15\u4EA2\u4F5A\u50D6\u5349\u53DF\u5484\u5958\u5B34\u5B5A\u5C91\u5E1B\u5F77\u61CB\u61FF\u620C\u620D\u622E\u6248\u6538\u660A\u664F\u678B\u67E9\u69B7\u69C3\u6CB1\u6CD7\u6D5A\u6DAA\u6DC7\u7099\u71EE\u7325\u7425\u7455\u747E\u749E\u75B5\u7678\u7693\u76C2\u77B0\u77BF\u78CB\u7957\u795A\u797A\u7A79\u7B08\u7B75\u7BB4\u7F9A\u7FB2\u7FDF\u80E5\u81BA\u8340\u837C\u8398\u8559\u85A8\u86DF\u8734\u8882\u88F4\u8936\u900D\u907D\u9642\u96C9\u9AFB\u9E9D\u9EBE
NGram.KANJI_2_12=\u5F57\u7940
NGram.KANJI_2_13=\u5191\u7791\u792C\u7D46
NGram.KANJI_2_15=\u5713\u58FD\u5D17\u5D19\u5DBC\u5F4C\u6191\u64A5\u687F\u69AE\u6AFB\u6EEC\u6F3F\u6FE4\u6FF1\u6FFE\u700B\u74CA\u76E1\u76E7\u7926\u792B\u79AE\u7AA9\u7C43\u7C4C\u7C64\u7DBD\u81A0\u856D\u8594\u8606\u8A62\u8AF7\u8CC8\u8CE3\u8D99\u8F1B\u8F3B\u9059\u9127\u9264\u947D\u95A9\u97CB\u980C\u9838\u9846\u99AE\u9A19\u9B06\u9B91\u9F4A\u9F4B
NGram.KANJI_2_16=\u4E69\u4EC4\u4EDF\u4EF3\u4F0B\u4F5E\u5000\u5028\u50E5\u513B\u5157\u51DC\u52D7\u530F\u5379\u53F5\u5471\u5477\u5555\u555C\u557B\u5594\u55B2\u55C9\u560D\u5616\u562E\u5630\u5653\u5657\u566F\u56A8\u56B6\u5820\u5880\u58CE\u58D9\u5950\u5969\u596D\u599E\u59B3\u59CD\u59D2\u5A40\u5AA7\u5ABC\u5AD7\u5AD8\u5B0B\u5B24\u5B38\u5B53\u5C5C\u5D06\u5D47\u5D94\u5D9D\u5E57\u5EC4\u5F46\u5FAC\u60BD\u60D8\u6123\u615D\u615F\u6175\u618A\u61AB\u61E3\u623E\u6308\u636B\u645F\u6519\u6595\u6698\u66B8\u67D9\u6840\u695D\u696E\u6979\u69C1\u69E8\u6AEC\u6AFA\u6B5F\u6CAC\u6CE0\u6CEF\u6D0C\u6D36\u6DD2\u6DD9\u6DE6\u6DEC\u6E5F\u6FA0\u6FEC\u7156\u71C4\u71DC\u71EC\u71FC\u720D\u7230\u7292\u7296\u72A2\u72CE\u7357\u737A\u7380\u7386\u73A8\u73EE\u743F\u74A6\u74CF\u74D4\u74DA\u755A\u75A5\u75B3\u75C2\u75E0\u75F1\u75FF\u7601\u7609\u7646\u7658\u769A\u76B0\u774F\u775C\u778B\u77BD\u77C7\u7843\u787F\u78F4\u79C8\u7A88\u7A95\u7AFD\u7B1E\u7B67\u7B9D\u7BCC\u7C0D\u7C11\u7C37\u7C40\u7C6E\u7CB3\u7CBD\u7D09\u7D31\u7D40\u7D5B\u7D70\u7D91\u7D9E\u7DB0\u7DD9\u7DF9\u7E08\u7E11\u7E1D\u7E35\u7E52\u7FB6\u7FBF\u7FEE\u8012\u801C\u8028\u8052\u8123\u8188\u81C3\u81DA\u81FE\u8210\u82BE\u83A0\u83D4\u8407\u8435\u8477\u849E\u84C6\u84CA\u85F9\u867A\u86B5\u86B6\u86C4\u8706\u8707\u870A\u8768\u87BB\u8831\u8839\u8879\u8921\u8938\u8964\u89A6\u89AC\u8A10\u8A3E\u8AC2\u8ADB\u8AF3\u8B2B\u8B41\u8B4E\u8B5F\u8B6B\u8B92\u8C55\u8C62\u8C73\u8C8A\u8C8D\u8CB2\u8CB3\u8CD2\u8CE1\u8CFB\u8D0D\u8E34\u8E7A\u8E8A\u8ED4\u8EFE\u8F0A\u8F1C\u8F1E\u8F26\u8FAE\u9088\u90C3\u90FE\u9134\u9148\u91D9\u91E9\u9238\u9239\u923D\u924D\u925A\u9296\u92AC\u92BB\u9315\u9319\u931A\u9321\u9370\u9394\u93A2\u93D8\u93E4\u943A\u9477\u9582\u958E\u95A1\u95C8\u95CC\u95D4\u9658\u966C\u970F\u973D\u9744\u975B\u9766\u97A3\u97A6\u97C1\u97C6\u980A\u9837\u9853\u9870\u98AF\u98B3\u98BA\u98E9\u98ED\u9912\u991B\u991E\u993D\u993F\u99D1\u99DF\u9A01\u9A3E\u9A43\u9A4D\u9ACF\u9AE1\u9B22\u9B58\u9C25\u9C3E\u9C54\u9C56\u9D15\u9D23\u9D89\u9DC2\u9DD3\u9E82\u9E8B\u9EA9\u9EE0\u9EF7\u9F07\u9F2F\u9F34\u9F3E\u9F5F\u9F6C
NGram.KANJI_2_18=\u5155\u520E\u55DF\u56C0\u56C1\u5793\u5FD6\u5FF8\u6029\u60FA\u613E\u6147\u615A\u62C8\u6384\u6883\u6894\u68F9\u6AA3\u6AAE\u6AC2\u6E63\u7032\u70A4\u7146\u71FB\u7228\u72F7\u7370\u7441\u74BF\u75B8\u75E3\u7622\u76CD\u7768\u79E3\u7A60\u7B6E\u7BC1\u7C5F\u7D06\u7E2F\u7E39\u8146\u81CF\u8703\u8729\u8737\u87EF\u88D2\u8A22\u8AC4\u8AF6\u8E59\u8F33\u8F42\u9169\u91B1\u9278\u93C3\u93DD\u9460\u946A\u9785\u9AD1\u9B4D\u9B4E\u9C31\u9D12\u9ECC
NGram.KANJI_2_21=\u502A\u544E\u59AE\u59EC\u5D1B\u66A8\u6BD7\u6C76\u6E1D\u70EF\u742A\u7459\u7FE1\u82EF\u8343\u85C9\u8A79\u90DD
NGram.KANJI_2_22=\u4EDE\u4F7B\u504C\u50EE\u52E3\u52F0\u536E\u54A9\u54BB\u54BF\u54C2\u54E6\u550F\u556A\u55E8\u564E\u5664\u5671\u568F\u56DD\u572F\u57A0\u5809\u5924\u59A3\u59A4\u59E3\u5A13\u5A23\u5B51\u5B73\u5C50\u5C8C\u6035\u60C6\u6106\u6215\u62CE\u62FD\u64ED\u6549\u6554\u655D\u659B\u65CE\u65D6\u6615\u6624\u665E\u6677\u669D\u66E9\u6772\u677C\u696B\u6A84\u6AA0\u6BFD\u6C16\u6C86\u6C94\u6CD6\u6D2E\u6D39\u6F78\u6FB6\u705E\u70CA\u7168\u723B\u7256\u7284\u73B3\u740D\u742F\u7498\u74A9\u752D\u75F3\u7634\u768E\u76B4\u76E5\u77A0\u77DC\u781F\u782D\u7AA0\u7BFE\u7FF1\u80AB\u8174\u81EC\u8202\u8222\u8228\u82DC\u8306\u83FD\u8469\u84FF\u859C\u8617\u86B1\u8722\u8C89\u8D67\u8DCE\u8E49\u8E76\u8E87\u8FE2\u8FE4\u8FF8\u9016\u905B\u9174\u982B\u98E7\u9955\u9B32
NGram.KANJI_2_23=\u4F8F\u5055\u524C\u548E\u5583\u594E\u5CB7\u5ED6\u5F5D\u6021\u66B9\u66F0\u6C55\u6C7E\u6C82\u6E2D\u6EC7\u6ED5\u70B3\u71B9\u72C4\u73C0\u7426\u745C\u748B\u7696\u777F\u79A7\u79B9\u7F8C\u8153\u8339\u8386\u8725\u90B5\u9102\u962E\u9716\u97F6
NGram.KANJI_2_28=\u5733\u57D4\u838E\u8FEA
NGram.KANJI_2_29=\u50ED\u5F29\u62EE\u6A9C\u7BC6\u80F1\u8129\u8171\u822B\u8AEB
NGram.KANJI_2_30=\u4EB3\u4F15\u4FB7\u5006\u509A\u50A2\u5102\u5109\u5115\u5137\u5138\u513C\u524B\u524E\u5277\u528A\u52E6\u52FB\u5331\u5436\u5443\u54FD\u5538\u555E\u55C6\u55C7\u5679\u5690\u5695\u56C9\u56D1\u56EA\u588A\u58E2\u5AFB\u5B2A\u5B43\u5B7F\u5BE2\u5C37\u5D27\u5D84\u5D87\u5DD4\u5EC1\u5EDD\u5F12\u5FA0\u60F1\u616B\u61F5\u61F6\u61FE\u62DA\u6371\u6399\u63C0\u6451\u647B\u6493\u64BB\u64BF\u64C4\u64F1\u64F7\u650F\u652C\u665D\u6684\u6688\u66EC\u672E\u68E7\u69A6\u69ED\u69F3\u6A01\u6AAF\u6AE5\u6BA4\u6BAE\u6BAF\u6BC6\u6C08\u6C2C\u6C59\u6D87\u6EBC\u6ECC\u6EF7\u6F6F\u6F80\u6F86\u6FD8\u6FF0\u6FFA\u7006\u7018\u7030\u7051\u7192\u71C9\u71D9\u71F4\u71FE\u7274\u7377\u74A3\u750C\u7613\u7627\u7661\u7662\u7665\u766E\u7671\u7672\u76BA\u775E\u776A\u778C\u78E7\u7955\u7A08\u7AC5\u7B4D\u7C2B\u7C6C\u7CF0\u7D02\u7D1C\u7D73\u7DA2\u7DB5\u7DDE\u7E09\u7E0A\u7E37\u7E43\u7E61\u7E7D\u7E93\u7F3D\u7FF9\u81A9\u8271\u83F8\u84C0\u8514\u85BA\u86A9\u86FB\u879E\u8814\u8836\u889E\u8932\u896A\u896F\u8993\u89B2\u8A15\u8A16\u8A1D\u8A5B\u8A6C\u8A6D\u8A7C\u8AA1\u8AA3\u8AA5\u8B0A\u8B4F\u8B59\u8B96\u8C48\u8C54\u8CBD\u8CFA\u8D13\u8E89\u8E8B\u8EAA\u8EC0\u8EDB\u8EFC\u8F12\u8F1F\u8F3E\u8F45\u8FFA\u9015\u9183\u919E\u91A3\u91D7\u91F5\u9209\u9215\u923E\u9240\u9251\u9257\u927B\u9293\u92A8\u92C5\u92C7\u92F0\u9333\u935A\u9382\u938A\u9398\u93B3\u93D7\u93DF\u93E2\u93FD\u942B\u942E\u9433\u9463\u9470\u9472\u947E\u95D0\u96CB\u97C3\u97CC\u981C\u9839\u986B\u98B6\u98EA\u9909\u991A\u9935\u993E\u9951\u99A5\u99B1\u99D9\u99DD\u99F1\u9A2B\u9A62\u9A65\u9AAF\u9AD2\u9AEF\u9B0D\u9B28\u9B77\u9BFD\u9C49\u9C5F\u9C78\u9D3F\u9D72\u9DD7\u9E1B\u9EB4\u9EF4\u9F66\u9F94
NGram.KANJI_2_31=\u5DBD\u63C6\u6E3E\u7587\u8AF1\u8B5A\u9695
NGram.KANJI_2_32=\u53A5\u589F\u5CD9\u7109\u7F79\u8006\u8654\u8944\u968B\u96CD
NGram.KANJI_2_35=\u4F47\u4F91\u4FCE\u4FDF\u527D\u535E\u55DA\u56A5\u5879\u5A11\u5B7A\u5CAB\u5CF4\u5EBE\u5F7F\u5FA8\u601B\u606B\u60B8\u610D\u6134\u619A\u61FA\u6369\u6523\u65CC\u66C4\u6727\u6968\u6A05\u6A48\u6B59\u6BEC\u6D35\u6D38\u6E19\u701F\u7064\u711C\u716C\u71A8\u71E7\u7258\u743A\u746F\u75BD\u75D9\u75F2\u7669\u766C\u76DE\u7729\u77BC\u78EC\u792A\u7A37\u7A62\u7BE6\u7C2A\u7C50\u7D07\u7DD8\u7E5A\u7F8B\u7FD5\u7FF3\u8151\u81CD\u8317\u83F4\u85EA\u85FA\u8823\u895E\u89F4\u8A0C\u8A41\u8AA8\u8ACD\u8B10\u8CC1\u8D05\u8D73\u8E4A\u8E85\u8E91\u8EFB\u8F13\u9087\u914A\u91C9\u923F\u93B0\u9403\u95A8\u95AD\u9730\u9865\u9903\u9945\u9949\u99AD\u99E2\u9A6A\u9D26\u9E1E\u9EDD\u9F2C\u9F72
NGram.KANJI_2_36=\u4E9E\u4F86\u5011\u50B3\u5152\u5169\u5340\u5718\u5B78\u5BE6\u5BF6\u5C07\u5EE3\u61C9\u6230\u6703\u689D\u6A02\u6C23\u7063\u7368\u756B\u7576\u767C\u7A31\u7D93\u7E23\u7E3D\u81FA\u8207\u842C\u85DD\u865F\u8B49\u8B80\u8CFD\u908A\u9435\u95DC\u965D\u9AD4\u9EE8
NGram.KANJI_2_37=\u5480\u5580\u5C39\u67EF\u68B5\u6D85\u8521\u90B1
NGram.KANJI_2_38=\u4E1F\u4F96\u4FE0\u50F1\u5118\u522A\u5291\u52C1\u52DB\u52F3\u52F5\u52F8\u53B2\u55CE\u562F\u580A\u5862\u58AE\u58D8\u58DF\u58E9\u58EF\u5925\u593E\u599D\u5ABD\u5C62\u5EC2\u5EDA\u5EE2\u5F4E\u5F65\u6085\u6158\u61FC\u6200\u62CB\u633E\u6416\u6436\u6490\u64CB\u64E0\u64FA\u6514\u651C\u6524\u6558\u6583\u66B1\u66C6\u66C9\u66E0\u6A11\u6A1E\u6A38\u6A62\u6AB3\u6B16\u6B98\u6BBC\u6C2B\u6DDA\u6DE8\u6DEA\u6DFA\u6EEF\u6EFE\u6F32\u6F51\u6F5B\u700F\u71D2\u7210\u7246\u7260\u72A7\u72F9\u7375\u7378\u758A\u760B\u76DC\u76EA\u77DA\u77FD\u78DA\u7919\u797F\u79AA\u7A05\u7A4C\u7ACA\u7C72\u7D81\u7DDD\u7E31\u7E69\u7E6B\u7E73\u7E96\u7E9C\u81BD\u81C9\u81DF\u8259\u8277\u8396\u83A7\u8523\u8525\u860A\u863F\u8667\u87A2\u87F2\u881F\u883B\u89F8\u8B20\u8B74\u8B9A\u8C4E\u8C6C\u8C93\u8CEC\u8D0A\u8D0F\u8D95\u8E10\u8F4E\u8FAF\u8FF4\u905E\u9072\u9081\u908F\u91AC\u91C0\u91C1\u91D0\u921E\u9223\u9245\u929C\u92B3\u92C1\u9336\u934A\u93C8\u9444\u9452\u947C\u947F\u9592\u95B1\u95C6\u95D6\u95E1\u95E2\u96DE\u9742\u978F\u984F\u9871\u98B1\u98C4\u99ED\u9A37\u9A45\u9A5F\u9AEE\u9B27\u9BCA\u9C77\u9D51\u9D5D\u9E79\u9E7C\u9E7D\u9EB5\u9EBC\u9F61\u9F63\u9F90\u9F9C
NGram.KANJI_3_1=\u5283\u7562\u7DEC\u88E1\u8F2F
NGram.KANJI_3_2=\u5009\u502B\u5049\u5075\u507D\u5091\u5098\u50B5\u50B7\u50BE\u5100\u5104\u511F\u518A\u525B\u5289\u5442\u5805\u589C\u58C7\u5922\u596A\u5A66\u5B6B\u5BE7\u5BE9\u5DBA\u5E63\u5E7E\u5FB9\u6163\u616E\u6176\u61B2\u61B6\u61F8\u639B\u63DA\u63EE\u640D\u64B2\u64C1\u64EC\u6557\u6575\u6607\u66AB\u68C4\u6A39\u6C96\u6CC1\u6E1B\u6E6F\u6E9D\u6EC5\u6F01\u6F64\u6FC3\u7058\u707D\u7344\u7642\u76E4\u7832\u790E\u7B46\u7D05\u7D0B\u7D14\u7D19\u7D1B\u7D39\u7D61\u7DB1\u7DCA\u7DD2\u7DE0\u7DE9\u7DEF\u7DF4\u7E2E\u7E3E\u8105\u8108\u81E8\u8266\u84CB\u84EE\u85A9\u885D\u88DC\u8972\u8A02\u8A0E\u8A13\u8A17\u8A2A\u8A34\u8A3A\u8A3C\u8A69\u8A73\u8A95\u8AA0\u8AA4\u8AB2\u8AC7\u8ACB\u8B00\u8B1B\u8B1D\u8B5C\u8C9D\u8C9E\u8CA2\u8CA8\u8CA9\u8CAB\u8CAC\u8CB7\u8CBF\u8CC0\u8CDE\u8CE2\u8CFC\u8D08\u8DE1\u8E8D\u8ECC\u8EDF\u8EF8\u8F14\u8F1D\u8F2A\u8F44\u9055\u9069\u9077\u907C\u90F5\u91DD\u9285\u92FC\u9326\u932F\u9375\u9396\u93AE\u93E1\u9451\u9589\u95A3\u9663\u9670\u9673\u96BB\u9801\u9802\u9803\u9806\u9808\u9810\u983B\u984D\u9858\u9867\u98EF\u98F2\u98FE\u990A\u99D0\u9A0E\u9A5A\u9B5A\u9CE5\u9DB4\u9E97\u9F8D
NGram.KANJI_3_3=\u543E\u5BEE\u5F18\u6590\u725F\u83C5\u85E9\u9E93
NGram.KANJI_3_4=\u5016\u53AD\u5606\u5629\u58BE\u5F14\u6065\u6144\u646F\u647A\u67F5\u6953\u6C3E\u6F2C\u6F97\u6FB1\u7169\u71E6\u71ED\u74BD\u79BF\u7A1C\u7A4E\u7AAF\u7CDE\u7D17\u7D43\u7E55\u7FA8\u807E\u8139\u8490\u8569\u856A\u87FB\u8A23\u8AB9\u8AE6\u8AFA\u8B2C\u8CD1\u91D8\u92F8\u9318\u96DB\u99B4\u9BC9\u9C2D\u9CF6\u9D61\u9DFA
NGram.KANJI_3_5=\u4E26\u4F75\u4FC2\u500B\u5074\u5099\u512A\u5225\u5247\u5275\u5287\u52D5\u52D9\u52DD\u52E2\u5354\u54E1\u554F\u5712\u57F7\u5831\u5834\u5BAE\u5C0E\u5C64\u5CA1\u5CF6\u5E2B\u5E79\u5EAB\u5F35\u5F37\u5F8C\u5FA9\u611B\u614B\u63A1\u63DB\u6642\u66F8\u6771\u696D\u6975\u69CB\u6A19\u6A4B\u6A5F\u6BBA\u6C7A\u6E2C\u6E96\u6F22\u70BA\u7121\u71B1\u7372\u73FE\u74B0\u7570\u76E3\u78BA\u7A2E\u7A4D\u7AF6\u7BC0\u7BC4\u7BC9\u7C21\u7D00\u7D04\u7D0D\u7D1A\u7D30\u7D42\u7D44\u7D50\u7D66\u7D71\u7DAD\u7DDA\u7DE8\u7E54\u7F85\u7FA9\u7FD2\u8056\u805E\u8077\u8208\u83EF\u8449\u8853\u885B\u88FD\u8907\u898B\u898F\u8996\u89AA\u8A08\u8A18\u8A2D\u8A31\u8A55\u8A5E\u8A66\u8A71\u8A72\u8A8C\u8A8D\u8A9E\u8ABF\u8AD6\u8AF8\u8B58\u8B70\u8B77\u8CA0\u8CA1\u8CB4\u8CBB\u8CC7\u8CEA\u8ECA\u8ECD\u8F03\u8F09\u8F38\u8FB2\u9023\u9031\u9032\u904A\u904B\u904E\u9054\u9060\u9078\u907A\u9084\u9280\u9577\u9580\u958B\u9593\u9678\u967D\u968A\u968E\u969B\u96E2\u96E3\u96F2\u96FB\u97D3\u97FF\u9805\u9818\u982D\u984C\u985E\u98A8\u98DB\u9928\u99AC\u9BAE
NGram.KANJI_3_8=\u5F6B\u6C4E\u7B87\u8A70
NGram.KANJI_3_9=\u540B\u5B5C\u826E
NGram.KANJI_3_11=\u4F83\u4FF8\u51CB\u52BE\u53F1\u548B\u558B\u5CB1\u5D69\u5F3C\u620E\u621F\u64E2\u67DA\u6854\u69CC\u6A35\u6C8C\u6E1A\u6F15\u6FE0\u717D\u7252\u7AFA\u82D3\u83DF\u8431\u9041\u9149\u9798
NGram.KANJI_3_12=\u4ED5\u55E3\u572D\u57A3\u587E\u5983\u5A9B\u5C90\u5E61\u672D\u6960\u6F5F\u72D9\u72E9\u757F\u7949\u7950\u7E82\u7FCC\u82B8\u90B8\u91DC\u961C\u9B45
NGram.KANJI_3_13=\u55AB\u6249\u643E\u6841\u68B1\u725D\u7B8B\u7C95\u7E1E\u7F36\u8A03\u8A6B\u8E74\u95A4
NGram.KANJI_3_15=\u50AD\u50D1\u5132\u51F1\u55AC\u5617\u5687\u584A\u59EA\u5B30\u5BF5\u5C0B\u5C4D\u5EDF\u6182\u61A4\u64AB\u64FE\u66A2\u6897\u694A\u69CD\u6B3D\u6BC0\u6D29\u6F38\u7015\u7149\u71C8\u723A\u7336\u7345\u755D\u76C3\u78A9\u798D\u7AAE\u7DFB\u7E2B\u7F75\u7F77\u81E5\u834A\u852D\u85CD\u8755\u8A3B\u8A54\u8AE7\u8B02\u8B39\u8CAA\u8CE6\u8DA8\u8E5F\u8F5F\u905C\u912D\u919C\u92D2\u932B\u937E\u9418\u9583\u9812\u985B\u9905\u99B3\u99C1\u99D5\u9A30\u9CF3\u9D3B\u9D6C
NGram.KANJI_3_16=\u6D6C\u72FD\u77A5\u8956\u9C0D
NGram.KANJI_3_18=\u5919\u5F4A\u6063\u63AC\u649A\u6715\u6AD3\u71D0\u758B\u834F\u85F7\u88DF\u8F61\u93D1\u98F4\u9D60
NGram.KANJI_3_19=\u4F50\u7DB2\u962A
NGram.KANJI_3_22=\u5E96\u75D4\u91C6
NGram.KANJI_3_23=\u5E9A\u6C40\u821C\u839E\u8FED\u9EDB
NGram.KANJI_3_27=\u5F01\u66DC
NGram.KANJI_3_29=\u5023\u5208\u531D\u536F\u53E9\u54C9\u598A\u59BE\u5A20\u5D6F\u5DF3\u66C7\u66D6\u66F3\u6775\u6A3D\u6ADB\u6B86\u6C72\u6E25\u73EA\u7435\u760D\u7656\u7825\u78D0\u7A14\u7A6B\u7B20\u7BE0\u7CF8\u7DAC\u7DBB\u7DBE\u80E4\u80F4\u837B\u8466\u8568\u867B\u8A63\u91E7\u9320\u935B\u9591\u965B\u98E2\u990C\u9913\u9BAB
NGram.KANJI_3_30=\u60B6\u8AD2\u8CC2\u9237\u9328\u934D\u9397\u9830
NGram.KANJI_3_31=\u4FB6\u50D5\u51CD\u559A\u55AA\u5674\u5857\u585A\u5875\u58B3\u596E\u59E6\u5A41\u5D50\u5E25\u5E33\u5F59\u61C7\u61F2\u6368\u6383\u65AC\u68DF\u68F2\u6A3A\u6B04\u6DBC\u6DF5\u6E26\u6E4A\u6E67\u6F54\u6F70\u6FC1\u6FEB\u7159\u727D\u7652\u77EF\u78EF\u798E\u7A40\u7AAA\u7BE4\u7C60\u7CE7\u7CFE\u7D21\u7D33\u7D5E\u7D79\u7DB4\u7DBF\u7E1B\u7E8F\u7F70\u814E\u816B\u8178\u819A\u84BC\u85A6\u865C\u8766\u8A1F\u8A50\u8A60\u8A6E\u8A87\u8A98\u8AB0\u8ADC\u8AED\u8AEE\u8B0E\u8B19\u8CA7\u8CAF\u8CB8\u8CBC\u8CC3\u8CC4\u8CCA\u8CDC\u8CE0\u8CED\u8ED2\u8F29\u8F3F\u91E3\u920D\u9234\u925B\u9298\u9310\u934B\u958F\u95A5\u9727\u97FB\u9811\u984E\u98FC\u98FD\u99D2\u99FF\u9B31\u9BE8\u9C57\u9CE9\u9CF4\u9D28\u9DF9
NGram.KANJI_3_32=\u4E1E\u502D\u51A5\u5321\u58EC\u5A3C\u5BC5\u5CE8\u61A9\u620A\u65A1\u6714\u6853\u6893\u6C50\u6C5D\u7436\u745A\u745B\u773A\u7941\u7947\u8543\u865E\u8C5A\u914B\u99A8\u9AB8
NGram.KANJI_3_35=\u4E99\u5BA5\u5DFD\u608C\u60C7\u60DA\u6190\u61A7\u6753\u6777\u6787\u6B4E\u6F23\u6FE1\u6FEF\u7337\u7827\u786F\u7893\u7ABA\u7B94\u7BB8\u7C3E\u7D62\u7E6D\u80B1\u81BF\u81C6\u821B\u82E7\u83F0\u84D1\u86ED\u8888\u8B01\u8B04\u8F4D\u9291\u92E4\u932E\u9354\u936C\u939A\u9957\u9AED\u9BAA\u9BAD\u9BD6\u9BDB\u9C3B\u9D1B
NGram.KANJI_3_36=\u50C5\u53E2\u5EE0\u65BC\u70CF\u723E\u7D10\u7D9C\u806F\u8607\u862D\u8A0A\u8AFE\u8CD3\u9019\u9813\u9B6F
NGram.KANJI_3_37=\u4EA8\u4F3D\u5384\u5EFF\u60DF\u66DD\u6E5B\u8087\u82D1\u8FE6\u9640\u9E9F
NGram.KANJI_3_38=\u5147\u525D\u5678\u617E\u6372\u79A6\u8ABC\u92EA\u9438\u9817
NGram.KANJI_4_0=\u6D3C\u718F\u74EE\u8712
NGram.KANJI_4_9=\u4F84\u54C6\u5565\u68F1\u6D82\u83C7
NGram.KANJI_4_10=\u4FE9\u4FED\u51FF\u523D\u5300\u5364\u538C\u5450\u5455\u545C\u54D1\u54D7\u5578\u56A3\u58F6\u592F\u5CE6\u5D2D\u5E90\u6073\u607C\u60EB\u61D2\u62E2\u62E3\u631A\u6320\u6323\u6361\u63B7\u63B8\u63BA\u6405\u65A9\u65F7\u6619\u6655\u67A3\u67E0\u6805\u6808\u6866\u6868\u6869\u6A71\u6BE1\u6C79\u6CA5\u6CDE\u6DA4\u6DA7\u6DA9\u6E85\u70DB\u70E6\u70EB\u7115\u724D\u7410\u759F\u75AE\u75EA\u75F9\u762B\u763E\u76B1\u77EB\u783E\u79C3\u7A8D\u7A9C\u7B5D\u7BF1\u7EC5\u7ED2\u7EDE\u7EE3\u7EF7\u7EF8\u7EFD\u7F00\u7F0E\u7F15\u7F1A\u7F20\u7F24\u7F28\u7FA1\u7FD8\u8038\u803B\u804B\u80AE\u817B\u82C7\u8327\u835E\u8367\u83BA\u8424\u864F\u8681\u8682\u8715\u8717\u8721\u8747\u874E\u8845\u886C\u889C\u88E4\u89C5\u8BB6\u8BB9\u8BC0\u8BC5\u8BE1\u8BEB\u8BEC\u8BF5\u8C0E\u8C1A\u8D2E\u8D31\u8D43\u8D4E\u8D58\u8F67\u8F7F\u9489\u9499\u949D\u94A0\u94A5\u94AE\u94BE\u94D0\u94DB\u94F2\u9508\u950C\u951A\u9525\u952D\u952F\u9530\u953B\u9540\u9550\u9570\u9576\u95F0\u960E\u9668\u96CF\u97E7\u9885\u988A\u98A4\u9965\u9975\u997A\u997F\u9985\u998D\u998F\u9A6E\u9A6F\u9A74\u9A79\u9A7C\u9A82\u9A87\u9CA4\u9CC4\u9CCD\u9CD6\u9E20\u9E25\u9E35\u9E3D\u9E45\u9E49\u9E4A\u9E66
NGram.KANJI_4_16=\u576F\u579B\u6345\u78B4\u79EB\u79F8
NGram.KANJI_4_17=\u4E13\u4E1A\u4E1C\u4E24\u4E25\u4E2A\u4E3E\u4E49\u4E50\u4E66\u4E9A\u4EA7\u4EBF\u4ECE\u4EEC\u4EF7\u4F17\u4F20\u5170\u5173\u519B\u51B3\u51E4\u51FB\u5219\u521B\u522B\u529E\u52A1\u52A8\u52BF\u534F\u5355\u536B\u5386\u53BF\u53D1\u53D8\u542F\u5458\u54CD\u56E2\u56ED\u56F4\u56FE\u573A\u5904\u590D\u5934\u5B81\u5B9E\u5BF9\u5BFC\u5C14\u5C9B\u5E26\u5E7F\u5E94\u5F00\u5F20\u5F3A\u603B\u6218\u65E0\u65F6\u663E\u672F\u6743\u6784\u6807\u6C14\u6C49\u707E\u70ED\u73AF\u73B0\u7535\u76D1\u786E\u79CD\u79EF\u7B80\u7C7B\u7EA2\u7EA6\u7EA7\u7EAA\u7EBF\u7EC4\u7EC7\u7ED3\u7EDF\u7EE7\u7EED\u7EF4\u7F16\u7F57\u804C\u8054\u817E\u8282\u82CF\u83B7\u8425\u89C1\u89C2\u89C4\u89C6\u8BA1\u8BA4\u8BAE\u8BAF\u8BB0\u8BB8\u8BBA\u8BBE\u8BC1\u8BC4\u8BD1\u8BDD\u8BE5\u8BED\u8BF4\u8C03\u8D22\u8D23\u8D28\u8D39\u8D44\u8D5B\u8F66\u8F6C\u8F83\u8FBE\u8FC7\u8FD0\u8FD8\u8FD9\u8FDB\u8FDE\u9009\u94C1\u957F\u95E8\u95EE\u95F4\u95FB\u961F\u9633\u9645\u9646\u96BE\u9879\u9884\u9886\u9898\u98CE\u9A6C\u9F99
NGram.KANJI_4_18=\u51DB\u67B7
NGram.KANJI_4_22=\u4FA5\u545B\u5499\u5520\u5570\u56F1\u5A76\u5C96\u60AF\u60ED\u618B\u61A8\u62A0\u62A1\u62E7\u6363\u6390\u63B0\u6400\u6402\u6512\u6748\u70C1\u732C\u765E\u7663\u76CF\u7741\u781A\u7980\u79C6\u79FD\u7AA5\u7B0B\u7B8D\u7BA9\u7BAB\u7BD3\u7CAA\u7EAB\u7ECA\u7EE2\u7F2D\u7F30\u8110\u8113\u81CA\u835A\u8360\u84D6\u852B\u87E5\u8869\u8A8A\u8BA5\u8BF2\u8C05\u8C12\u8D30\u8D4A\u8D61\u8DF7\u8E6D\u8E8F\u8F95\u8F99\u8FAB\u94B3\u94C6\u94E3\u9504\u954A\u9563\u95FA\u9893\u9981\u9992\u9AA1\u9CAB\u9E2F\u9E33\u9EB8
NGram.KANJI_4_24=\u4E22\u4E8F\u4F1E\u4FA3\u5151\u517D\u51BB\u51D1\u5220\u529D\u52CB\u5367\u5389\u5395\u53E0\u53F9\u5413\u548F\u5524\u575E\u575F\u5784\u5792\u57A6\u57AB\u58F3\u5986\u5988\u5A04\u5A07\u5BA0\u5C18\u5C82\u5DE9\u5E10\u5E1C\u5F2F\u60E9\u6124\u629B\u6321\u6324\u635E\u63FD\u6401\u644A\u6491\u655B\u658B\u6635\u67AB\u67DC\u680B\u692D\u6984\u6A31\u6B7C\u6BD9\u6C22\u6CA6\u6CA7\u6CEA\u6CFB\u6CFC\u6D46\u6D47\u6D4A\u6D51\u6DA1\u6E0A\u6E83\u6EE4\u6EE5\u6F9C\u6FD2\u70C2\u7237\u727A\u730E\u7574\u75AF\u7792\u7816\u7845\u78B1\u7A77\u7A91\u7A9D\u7AD6\u7B3C\u7B5B\u7CAE\u7EA4\u7EB1\u7EBA\u7ECE\u7ED1\u7EF0\u7EF3\u7F14\u7F1D\u7F34\u7F62\u8042\u806A\u80A0\u80A4\u80BE\u80BF\u80C0\u810F\u8138\u8231\u8270\u829C\u82CD\u8350\u83B9\u841D\u8574\u8680\u8BB3\u8BBC\u8BBD\u8BC8\u8BF1\u8BFD\u8C0A\u8C0D\u8C1C\u8C24\u8C26\u8C2C\u8C2D\u8C34\u8D1E\u8D2C\u8D3C\u8D41\u8D42\u8D4C\u8D50\u8D5A\u8F69\u8F88\u8F90\u8FA9\u915D\u9171\u9493\u949E\u94A7\u94A9\u94BB\u94C3\u94C5\u94DD\u94F8\u9505\u9510\u9523\u9524\u95EF\u95F7\u95F9\u9600\u9610\u96F3\u97F5\u987D\u9882\u9888\u9896\u98D8\u9971\u9972\u9976\u997C\u9A84\u9A86\u9A8F\u9A97\u9A9A\u9AA4\u9CB8\u9CDE\u9E26\u9E43\u9E64\u9E70\u9F7F\u9F9F
NGram.KANJI_4_28=\u534E\u62A5\u7ECF\u7F51
NGram.KANJI_4_34=\u4E34\u4E3D\u4E4C\u4E54\u4E60\u4E61\u4E70\u4EB2\u4EC5\u4EEA\u4F18\u4F1F\u4F24\u4F26\u4FA7\u50A8\u513F\u5174\u517B\u518C\u519C\u51B5\u51CF\u5218\u521A\u5267\u52B3\u5356\u5382\u5385\u538B\u53A6\u5434\u5706\u5723\u5757\u575A\u575B\u575D\u5907\u591F\u593A\u5956\u5B59\u5BA1\u5BAB\u5BBD\u5BBE\u5BFB\u5C42\u5C81\u5E01\u5E08\u5E86\u5E93\u5F02\u5F39\u5F52\u5F55\u5F7B\u6000\u6001\u6076\u620F\u6237\u6267\u6269\u626C\u62A2\u62A4\u62DF\u62E5\u62E9\u6325\u635F\u6362\u6444\u6653\u6682\u6740\u6742\u6768\u6781\u6811\u6837\u6865\u68C0\u6B22\u6BC1\u6BD5\u6C47\u6C9F\u6CAA\u6CFD\u6D4B\u6DA8\u6E10\u6EE1\u6EE8\u706D\u7075\u70DF\u7231\u739B\u7597\u76D6\u76D8\u77FF\u7801\u7840\u79BB\u7A33\u7ADE\u7B14\u7B7E\u7CA4\u7D27\u7EB3\u7EBD\u7EC3\u7EC6\u7EC8\u7ECD\u7ED5\u7ED9\u7EDC\u7EDD\u7EE9\u7EFC\u7EFF\u7F13\u7F29\u8083\u80DC\u8111\u814A\u8230\u827A\u8363\u836F\u8428\u84DD\u867D\u8865\u88AD\u89C8\u8BA2\u8BA8\u8BA9\u8BAD\u8BB2\u8BBF\u8BC6\u8BCD\u8BD5\u8BEF\u8BF7\u8BF8\u8BFA\u8BFB\u8C08\u8D1D\u8D1F\u8D21\u8D25\u8D27\u8D2D\u8D2F\u8D35\u8D38\u8DC3\u8F6E\u8F6F\u8F7B\u8F7D\u8F86\u8F91\u8F93\u8F96\u8FB9\u8FBD\u8FC1\u8FDC\u8FDD\u9002\u9057\u90BB\u90D1\u91CA\u9488\u949F\u94A2\u94B1\u94F6\u9500\u9526\u9547\u9614\u9634\u9635\u9636\u9648\u9655\u9669\u9690\u97E9\u9875\u9876\u987A\u987B\u987E\u987F\u9891\u989D\u98DE\u9986\u9A7B\u9A8C\u9C81\u9C9C\u9F50
NGram.KANJI_4_39=\u4E1B\u4E1D\u4E27\u4EA9\u4ED1\u4ED3\u4F2A\u4FA6\u4FA8\u503A\u503E\u507F\u5188\u51AF\u51C0\u51C9\u51ED\u51EF\u5242\u5251\u52B2\u5362\u53A2\u5415\u5417\u5428\u55B7\u5760\u5899\u5939\u594B\u5987\u5A31\u5A74\u5BAA\u5C1D\u5C7F\u5C97\u5CAD\u5E05\u5E2E\u5E99\u5E9E\u5E9F\u5F03\u5FC6\u5FE7\u60AC\u60CA\u60EF\u626B\u6270\u629A\u62E6\u62E8\u6446\u6447\u654C\u67AA\u680F\u6863\u68A6\u6C64\u6D01\u6D53\u6D9D\u6DA6\u6E14\u6E17\u6EDA\u6EE9\u707F\u70BC\u70E7\u7275\u72B9\u72EE\u72F1\u743C\u7545\u76D0\u7855\u7978\u7B79\u7BEE\u7EA0\u7EAC\u7EAF\u7EB2\u7EB5\u7EB7\u7EB8\u7EB9\u7ED8\u7EEA\u7EF5\u7F05\u7F06\u7F18\u7F5A\u80C1\u80F6\u8109\u8206\u8273\u82F9\u8346\u8361\u83B2\u8427\u8651\u867E\u8854\u89C9\u8BC9\u8BCA\u8BD7\u8BDA\u8BDE\u8BE2\u8BE6\u8BFE\u8C01\u8C0B\u8C10\u8C13\u8C22\u8C23\u8C28\u8C31\u8D24\u8D26\u8D29\u8D2A\u8D2B\u8D34\u8D37\u8D3A\u8D3E\u8D3F\u8D4B\u8D4F\u8D54\u8D56\u8D5E\u8D60\u8D62\u8D75\u8D76\u8D8B\u8F68\u8F70\u8F74\u8F85\u8F89\u8FC8\u8FDF\u900A\u9012\u903B\u9093\u90AE\u917F\u9274\u94A6\u94DC\u94ED\u94FA\u94FE\u9501\u950B\u9519\u9521\u952E\u955C\u95EA\u95ED\u95F2\u95F8\u95FD\u9601\u9605\u9647\u96B6\u96FE\u9877\u9881\u9887\u9897\u989C\u98A0\u996D\u996E\u9970\u9A70\u9A71\u9A73\u9A76\u9A7E\u9A91\u9C7C\u9E1F\u9E21\u9E23\u9E2D\u9E3F\u9E4F\u9F84
NGram.KANJI_5_10=\u5239\u8EAF
NGram.KANJI_5_11=\u51C4\u8471
NGram.KANJI_5_12=\u6DC0\u7C98
NGram.KANJI_5_13=\u5631\u5815\u8695
NGram.KANJI_5_14=\u4E71\u4FA0\u5265\u52B1\u5374\u53A8\u53D9\u58EE\u5BDD\u5BFF\u5C3D\u5C4A\u5CE1\u5F25\u5F84\u604B\u60A6\u60E7\u60E8\u631F\u636E\u643A\u663C\u664B\u67A2\u6816\u697C\u6B8B\u6BB4\u6D45\u6E7F\u6EDE\u6F5C\u706F\u7089\u72ED\u732A\u732B\u76D7\u793C\u7977\u7A0E\u7A83\u80C6\u811A\u8131\u82A6\u830E\u848B\u865A\u866B\u86EE\u89E6\u8A89\u8DF5\u8E0A\u8E2A\u8F9E\u9065\u968F\u9759\u9EA6
NGram.KANJI_5_18=\u601C\u75D2
NGram.KANJI_5_26=\u4E07\u4E0E\u4E89\u4F1A\u4F53\u515A\u5185\u5199\u533A\u533B\u53C2\u53CC\u53F7\u58F0\u5965\u5B66\u5B9D\u5C06\u5C5E\u5F53\u62C5\u6570\u65AD\u65E7\u6761\u6765\u6A2A\u6B27\u6CA1\u6E29\u6E7E\u70B9\u72B6\u72EC\u732E\u753B\u79F0\u88C5\u9EC4
NGram.KANJI_5_29=\u693F\u82EB
NGram.KANJI_5_34=\u53F6\u6D9B\u83B1
NGram.KANJI_5_39=\u5C61\u788D
NGram.KANJI_6_0=\u4E10\u4E52\u4EC6\u4F88\u4FD0\u51F3\u533E\u53ED\u53EE\u5406\u541D\u5429\u5435\u5440\u5490\u5495\u54B1\u54C4\u54FC\u557C\u55D3\u5669\u56E4\u5777\u5992\u59E8\u5B7D\u5BDE\u5BE5\u5C79\u5C94\u5DCD\u5E18\u5E1A\u5E54\u5FF1\u604D\u6064\u60F6\u6127\u6177\u6233\u6252\u625B\u6273\u6296\u62C2\u62C7\u62F4\u638F\u6396\u63E3\u63EA\u6413\u6479\u64A9\u64C2\u659F\u667E\u6760\u6845\u6963\u6A90\u6B83\u6C13\u6C5E\u6D8E\u6D95\u6DCC\u6ED4\u6F13\u6F3E\u6FA1\u7076\u70D8\u710A\u71CE\u7239\u72E1\u73B7\u7599\u759A\u75A4\u75CA\u7629\u7682\u76C5\u76EF\u778E\u77AA\u787C\u7889\u788C\u78BE\u79E7\u7A96\u7A98\u7B77\u7C7D\u7CB1\u7D0A\u7D6E\u7F94\u7FCE\u8116\u814B\u814C\u819B\u828D\u82DF\u8301\u83E0\u85D5\u8611\u86A3\u8708\u8822\u8C4C\u8DB4\u8DEA\u8E42\u8E66\u8E72\u8EBA\u901B\u9157\u970E\u97ED
NGram.KANJI_6_3=\u62FC\u88D4\u9B4F
NGram.KANJI_6_9=\u4ED7\u4F63\u4FCF\u5018\u50BB\u50F5\u5154\u5201\u522E\u5254\u527F\u5306\u5462\u5492\u5496\u54A8\u54AA\u554A\u5561\u5564\u5566\u5885\u5938\u5AC2\u5AE9\u5CED\u5F64\u6084\u608D\u60A8\u60D5\u61C2\u61C8\u6254\u626F\u62AC\u6346\u634D\u640F\u6454\u6487\u6495\u64D2\u6746\u6789\u68B3\u68F5\u695E\u6986\u6995\u69A8\u6A44\u6AAC\u6B79\u6C28\u6C2E\u6CF5\u6DE4\u6E34\u6E3A\u6E89\u6F29\u70AB\u70AC\u7130\u715E\u7184\u71AC\u7238\u7281\u72E0\u74E3\u74F7\u7529\u7578\u761F\u7626\u76D4\u775B\u7779\u7784\u77BB\u780C\u780D\u7838\u7898\u78C5\u78F7\u7AED\u7B28\u7BE1\u7C07\u7CD5\u7CD9\u7CEF\u7F38\u800D\u8084\u809A\u8165\u816E\u832B\u8334\u840D\u8774\u886B\u888D\u88D9\u88F9\u8C41\u8D81\u8D9F\u8E22\u8E29\u8EB2\u8F9C\u9165\u918B\u9631\u964B\u964C\u9661\u9709\u9739\u9776\u9AD3\u9ED4
NGram.KANJI_6_10=\u4E53\u5582\u5600\u6342\u7B06
NGram.KANJI_6_11=\u5288\u543C\u5475\u5486\u54EE\u5598\u56BC\u5962\u5A36\u5A9A\u5B75\u5BA6\u5C38\u5C4E\u5F8A\u5F98\u627C\u62CC\u62D7\u63C9\u6930\u6954\u69D0\u6BEF\u6C90\u6CBD\u6CBE\u6F31\u6F88\u70D9\u7329\u75BC\u75F0\u7737\u77D7\u7B19\u7FB9\u803F\u80D6\u813E\u81C0\u8205\u8309\u83BD\u846B\u8517\u868C\u8759\u8815\u8859\u8B6C\u8E81\u8EAC\u90A2\u9698\u9B44
NGram.KANJI_6_12=\u722C\u7FD4
NGram.KANJI_6_16=\u5228\u5315\u542E\u54CE\u5509\u5527\u5543\u55B3\u55E1\u5636\u568E\u5FFF\u61E6\u6376\u642A\u6726\u74E4\u76F9\u7736\u7BD9\u8019\u80F0\u80F3\u812F\u818A\u8200\u8214\u8638\u869C\u86C0\u86C6\u86D4\u87C6\u88B1\u8902\u8C7A\u8E4B\u9119
NGram.KANJI_6_18=\u67D2\u6ED3\u87C0\u87CB\u8DDB\u901E\u9163
NGram.KANJI_6_20=\u4F5B\u52D2\u54C8\u62FF\u66FC\u6D59\u704C\u7586\u9ECE
NGram.KANJI_6_21=\u4E48\u4EFF\u4F19\u4FF1\u5021\u5077\u5195\u5212\u5269\u5401\u541E\u5427\u54EA\u5587\u558A\u55BB\u566A\u573E\u574E\u5783\u57AE\u584C\u58E4\u5960\u5976\u59CA\u5A1C\u5DE2\u5F99\u600E\u6015\u6263\u626D\u6293\u62C6\u62D6\u62EF\u62F1\u6316\u632A\u6380\u6389\u63D2\u641E\u64C5\u64CE\u65F1\u6664\u6735\u6770\u67EC\u6846\u684C\u68AD\u6B47\u6B49\u6B67\u6C1B\u6C27\u6C2F\u6C5B\u6C89\u6DF9\u6EAF\u70AE\u70E4\u731C\u7334\u73BB\u7470\u76FC\u788E\u789F\u78B0\u78B3\u7A0D\u7A3B\u7A57\u7CB9\u7F69\u8335\u8354\u84BF\u8DCC\u8DD1\u904F\u90A8\u9189\u9677\u9738\u978B
NGram.KANJI_6_22=\u5162\u53E8\u542D\u5501\u552C\u5639\u563F\u56B7\u6043\u60B4\u6194\u61CA\u634E\u63CD\u6414\u64AC\u6DAE\u6E43\u6F66\u7095\u7316\u733E\u7728\u7830\u78D5\u7ABF\u7FE9\u8018\u80EF\u8198\u8693\u86AA\u86AF\u874C\u8783\u879F\u8892\u8E6C
NGram.KANJI_6_23=\u4FD8\u4FEF\u501A\u5085\u5180\u526A\u5323\u54ED\u5634\u56CA\u58A9\u58F9\u5955\u5978\u59DA\u5A49\u5B55\u5BC7\u5BE8\u5D4C\u5E62\u6467\u64BC\u6500\u655E\u6572\u658C\u6670\u68CD\u68D5\u68E0\u6912\u6A0A\u6BB7\u6C9B\u6D3D\u6DC6\u6E23\u6F8E\u7011\u7092\u714C\u73AB\u7405\u7624\u76D2\u7960\u79C9\u7A20\u7BF7\u7F50\u804A\u8086\u81C2\u8292\u82DE\u852C\u857E\u859B\u8760\u8C6B\u8DBE\u8E48\u8F9F\u96A7
NGram.KANJI_6_25=\u4E8E\u5DF2\u5FB7\u7AD9
NGram.KANJI_6_28=\u4E58\u4ECD\u4EFD\u4F30\u4F60\u4F69\u503C\u5047\u51B0\u51F0\u5361\u5377\u53E6\u54E5\u552E\u5708\u5740\u5761\u57C3\u5821\u589E\u5979\u59C6\u5B69\u5B83\u5E15\u5E76\u5F17\u5F88\u6208\u622A\u624E\u627E\u62D4\u62DC\u63ED\u641C\u6536\u6548\u65C1\u665A\u6668\u67E5\u6B65\u6BCF\u6C61\u6CDB\u6D4E\u6D89\u6DB5\u6E38\u6EAA\u6FB3\u70B8\u745F\u7538\u7A97\u7F3A\u7F55\u805A\u8258\u827E\u82AC\u8303\u83F2\u8482\u85CF\u8DDF\u903E\u9080\u970D\u9760\u9ED1\u9ED8
NGram.KANJI_6_29=\u634F\u6518\u7B50\u809B
NGram.KANJI_6_30=\u54A7\u57C2\u5AB3\u60CB\u6886\u8378\u85D0\u8671
NGram.KANJI_6_32=\u5080\u5121\u51A4\u54AC\u55DC\u592D\u5DEB\u6292\u68D8\u69B4\u6A59\u6E24\u7FC5\u80DA\u8180\u86DB\u8700\u8DCB\u9761
NGram.KANJI_6_34=\u4E30\u51E0\u542C\u613F
NGram.KANJI_6_35=\u4E56\u547B\u55FD\u5C41\u606C\u6115\u6CAE\u7119\u795F\u7CDC\u86C9\u86F9\u8713\u873B\u8757\u8925\u892A\u96F9
NGram.KANJI_6_37=\u51B2\u5308\u5398\u54B8\u59DC\u5C4F\u5D14\u5F6D\u60E0\u6241\u6350\u699C\u6BEB\u6C6A\u6CC4\u6DEE\u6F58\u6F6D\u7199\u77EE\u7ADF\u8058\u820D\u8212\u8389\u8587\u884D\u8881\u8FA8\u8FF9\u96D5
NGram.KANJI_6_39=\u574F\u6251\u6302
NGram.KANJI_7_0=\u52FA\u5544\u60F0\u6994\u86A4\u86E4
NGram.KANJI_7_3=\u4E59\u4E7E\u4EAD\u4EF0\u4EF2\u4F0F\u4F10\u4FAF\u4FCA\u500D\u501F\u5076\u508D\u50E7\u5112\u5146\u5192\u51AC\u51DD\u51FD\u5200\u5237\u524A\u52A3\u52C3\u52C7\u52DF\u5351\u5352\u5353\u5378\u537F\u53E5\u5439\u54FA\u574A\u5782\u57CB\u5893\u58C1\u5915\u5937\u5949\u5951\u5974\u59B9\u5A18\u5A5A\u5ACC\u5B54\u5B5D\u5B64\u5B8F\u5BBF\u5BD2\u5C3A\u5C6F\u5CB3\u5D07\u5DE7\u5E84\u5E8A\u5F26\u5F69\u5F70\u5F90\u5FAA\u5FCD\u6012\u6016\u602A\u60A0\u60B2\u60BC\u6148\u6162\u6170\u6291\u6298\u62AB\u62BC\u62BD\u62D2\u62D3\u62D8\u62F3\u6311\u638C\u6398\u63E1\u642C\u6458\u64A4\u654F\u656C\u659C\u65E2\u65E8\u65EC\u6606\u6614\u6676\u6691\u6696\u66F9\u6749\u676F\u679A\u679D\u67CF\u67D4\u67F1\u67F3\u67F4\u6817\u6842\u6843\u6851\u68A8\u68CB\u68D2\u6B20\u6B32\u6BBF\u6C57\u6C88\u6CCA\u6D17\u6D1E\u6D69\u6D6E\u6D78\u6DE1\u6DFB\u6E58\u6EB6\u6F0F\u6F20\u7070\u708E\u70AD\u7126\u718A\u71C3\u7267\u72C2\u731B\u7384\u73A9\u73CD\u7434\u75AB\u75DB\u76C6\u76FE\u773C\u7891\u78C1\u795D\u7965\u79D2\u79DF\u79E6\u7A00\u7B11\u7B51\u7B54\u7C89\u7C92\u7CD6\u7D2B\u7F8A\u7FBD\u7FFC\u8010\u80A5\u80CE\u8150\u8179\u819C\u8247\u829D\u82B3\u82D7\u82E6\u8302\u8336\u8352\u83CA\u83CC\u83DC\u845B\u846C\u84B2\u84B8\u84C4\u8584\u864E\u86C7\u8861\u8863\u8870\u888B\u8896\u88D5\u8986\u8C46\u8DA3\u8E0F\u8F9B\u8FC5\u8FEB\u8FF7\u9003\u9006\u902E\u9042\u9063\u90ED\u963B\u9676\u96EA\u9756\u9B3C\u9B42\u9F3B
NGram.KANJI_7_6=\u4E01\u4E03\u4E45\u4E5D\u4E88\u4E92\u4EA1\u4ECB\u4EE4\u4F01\u4F0A\u4F2F\u4F3C\u4F4E\u4F4F\u4F55\u4F8B\u4F9D\u4FBF\u4FEE\u505C\u50CF\u516B\u516D\u5175\u5177\u5178\u5207\u520A\u5224\u526F\u529F\u52A9\u5343\u5348\u535A\u5370\u53BB\u53CB\u53F3\u5409\u542B\u544A\u547C\u5584\u5747\u5802\u590F\u592B\u5931\u5947\u597D\u5A01\u5A92\u5B63\u5B8C\u5B97\u5BA2\u5BA3\u5BA4\u5BB3\u5BB9\u5BC6\u5BCC\u5BDF\u5C04\u5C1A\u5C45\u5C4B\u5CB8\u5DE6\u5E0C\u5E1D\u5E2D\u5E55\u5E8F\u5E95\u5E97\u5EA7\u5EB7\u5EF6\u5F8B\u5FAE\u5FC5\u5FD7\u5FF5\u601D\u6025\u606F\u60F3\u611F\u623F\u6253\u6279\u627F\u6295\u6297\u62EC\u6388\u6392\u63F4\u6545\u6551\u6574\u6599\u65C5\u65E9\u6613\u6620\u6625\u666E\u666F\u66B4\u66F4\u670D\u671B\u6728\u672B\u6751\u677E\u67B6\u6838\u6839\u6848\u68EE\u690D\u6982\u6A21\u6B4C\u6B62\u6B66\u6BB5\u6BCD\u6C0F\u6C38\u6C42\u6CBF\u6CE2\u6CE8\u6D0B\u6D3E\u6D88\u6DF1\u6E05\u6E56\u706B\u7167\u7206\u7236\u7247\u7387\u7530\u7537\u7559\u7565\u7591\u75C5\u767B\u767D\u767E\u7687\u76DB\u76DF\u771F\u7763\u77ED\u7834\u79FB\u7A81\u7AE0\u7AEF\u7B56\u7B97\u7C4D\u7CBE\u7D20\u7D22\u7F72\u7FA4\u8001\u8003\u81F4\u822A\u826F\u82B1\u8349\u843D\u878D\u8857\u89D2\u8B66\u8C37\u8D70\u8D85\u8D8A\u8DB3\u8FF0\u8FFD\u9001\u901F\u90A3\u90A6\u914D\u91CE\u9632\u963F\u9644\u964D\u9664\u96C4\u96E8\u9752\u9769\u98DF
NGram.KANJI_7_7=\u4E09\u4E0A\u4E0B\u4E0D\u4E16\u4E3B\u4E8B\u4E8C\u4EE3\u4EE5\u4F4D\u4F5C\u4F7F\u5165\u5168\u516C\u5171\u51FA\u5206\u5229\u5236\u524D\u529B\u52A0\u5316\u5317\u5357\u539F\u53CA\u53F0\u5408\u540C\u540D\u548C\u5730\u57FA\u5916\u591A\u5929\u5B50\u5B9A\u5BB6\u5C0F\u5C71\u5DDE\u5DE5\u5E02\u5E73\u5EA6\u5EFA\u5F0F\u6027\u6210\u6240\u6307\u653F\u6587\u65B0\u65B9\u660E\u6700\u6709\u671F\u672C\u6B21\u6B63\u6C11\u6CBB\u6CD5\u6D77\u7269\u7279\u7406\u751F\u7528\u7531\u754C\u76EE\u76F8\u793E\u79D1\u7ACB\u7B2C\u7B49\u7CFB\u8005\u80FD\u81EA\u82F1\u884C\u8868\u897F\u8981\u901A\u9053\u90E8\u90FD\u91CD\u9AD8
NGram.KANJI_7_9=\u4E4D\u4F36\u5319\u6A61\u6DCB\u7194
NGram.KANJI_7_11=\u4E5E\u4F43\u5026\u50FB\u515C\u5243\u5420\u5446\u54B3\u54BD\u553E\u55A7\u5703\u5984\u5AC9\u5B09\u5C51\u5DFE\u5ED3\u5F1B\u6055\u618E\u62D9\u65A7\u6652\u6977\u6EBA\u707C\u75D8\u79E4\u7AFF\u7B4F\u7CA5\u808B\u8098\u80B4\u8235\u82DB\u849C\u8549\u868A\u86FE\u8718\u914C
NGram.KANJI_7_12=\u4E08\u4E38\u4F8D\u50DA\u5203\u5256\u52C9\u52D8\u52FE\u5320\u533F\u5375\u53D4\u540F\u54E8\u56DA\u5806\u5996\u5999\u59A5\u59A8\u59FF\u5AE1\u5BB0\u5BF8\u5C09\u5C3F\u5C48\u5C65\u5D29\u5E06\u5E4C\u5EB5\u5EB6\u5EB8\u5F13\u5FCC\u5FD8\u6052\u606D\u609F\u60D1\u614E\u6247\u62B1\u6349\u64E6\u6577\u65ED\u6674\u6734\u67C4\u6850\u690E\u6A58\u6B3A\u6B89\u6C41\u6CBC\u6CCC\u6CF3\u6D74\u6DAF\u6DF3\u6ECB\u6F02\u6F84\u71E5\u7261\u7272\u72AC\u72FC\u733F\u7409\u755C\u76F2\u7720\u77AC\u77E2\u7802\u786B\u78E8\u7901\u7948\u79E9\u7A1A\u7A74\u7AE3\u7B4B\u7B52\u7BB1\u7C3F\u8015\u8096\u809D\u80A2\u80A9\u80AA\u80BA\u80F8\u8102\u810A\u8154\u8155\u8170\u817A\u81A8\u81ED\u820C\u8236\u82BD\u8305\u83E9\u83F1\u840C\u85FB\u8650\u8702\u8A93\u8E44\u8FB0\u9038\u9091\u90AA\u916C\u9175\u9177\u9685\u96C0\u96C7\u96CC\u97AD
NGram.KANJI_7_13=\u63D6\u803D
NGram.KANJI_7_16=\u602F\u7566
NGram.KANJI_7_18=\u634C\u7C38
NGram.KANJI_7_19=\u4E18\u4E73\u4E95\u4EAB\u4EC1\u4ED8\u4ED9\u4F11\u4F34\u4F38\u4F59\u4FB5\u4FC3\u4FD7\u5012\u5019\u5065\u50AC\u5144\u5145\u514D\u517C\u51A0\u51B7\u5211\u5238\u523A\u523B\u5272\u52E4\u5360\u5371\u539A\u541B\u5426\u5438\u5473\u54F2\u5510\u552F\u5531\u559C\u5609\u56F0\u56FA\u591C\u5948\u594F\u59BB\u59D3\u5B85\u5B87\u5B88\u5B99\u5B9C\u5BC4\u5BFA\u5C0A\u5C3E\u5CA9\u5D0E\u5DE1\u5DE8\u5DEE\u5DF1\u5E45\u5E78\u5E7B\u5E7C\u5EAD\u5EF7\u5F1F\u5F31\u5F79\u5F7C\u5F85\u5F92\u5FA1\u5FE0\u6050\u60A3\u6212\u62DB\u632F\u6355\u63A2\u63AA\u63CF\u642D\u6469\u64CD\u653B\u6563\u660C\u662D\u667A\u6697\u66FF\u6750\u675F\u677F\u6790\u67D3\u682A\u6885\u68B0\u6B8A\u6B96\u6BDB\u6C60\u6CB9\u6CC9\u6D25\u6D66\u6DB2\u6DF7\u6E21\u6ED1\u6F2B\u6F6E\u6FC0\u7235\u725B\u72AF\u7389\u7532\u7533\u756A\u75BE\u75C7\u76AE\u76CA\u7740\u786C\u7956\u7968\u796D\u7981\u79C0\u79C1\u79CB\u79D8\u7A3F\u7AE5\u7AF9\u7E41\u7F6A\u7FFB\u8089\u80CC\u80DE\u81E3\u821E\u8239\u82E5\u8328\u8377\u85E4\u8840\u88C1\u88C2\u8C6A\u8D64\u8DDD\u8FCE\u8FD4\u9000\u9014\u907F\u90CA\u90CE\u90E1\u9152\u9178\u9686\u9694\u969C\u9707\u9732\u9AA8\u9B54\u9E7F\u9EBB
NGram.KANJI_7_20=\u4E39\u4E43\u4EAE\u4F73\u504F\u505A\u51C6\u51CC\u52AA\u5339\u5347\u53EB\u53EC\u5448\u5766\u57F9\u5854\u585E\u58A8\u5B8B\u5C01\u5CF0\u5E72\u5EC9\u5F80\u5F81\u5FBD\u5FEB\u6069\u6211\u624D\u628A\u62B5\u62CD\u6309\u63A7\u64AD\u6566\u6597\u65CB\u65D7\u6628\u6717\u6731\u674E\u675C\u683D\u6881\u6B3E\u6BD2\u6C7D\u6C99\u6CE5\u6CF0\u6D1B\u6D2A\u70C8\u719F\u724C\u7259\u73E0\u73ED\u745E\u74E6\u7518\u751A\u7686\u770B\u7B26\u8033\u80A1\u80E1\u821F\u83AB\u8499\u8D74\u8DE8\u900F\u9010\u9047\u904D\u906D\u9675\u96C5\u96F6\u96F7\u9700\u9F13
NGram.KANJI_7_21=\u5764\u59D0\u5A03\u6062\u6108\u68C9\u7164\u79BE\u7BAD\u903C
NGram.KANJI_7_23=\u4EA5\u50B2\u532A\u5366\u543B\u54E9\u5632\u59D1\u5BB5\u5DF7\u5F6A\u5F6C\u5FFD\u6070\u6168\u61BE\u63A0\u63A9\u6478\u65A4\u68A7\u6A1F\u6CAB\u70F9\u711A\u723D\u7262\u72F8\u751C\u754F\u75B9\u76C8\u7709\u7897\u7CCA\u7F9E\u8299\u82AD\u82B9\u82D4\u8304\u84C9\u84EC\u854A\u85AF\u86D9\u8FA3\u9187\u97A0
NGram.KANJI_7_25=\u4E14\u4E5F\u4F46\u514B\u5176\u5230\u5373\u53EA\u540E\u5982\u5C3C\u5DF4\u6216\u62C9\u65AF\u66FE\u6B64\u6D32\u6D6A\u7BC7\u800C
NGram.KANJI_7_28=\u4E4E\u4E9B\u4EA6\u4EC0\u4FC4\u5403\u5957\u5C24\u6089\u6258\u67D0\u758F\u7FF0\u8D6B
NGram.KANJI_7_29=\u4FAE\u5944\u5A29\u6101\u62ED\u6328\u637B\u6666\u6687\u66AE\u673D\u6756\u67FF\u6813\u68A2\u699B\u7078\u708A\u7396\u7422\u7525\u75E2\u76BF\u7766\u77B3\u7A3C\u7A92\u819D\u81FC\u8237\u8338\u8511\u88F3\u8FC2
NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5993\u5A7F\u5AC1\u5B9B\u5BC2\u5BE1\u5F04\u5F0A\u5F27\u6020\u6028\u6068\u6094\u6109\u611A\u614C\u621A\u62B9\u62D0\u62F7\u62FE\u632B\u633D\u6367\u660F\u6627\u6643\u66D9\u674F\u6795\u67AF\u67D1\u6876\u68DA\u68FA\u6905\u69FD\u6A80\u6B6A\u6CB8\u6CE3\u6DD1\u6DEB\u6E9C\u6EA2\u6EF4\u6F06\u714E\u716E\u722A\u7280\u74A7\u752B\u75B2\u75D5\u75F4\u77AD\u77E9\u785D\u79BD\u7A3D\u7A9F\u7B1B\u7B95\u7C9F\u7CDF\u80C3\u8106\u817F\u818F\u81B3\u828B\u82A5\u82AF\u840E\u851A\u853D\u8776\u87F9\u8877\u8910\u8912\u8C79\u8D66\u8FB1\u9017\u90C1\u916A\u9699\u96C1\u971C\u9774\u978D
NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20

View File

@ -1,271 +0,0 @@
using System;
using System.Text;
using MediaBrowser.Model.IO;
using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Text;
using Microsoft.Extensions.Logging;
using NLangDetect.Core;
using UniversalDetector;
namespace Emby.Server.Implementations.TextEncoding
{
public class TextEncoding : ITextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
private IJsonSerializer _json;
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
{
_fileSystem = fileSystem;
_logger = logger;
_json = json;
}
public Encoding GetASCIIEncoding()
{
return Encoding.ASCII;
}
private static Encoding GetInitialEncoding(byte[] buffer, int count)
{
if (count >= 3)
{
if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
return Encoding.UTF8;
}
if (count >= 2)
{
if (buffer[0] == 0xfe && buffer[1] == 0xff)
return Encoding.Unicode;
}
if (count >= 4)
{
if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
return Encoding.UTF32;
}
if (count >= 3)
{
if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
return Encoding.UTF7;
}
var result = new TextEncodingDetect().DetectEncoding(buffer, count);
switch (result)
{
case TextEncodingDetect.CharacterEncoding.Ansi:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Ascii:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf8Bom:
return Encoding.UTF8;
case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
return Encoding.UTF8;
default:
return null;
}
}
private bool _langDetectInitialized;
public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
{
var index = 0;
var encoding = GetInitialEncoding(bytes, count);
if (encoding != null && encoding.Equals(Encoding.UTF8))
{
return "utf-8";
}
if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
{
if (!_langDetectInitialized)
{
_langDetectInitialized = true;
LanguageDetector.Initialize(_json);
}
language = DetectLanguage(bytes, index, count);
if (!string.IsNullOrWhiteSpace(language))
{
_logger.LogDebug("Text language detected as {0}", language);
}
}
var charset = DetectCharset(bytes, index, count, language);
if (!string.IsNullOrWhiteSpace(charset))
{
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
return "utf-8";
}
if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
{
return charset;
}
}
if (!string.IsNullOrWhiteSpace(language))
{
return GetFileCharacterSetFromLanguage(language);
}
return null;
}
private string DetectLanguage(byte[] bytes, int index, int count)
{
try
{
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
}
catch (NLangDetectException ex)
{
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
}
try
{
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
}
catch (NLangDetectException ex)
{
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
}
try
{
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
}
catch (NLangDetectException ex)
{
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
}
return null;
}
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
{
throw new ArgumentNullException(nameof(charset));
}
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
try
{
return Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
charset = charset.Replace("-", string.Empty);
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
return Encoding.GetEncoding(charset);
}
}
public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
{
var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
return GetEncodingFromCharset(charset);
}
private static string GetFileCharacterSetFromLanguage(string language)
{
// https://developer.xamarin.com/api/type/System.Text.Encoding/
switch (language.ToLower())
{
case "tha":
return "windows-874";
case "hun":
return "windows-1252";
case "pol":
case "cze":
case "ces":
case "slo":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rom":
case "rup":
return "windows-1250";
// albanian
case "alb":
case "sqi":
return "windows-1250";
// slovak
case "slk":
case "slv":
return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
return "windows-1253";
// greek
case "gre":
case "ell":
return "windows-1253";
case "crh":
case "ota":
case "tur":
return "windows-1254";
// bulgarian
case "bul":
case "bgr":
return "windows-1251";
case "rus":
return "windows-1251";
case "vie":
return "windows-1258";
case "kor":
return "cp949";
default:
return "windows-1252";
}
}
private static string DetectCharset(byte[] bytes, int index, int count, string language)
{
var detector = new CharsetDetector();
detector.Feed(bytes, index, count);
detector.DataEnd();
var charset = detector.Charset;
// This is often incorrectly indetected. If this happens, try to use other techniques instead
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
{
if (!string.IsNullOrWhiteSpace(language))
{
return null;
}
}
return charset;
}
}
}

View File

@ -1,406 +0,0 @@
namespace Emby.Server.Implementations.TextEncoding
{
// Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
//
// https://www.autoitscript.com
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/// <summary>
/// Credit: https://github.com/AutoIt/text-encoding-detect
/// </summary>
public class TextEncodingDetect
{
private readonly byte[] _utf16BeBom =
{
0xFE,
0xFF
};
private readonly byte[] _utf16LeBom =
{
0xFF,
0xFE
};
private readonly byte[] _utf8Bom =
{
0xEF,
0xBB,
0xBF
};
private bool _nullSuggestsBinary = true;
private double _utf16ExpectedNullPercent = 70;
private double _utf16UnexpectedNullPercent = 10;
public enum CharacterEncoding
{
None, // Unknown or binary
Ansi, // 0-255
Ascii, // 0-127
Utf8Bom, // UTF8 with BOM
Utf8Nobom, // UTF8 without BOM
Utf16LeBom, // UTF16 LE with BOM
Utf16LeNoBom, // UTF16 LE without BOM
Utf16BeBom, // UTF16-BE with BOM
Utf16BeNoBom // UTF16-BE without BOM
}
/// <summary>
/// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
/// </summary>
public bool NullSuggestsBinary
{
set => _nullSuggestsBinary = value;
}
public double Utf16ExpectedNullPercent
{
set
{
if (value > 0 && value < 100)
{
_utf16ExpectedNullPercent = value;
}
}
}
public double Utf16UnexpectedNullPercent
{
set
{
if (value > 0 && value < 100)
{
_utf16UnexpectedNullPercent = value;
}
}
}
/// <summary>
/// Gets the BOM length for a given Encoding mode.
/// </summary>
/// <param name="encoding"></param>
/// <returns>The BOM length.</returns>
public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
{
int length;
switch (encoding)
{
case CharacterEncoding.Utf16BeBom:
case CharacterEncoding.Utf16LeBom:
length = 2;
break;
case CharacterEncoding.Utf8Bom:
length = 3;
break;
default:
length = 0;
break;
}
return length;
}
/// <summary>
/// Checks for a BOM sequence in a byte buffer.
/// </summary>
/// <param name="buffer"></param>
/// <param name="size"></param>
/// <returns>Encoding type or Encoding.None if no BOM.</returns>
public CharacterEncoding CheckBom(byte[] buffer, int size)
{
// Check for BOM
if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
{
return CharacterEncoding.Utf16LeBom;
}
if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
{
return CharacterEncoding.Utf16BeBom;
}
if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
{
return CharacterEncoding.Utf8Bom;
}
return CharacterEncoding.None;
}
/// <summary>
/// Automatically detects the Encoding type of a given byte buffer.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>The Encoding type or Encoding.None if unknown.</returns>
public CharacterEncoding DetectEncoding(byte[] buffer, int size)
{
// First check if we have a BOM and return that if so
CharacterEncoding encoding = CheckBom(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
// Now check for valid UTF8
encoding = CheckUtf8(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
// Now try UTF16
encoding = CheckUtf16NewlineChars(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
encoding = CheckUtf16Ascii(buffer, size);
if (encoding != CharacterEncoding.None)
{
return encoding;
}
// ANSI or None (binary) then
if (!DoesContainNulls(buffer, size))
{
return CharacterEncoding.Ansi;
}
// Found a null, return based on the preference in null_suggests_binary_
return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
}
/// <summary>
/// Checks if a buffer contains text that looks like utf16 by scanning for
/// newline chars that would be present even in non-english text.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
{
if (size < 2)
{
return CharacterEncoding.None;
}
// Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
size--;
var leControlChars = 0;
var beControlChars = 0;
uint pos = 0;
while (pos < size)
{
byte ch1 = buffer[pos++];
byte ch2 = buffer[pos++];
if (ch1 == 0)
{
if (ch2 == 0x0a || ch2 == 0x0d)
{
++beControlChars;
}
}
else if (ch2 == 0)
{
if (ch1 == 0x0a || ch1 == 0x0d)
{
++leControlChars;
}
}
// If we are getting both LE and BE control chars then this file is not utf16
if (leControlChars > 0 && beControlChars > 0)
{
return CharacterEncoding.None;
}
}
if (leControlChars > 0)
{
return CharacterEncoding.Utf16LeNoBom;
}
return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
}
/// <summary>
/// Checks if a buffer contains any nulls. Used to check for binary vs text data.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
private static bool DoesContainNulls(byte[] buffer, int size)
{
uint pos = 0;
while (pos < size)
{
if (buffer[pos++] == 0)
{
return true;
}
}
return false;
}
/// <summary>
/// Checks if a buffer contains text that looks like utf16. This is done based
/// on the use of nulls which in ASCII/script like text can be useful to identify.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
{
var numOddNulls = 0;
var numEvenNulls = 0;
// Get even nulls
uint pos = 0;
while (pos < size)
{
if (buffer[pos] == 0)
{
numEvenNulls++;
}
pos += 2;
}
// Get odd nulls
pos = 1;
while (pos < size)
{
if (buffer[pos] == 0)
{
numOddNulls++;
}
pos += 2;
}
double evenNullThreshold = numEvenNulls * 2.0 / size;
double oddNullThreshold = numOddNulls * 2.0 / size;
double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
// Lots of odd nulls, low number of even nulls
if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
{
return CharacterEncoding.Utf16LeNoBom;
}
// Lots of even nulls, low number of odd nulls
if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
{
return CharacterEncoding.Utf16BeNoBom;
}
// Don't know
return CharacterEncoding.None;
}
/// <summary>
/// Checks if a buffer contains valid utf8.
/// </summary>
/// <param name="buffer">The byte buffer.</param>
/// <param name="size">The size of the byte buffer.</param>
/// <returns>
/// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
/// Encoding.ASCII (data in 0.127 range).
/// </returns>
/// <returns>2</returns>
private CharacterEncoding CheckUtf8(byte[] buffer, int size)
{
// UTF8 Valid sequences
// 0xxxxxxx ASCII
// 110xxxxx 10xxxxxx 2-byte
// 1110xxxx 10xxxxxx 10xxxxxx 3-byte
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
//
// Width in UTF8
// Decimal Width
// 0-127 1 byte
// 194-223 2 bytes
// 224-239 3 bytes
// 240-244 4 bytes
//
// Subsequent chars are in the range 128-191
var onlySawAsciiRange = true;
uint pos = 0;
while (pos < size)
{
byte ch = buffer[pos++];
if (ch == 0 && _nullSuggestsBinary)
{
return CharacterEncoding.None;
}
int moreChars;
if (ch <= 127)
{
// 1 byte
moreChars = 0;
}
else if (ch >= 194 && ch <= 223)
{
// 2 Byte
moreChars = 1;
}
else if (ch >= 224 && ch <= 239)
{
// 3 Byte
moreChars = 2;
}
else if (ch >= 240 && ch <= 244)
{
// 4 Byte
moreChars = 3;
}
else
{
return CharacterEncoding.None; // Not utf8
}
// Check secondary chars are in range if we are expecting any
while (moreChars > 0 && pos < size)
{
onlySawAsciiRange = false; // Seen non-ascii chars now
ch = buffer[pos++];
if (ch < 128 || ch > 191)
{
return CharacterEncoding.None; // Not utf8
}
--moreChars;
}
}
// If we get to here then only valid UTF-8 sequences have been processed
// If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
}
}
}

View File

@ -1,121 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System.IO;
namespace UniversalDetector
{
/// <summary>
/// Default implementation of charset detection interface.
/// The detector can be fed by a System.IO.Stream:
/// <example>
/// <code>
/// using (FileStream fs = File.OpenRead(filename)) {
/// CharsetDetector cdet = new CharsetDetector();
/// cdet.Feed(fs);
/// cdet.DataEnd();
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
/// </code>
/// </example>
///
/// or by a byte a array:
///
/// <example>
/// <code>
/// byte[] buff = new byte[1024];
/// int read;
/// while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
/// Feed(buff, 0, read);
/// cdet.DataEnd();
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
/// </code>
/// </example>
/// </summary>
public class CharsetDetector : Core.UniversalDetector, ICharsetDetector
{
private string charset;
private float confidence;
//public event DetectorFinished Finished;
public CharsetDetector() : base(FILTER_ALL)
{
}
public void Feed(Stream stream)
{
byte[] buff = new byte[1024];
int read;
while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
{
Feed(buff, 0, read);
}
}
public bool IsDone()
{
return done;
}
public override void Reset()
{
this.charset = null;
this.confidence = 0.0f;
base.Reset();
}
public string Charset => charset;
public float Confidence => confidence;
protected override void Report(string charset, float confidence)
{
this.charset = charset;
this.confidence = confidence;
// if (Finished != null) {
// Finished(charset, confidence);
// }
}
}
//public delegate void DetectorFinished(string charset, float confidence);
}

View File

@ -1,113 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class Big5Prober : CharsetProber
{
//void GetDistribution(PRUint32 aCharLen, const char* aStr);
private CodingStateMachine codingSM;
private BIG5DistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public Big5Prober()
{
this.codingSM = new CodingStateMachine(new BIG5SMModel());
this.distributionAnalyser = new BIG5DistributionAnalyser();
this.Reset();
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState = 0;
int max = offset + len;
for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen;
if (i == offset)
{
lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
}
else
{
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
}
}
}
lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
distributionAnalyser.Reset();
}
public override string GetCharsetName()
{
return "Big-5";
}
public override float GetConfidence()
{
return distributionAnalyser.GetConfidence();
}
}
}

View File

@ -1,98 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class BitPackage
{
public static int INDEX_SHIFT_4BITS = 3;
public static int INDEX_SHIFT_8BITS = 2;
public static int INDEX_SHIFT_16BITS = 1;
public static int SHIFT_MASK_4BITS = 7;
public static int SHIFT_MASK_8BITS = 3;
public static int SHIFT_MASK_16BITS = 1;
public static int BIT_SHIFT_4BITS = 2;
public static int BIT_SHIFT_8BITS = 3;
public static int BIT_SHIFT_16BITS = 4;
public static int UNIT_MASK_4BITS = 0x0000000F;
public static int UNIT_MASK_8BITS = 0x000000FF;
public static int UNIT_MASK_16BITS = 0x0000FFFF;
private int indexShift;
private int shiftMask;
private int bitShift;
private int unitMask;
private int[] data;
public BitPackage(int indexShift, int shiftMask,
int bitShift, int unitMask, int[] data)
{
this.indexShift = indexShift;
this.shiftMask = shiftMask;
this.bitShift = bitShift;
this.unitMask = unitMask;
this.data = data;
}
public static int Pack16bits(int a, int b)
{
return ((b << 16) | a);
}
public static int Pack8bits(int a, int b, int c, int d)
{
return Pack16bits((b << 8) | a, (d << 8) | c);
}
public static int Pack4bits(int a, int b, int c, int d,
int e, int f, int g, int h)
{
return Pack8bits((b << 4) | a, (d << 4) | c,
(f << 4) | e, (h << 4) | g);
}
public int Unpack(int i)
{
return (data[i >> indexShift] >>
((i & shiftMask) << bitShift)) & unitMask;
}
}
}

View File

@ -1,202 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
using System.IO;
namespace UniversalDetector.Core
{
public enum ProbingState
{
Detecting = 0, // no sure answer yet, but caller can ask for confidence
FoundIt = 1, // positive answer
NotMe = 2 // negative answer
};
public abstract class CharsetProber
{
protected const float SHORTCUT_THRESHOLD = 0.95F;
protected ProbingState state;
// ASCII codes
private const byte SPACE = 0x20;
private const byte CAPITAL_A = 0x41;
private const byte CAPITAL_Z = 0x5A;
private const byte SMALL_A = 0x61;
private const byte SMALL_Z = 0x7A;
private const byte LESS_THAN = 0x3C;
private const byte GREATER_THAN = 0x3E;
/// <summary>
/// Feed data to the prober
/// </summary>
/// <param name="buf">a buffer</param>
/// <param name="offset">offset into buffer</param>
/// <param name="len">number of bytes available into buffer</param>
/// <returns>
/// A <see cref="ProbingState"/>
/// </returns>
public abstract ProbingState HandleData(byte[] buf, int offset, int len);
/// <summary>
/// Reset prober state
/// </summary>
public abstract void Reset();
public abstract string GetCharsetName();
public abstract float GetConfidence();
public virtual ProbingState GetState()
{
return state;
}
public virtual void SetOption()
{
}
public virtual void DumpStatus()
{
}
//
// Helper functions used in the Latin1 and Group probers
//
/// <summary>
///
/// </summary>
/// <returns>filtered buffer</returns>
protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len)
{
byte[] result = null;
using (var ms = new MemoryStream(buf.Length))
{
bool meetMSB = false;
int max = offset + len;
int prev = offset;
int cur = offset;
while (cur < max)
{
byte b = buf[cur];
if ((b & 0x80) != 0)
{
meetMSB = true;
}
else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|| b > SMALL_Z)
{
if (meetMSB && cur > prev)
{
ms.Write(buf, prev, cur - prev);
ms.WriteByte(SPACE);
meetMSB = false;
}
prev = cur + 1;
}
cur++;
}
if (meetMSB && cur > prev)
ms.Write(buf, prev, cur - prev);
ms.SetLength(ms.Position);
result = ms.ToArray();
}
return result;
}
/// <summary>
/// Do filtering to reduce load to probers (Remove ASCII symbols,
/// collapse spaces). This filter applies to all scripts which contain
/// both English characters and upper ASCII characters.
/// </summary>
/// <returns>a filtered copy of the input buffer</returns>
protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len)
{
byte[] result = null;
using (var ms = new MemoryStream(buf.Length))
{
bool inTag = false;
int max = offset + len;
int prev = offset;
int cur = offset;
while (cur < max)
{
byte b = buf[cur];
if (b == GREATER_THAN)
inTag = false;
else if (b == LESS_THAN)
inTag = true;
// it's ascii, but it's not a letter
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|| (b > CAPITAL_Z && b < SMALL_A)))
{
if (cur > prev && !inTag)
{
ms.Write(buf, prev, cur - prev);
ms.WriteByte(SPACE);
}
prev = cur + 1;
}
cur++;
}
// If the current segment contains more than just a symbol
// and it is not inside a tag then keep it.
if (!inTag && cur > prev)
ms.Write(buf, prev, cur - prev);
ms.SetLength(ms.Position);
result = ms.ToArray();
}
return result;
}
}
}

View File

@ -1,149 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public static class Charsets
{
public const string ASCII = "ASCII";
public const string UTF8 = "UTF-8";
public const string UTF16_LE = "UTF-16LE";
public const string UTF16_BE = "UTF-16BE";
public const string UTF32_BE = "UTF-32BE";
public const string UTF32_LE = "UTF-32LE";
/// <summary>
/// Unusual BOM (3412 order)
/// </summary>
public const string UCS4_3412 = "X-ISO-10646-UCS-4-3412";
/// <summary>
/// Unusual BOM (2413 order)
/// </summary>
public const string UCS4_2413 = "X-ISO-10646-UCS-4-2413";
/// <summary>
/// Cyrillic (based on bulgarian and russian data)
/// </summary>
public const string WIN1251 = "windows-1251";
/// <summary>
/// Latin-1, almost identical to ISO-8859-1
/// </summary>
public const string WIN1252 = "windows-1252";
/// <summary>
/// Greek
/// </summary>
public const string WIN1253 = "windows-1253";
/// <summary>
/// Logical hebrew (includes ISO-8859-8-I and most of x-mac-hebrew)
/// </summary>
public const string WIN1255 = "windows-1255";
/// <summary>
/// Traditional chinese
/// </summary>
public const string BIG5 = "Big-5";
public const string EUCKR = "EUC-KR";
public const string EUCJP = "EUC-JP";
public const string EUCTW = "EUC-TW";
/// <summary>
/// Note: gb2312 is a subset of gb18030
/// </summary>
public const string GB18030 = "gb18030";
public const string ISO2022_JP = "ISO-2022-JP";
public const string ISO2022_CN = "ISO-2022-CN";
public const string ISO2022_KR = "ISO-2022-KR";
/// <summary>
/// Simplified chinese
/// </summary>
public const string HZ_GB_2312 = "HZ-GB-2312";
public const string SHIFT_JIS = "Shift-JIS";
public const string MAC_CYRILLIC = "x-mac-cyrillic";
public const string KOI8R = "KOI8-R";
public const string IBM855 = "IBM855";
public const string IBM866 = "IBM866";
/// <summary>
/// East-Europe. Disabled because too similar to windows-1252
/// (latin-1). Should use tri-grams models to discriminate between
/// these two charsets.
/// </summary>
public const string ISO8859_2 = "ISO-8859-2";
/// <summary>
/// Cyrillic
/// </summary>
public const string ISO8859_5 = "ISO-8859-5";
/// <summary>
/// Greek
/// </summary>
public const string ISO_8859_7 = "ISO-8859-7";
/// <summary>
/// Visual Hebrew
/// </summary>
public const string ISO8859_8 = "ISO-8859-8";
/// <summary>
/// Thai. This recognizer is not enabled yet.
/// </summary>
public const string TIS620 = "TIS620";
}
}

View File

@ -1,85 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
/// <summary>
/// Parallel state machine for the Coding Scheme Method
/// </summary>
public class CodingStateMachine
{
private int currentState;
private SMModel model;
private int currentCharLen;
private int currentBytePos;
public CodingStateMachine(SMModel model)
{
this.currentState = SMModel.START;
this.model = model;
}
public int NextState(byte b)
{
// for each byte we get its class, if it is first byte,
// we also get byte length
int byteCls = model.GetClass(b);
if (currentState == SMModel.START)
{
currentBytePos = 0;
currentCharLen = model.charLenTable[byteCls];
}
// from byte's class and stateTable, we get its next state
currentState = model.stateTable.Unpack(
currentState * model.ClassFactor + byteCls);
currentBytePos++;
return currentState;
}
public void Reset()
{
currentState = SMModel.START;
}
public int CurrentCharLen => currentCharLen;
public string ModelName => model.Name;
}
}

View File

@ -1,117 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EUCJPProber : CharsetProber
{
private CodingStateMachine codingSM;
private EUCJPContextAnalyser contextAnalyser;
private EUCJPDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public EUCJPProber()
{
codingSM = new CodingStateMachine(new EUCJPSMModel());
distributionAnalyser = new EUCJPDistributionAnalyser();
contextAnalyser = new EUCJPContextAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "EUC-JP";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen;
if (i == offset)
{
lastChar[1] = buf[offset];
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
}
else
{
contextAnalyser.HandleOneChar(buf, i - 1, charLen);
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
}
}
}
lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting)
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
contextAnalyser.Reset();
distributionAnalyser.Reset();
}
public override float GetConfidence()
{
float contxtCf = contextAnalyser.GetConfidence();
float distribCf = distributionAnalyser.GetConfidence();
return (contxtCf > distribCf ? contxtCf : distribCf);
}
}
}

View File

@ -1,114 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EUCKRProber : CharsetProber
{
private CodingStateMachine codingSM;
private EUCKRDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public EUCKRProber()
{
codingSM = new CodingStateMachine(new EUCKRSMModel());
distributionAnalyser = new EUCKRDistributionAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "EUC-KR";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen;
if (i == offset)
{
lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
}
else
{
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
}
}
}
lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override float GetConfidence()
{
return distributionAnalyser.GetConfidence();
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
distributionAnalyser.Reset();
//mContextAnalyser.Reset();
}
}
}

View File

@ -1,113 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EUCTWProber : CharsetProber
{
private CodingStateMachine codingSM;
private EUCTWDistributionAnalyser distributionAnalyser;
private byte[] lastChar = new byte[2];
public EUCTWProber()
{
this.codingSM = new CodingStateMachine(new EUCTWSMModel());
this.distributionAnalyser = new EUCTWDistributionAnalyser();
this.Reset();
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState;
int max = offset + len;
for (int i = 0; i < max; i++)
{
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen;
if (i == offset)
{
lastChar[1] = buf[offset];
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
}
else
{
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
}
}
}
lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting)
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
return state;
}
public override string GetCharsetName()
{
return "x-euc-tw";
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
distributionAnalyser.Reset();
}
public override float GetConfidence()
{
return distributionAnalyser.GetConfidence();
}
}
}

View File

@ -1,113 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public class EscCharsetProber : CharsetProber
{
private const int CHARSETS_NUM = 4;
private string detectedCharset;
private CodingStateMachine[] codingSM;
int activeSM;
public EscCharsetProber()
{
codingSM = new CodingStateMachine[CHARSETS_NUM];
codingSM[0] = new CodingStateMachine(new HZSMModel());
codingSM[1] = new CodingStateMachine(new ISO2022CNSMModel());
codingSM[2] = new CodingStateMachine(new ISO2022JPSMModel());
codingSM[3] = new CodingStateMachine(new ISO2022KRSMModel());
Reset();
}
public override void Reset()
{
state = ProbingState.Detecting;
for (int i = 0; i < CHARSETS_NUM; i++)
codingSM[i].Reset();
activeSM = CHARSETS_NUM;
detectedCharset = null;
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int max = offset + len;
for (int i = offset; i < max && state == ProbingState.Detecting; i++)
{
for (int j = activeSM - 1; j >= 0; j--)
{
// byte is feed to all active state machine
int codingState = codingSM[j].NextState(buf[i]);
if (codingState == SMModel.ERROR)
{
// got negative answer for this state machine, make it inactive
activeSM--;
if (activeSM == 0)
{
state = ProbingState.NotMe;
return state;
}
else if (j != activeSM)
{
CodingStateMachine t = codingSM[activeSM];
codingSM[activeSM] = codingSM[j];
codingSM[j] = t;
}
}
else if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt;
detectedCharset = codingSM[j].ModelName;
return state;
}
}
}
return state;
}
public override string GetCharsetName()
{
return detectedCharset;
}
public override float GetConfidence()
{
return 0.99f;
}
}
}

View File

@ -1,304 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA <k-tak@void.in> (Java port)
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/// <summary>
/// Escaped charsets state machines
/// </summary>
namespace UniversalDetector.Core
{
public class HZSMModel : SMModel
{
private readonly static int[] HZ_cls = {
BitPackage.Pack4bits(1,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,4,0,5,2,0), // 78 - 7f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 80 - 87
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 88 - 8f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 90 - 97
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 98 - 9f
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a0 - a7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a8 - af
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c0 - c7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c8 - cf
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d0 - d7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d8 - df
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e0 - e7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e8 - ef
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // f0 - f7
BitPackage.Pack4bits(1,1,1,1,1,1,1,1) // f8 - ff
};
private readonly static int[] HZ_st = {
BitPackage.Pack4bits(START, ERROR, 3, START, START, START, ERROR, ERROR),//00-07
BitPackage.Pack4bits(ERROR, ERROR, ERROR, ERROR, ITSME, ITSME, ITSME, ITSME),//08-0f
BitPackage.Pack4bits(ITSME, ITSME, ERROR, ERROR, START, START, 4, ERROR),//10-17
BitPackage.Pack4bits( 5, ERROR, 6, ERROR, 5, 5, 4, ERROR),//18-1f
BitPackage.Pack4bits( 4, ERROR, 4, 4, 4, ERROR, 4, ERROR),//20-27
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
};
private readonly static int[] HZCharLenTable = { 0, 0, 0, 0, 0, 0 };
public HZSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, HZ_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, HZ_st),
HZCharLenTable, "HZ-GB-2312")
{
}
}
public class ISO2022CNSMModel : SMModel
{
private readonly static int[] ISO2022CN_cls = {
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
};
private readonly static int[] ISO2022CN_st = {
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START),//00-07
BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//10-17
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR),//18-1f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//20-27
BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//28-2f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//30-37
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
};
private readonly static int[] ISO2022CNCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
public ISO2022CNSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
9,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
ISO2022CNCharLenTable, "ISO-2022-CN")
{
}
}
public class ISO2022JPSMModel : SMModel
{
private readonly static int[] ISO2022JP_cls = {
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
};
private readonly static int[] ISO2022JP_st = {
BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START),//00-07
BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//10-17
BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR),//18-1f
BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR),//20-27
BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR),//28-2f
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//30-37
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//38-3f
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
};
private readonly static int[] ISO2022JPCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
public ISO2022JPSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
10,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
ISO2022JPCharLenTable, "ISO-2022-JP")
{
}
}
public class ISO2022KRSMModel : SMModel
{
private readonly static int[] ISO2022KR_cls = {
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
};
private readonly static int[] ISO2022KR_st = {
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR),//00-07
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR),//10-17
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR),//18-1f
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
};
private readonly static int[] ISO2022KRCharLenTable = { 0, 0, 0, 0, 0, 0 };
public ISO2022KRSMModel() : base(
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
6,
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
BitPackage.SHIFT_MASK_4BITS,
BitPackage.BIT_SHIFT_4BITS,
BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
ISO2022KRCharLenTable, "ISO-2022-KR")
{
}
}
}

View File

@ -1,119 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
// We use gb18030 to replace gb2312, because 18030 is a superset.
public class GB18030Prober : CharsetProber
{
private CodingStateMachine codingSM;
private GB18030DistributionAnalyser analyser;
private byte[] lastChar;
public GB18030Prober()
{
lastChar = new byte[2];
codingSM = new CodingStateMachine(new GB18030SMModel());
analyser = new GB18030DistributionAnalyser();
Reset();
}
public override string GetCharsetName()
{
return "gb18030";
}
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
int codingState = SMModel.START;
int max = offset + len;
for (int i = offset; i < max; i++)
{
codingState = codingSM.NextState(buf[i]);
if (codingState == SMModel.ERROR)
{
state = ProbingState.NotMe;
break;
}
if (codingState == SMModel.ITSME)
{
state = ProbingState.FoundIt;
break;
}
if (codingState == SMModel.START)
{
int charLen = codingSM.CurrentCharLen;
if (i == offset)
{
lastChar[1] = buf[offset];
analyser.HandleOneChar(lastChar, 0, charLen);
}
else
{
analyser.HandleOneChar(buf, i - 1, charLen);
}
}
}
lastChar[0] = buf[max - 1];
if (state == ProbingState.Detecting)
{
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
state = ProbingState.FoundIt;
}
return state;
}
public override float GetConfidence()
{
return analyser.GetConfidence();
}
public override void Reset()
{
codingSM.Reset();
state = ProbingState.Detecting;
analyser.Reset();
}
}
}

View File

@ -1,328 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/**
* General ideas of the Hebrew charset recognition
*
* Four main charsets exist in Hebrew:
* "ISO-8859-8" - Visual Hebrew
* "windows-1255" - Logical Hebrew
* "ISO-8859-8-I" - Logical Hebrew
* "x-mac-hebrew" - ?? Logical Hebrew ??
*
* Both "ISO" charsets use a completely identical set of code points, whereas
* "windows-1255" and "x-mac-hebrew" are two different proper supersets of
* these code points. windows-1255 defines additional characters in the range
* 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
* diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
* x-mac-hebrew defines similar additional code points but with a different
* mapping.
*
* As far as an average Hebrew text with no diacritics is concerned, all four
* charsets are identical with respect to code points. Meaning that for the
* main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
* (including final letters).
*
* The dominant difference between these charsets is their directionality.
* "Visual" directionality means that the text is ordered as if the renderer is
* not aware of a BIDI rendering algorithm. The renderer sees the text and
* draws it from left to right. The text itself when ordered naturally is read
* backwards. A buffer of Visual Hebrew generally looks like so:
* "[last word of first line spelled backwards] [whole line ordered backwards
* and spelled backwards] [first word of first line spelled backwards]
* [end of line] [last word of second line] ... etc' "
* adding punctuation marks, numbers and English text to visual text is
* naturally also "visual" and from left to right.
*
* "Logical" directionality means the text is ordered "naturally" according to
* the order it is read. It is the responsibility of the renderer to display
* the text from right to left. A BIDI algorithm is used to place general
* punctuation marks, numbers and English text in the text.
*
* Texts in x-mac-hebrew are almost impossible to find on the Internet. From
* what little evidence I could find, it seems that its general directionality
* is Logical.
*
* To sum up all of the above, the Hebrew probing mechanism knows about two
* charsets:
* Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
* backwards while line order is natural. For charset recognition purposes
* the line order is unimportant (In fact, for this implementation, even
* word order is unimportant).
* Logical Hebrew - "windows-1255" - normal, naturally ordered text.
*
* "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
* specifically identified.
* "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
* that contain special punctuation marks or diacritics is displayed with
* some unconverted characters showing as question marks. This problem might
* be corrected using another model prober for x-mac-hebrew. Due to the fact
* that x-mac-hebrew texts are so rare, writing another model prober isn't
* worth the effort and performance hit.
*
* *** The Prober ***
*
* The prober is divided between two nsSBCharSetProbers and an nsHebrewProber,
* all of which are managed, created, fed data, inquired and deleted by the
* nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in
* fact some kind of Hebrew, Logical or Visual. The final decision about which
* one is it is made by the nsHebrewProber by combining final-letter scores
* with the scores of the two nsSBCharSetProbers to produce a final answer.
*
* The nsSBCSGroupProber is responsible for stripping the original text of HTML
* tags, English characters, numbers, low-ASCII punctuation characters, spaces
* and new lines. It reduces any sequence of such characters to a single space.
* The buffer fed to each prober in the SBCS group prober is pure text in
* high-ASCII.
* The two nsSBCharSetProbers (model probers) share the same language model:
* Win1255Model.
* The first nsSBCharSetProber uses the model normally as any other
* nsSBCharSetProber does, to recognize windows-1255, upon which this model was
* built. The second nsSBCharSetProber is told to make the pair-of-letter
* lookup in the language model backwards. This in practice exactly simulates
* a visual Hebrew model using the windows-1255 logical Hebrew model.
*
* The nsHebrewProber is not using any language model. All it does is look for
* final-letter evidence suggesting the text is either logical Hebrew or visual
* Hebrew. Disjointed from the model probers, the results of the nsHebrewProber
* alone are meaningless. nsHebrewProber always returns 0.00 as confidence
* since it never identifies a charset by itself. Instead, the pointer to the
* nsHebrewProber is passed to the model probers as a helper "Name Prober".
* When the Group prober receives a positive identification from any prober,
* it asks for the name of the charset identified. If the prober queried is a
* Hebrew model prober, the model prober forwards the call to the
* nsHebrewProber to make the final decision. In the nsHebrewProber, the
* decision is made according to the final-letters scores maintained and Both
* model probers scores. The answer is returned in the form of the name of the
* charset identified, either "windows-1255" or "ISO-8859-8".
*
*/
namespace UniversalDetector.Core
{
/// <summary>
/// This prober doesn't actually recognize a language or a charset.
/// It is a helper prober for the use of the Hebrew model probers
/// </summary>
public class HebrewProber : CharsetProber
{
// windows-1255 / ISO-8859-8 code points of interest
private const byte FINAL_KAF = 0xEA;
private const byte NORMAL_KAF = 0xEB;
private const byte FINAL_MEM = 0xED;
private const byte NORMAL_MEM = 0xEE;
private const byte FINAL_NUN = 0xEF;
private const byte NORMAL_NUN = 0xF0;
private const byte FINAL_PE = 0xF3;
private const byte NORMAL_PE = 0xF4;
private const byte FINAL_TSADI = 0xF5;
private const byte NORMAL_TSADI = 0xF6;
// Minimum Visual vs Logical final letter score difference.
// If the difference is below this, don't rely solely on the final letter score distance.
private const int MIN_FINAL_CHAR_DISTANCE = 5;
// Minimum Visual vs Logical model score difference.
// If the difference is below this, don't rely at all on the model score distance.
private const float MIN_MODEL_DISTANCE = 0.01f;
protected const string VISUAL_HEBREW_NAME = "ISO-8859-8";
protected const string LOGICAL_HEBREW_NAME = "windows-1255";
// owned by the group prober.
protected CharsetProber logicalProber, visualProber;
protected int finalCharLogicalScore, finalCharVisualScore;
// The two last bytes seen in the previous buffer.
protected byte prev, beforePrev;
public HebrewProber()
{
Reset();
}
public void SetModelProbers(CharsetProber logical, CharsetProber visual)
{
logicalProber = logical;
visualProber = visual;
}
/**
* Final letter analysis for logical-visual decision.
* Look for evidence that the received buffer is either logical Hebrew or
* visual Hebrew.
* The following cases are checked:
* 1) A word longer than 1 letter, ending with a final letter. This is an
* indication that the text is laid out "naturally" since the final letter
* really appears at the end. +1 for logical score.
* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
* the Non-Final form of that letter. Exceptions to this rule are mentioned
* above in isNonFinal(). This is an indication that the text is laid out
* backwards. +1 for visual score
* 3) A word longer than 1 letter, starting with a final letter. Final letters
* should not appear at the beginning of a word. This is an indication that
* the text is laid out backwards. +1 for visual score.
*
* The visual score and logical score are accumulated throughout the text and
* are finally checked against each other in GetCharSetName().
* No checking for final letters in the middle of words is done since that case
* is not an indication for either Logical or Visual text.
*
* The input buffer should not contain any white spaces that are not (' ')
* or any low-ascii punctuation marks.
*/
public override ProbingState HandleData(byte[] buf, int offset, int len)
{
// Both model probers say it's not them. No reason to continue.
if (GetState() == ProbingState.NotMe)
return ProbingState.NotMe;
int max = offset + len;
for (int i = offset; i < max; i++)
{
byte b = buf[i];
// a word just ended
if (b == 0x20)
{
// *(curPtr-2) was not a space so prev is not a 1 letter word
if (beforePrev != 0x20)
{
// case (1) [-2:not space][-1:final letter][cur:space]
if (IsFinal(prev))
finalCharLogicalScore++;
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
else if (IsNonFinal(prev))
finalCharVisualScore++;
}
}
else
{
// case (3) [-2:space][-1:final letter][cur:not space]
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
++finalCharVisualScore;
}
beforePrev = prev;
prev = b;
}
// Forever detecting, till the end or until both model probers
// return NotMe (handled above).
return ProbingState.Detecting;
}
// Make the decision: is it Logical or Visual?
public override string GetCharsetName()
{
// If the final letter score distance is dominant enough, rely on it.
int finalsub = finalCharLogicalScore - finalCharVisualScore;
if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
return LOGICAL_HEBREW_NAME;
if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
return VISUAL_HEBREW_NAME;
// It's not dominant enough, try to rely on the model scores instead.
float modelsub = logicalProber.GetConfidence() - visualProber.GetConfidence();
if (modelsub > MIN_MODEL_DISTANCE)
return LOGICAL_HEBREW_NAME;
if (modelsub < -(MIN_MODEL_DISTANCE))
return VISUAL_HEBREW_NAME;
// Still no good, back to final letter distance, maybe it'll save the day.
if (finalsub < 0)
return VISUAL_HEBREW_NAME;
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
return LOGICAL_HEBREW_NAME;
}
public override void Reset()
{
finalCharLogicalScore = 0;
finalCharVisualScore = 0;
prev = 0x20;
beforePrev = 0x20;
}
public override ProbingState GetState()
{
// Remain active as long as any of the model probers are active.
if (logicalProber.GetState() == ProbingState.NotMe &&
visualProber.GetState() == ProbingState.NotMe)
return ProbingState.NotMe;
return ProbingState.Detecting;
}
public override void DumpStatus()
{
//Console.WriteLine(" HEB: {0} - {1} [Logical-Visual score]", finalCharLogicalScore, finalCharVisualScore);
}
public override float GetConfidence()
{
return 0.0f;
}
protected static bool IsFinal(byte b)
{
return (b == FINAL_KAF || b == FINAL_MEM || b == FINAL_NUN
|| b == FINAL_PE || b == FINAL_TSADI);
}
protected static bool IsNonFinal(byte b)
{
// The normal Tsadi is not a good Non-Final letter due to words like
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
// the Non-Final tsadi to appear at an end of a word even though this is not
// the case in the original text.
// The letters Pe and Kaf rarely display a related behavior of not being a
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
// these letters as Non-Final letters outweighs the damage since these words
// are quite rare.
return (b == NORMAL_KAF || b == NORMAL_MEM || b == NORMAL_NUN
|| b == NORMAL_PE);
}
}
}

View File

@ -1,327 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class JapaneseContextAnalyser
{
protected const int CATEGORIES_NUM = 6;
protected const int ENOUGH_REL_THRESHOLD = 100;
protected const int MAX_REL_THRESHOLD = 1000;
protected const int MINIMUM_DATA_THRESHOLD = 4;
protected const float DONT_KNOW = -1.0f;
// hiragana frequency category table
// This is hiragana 2-char sequence table, the number in each cell represents its frequency category
protected static byte[,] jp2CharContext = {
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
{ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,},
{ 0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4,},
{ 1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4,},
{ 0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3,},
{ 0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3,},
{ 0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3,},
{ 0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4,},
{ 0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3,},
{ 2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4,},
{ 0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3,},
{ 0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5,},
{ 0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3,},
{ 2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5,},
{ 0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4,},
{ 1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4,},
{ 0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3,},
{ 0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3,},
{ 0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3,},
{ 0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5,},
{ 0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4,},
{ 0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5,},
{ 0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3,},
{ 0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4,},
{ 0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4,},
{ 0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4,},
{ 0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,},
{ 0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,},
{ 1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3,},
{ 0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0,},
{ 0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3,},
{ 0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3,},
{ 0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5,},
{ 0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4,},
{ 2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5,},
{ 0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3,},
{ 0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3,},
{ 0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3,},
{ 0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3,},
{ 0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4,},
{ 0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4,},
{ 0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2,},
{ 0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3,},
{ 0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3,},
{ 0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3,},
{ 0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3,},
{ 0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4,},
{ 0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3,},
{ 0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4,},
{ 0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3,},
{ 0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3,},
{ 0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4,},
{ 0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4,},
{ 0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3,},
{ 2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4,},
{ 0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4,},
{ 0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3,},
{ 0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4,},
{ 0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4,},
{ 1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4,},
{ 0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3,},
{ 0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,},
{ 0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2,},
{ 0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3,},
{ 0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3,},
{ 0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5,},
{ 0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3,},
{ 0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4,},
{ 1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4,},
{ 0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
{ 0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3,},
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1,},
{ 0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2,},
{ 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,},
{ 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,},
};
// category counters, each integer counts sequence in its category
int[] relSample = new int[CATEGORIES_NUM];
// total sequence received
int totalRel;
// The order of previous char
int lastCharOrder;
// if last byte in current buffer is not the last byte of a character,
// we need to know how many byte to skip in next buffer.
int needToSkipCharNum;
// If this flag is set to true, detection is done and conclusion has
// been made
bool done;
public JapaneseContextAnalyser()
{
Reset();
}
public float GetConfidence()
{
// This is just one way to calculate confidence. It works well for me.
if (totalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(totalRel - relSample[0])) / totalRel;
else
return DONT_KNOW;
}
public void HandleData(byte[] buf, int offset, int len)
{
int charLen = 0;
int max = offset + len;
if (done)
return;
// The buffer we got is byte oriented, and a character may span
// more than one buffer. In case the last one or two byte in last
// buffer is not complete, we record how many byte needed to
// complete that character and skip these bytes here. We can choose
// to record those bytes as well and analyse the character once it
// is complete, but since a character will not make much difference,
// skipping it will simplify our logic and improve performance.
for (int i = needToSkipCharNum + offset; i < max;)
{
int order = GetOrder(buf, i, out charLen);
i += charLen;
if (i > max)
{
needToSkipCharNum = i - max;
lastCharOrder = -1;
}
else
{
if (order != -1 && lastCharOrder != -1)
{
totalRel++;
if (totalRel > MAX_REL_THRESHOLD)
{
done = true;
break;
}
relSample[jp2CharContext[lastCharOrder, order]]++;
}
lastCharOrder = order;
}
}
}
public void HandleOneChar(byte[] buf, int offset, int charLen)
{
if (totalRel > MAX_REL_THRESHOLD)
done = true;
if (done)
return;
// Only 2-bytes characters are of our interest
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
if (order != -1 && lastCharOrder != -1)
{
totalRel++;
// count this sequence to its category counter
relSample[jp2CharContext[lastCharOrder, order]]++;
}
lastCharOrder = order;
}
public void Reset()
{
totalRel = 0;
for (int i = 0; i < CATEGORIES_NUM; i++)
{
relSample[i] = 0;
needToSkipCharNum = 0;
lastCharOrder = -1;
done = false;
}
}
protected abstract int GetOrder(byte[] buf, int offset, out int charLen);
protected abstract int GetOrder(byte[] buf, int offset);
public bool GotEnoughData()
{
return totalRel > ENOUGH_REL_THRESHOLD;
}
}
public class SJISContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0x82;
protected override int GetOrder(byte[] buf, int offset, out int charLen)
{
//find out current char's byte length
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|| buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
charLen = 2;
else
charLen = 1;
// return its order if it is hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE)
{
byte low = buf[offset + 1];
if (low >= 0x9F && low <= 0xF1)
return low - 0x9F;
}
return -1;
}
protected override int GetOrder(byte[] buf, int offset)
{
// We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE)
{
byte low = buf[offset + 1];
if (low >= 0x9F && low <= 0xF1)
return low - 0x9F;
}
return -1;
}
}
public class EUCJPContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0xA4;
protected override int GetOrder(byte[] buf, int offset, out int charLen)
{
byte high = buf[offset];
//find out current char's byte length
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
charLen = 2;
else if (high == 0xBF)
charLen = 3;
else
charLen = 1;
// return its order if it is hiragana
if (high == HIRAGANA_FIRST_BYTE)
{
byte low = buf[offset + 1];
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
return -1;
}
protected override int GetOrder(byte[] buf, int offset)
{
// We are only interested in Hiragana
if (buf[offset] == HIRAGANA_FIRST_BYTE)
{
byte low = buf[offset + 1];
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
return -1;
}
}
}

View File

@ -1,246 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class BulgarianModel : SequenceModel
{
//Model Table:
//total sequences: 100%
//first 512 sequences: 96.9392%
//first 1024 sequences:3.0618%
//rest sequences: 0.2992%
//negative sequences: 0.0020%
private static byte[] BULGARIAN_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,0,3,1,0,
0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,1,3,3,3,3,2,2,2,1,1,2,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,2,3,2,2,3,3,1,1,2,3,3,2,3,3,3,3,2,1,2,0,2,0,3,0,0,
0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,1,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,3,1,3,0,3,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,1,3,3,2,3,2,2,2,0,0,2,0,2,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,3,3,1,2,2,3,2,1,1,2,0,2,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,2,3,3,1,2,3,2,2,2,3,3,3,3,3,2,2,3,1,2,0,2,1,2,0,0,
0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,1,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,1,2,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,1,1,1,2,2,1,3,1,3,2,2,3,0,0,1,0,1,0,1,0,0,
0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,2,2,3,2,2,3,1,2,1,1,1,2,3,1,3,1,2,2,0,1,1,1,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,1,3,2,2,3,3,1,2,3,1,1,3,3,3,3,1,2,2,1,1,1,0,2,0,2,0,1,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,2,2,1,1,2,0,2,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,0,1,2,1,3,3,2,3,3,3,3,3,2,3,2,1,0,3,1,2,1,2,1,2,3,2,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,1,3,3,2,3,3,2,2,2,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,0,3,3,3,3,3,2,1,1,2,1,3,3,0,3,1,1,1,1,3,2,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,1,1,3,1,3,3,2,3,2,2,2,3,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,2,2,3,2,1,1,1,1,1,3,1,3,1,1,0,0,0,1,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,2,0,3,2,0,3,0,2,0,0,2,1,3,1,0,0,1,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,2,1,1,1,1,2,1,1,2,1,1,1,2,2,1,2,1,1,1,0,1,1,0,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,2,1,3,1,1,2,1,3,2,1,1,0,1,2,3,2,1,1,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,2,2,1,0,1,0,0,1,0,0,0,2,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,2,3,2,3,3,1,3,2,1,1,1,2,1,1,2,1,3,0,1,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,2,2,3,3,2,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,0,1,1,0,1,0,2,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,2,1,3,1,0,2,2,1,3,2,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
3,3,3,3,3,3,1,2,0,2,3,1,2,3,2,0,1,3,1,2,1,1,1,0,0,1,0,0,2,2,2,3,
2,2,2,2,1,2,1,1,2,2,1,1,2,0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1,
3,3,3,3,3,2,1,2,2,1,2,0,2,0,1,0,1,2,1,2,1,1,0,0,0,1,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
3,3,2,3,3,1,1,3,1,0,3,2,1,0,0,0,1,2,0,2,0,1,0,0,0,1,0,1,2,1,2,2,
1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,0,1,2,1,1,1,0,0,0,0,0,1,1,0,0,
3,1,0,1,0,2,3,2,2,2,3,2,2,2,2,2,1,0,2,1,2,1,1,1,0,1,2,1,2,2,2,1,
1,1,2,2,2,2,1,2,1,1,0,1,2,1,2,2,2,1,1,1,0,1,1,1,1,2,0,1,0,0,0,0,
2,3,2,3,3,0,0,2,1,0,2,1,0,0,0,0,2,3,0,2,0,0,0,0,0,1,0,0,2,0,1,2,
2,1,2,1,2,2,1,1,1,2,1,1,1,0,1,2,2,1,1,1,1,1,0,1,1,1,0,0,1,2,0,0,
3,3,2,2,3,0,2,3,1,1,2,0,0,0,1,0,0,2,0,2,0,0,0,1,0,1,0,1,2,0,2,2,
1,1,1,1,2,1,0,1,2,2,2,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,
2,3,2,3,3,0,0,3,0,1,1,0,1,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,2,0,1,2,
2,2,1,1,1,1,1,2,2,2,1,0,2,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
3,3,3,3,2,2,2,2,2,0,2,1,1,1,1,2,1,2,1,1,0,2,0,1,0,1,0,0,2,0,1,2,
1,1,1,1,1,1,1,2,2,1,1,0,2,0,1,0,2,0,0,1,1,1,0,0,2,0,0,0,1,1,0,0,
2,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,1,2,0,1,2,
2,2,2,1,1,2,1,1,2,2,2,1,2,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,
2,3,3,3,3,0,2,2,0,2,1,0,0,0,1,1,1,2,0,2,0,0,0,3,0,0,0,0,2,0,2,2,
1,1,1,2,1,2,1,1,2,2,2,1,2,0,1,1,1,0,1,1,1,1,0,2,1,0,0,0,1,1,0,0,
2,3,3,3,3,0,2,1,0,0,2,0,0,0,0,0,1,2,0,2,0,0,0,0,0,0,0,0,2,0,1,2,
1,1,1,2,1,1,1,1,2,2,2,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,
3,3,2,2,3,0,1,0,1,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,0,0,0,0,1,0,2,2,
1,1,1,1,1,2,1,1,2,2,1,2,2,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,1,0,0,
3,1,0,1,0,2,2,2,2,3,2,1,1,1,2,3,0,0,1,0,2,1,1,0,1,1,1,1,2,1,1,1,
1,2,2,1,2,1,2,2,1,1,0,1,2,1,2,2,1,1,1,0,0,1,1,1,2,1,0,1,0,0,0,0,
2,1,0,1,0,3,1,2,2,2,2,1,2,2,1,1,1,0,2,1,2,2,1,1,2,1,1,0,2,1,1,1,
1,2,2,2,2,2,2,2,1,2,0,1,1,0,2,1,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,0,
2,1,1,1,1,2,2,2,2,1,2,2,2,1,2,2,1,1,2,1,2,3,2,2,1,1,1,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,3,2,0,1,2,0,1,2,1,1,0,1,0,1,2,1,2,0,0,0,1,1,0,0,0,1,0,0,2,
1,1,0,0,1,1,0,1,1,1,1,0,2,0,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,
2,0,0,0,0,1,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,2,1,1,1,
1,2,2,2,2,1,1,2,1,2,1,1,1,0,2,1,2,1,1,1,0,2,1,1,1,1,0,1,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,3,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,1,0,1,2,
1,1,1,1,1,1,0,0,2,2,2,2,2,0,1,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1,
2,3,1,2,1,0,1,1,0,2,2,2,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,2,
1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
2,2,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,2,2,
1,1,1,1,1,0,0,1,2,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,0,2,0,1,1,0,0,0,1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,1,
0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,3,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,2,
1,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
2,1,2,2,2,1,2,1,2,2,1,1,2,1,1,1,0,1,1,1,1,2,0,1,0,1,1,1,1,0,1,1,
1,1,2,1,1,1,1,1,1,0,0,1,2,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,
1,0,0,1,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,1,0,0,1,0,2,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,1,
0,2,0,1,0,0,1,1,2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,1,1,0,2,1,0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,
0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,0,1,2,1,1,1,1,1,1,2,2,1,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,
1,1,2,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,1,2,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,2,0,0,2,0,1,0,0,1,0,0,1,
1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
1,1,1,1,1,1,1,2,0,0,0,0,0,0,2,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
};
public BulgarianModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, BULGARIAN_LANG_MODEL, 0.969392f, false, name)
{
}
}
public class Latin5BulgarianModel : BulgarianModel
{
//255: Control characters that usually does not exist in any text
//254: Carriage/Return
//253: symbol (punctuation) that does not belong to word
//252: 0 - 9
// Character Mapping Table:
// this table is modified base on win1251BulgarianCharToOrderMap, so
// only number <64 is sure valid
private static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80
210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90
81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //b0
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
};
public Latin5BulgarianModel() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
{
}
}
public class Win1251BulgarianModel : BulgarianModel
{
private static byte[] WIN1251__CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80
221, 78, 64, 83,121, 98,117,105,222,223,224,225,226,227,228,229, //90
88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0
73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0
};
public Win1251BulgarianModel() : base(WIN1251__CHAR_TO_ORDER_MAP, "windows-1251")
{
}
}
}

View File

@ -1,345 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class CyrillicModel : SequenceModel
{
// Model Table:
// total sequences: 100%
// first 512 sequences: 97.6601%
// first 1024 sequences: 2.3389%
// rest sequences: 0.1237%
// negative sequences: 0.0009%
protected readonly static byte[] RUSSIAN_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,2,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,2,3,3,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,2,2,2,3,1,3,3,1,3,3,3,3,2,2,3,0,2,2,2,3,3,2,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3,3,3,2,1,2,2,0,1,2,2,2,2,2,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,2,3,3,2,1,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,1,2,3,2,2,3,2,3,3,3,3,2,2,3,0,3,2,2,3,1,1,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,3,3,3,3,2,2,2,0,3,3,3,2,2,2,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,2,3,2,2,0,1,3,2,1,2,2,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,2,1,1,3,0,1,1,1,1,2,1,1,0,2,2,2,1,2,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,2,2,2,2,1,3,2,3,2,3,2,1,2,2,0,1,1,2,1,2,1,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,2,2,2,0,2,2,2,2,3,1,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,2,3,2,2,3,3,3,3,3,3,3,3,3,1,3,2,0,0,3,3,3,3,2,3,3,3,3,2,3,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,3,2,2,3,3,0,2,1,0,3,2,3,2,3,0,0,1,2,0,0,1,0,1,2,1,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,3,0,2,3,3,3,3,2,3,3,3,3,1,2,2,0,0,2,3,2,2,2,3,2,3,2,2,3,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,0,2,3,2,3,0,1,2,3,3,2,0,2,3,0,0,2,3,2,2,0,1,3,1,3,2,2,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,3,0,2,3,3,3,3,3,3,3,3,2,1,3,2,0,0,2,2,3,3,3,2,3,3,0,2,2,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,2,2,2,3,3,0,0,1,1,1,1,1,2,0,0,1,1,1,1,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,0,3,2,3,3,2,3,2,0,2,1,0,1,1,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,1,3,2,3,1,1,2,1,0,2,2,2,2,1,3,1,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
2,2,3,3,3,3,3,1,2,2,1,3,1,0,3,0,0,3,0,0,0,1,1,0,1,2,1,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,2,1,1,3,3,3,2,2,1,2,2,3,1,1,2,0,0,2,2,1,3,0,0,2,1,1,2,1,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,3,3,3,3,1,2,2,2,1,2,1,3,3,1,1,2,1,2,1,2,2,0,2,0,0,1,1,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,3,2,1,3,2,2,3,2,0,3,2,0,3,0,1,0,1,1,0,0,1,1,1,1,0,1,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,2,3,3,3,2,2,2,3,3,1,2,1,2,1,0,1,0,1,1,0,1,0,0,2,1,1,1,0,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
3,1,1,2,1,2,3,3,2,2,1,2,2,3,0,2,1,0,0,2,2,3,2,1,2,2,2,2,2,3,1,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,1,1,0,1,1,2,2,1,1,3,0,0,1,3,1,1,1,0,0,0,1,0,1,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,3,3,3,2,0,0,0,2,1,0,1,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,1,0,0,2,3,2,2,2,1,2,2,2,1,2,1,0,0,1,1,1,0,2,0,1,1,1,0,0,1,1,
1,0,0,0,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,3,0,0,0,0,1,0,0,0,0,3,0,1,2,1,0,0,0,0,0,0,0,1,1,0,0,1,1,
1,0,1,0,1,2,0,0,1,1,2,1,0,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,0,1,1,0,
2,2,3,2,2,2,3,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,1,0,1,1,1,0,2,1,
1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,1,1,0,
3,3,3,2,2,2,2,3,2,2,1,1,2,2,2,2,1,1,3,1,2,1,2,0,0,1,1,0,1,0,2,1,
1,1,1,1,1,2,1,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,
2,0,0,1,0,3,2,2,2,2,1,2,1,2,1,2,0,0,0,2,1,2,2,1,1,2,2,0,1,1,0,2,
1,1,1,1,1,0,1,1,1,2,1,1,1,2,1,0,1,2,1,1,1,1,0,1,1,1,0,0,1,0,0,1,
1,3,2,2,2,1,1,1,2,3,0,0,0,0,2,0,2,2,1,0,0,0,0,0,0,1,0,0,0,0,1,1,
1,0,1,1,0,1,0,1,1,0,1,1,0,2,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
2,3,2,3,2,1,2,2,2,2,1,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,2,1,
1,1,2,1,0,2,0,0,1,0,1,0,0,1,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,
3,0,0,1,0,2,2,2,3,2,2,2,2,2,2,2,0,0,0,2,1,2,1,1,1,2,2,0,0,0,1,2,
1,1,1,1,1,0,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,
2,3,2,3,3,2,0,1,1,1,0,0,1,0,2,0,1,1,3,1,0,0,0,0,0,0,0,1,0,0,2,1,
1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,
2,3,3,3,3,1,2,2,2,2,0,1,1,0,2,1,1,1,2,1,0,1,1,0,0,1,0,1,0,0,2,0,
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,3,3,2,0,0,1,1,2,2,1,0,0,2,0,1,1,3,0,0,1,0,0,0,0,0,1,0,1,2,1,
1,1,2,0,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,1,0,
1,3,2,3,2,1,0,0,2,2,2,0,1,0,2,0,1,1,1,0,1,0,0,0,3,0,1,1,0,0,2,1,
1,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,2,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,
3,1,2,1,1,2,2,2,2,2,2,1,2,2,1,1,0,0,0,2,2,2,0,0,0,1,2,1,0,1,0,1,
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,1,0,1,0,1,1,0,1,1,1,0,0,1,
3,0,0,0,0,2,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,0,1,0,1,1,0,0,1,0,1,
1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,
1,3,3,2,2,0,0,0,2,2,0,0,0,1,2,0,1,1,2,0,0,0,0,0,0,0,0,1,0,0,2,1,
0,1,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
2,3,2,3,2,0,0,0,0,1,1,0,0,0,2,0,2,0,2,0,0,0,0,0,1,0,0,1,0,0,1,1,
1,1,2,0,1,2,1,0,1,1,2,1,1,1,1,1,2,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,
1,3,2,2,2,1,0,0,2,2,1,0,1,2,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,
0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,0,2,3,1,2,2,2,2,2,2,1,1,0,0,0,1,0,1,0,2,1,1,1,0,0,0,0,1,
1,1,0,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
2,0,2,0,0,1,0,3,2,1,2,1,2,2,0,1,0,0,0,2,1,0,0,2,1,1,1,1,0,2,0,2,
2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,0,0,0,1,1,1,1,0,1,0,0,1,
1,2,2,2,2,1,0,0,1,0,0,0,0,0,2,0,1,1,1,1,0,0,0,0,1,0,1,2,0,0,2,0,
1,0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,
2,1,2,2,2,0,3,0,1,1,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,
1,2,2,3,2,2,0,0,1,1,2,0,1,2,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,
0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
2,2,1,1,2,1,2,2,2,2,2,1,2,2,0,1,0,0,0,1,2,2,2,1,2,1,1,1,1,1,2,1,
1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,
1,2,2,2,2,0,1,0,2,2,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,0,0,2,2,2,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,
0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,2,0,0,0,0,1,0,0,1,1,2,0,0,0,0,1,0,1,0,0,1,0,0,2,0,0,0,1,
0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
1,2,2,2,1,1,2,0,2,1,1,1,1,0,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
1,0,2,1,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
1,0,0,0,0,2,0,1,2,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1,
0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,
2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,1,1,0,1,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,1,0,1,1,0,1,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,0,0,
0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
};
public CyrillicModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, RUSSIAN_LANG_MODEL, 0.976601f, false, name)
{
}
}
public class Koi8rModel : CyrillicModel
{
private readonly static byte[] KOI8R_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90
223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0
238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253, //b0
27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0
15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0
59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
};
public Koi8rModel() : base(KOI8R_CHAR_TO_ORDER_MAP, "KOI8-R")
{
}
}
public class Win1251Model : CyrillicModel
{
private readonly static byte[] WIN1251_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
239,240,241,242,243,244,245,246, 68,247,248,249,250,251,252,253,
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
};
public Win1251Model() : base(WIN1251_CHAR_TO_ORDER_MAP, "windows-1251")
{
}
}
public class Latin5Model : CyrillicModel
{
private readonly static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
};
public Latin5Model() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
{
}
}
public class MacCyrillicModel : CyrillicModel
{
private readonly static byte[] MACCYRILLIC_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
239,240,241,242,243,244,245,246,247,248,249,250,251,252, 68, 16,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
};
public MacCyrillicModel() : base(MACCYRILLIC_CHAR_TO_ORDER_MAP,
"x-mac-cyrillic")
{
}
}
public class Ibm855Model : CyrillicModel
{
private readonly static byte[] IBM855_BYTE_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205,
206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70,
3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219,
220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229,
230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243,
8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248,
43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
};
public Ibm855Model() : base(IBM855_BYTE_TO_ORDER_MAP, "IBM855")
{
}
}
public class Ibm866Model : CyrillicModel
{
private readonly static byte[] IBM866_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
};
public Ibm866Model() : base(IBM866_CHAR_TO_ORDER_MAP, "IBM866")
{
}
}
}

View File

@ -1,244 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class GreekModel : SequenceModel
{
// Model Table:
// total sequences: 100%
// first 512 sequences: 98.2851%
// first 1024 sequences:1.7001%
// rest sequences: 0.0359%
// negative sequences: 0.0148%
private readonly static byte[] GREEK_LANG_MODEL = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0,
2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0,
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0,
2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0,
2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0,
0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0,
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0,
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0,
3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0,
3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0,
2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0,
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0,
0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0,
0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0,
0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2,
0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,
0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2,
0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0,
0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2,
0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2,
0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,
0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2,
0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0,
0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0,
0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0,
0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2,
0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0,
0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2,
0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2,
0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2,
0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,
0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1,
0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2,
0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2,
0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2,
0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,
0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,
0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,
0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0,
0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0,
0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
public GreekModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, GREEK_LANG_MODEL, 0.982851f, false, name)
{
}
}
public class Latin7Model : GreekModel
{
/****************************************************************
255: Control characters that usually does not exist in any text
254: Carriage/Return
253: symbol (punctuation) that does not belong to word
252: 0 - 9
*****************************************************************/
//Character Mapping Table:
private readonly static byte[] LATIN7_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
+253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, //b0
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
};
public Latin7Model() : base(LATIN7_CHAR_TO_ORDER_MAP, "ISO-8859-7")
{
}
}
public class Win1253Model : GreekModel
{
private readonly static byte[] WIN1253__CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
+253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, //b0
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
};
public Win1253Model() : base(WIN1253__CHAR_TO_ORDER_MAP, "windows-1253")
{
}
}
}

View File

@ -1,220 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class HebrewModel : SequenceModel
{
//Model Table:
//total sequences: 100%
//first 512 sequences: 98.4004%
//first 1024 sequences: 1.5981%
//rest sequences: 0.087%
//negative sequences: 0.0015%
private readonly static byte[] HEBREW_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
1,2,1,2,1,2,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,
1,2,1,3,1,1,0,0,2,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,2,2,1,3,
1,2,1,1,2,2,0,0,2,2,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,2,
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,3,2,2,2,1,2,2,2,2,
1,2,1,1,2,2,0,1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,2,2,2,
0,2,0,2,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2,
0,2,1,2,2,2,0,0,2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,2,2,
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,2,0,2,
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,2,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,1,2,1,1,1,
0,1,1,1,1,1,3,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,
0,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,2,1,2,3,3,2,3,3,3,3,2,3,2,1,2,0,2,1,2,
0,2,0,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,1,2,2,3,3,2,3,2,3,2,2,3,1,2,2,0,2,2,2,
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,2,3,3,3,3,1,3,2,2,2,
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,2,2,2,1,2,2,0,2,2,2,2,
0,2,0,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,3,3,2,3,3,2,2,1,2,2,2,2,2,2,
0,2,1,2,1,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,3,2,3,3,3,3,3,2,2,2,2,2,2,2,1,
0,2,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,3,3,3,2,3,2,3,2,1,2,3,0,2,1,2,2,
0,2,1,1,2,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,
3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,1,3,1,2,2,2,1,2,3,3,1,2,1,2,2,2,2,
0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,1,3,3,3,1,2,2,2,2,1,1,2,2,2,2,2,2,
0,2,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,1,2,3,2,3,2,2,2,2,1,2,1,1,1,2,2,
0,2,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,
1,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,2,3,3,2,3,1,2,2,2,2,3,2,3,1,1,2,2,1,2,2,1,1,0,2,2,2,2,
0,1,0,1,2,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
3,0,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,1,0,1,0,1,1,0,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
3,2,2,1,2,2,2,2,2,2,2,1,2,2,1,2,2,1,1,1,1,1,1,1,1,2,1,1,0,3,3,3,
0,3,0,2,2,2,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
2,2,2,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,1,2,2,2,1,1,1,2,0,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,3,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,0,2,1,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,3,1,1,2,2,2,2,2,1,2,2,2,1,1,2,2,2,2,2,2,2,1,2,2,1,0,1,1,1,1,0,
0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,2,1,1,1,1,2,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,
2,1,1,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,1,2,1,1,1,1,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,2,1,2,2,2,2,2,2,2,2,2,2,1,2,1,2,1,1,2,1,1,1,2,1,2,1,2,0,1,0,1,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,3,1,2,2,2,1,2,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,2,1,2,1,1,0,1,0,1,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,1,1,1,1,1,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,2,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,1,1,0,0,
0,1,1,1,2,1,2,2,2,0,2,0,2,0,1,1,2,1,1,1,1,2,1,0,1,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,1,0,0,0,0,0,1,0,1,2,2,0,1,0,0,1,1,2,2,1,2,0,2,0,0,0,1,2,0,1,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,2,0,2,1,2,0,2,0,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,1,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,1,2,2,0,0,1,0,0,0,1,0,0,1,
1,1,2,1,0,1,1,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,2,1,
0,2,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,1,0,0,1,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,1,1,0,1,
2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,1,1,2,1,1,2,0,1,0,0,0,1,1,0,1,
1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,0,0,2,1,1,2,0,2,0,0,0,1,1,0,1,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,2,2,1,2,1,1,0,1,0,0,0,1,1,0,1,
2,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,0,1,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,2,1,1,1,0,2,1,1,0,0,0,2,1,0,1,
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,0,2,1,1,0,1,0,0,0,1,1,0,1,
2,2,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,0,1,2,1,0,2,0,0,0,1,1,0,1,
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,
0,1,0,0,2,0,2,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,2,1,1,1,1,1,0,1,0,0,0,0,1,0,1,
0,1,1,1,2,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
};
public HebrewModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, HEBREW_LANG_MODEL, 0.984004f, false, name)
{
}
}
public class Win1255Model : HebrewModel
{
/*
255: Control characters that usually does not exist in any text
254: Carriage/Return
253: symbol (punctuation) that does not belong to word
252: 0 - 9
*/
//Windows-1255 language model
//Character Mapping Table:
private readonly static byte[] WIN1255_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40
78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, //50
253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60
66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, //70
124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214,
215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221,
34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227,
106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234,
30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237,
238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250,
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
};
public Win1255Model() : base(WIN1255_CHAR_TO_ORDER_MAP, "windows-1255")
{
}
}
}

View File

@ -1,238 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
namespace UniversalDetector.Core
{
public abstract class HungarianModel : SequenceModel
{
//Model Table:
//total sequences: 100%
//first 512 sequences: 94.7368%
//first 1024 sequences:5.2623%
//rest sequences: 0.8894%
//negative sequences: 0.0009%
private readonly static byte[] HUNGARIAN_LANG_MODEL = {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
3,2,1,3,3,3,3,3,2,3,3,3,3,3,1,1,2,3,3,3,3,3,3,3,1,1,3,2,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,1,1,2,3,3,3,1,3,3,3,3,3,1,3,3,2,2,0,3,2,3,
0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,3,3,2,3,3,2,2,3,2,3,2,0,3,2,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,1,2,3,2,2,3,1,2,3,3,2,2,0,3,3,3,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,3,2,
0,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,2,1,3,2,2,3,2,1,3,2,2,1,0,3,3,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,2,2,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,3,2,2,3,1,1,3,2,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,1,3,3,3,3,3,2,2,1,3,3,3,0,1,1,2,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,2,0,3,2,3,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,
3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,1,3,2,2,2,3,1,1,3,3,1,1,0,3,3,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,3,3,3,3,3,1,2,3,2,2,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,2,2,2,3,1,3,3,2,2,1,3,3,3,1,1,3,1,2,3,2,3,2,2,2,1,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,2,2,3,2,1,0,3,2,0,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,1,0,3,3,3,3,0,2,3,0,0,2,1,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,3,3,3,3,2,2,3,3,2,2,2,2,3,3,0,1,2,3,2,3,2,2,3,2,1,2,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
3,3,3,3,3,3,1,2,3,3,3,2,1,2,3,3,2,2,2,3,2,3,3,1,3,3,1,1,0,2,3,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,1,2,2,2,2,3,3,3,1,1,1,3,3,1,1,3,1,1,3,2,1,2,3,1,1,0,2,2,2,
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,2,1,2,1,1,3,3,1,1,1,1,3,3,1,1,2,2,1,2,1,1,2,2,1,1,0,2,2,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,1,1,2,1,1,3,3,1,0,1,1,3,3,2,0,1,1,2,3,1,0,2,2,1,0,0,1,3,2,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,2,1,3,3,3,3,3,1,2,3,2,3,3,2,1,1,3,2,3,2,1,2,2,0,1,2,1,0,0,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
3,3,3,3,2,2,2,2,3,1,2,2,1,1,3,3,0,3,2,1,2,3,2,1,3,3,1,1,0,2,1,3,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,3,3,2,2,2,3,2,3,3,3,2,1,1,3,3,1,1,1,2,2,3,2,3,2,2,2,1,0,2,2,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
1,0,0,3,3,3,3,3,0,0,3,3,2,3,0,0,0,2,3,3,1,0,1,2,0,0,1,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,2,3,3,3,3,3,1,2,3,3,2,2,1,1,0,3,3,2,2,1,2,2,1,0,2,2,0,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,3,2,2,1,3,1,2,3,3,2,2,1,1,2,2,1,1,1,1,3,2,1,1,1,1,2,1,0,1,2,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
2,3,3,1,1,1,1,1,3,3,3,0,1,1,3,3,1,1,1,1,1,2,2,0,3,1,1,2,0,2,1,1,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
3,1,0,1,2,1,2,2,0,1,2,3,1,2,0,0,0,2,1,1,1,1,1,2,0,0,1,1,0,0,0,0,
1,2,1,2,2,2,1,2,1,2,0,2,0,2,2,1,1,2,1,1,2,1,1,1,0,1,0,0,0,1,1,0,
1,1,1,2,3,2,3,3,0,1,2,2,3,1,0,1,0,2,1,2,2,0,1,1,0,0,1,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,3,3,2,2,1,0,0,3,2,3,2,0,0,0,1,1,3,0,0,1,1,0,0,2,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,2,2,3,3,1,0,1,3,2,3,1,1,1,0,1,1,1,1,1,3,1,0,0,2,2,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3,1,1,1,2,2,2,1,0,1,2,3,3,2,0,0,0,2,1,1,1,2,1,1,1,0,1,1,1,0,0,0,
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,2,1,1,1,1,1,1,0,1,1,1,0,0,1,1,
3,2,2,1,0,0,1,1,2,2,0,3,0,1,2,1,1,0,0,1,1,1,0,1,1,1,1,0,2,1,1,1,
2,2,1,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,2,3,1,1,1,1,1,1,1,1,1,0,1,
2,3,3,0,1,0,0,0,3,3,1,0,0,1,2,2,1,0,0,0,0,2,0,0,1,1,1,0,2,1,1,1,
2,1,1,1,1,1,1,2,1,1,0,1,1,0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,1,0,1,
2,3,3,0,1,0,0,0,2,2,0,0,0,0,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,1,0,
2,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
3,2,2,0,1,0,1,0,2,3,2,0,0,1,2,2,1,0,0,1,1,1,0,0,2,1,0,1,2,2,1,1,
2,1,1,1,1,1,1,2,1,1,1,1,1,1,0,2,1,0,1,1,0,1,1,1,0,1,1,2,1,1,0,1,
2,2,2,0,0,1,0,0,2,2,1,1,0,0,2,1,1,0,0,0,1,2,0,0,2,1,0,0,2,1,1,1,
2,1,1,1,1,2,1,2,1,1,1,2,2,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,
1,2,3,0,0,0,1,0,3,2,1,0,0,1,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,2,1,
1,1,0,0,0,1,0,1,1,1,1,1,2,0,0,1,0,0,0,2,0,0,1,1,1,1,1,1,1,1,0,1,
3,0,0,2,1,2,2,1,0,0,2,1,2,2,0,0,0,2,1,1,1,0,1,1,0,0,1,1,2,0,0,0,
1,2,1,2,2,1,1,2,1,2,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,0,0,1,
1,3,2,0,0,0,1,0,2,2,2,0,0,0,2,2,1,0,0,0,0,3,1,1,1,1,0,0,2,1,1,1,
2,1,0,1,1,1,0,1,1,1,1,1,1,1,0,2,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1,
2,3,2,0,0,0,1,0,2,2,0,0,0,0,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,1,0,
2,1,1,1,1,2,1,2,1,2,0,1,1,1,0,2,1,1,1,2,1,1,1,1,0,1,1,1,1,1,0,1,
3,1,1,2,2,2,3,2,1,1,2,2,1,1,0,1,0,2,2,1,1,1,1,1,0,0,1,1,0,1,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,0,0,0,0,0,2,2,0,0,0,0,2,2,1,0,0,0,1,1,0,0,1,2,0,0,2,1,1,1,
2,2,1,1,1,2,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,1,1,0,1,2,1,1,1,0,1,
1,0,0,1,2,3,2,1,0,0,2,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0,
1,2,1,2,1,2,1,1,1,2,0,2,1,1,1,0,1,2,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
2,3,2,0,0,0,0,0,1,1,2,1,0,0,1,1,1,0,0,0,0,2,0,0,1,1,0,0,2,1,1,1,
2,1,1,1,1,1,1,2,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,
1,2,2,0,1,1,1,0,2,2,2,0,0,0,3,2,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0,
1,1,0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,0,0,1,1,1,0,1,0,1,
2,1,0,2,1,1,2,2,1,1,2,1,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,1,0,0,0,
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0,
1,2,3,0,0,0,1,0,2,2,0,0,0,0,2,2,0,0,0,0,0,1,0,0,1,0,0,0,2,0,1,0,
2,1,1,1,1,1,0,2,0,0,0,1,2,1,1,1,1,0,1,2,0,1,0,1,0,1,1,1,0,1,0,1,
2,2,2,0,0,0,1,0,2,1,2,0,0,0,1,1,2,0,0,0,0,1,0,0,1,1,0,0,2,1,0,1,
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
1,2,2,0,0,0,1,0,2,2,2,0,0,0,1,1,0,0,0,0,0,1,1,0,2,0,0,1,1,1,0,1,
1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,0,1,
1,0,0,1,0,1,2,1,0,0,1,1,1,2,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,
0,2,1,2,1,1,1,1,1,2,0,2,0,1,1,0,1,2,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
2,1,1,0,1,2,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,2,1,0,1,
2,2,1,1,1,1,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,0,1,0,1,1,1,1,1,0,1,
1,2,2,0,0,0,0,0,1,1,0,0,0,0,2,1,0,0,0,0,0,2,0,0,2,2,0,0,2,0,0,1,
2,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,
1,1,2,0,0,3,1,0,2,1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0,
1,2,1,0,1,1,1,2,1,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0,
2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,2,0,0,0,
2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,1,0,1,
2,1,1,1,2,1,1,1,0,1,1,2,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,0,1,1,1,1,1,0,0,1,1,2,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,
1,2,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
2,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,2,0,0,1,0,0,1,0,1,0,0,0,
0,1,1,1,1,1,1,1,1,2,0,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,1,1,0,0,2,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,
0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
0,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,1,0,1,0,0,1,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
};
public HungarianModel(byte[] charToOrderMap, string name)
: base(charToOrderMap, HUNGARIAN_LANG_MODEL, 0.947368f,
false, name)
{
}
}
public class Latin2HungarianModel : HungarianModel
{
private readonly static byte[] LATIN2_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,
175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,
191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205,
79,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231,
232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241,
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
};
public Latin2HungarianModel() : base(LATIN2_CHAR_TO_ORDER_MAP, "ISO-8859-2")
{
}
}
public class Win1250HungarianModel : HungarianModel
{
private readonly static byte[] WIN1250_CHAR_TO_ORDER_MAP = {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,
177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190,
191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205,
81,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231,
232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241,
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
};
public Win1250HungarianModel() : base(WIN1250_CHAR_TO_ORDER_MAP, "windows-1250")
{
}
}
}

Some files were not shown because too many files have changed in this diff Show More