Remove MediaBrowser.Text since it violates licenses and is overall hacky
This commit is contained in:
parent
3a5e3ade01
commit
b35dcbb9f0
|
@ -22,7 +22,6 @@ using System.Collections.Generic;
|
|||
using System.IO;
|
||||
using System.Linq;
|
||||
using MediaBrowser.Model.IO;
|
||||
using MediaBrowser.Model.Text;
|
||||
|
||||
namespace BDInfo
|
||||
{
|
||||
|
@ -72,8 +71,7 @@ namespace BDInfo
|
|||
|
||||
public event OnPlaylistFileScanError PlaylistFileScanError;
|
||||
|
||||
public BDROM(
|
||||
string path, IFileSystem fileSystem, ITextEncoding textEncoding)
|
||||
public BDROM(string path, IFileSystem fileSystem)
|
||||
{
|
||||
if (string.IsNullOrEmpty(path))
|
||||
{
|
||||
|
@ -167,7 +165,7 @@ namespace BDInfo
|
|||
foreach (var file in files)
|
||||
{
|
||||
PlaylistFiles.Add(
|
||||
file.Name.ToUpper(), new TSPlaylistFile(this, file, _fileSystem, textEncoding));
|
||||
file.Name.ToUpper(), new TSPlaylistFile(this, file, _fileSystem));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -187,7 +185,7 @@ namespace BDInfo
|
|||
foreach (var file in files)
|
||||
{
|
||||
StreamClipFiles.Add(
|
||||
file.Name.ToUpper(), new TSStreamClipFile(file, _fileSystem, textEncoding));
|
||||
file.Name.ToUpper(), new TSStreamClipFile(file, _fileSystem));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,14 +22,12 @@ using System;
|
|||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using MediaBrowser.Model.IO;
|
||||
using MediaBrowser.Model.Text;
|
||||
|
||||
namespace BDInfo
|
||||
{
|
||||
public class TSPlaylistFile
|
||||
{
|
||||
private readonly IFileSystem _fileSystem;
|
||||
private readonly ITextEncoding _textEncoding;
|
||||
private FileSystemMetadata FileInfo = null;
|
||||
public string FileType = null;
|
||||
public bool IsInitialized = false;
|
||||
|
@ -64,26 +62,22 @@ namespace BDInfo
|
|||
public List<TSGraphicsStream> GraphicsStreams =
|
||||
new List<TSGraphicsStream>();
|
||||
|
||||
public TSPlaylistFile(
|
||||
BDROM bdrom,
|
||||
FileSystemMetadata fileInfo, IFileSystem fileSystem, ITextEncoding textEncoding)
|
||||
public TSPlaylistFile(BDROM bdrom,
|
||||
FileSystemMetadata fileInfo, IFileSystem fileSystem)
|
||||
{
|
||||
BDROM = bdrom;
|
||||
FileInfo = fileInfo;
|
||||
_fileSystem = fileSystem;
|
||||
_textEncoding = textEncoding;
|
||||
Name = fileInfo.Name.ToUpper();
|
||||
}
|
||||
|
||||
public TSPlaylistFile(
|
||||
BDROM bdrom,
|
||||
public TSPlaylistFile(BDROM bdrom,
|
||||
string name,
|
||||
List<TSStreamClip> clips, IFileSystem fileSystem, ITextEncoding textEncoding)
|
||||
List<TSStreamClip> clips, IFileSystem fileSystem)
|
||||
{
|
||||
BDROM = bdrom;
|
||||
Name = name;
|
||||
_fileSystem = fileSystem;
|
||||
_textEncoding = textEncoding;
|
||||
IsCustom = true;
|
||||
foreach (var clip in clips)
|
||||
{
|
||||
|
@ -1245,8 +1239,7 @@ namespace BDInfo
|
|||
int count,
|
||||
ref int pos)
|
||||
{
|
||||
string val =
|
||||
_textEncoding.GetASCIIEncoding().GetString(data, pos, count);
|
||||
string val = Encoding.ASCII.GetString(data, pos, count);
|
||||
|
||||
pos += count;
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
//============================================================================
|
||||
//============================================================================
|
||||
// BDInfo - Blu-ray Video and Audio Analysis Tool
|
||||
// Copyright © 2010 Cinema Squid
|
||||
//
|
||||
|
@ -21,15 +21,14 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
using MediaBrowser.Model.IO;
|
||||
using MediaBrowser.Model.Text;
|
||||
|
||||
namespace BDInfo
|
||||
{
|
||||
public class TSStreamClipFile
|
||||
{
|
||||
private readonly IFileSystem _fileSystem;
|
||||
private readonly ITextEncoding _textEncoding;
|
||||
public FileSystemMetadata FileInfo = null;
|
||||
public string FileType = null;
|
||||
public bool IsValid = false;
|
||||
|
@ -38,12 +37,10 @@ namespace BDInfo
|
|||
public Dictionary<ushort, TSStream> Streams =
|
||||
new Dictionary<ushort, TSStream>();
|
||||
|
||||
public TSStreamClipFile(
|
||||
FileSystemMetadata fileInfo, IFileSystem fileSystem, ITextEncoding textEncoding)
|
||||
public TSStreamClipFile(FileSystemMetadata fileInfo, IFileSystem fileSystem)
|
||||
{
|
||||
FileInfo = fileInfo;
|
||||
_fileSystem = fileSystem;
|
||||
_textEncoding = textEncoding;
|
||||
Name = fileInfo.Name.ToUpper();
|
||||
}
|
||||
|
||||
|
@ -69,7 +66,7 @@ namespace BDInfo
|
|||
byte[] fileType = new byte[8];
|
||||
Array.Copy(data, 0, fileType, 0, fileType.Length);
|
||||
|
||||
FileType = _textEncoding.GetASCIIEncoding().GetString(fileType, 0, fileType.Length);
|
||||
FileType = Encoding.ASCII.GetString(fileType, 0, fileType.Length);
|
||||
if (FileType != "HDMV0100" &&
|
||||
FileType != "HDMV0200")
|
||||
{
|
||||
|
@ -165,8 +162,7 @@ namespace BDInfo
|
|||
byte[] languageBytes = new byte[3];
|
||||
Array.Copy(clipData, streamOffset + 3,
|
||||
languageBytes, 0, languageBytes.Length);
|
||||
string languageCode =
|
||||
_textEncoding.GetASCIIEncoding().GetString(languageBytes, 0, languageBytes.Length);
|
||||
string languageCode = Encoding.ASCII.GetString(languageBytes, 0, languageBytes.Length);
|
||||
|
||||
var channelLayout = (TSChannelLayout)
|
||||
(clipData[streamOffset + 2] >> 4);
|
||||
|
@ -196,8 +192,7 @@ namespace BDInfo
|
|||
byte[] languageBytes = new byte[3];
|
||||
Array.Copy(clipData, streamOffset + 2,
|
||||
languageBytes, 0, languageBytes.Length);
|
||||
string languageCode =
|
||||
_textEncoding.GetASCIIEncoding().GetString(languageBytes, 0, languageBytes.Length);
|
||||
string languageCode = Encoding.ASCII.GetString(languageBytes, 0, languageBytes.Length);
|
||||
|
||||
stream = new TSGraphicsStream();
|
||||
stream.LanguageCode = languageCode;
|
||||
|
@ -216,8 +211,7 @@ namespace BDInfo
|
|||
byte[] languageBytes = new byte[3];
|
||||
Array.Copy(clipData, streamOffset + 3,
|
||||
languageBytes, 0, languageBytes.Length);
|
||||
string languageCode =
|
||||
_textEncoding.GetASCIIEncoding().GetString(languageBytes, 0, languageBytes.Length);
|
||||
string languageCode = Encoding.ASCII.GetString(languageBytes, 0, languageBytes.Length);
|
||||
#if DEBUG
|
||||
Debug.WriteLine(string.Format(
|
||||
"\t{0} {1} {2}",
|
||||
|
|
|
@ -99,7 +99,6 @@ using MediaBrowser.Model.Serialization;
|
|||
using MediaBrowser.Model.Services;
|
||||
using MediaBrowser.Model.System;
|
||||
using MediaBrowser.Model.Tasks;
|
||||
using MediaBrowser.Model.Text;
|
||||
using MediaBrowser.Model.Threading;
|
||||
using MediaBrowser.Model.Updates;
|
||||
using MediaBrowser.Model.Xml;
|
||||
|
@ -113,6 +112,7 @@ using ServiceStack;
|
|||
using ServiceStack.Text.Jsv;
|
||||
using StringExtensions = MediaBrowser.Controller.Extensions.StringExtensions;
|
||||
using X509Certificate = System.Security.Cryptography.X509Certificates.X509Certificate;
|
||||
using UtfUnknown;
|
||||
|
||||
namespace Emby.Server.Implementations
|
||||
{
|
||||
|
@ -309,7 +309,6 @@ namespace Emby.Server.Implementations
|
|||
|
||||
private IEncodingManager EncodingManager { get; set; }
|
||||
private IChannelManager ChannelManager { get; set; }
|
||||
protected ITextEncoding TextEncoding { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the user data repository.
|
||||
|
@ -826,9 +825,7 @@ namespace Emby.Server.Implementations
|
|||
StringExtensions.LocalizationManager = LocalizationManager;
|
||||
RegisterSingleInstance(LocalizationManager);
|
||||
|
||||
TextEncoding = new TextEncoding.TextEncoding(FileSystemManager, LoggerFactory.CreateLogger("TextEncoding"), JsonSerializer);
|
||||
RegisterSingleInstance(TextEncoding);
|
||||
BlurayExaminer = new BdInfoExaminer(FileSystemManager, TextEncoding);
|
||||
BlurayExaminer = new BdInfoExaminer(FileSystemManager);
|
||||
RegisterSingleInstance(BlurayExaminer);
|
||||
|
||||
RegisterSingleInstance<IXmlReaderSettingsFactory>(new XmlReaderSettingsFactory());
|
||||
|
@ -873,7 +870,6 @@ namespace Emby.Server.Implementations
|
|||
ServerConfigurationManager,
|
||||
"web/index.html",
|
||||
NetworkManager,
|
||||
TextEncoding,
|
||||
JsonSerializer,
|
||||
XmlSerializer,
|
||||
GetParseFn);
|
||||
|
@ -950,7 +946,7 @@ namespace Emby.Server.Implementations
|
|||
AuthService = new AuthService(UserManager, authContext, ServerConfigurationManager, SessionManager, NetworkManager);
|
||||
RegisterSingleInstance(AuthService);
|
||||
|
||||
SubtitleEncoder = new MediaBrowser.MediaEncoding.Subtitles.SubtitleEncoder(LibraryManager, LoggerFactory.CreateLogger("SubtitleEncoder"), ApplicationPaths, FileSystemManager, MediaEncoder, JsonSerializer, HttpClient, MediaSourceManager, ProcessFactory, TextEncoding);
|
||||
SubtitleEncoder = new MediaBrowser.MediaEncoding.Subtitles.SubtitleEncoder(LibraryManager, LoggerFactory.CreateLogger("SubtitleEncoder"), ApplicationPaths, FileSystemManager, MediaEncoder, JsonSerializer, HttpClient, MediaSourceManager, ProcessFactory);
|
||||
RegisterSingleInstance(SubtitleEncoder);
|
||||
|
||||
RegisterSingleInstance(CreateResourceFileManager());
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
<PackageReference Include="SimpleInjector" Version="4.4.2" />
|
||||
<PackageReference Include="SQLitePCL.pretty.core" Version="1.1.8" />
|
||||
<PackageReference Include="SQLitePCLRaw.core" Version="1.1.11" />
|
||||
<PackageReference Include="UTF.Unknown" Version="1.0.0-beta1" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
@ -43,7 +44,6 @@
|
|||
<EmbeddedResource Include="Localization\countries.json" />
|
||||
<EmbeddedResource Include="Localization\Core\*.json" />
|
||||
<EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\*" />
|
||||
<EmbeddedResource Include="TextEncoding\NLangDetect\Utils\messages.properties" />
|
||||
<EmbeddedResource Include="Localization\Ratings\*.txt" />
|
||||
</ItemGroup>
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ using MediaBrowser.Model.Events;
|
|||
using MediaBrowser.Model.Extensions;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using MediaBrowser.Model.Services;
|
||||
using MediaBrowser.Model.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace Emby.Server.Implementations.HttpServer
|
||||
|
@ -37,11 +36,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
|
||||
private readonly IServerConfigurationManager _config;
|
||||
private readonly INetworkManager _networkManager;
|
||||
|
||||
private readonly IServerApplicationHost _appHost;
|
||||
|
||||
private readonly ITextEncoding _textEncoding;
|
||||
|
||||
private readonly IJsonSerializer _jsonSerializer;
|
||||
private readonly IXmlSerializer _xmlSerializer;
|
||||
private readonly Func<Type, Func<string, object>> _funcParseFn;
|
||||
|
@ -60,7 +55,6 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
IServerConfigurationManager config,
|
||||
string defaultRedirectPath,
|
||||
INetworkManager networkManager,
|
||||
ITextEncoding textEncoding,
|
||||
IJsonSerializer jsonSerializer,
|
||||
IXmlSerializer xmlSerializer,
|
||||
Func<Type, Func<string, object>> funcParseFn)
|
||||
|
@ -70,7 +64,6 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
_config = config;
|
||||
DefaultRedirectPath = defaultRedirectPath;
|
||||
_networkManager = networkManager;
|
||||
_textEncoding = textEncoding;
|
||||
_jsonSerializer = jsonSerializer;
|
||||
_xmlSerializer = xmlSerializer;
|
||||
_funcParseFn = funcParseFn;
|
||||
|
@ -147,7 +140,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
return;
|
||||
}
|
||||
|
||||
var connection = new WebSocketConnection(e.WebSocket, e.Endpoint, _jsonSerializer, _logger, _textEncoding)
|
||||
var connection = new WebSocketConnection(e.WebSocket, e.Endpoint, _jsonSerializer, _logger)
|
||||
{
|
||||
OnReceive = ProcessWebSocketMessageReceived,
|
||||
Url = e.Url,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
using System;
|
||||
using System;
|
||||
using System.Net.WebSockets;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
|
@ -8,8 +8,8 @@ using MediaBrowser.Controller.Net;
|
|||
using MediaBrowser.Model.Net;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using MediaBrowser.Model.Services;
|
||||
using MediaBrowser.Model.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using UtfUnknown;
|
||||
|
||||
namespace Emby.Server.Implementations.HttpServer
|
||||
{
|
||||
|
@ -68,7 +68,6 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
/// </summary>
|
||||
/// <value>The query string.</value>
|
||||
public QueryParamCollection QueryString { get; set; }
|
||||
private readonly ITextEncoding _textEncoding;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="WebSocketConnection" /> class.
|
||||
|
@ -78,7 +77,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
/// <param name="jsonSerializer">The json serializer.</param>
|
||||
/// <param name="logger">The logger.</param>
|
||||
/// <exception cref="ArgumentNullException">socket</exception>
|
||||
public WebSocketConnection(IWebSocket socket, string remoteEndPoint, IJsonSerializer jsonSerializer, ILogger logger, ITextEncoding textEncoding)
|
||||
public WebSocketConnection(IWebSocket socket, string remoteEndPoint, IJsonSerializer jsonSerializer, ILogger logger)
|
||||
{
|
||||
if (socket == null)
|
||||
{
|
||||
|
@ -110,7 +109,6 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
|
||||
RemoteEndPoint = remoteEndPoint;
|
||||
_logger = logger;
|
||||
_textEncoding = textEncoding;
|
||||
|
||||
socket.Closed += socket_Closed;
|
||||
}
|
||||
|
@ -132,8 +130,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var charset = _textEncoding.GetDetectedEncodingName(bytes, bytes.Length, null, false);
|
||||
var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;
|
||||
|
||||
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
|
@ -141,7 +138,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
}
|
||||
else
|
||||
{
|
||||
OnReceiveInternal(_textEncoding.GetASCIIEncoding().GetString(bytes, 0, bytes.Length));
|
||||
OnReceiveInternal(Encoding.ASCII.GetString(bytes, 0, bytes.Length));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,7 +158,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
|
||||
var bytes = memory.Slice(0, length).ToArray();
|
||||
|
||||
var charset = _textEncoding.GetDetectedEncodingName(bytes, bytes.Length, null, false);
|
||||
var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;
|
||||
|
||||
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
|
@ -169,7 +166,7 @@ namespace Emby.Server.Implementations.HttpServer
|
|||
}
|
||||
else
|
||||
{
|
||||
OnReceiveInternal(_textEncoding.GetASCIIEncoding().GetString(bytes, 0, bytes.Length));
|
||||
OnReceiveInternal(Encoding.ASCII.GetString(bytes, 0, bytes.Length));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,371 +0,0 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using NLangDetect.Core.Extensions;
|
||||
using NLangDetect.Core.Utils;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
public class Detector
|
||||
{
|
||||
private const double _AlphaDefault = 0.5;
|
||||
private const double _AlphaWidth = 0.05;
|
||||
|
||||
private const int _IterationLimit = 1000;
|
||||
private const double _ProbThreshold = 0.1;
|
||||
private const double _ConvThreshold = 0.99999;
|
||||
private const int _BaseFreq = 10000;
|
||||
|
||||
private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
|
||||
private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
|
||||
|
||||
private readonly Dictionary<string, ProbVector> _wordLangProbMap;
|
||||
private readonly List<string> _langlist;
|
||||
|
||||
private StringBuilder _text;
|
||||
private double[] _langprob;
|
||||
|
||||
private double _alpha = _AlphaDefault;
|
||||
private const int _trialsCount = 7;
|
||||
private int _maxTextLength = 10000;
|
||||
private double[] _priorMap;
|
||||
private int? _seed;
|
||||
|
||||
#region Constructor(s)
|
||||
|
||||
public Detector(DetectorFactory factory)
|
||||
{
|
||||
_wordLangProbMap = factory.WordLangProbMap;
|
||||
_langlist = factory.Langlist;
|
||||
_text = new StringBuilder();
|
||||
_seed = factory.Seed;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public void SetAlpha(double alpha)
|
||||
{
|
||||
_alpha = alpha;
|
||||
}
|
||||
|
||||
public void SetPriorMap(Dictionary<string, double> priorMap)
|
||||
{
|
||||
_priorMap = new double[_langlist.Count];
|
||||
|
||||
double sump = 0;
|
||||
|
||||
for (int i = 0; i < _priorMap.Length; i++)
|
||||
{
|
||||
string lang = _langlist[i];
|
||||
|
||||
if (priorMap.ContainsKey(lang))
|
||||
{
|
||||
double p = priorMap[lang];
|
||||
|
||||
if (p < 0)
|
||||
{
|
||||
throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
|
||||
}
|
||||
|
||||
_priorMap[i] = p;
|
||||
sump += p;
|
||||
}
|
||||
}
|
||||
|
||||
if (sump <= 0)
|
||||
{
|
||||
throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
|
||||
}
|
||||
|
||||
for (int i = 0; i < _priorMap.Length; i++)
|
||||
{
|
||||
_priorMap[i] /= sump;
|
||||
}
|
||||
}
|
||||
|
||||
public void SetMaxTextLength(int max_text_length)
|
||||
{
|
||||
_maxTextLength = max_text_length;
|
||||
}
|
||||
|
||||
// TODO IMM HI: TextReader?
|
||||
public void Append(StreamReader streamReader)
|
||||
{
|
||||
var buf = new char[_maxTextLength / 2];
|
||||
|
||||
while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
|
||||
{
|
||||
int length = streamReader.Read(buf, 0, buf.Length);
|
||||
|
||||
Append(new string(buf, 0, length));
|
||||
}
|
||||
}
|
||||
|
||||
public void Append(string text)
|
||||
{
|
||||
text = _UrlRegex.Replace(text, " ");
|
||||
text = _MailRegex.Replace(text, " ");
|
||||
|
||||
char pre = '\0';
|
||||
|
||||
for (int i = 0; i < text.Length && i < _maxTextLength; i++)
|
||||
{
|
||||
char c = NGram.Normalize(text[i]);
|
||||
|
||||
if (c != ' ' || pre != ' ')
|
||||
{
|
||||
_text.Append(c);
|
||||
}
|
||||
|
||||
pre = c;
|
||||
}
|
||||
}
|
||||
|
||||
private void CleanText()
|
||||
{
|
||||
int latinCount = 0, nonLatinCount = 0;
|
||||
|
||||
for (int i = 0; i < _text.Length; i++)
|
||||
{
|
||||
char c = _text[i];
|
||||
|
||||
if (c <= 'z' && c >= 'A')
|
||||
{
|
||||
latinCount++;
|
||||
}
|
||||
else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
|
||||
{
|
||||
nonLatinCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if (latinCount * 2 < nonLatinCount)
|
||||
{
|
||||
var textWithoutLatin = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < _text.Length; i++)
|
||||
{
|
||||
char c = _text[i];
|
||||
|
||||
if (c > 'z' || c < 'A')
|
||||
{
|
||||
textWithoutLatin.Append(c);
|
||||
}
|
||||
}
|
||||
|
||||
_text = textWithoutLatin;
|
||||
}
|
||||
}
|
||||
|
||||
public string Detect()
|
||||
{
|
||||
List<Language> probabilities = GetProbabilities();
|
||||
|
||||
return
|
||||
probabilities.Count > 0
|
||||
? probabilities[0].Name
|
||||
: null;
|
||||
}
|
||||
|
||||
public List<Language> GetProbabilities()
|
||||
{
|
||||
if (_langprob == null)
|
||||
{
|
||||
DetectBlock();
|
||||
}
|
||||
|
||||
var list = SortProbability(_langprob);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Private helper methods
|
||||
|
||||
private static double NormalizeProb(double[] probs)
|
||||
{
|
||||
double maxp = 0, sump = 0;
|
||||
|
||||
sump += probs.Sum();
|
||||
|
||||
for (int i = 0; i < probs.Length; i++)
|
||||
{
|
||||
double p = probs[i] / sump;
|
||||
|
||||
if (maxp < p)
|
||||
{
|
||||
maxp = p;
|
||||
}
|
||||
|
||||
probs[i] = p;
|
||||
}
|
||||
|
||||
return maxp;
|
||||
}
|
||||
|
||||
private static string UnicodeEncode(string word)
|
||||
{
|
||||
var resultSb = new StringBuilder();
|
||||
|
||||
foreach (char ch in word)
|
||||
{
|
||||
if (ch >= '\u0080')
|
||||
{
|
||||
string st = string.Format("{0:x}", 0x10000 + ch);
|
||||
|
||||
while (st.Length < 4)
|
||||
{
|
||||
st = "0" + st;
|
||||
}
|
||||
|
||||
resultSb
|
||||
.Append("\\u")
|
||||
.Append(st.SubSequence(1, 5));
|
||||
}
|
||||
else
|
||||
{
|
||||
resultSb.Append(ch);
|
||||
}
|
||||
}
|
||||
|
||||
return resultSb.ToString();
|
||||
}
|
||||
|
||||
private void DetectBlock()
|
||||
{
|
||||
CleanText();
|
||||
|
||||
List<string> ngrams = ExtractNGrams();
|
||||
|
||||
if (ngrams.Count == 0)
|
||||
{
|
||||
throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
|
||||
}
|
||||
|
||||
_langprob = new double[_langlist.Count];
|
||||
|
||||
var rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
|
||||
|
||||
for (int t = 0; t < _trialsCount; t++)
|
||||
{
|
||||
double[] prob = InitProbability();
|
||||
|
||||
// TODO IMM HI: verify it works
|
||||
double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
|
||||
|
||||
for (int i = 0; ; i++)
|
||||
{
|
||||
int r = rand.Next(ngrams.Count);
|
||||
|
||||
UpdateLangProb(prob, ngrams[r], alpha);
|
||||
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < _langprob.Length; j++)
|
||||
{
|
||||
_langprob[j] += prob[j] / _trialsCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private double[] InitProbability()
|
||||
{
|
||||
var prob = new double[_langlist.Count];
|
||||
|
||||
if (_priorMap != null)
|
||||
{
|
||||
for (int i = 0; i < prob.Length; i++)
|
||||
{
|
||||
prob[i] = _priorMap[i];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < prob.Length; i++)
|
||||
{
|
||||
prob[i] = 1.0 / _langlist.Count;
|
||||
}
|
||||
}
|
||||
return prob;
|
||||
}
|
||||
|
||||
private List<string> ExtractNGrams()
|
||||
{
|
||||
var list = new List<string>();
|
||||
var ngram = new NGram();
|
||||
|
||||
for (int i = 0; i < _text.Length; i++)
|
||||
{
|
||||
ngram.AddChar(_text[i]);
|
||||
|
||||
for (int n = 1; n <= NGram.GramsCount; n++)
|
||||
{
|
||||
string w = ngram.Get(n);
|
||||
|
||||
if (w != null && _wordLangProbMap.ContainsKey(w))
|
||||
{
|
||||
list.Add(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
private void UpdateLangProb(double[] prob, string word, double alpha)
|
||||
{
|
||||
if (word == null || !_wordLangProbMap.ContainsKey(word))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
ProbVector langProbMap = _wordLangProbMap[word];
|
||||
double weight = alpha / _BaseFreq;
|
||||
|
||||
for (int i = 0; i < prob.Length; i++)
|
||||
{
|
||||
prob[i] *= weight + langProbMap[i];
|
||||
}
|
||||
}
|
||||
|
||||
private List<Language> SortProbability(double[] prob)
|
||||
{
|
||||
var list = new List<Language>();
|
||||
|
||||
for (int j = 0; j < prob.Length; j++)
|
||||
{
|
||||
double p = prob[j];
|
||||
|
||||
if (p > _ProbThreshold)
|
||||
{
|
||||
for (int i = 0; i <= list.Count; i++)
|
||||
{
|
||||
if (i == list.Count || list[i].Probability < p)
|
||||
{
|
||||
list.Insert(i, new Language(_langlist[j], p));
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,125 +0,0 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using NLangDetect.Core.Utils;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
public class DetectorFactory
|
||||
{
|
||||
public Dictionary<string, ProbVector> WordLangProbMap;
|
||||
public List<string> Langlist;
|
||||
|
||||
private static readonly DetectorFactory _instance = new DetectorFactory();
|
||||
|
||||
#region Constructor(s)
|
||||
|
||||
private DetectorFactory()
|
||||
{
|
||||
WordLangProbMap = new Dictionary<string, ProbVector>();
|
||||
Langlist = new List<string>();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public static void LoadProfiles(IJsonSerializer json)
|
||||
{
|
||||
var assembly = typeof(DetectorFactory).Assembly;
|
||||
var names = assembly.GetManifestResourceNames()
|
||||
.Where(i => i.IndexOf("NLangDetect.Profiles", StringComparison.Ordinal) != -1)
|
||||
.ToList();
|
||||
|
||||
var index = 0;
|
||||
|
||||
foreach (var name in names)
|
||||
{
|
||||
using (var stream = assembly.GetManifestResourceStream(name))
|
||||
{
|
||||
var langProfile = (LangProfile)json.DeserializeFromStream(stream, typeof(LangProfile));
|
||||
|
||||
AddProfile(langProfile, index);
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
public static Detector Create()
|
||||
{
|
||||
return CreateDetector();
|
||||
}
|
||||
|
||||
public static Detector Create(double alpha)
|
||||
{
|
||||
var detector = CreateDetector();
|
||||
|
||||
detector.SetAlpha(alpha);
|
||||
|
||||
return detector;
|
||||
}
|
||||
|
||||
public static void SetSeed(int? seed)
|
||||
{
|
||||
_instance.Seed = seed;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Internal methods
|
||||
|
||||
internal static void AddProfile(LangProfile profile, int index)
|
||||
{
|
||||
var lang = profile.name;
|
||||
|
||||
if (_instance.Langlist.Contains(lang))
|
||||
{
|
||||
throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError);
|
||||
}
|
||||
|
||||
_instance.Langlist.Add(lang);
|
||||
|
||||
foreach (string word in profile.freq.Keys)
|
||||
{
|
||||
if (!_instance.WordLangProbMap.ContainsKey(word))
|
||||
{
|
||||
_instance.WordLangProbMap.Add(word, new ProbVector());
|
||||
}
|
||||
|
||||
double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1];
|
||||
|
||||
_instance.WordLangProbMap[word][index] = prob;
|
||||
}
|
||||
}
|
||||
|
||||
internal static void Clear()
|
||||
{
|
||||
_instance.Langlist.Clear();
|
||||
_instance.WordLangProbMap.Clear();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Private helper methods
|
||||
|
||||
private static Detector CreateDetector()
|
||||
{
|
||||
if (_instance.Langlist.Count == 0)
|
||||
{
|
||||
throw new NLangDetectException("need to load profiles", ErrorCode.NeedLoadProfileError);
|
||||
}
|
||||
|
||||
return new Detector(_instance);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Properties
|
||||
|
||||
public int? Seed { get; private set; }
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
namespace NLangDetect.Core
|
||||
{
|
||||
public enum ErrorCode
|
||||
{
|
||||
NoTextError,
|
||||
FormatError,
|
||||
FileLoadError,
|
||||
DuplicateLangError,
|
||||
NeedLoadProfileError,
|
||||
CantDetectError,
|
||||
CantOpenTrainData,
|
||||
TrainDataFormatError,
|
||||
InitParamError,
|
||||
}
|
||||
}
|
|
@ -1,374 +0,0 @@
|
|||
using System;
|
||||
|
||||
namespace NLangDetect.Core.Extensions
|
||||
{
|
||||
public static class CharExtensions
|
||||
{
|
||||
private const int MIN_CODE_POINT = 0x000000;
|
||||
private const int MAX_CODE_POINT = 0x10ffff;
|
||||
|
||||
private static readonly int[] _unicodeBlockStarts =
|
||||
{
|
||||
#region Unicode block starts
|
||||
|
||||
0x0000, // Basic Latin
|
||||
0x0080, // Latin-1 Supplement
|
||||
0x0100, // Latin Extended-A
|
||||
0x0180, // Latin Extended-B
|
||||
0x0250, // IPA Extensions
|
||||
0x02B0, // Spacing Modifier Letters
|
||||
0x0300, // Combining Diacritical Marks
|
||||
0x0370, // Greek and Coptic
|
||||
0x0400, // Cyrillic
|
||||
0x0500, // Cyrillic Supplementary
|
||||
0x0530, // Armenian
|
||||
0x0590, // Hebrew
|
||||
0x0600, // Arabic
|
||||
0x0700, // Syriac
|
||||
0x0750, // unassigned
|
||||
0x0780, // Thaana
|
||||
0x07C0, // unassigned
|
||||
0x0900, // Devanagari
|
||||
0x0980, // Bengali
|
||||
0x0A00, // Gurmukhi
|
||||
0x0A80, // Gujarati
|
||||
0x0B00, // Oriya
|
||||
0x0B80, // Tamil
|
||||
0x0C00, // Telugu
|
||||
0x0C80, // Kannada
|
||||
0x0D00, // Malayalam
|
||||
0x0D80, // Sinhala
|
||||
0x0E00, // Thai
|
||||
0x0E80, // Lao
|
||||
0x0F00, // Tibetan
|
||||
0x1000, // Myanmar
|
||||
0x10A0, // Georgian
|
||||
0x1100, // Hangul Jamo
|
||||
0x1200, // Ethiopic
|
||||
0x1380, // unassigned
|
||||
0x13A0, // Cherokee
|
||||
0x1400, // Unified Canadian Aboriginal Syllabics
|
||||
0x1680, // Ogham
|
||||
0x16A0, // Runic
|
||||
0x1700, // Tagalog
|
||||
0x1720, // Hanunoo
|
||||
0x1740, // Buhid
|
||||
0x1760, // Tagbanwa
|
||||
0x1780, // Khmer
|
||||
0x1800, // Mongolian
|
||||
0x18B0, // unassigned
|
||||
0x1900, // Limbu
|
||||
0x1950, // Tai Le
|
||||
0x1980, // unassigned
|
||||
0x19E0, // Khmer Symbols
|
||||
0x1A00, // unassigned
|
||||
0x1D00, // Phonetic Extensions
|
||||
0x1D80, // unassigned
|
||||
0x1E00, // Latin Extended Additional
|
||||
0x1F00, // Greek Extended
|
||||
0x2000, // General Punctuation
|
||||
0x2070, // Superscripts and Subscripts
|
||||
0x20A0, // Currency Symbols
|
||||
0x20D0, // Combining Diacritical Marks for Symbols
|
||||
0x2100, // Letterlike Symbols
|
||||
0x2150, // Number Forms
|
||||
0x2190, // Arrows
|
||||
0x2200, // Mathematical Operators
|
||||
0x2300, // Miscellaneous Technical
|
||||
0x2400, // Control Pictures
|
||||
0x2440, // Optical Character Recognition
|
||||
0x2460, // Enclosed Alphanumerics
|
||||
0x2500, // Box Drawing
|
||||
0x2580, // Block Elements
|
||||
0x25A0, // Geometric Shapes
|
||||
0x2600, // Miscellaneous Symbols
|
||||
0x2700, // Dingbats
|
||||
0x27C0, // Miscellaneous Mathematical Symbols-A
|
||||
0x27F0, // Supplemental Arrows-A
|
||||
0x2800, // Braille Patterns
|
||||
0x2900, // Supplemental Arrows-B
|
||||
0x2980, // Miscellaneous Mathematical Symbols-B
|
||||
0x2A00, // Supplemental Mathematical Operators
|
||||
0x2B00, // Miscellaneous Symbols and Arrows
|
||||
0x2C00, // unassigned
|
||||
0x2E80, // CJK Radicals Supplement
|
||||
0x2F00, // Kangxi Radicals
|
||||
0x2FE0, // unassigned
|
||||
0x2FF0, // Ideographic Description Characters
|
||||
0x3000, // CJK Symbols and Punctuation
|
||||
0x3040, // Hiragana
|
||||
0x30A0, // Katakana
|
||||
0x3100, // Bopomofo
|
||||
0x3130, // Hangul Compatibility Jamo
|
||||
0x3190, // Kanbun
|
||||
0x31A0, // Bopomofo Extended
|
||||
0x31C0, // unassigned
|
||||
0x31F0, // Katakana Phonetic Extensions
|
||||
0x3200, // Enclosed CJK Letters and Months
|
||||
0x3300, // CJK Compatibility
|
||||
0x3400, // CJK Unified Ideographs Extension A
|
||||
0x4DC0, // Yijing Hexagram Symbols
|
||||
0x4E00, // CJK Unified Ideographs
|
||||
0xA000, // Yi Syllables
|
||||
0xA490, // Yi Radicals
|
||||
0xA4D0, // unassigned
|
||||
0xAC00, // Hangul Syllables
|
||||
0xD7B0, // unassigned
|
||||
0xD800, // High Surrogates
|
||||
0xDB80, // High Private Use Surrogates
|
||||
0xDC00, // Low Surrogates
|
||||
0xE000, // Private Use
|
||||
0xF900, // CJK Compatibility Ideographs
|
||||
0xFB00, // Alphabetic Presentation Forms
|
||||
0xFB50, // Arabic Presentation Forms-A
|
||||
0xFE00, // Variation Selectors
|
||||
0xFE10, // unassigned
|
||||
0xFE20, // Combining Half Marks
|
||||
0xFE30, // CJK Compatibility Forms
|
||||
0xFE50, // Small Form Variants
|
||||
0xFE70, // Arabic Presentation Forms-B
|
||||
0xFF00, // Halfwidth and Fullwidth Forms
|
||||
0xFFF0, // Specials
|
||||
0x10000, // Linear B Syllabary
|
||||
0x10080, // Linear B Ideograms
|
||||
0x10100, // Aegean Numbers
|
||||
0x10140, // unassigned
|
||||
0x10300, // Old Italic
|
||||
0x10330, // Gothic
|
||||
0x10350, // unassigned
|
||||
0x10380, // Ugaritic
|
||||
0x103A0, // unassigned
|
||||
0x10400, // Deseret
|
||||
0x10450, // Shavian
|
||||
0x10480, // Osmanya
|
||||
0x104B0, // unassigned
|
||||
0x10800, // Cypriot Syllabary
|
||||
0x10840, // unassigned
|
||||
0x1D000, // Byzantine Musical Symbols
|
||||
0x1D100, // Musical Symbols
|
||||
0x1D200, // unassigned
|
||||
0x1D300, // Tai Xuan Jing Symbols
|
||||
0x1D360, // unassigned
|
||||
0x1D400, // Mathematical Alphanumeric Symbols
|
||||
0x1D800, // unassigned
|
||||
0x20000, // CJK Unified Ideographs Extension B
|
||||
0x2A6E0, // unassigned
|
||||
0x2F800, // CJK Compatibility Ideographs Supplement
|
||||
0x2FA20, // unassigned
|
||||
0xE0000, // Tags
|
||||
0xE0080, // unassigned
|
||||
0xE0100, // Variation Selectors Supplement
|
||||
0xE01F0, // unassigned
|
||||
0xF0000, // Supplementary Private Use Area-A
|
||||
0x100000, // Supplementary Private Use Area-B
|
||||
|
||||
#endregion
|
||||
};
|
||||
|
||||
private static readonly UnicodeBlock?[] _unicodeBlocks =
|
||||
{
|
||||
#region Unicode blocks
|
||||
UnicodeBlock.BasicLatin,
|
||||
UnicodeBlock.Latin1Supplement,
|
||||
UnicodeBlock.LatinExtendedA,
|
||||
UnicodeBlock.LatinExtendedB,
|
||||
UnicodeBlock.IpaExtensions,
|
||||
UnicodeBlock.SpacingModifierLetters,
|
||||
UnicodeBlock.CombiningDiacriticalMarks,
|
||||
UnicodeBlock.Greek,
|
||||
UnicodeBlock.Cyrillic,
|
||||
UnicodeBlock.CyrillicSupplementary,
|
||||
UnicodeBlock.Armenian,
|
||||
UnicodeBlock.Hebrew,
|
||||
UnicodeBlock.Arabic,
|
||||
UnicodeBlock.Syriac,
|
||||
null,
|
||||
UnicodeBlock.Thaana,
|
||||
null,
|
||||
UnicodeBlock.Devanagari,
|
||||
UnicodeBlock.Bengali,
|
||||
UnicodeBlock.Gurmukhi,
|
||||
UnicodeBlock.Gujarati,
|
||||
UnicodeBlock.Oriya,
|
||||
UnicodeBlock.Tamil,
|
||||
UnicodeBlock.Telugu,
|
||||
UnicodeBlock.Kannada,
|
||||
UnicodeBlock.Malayalam,
|
||||
UnicodeBlock.Sinhala,
|
||||
UnicodeBlock.Thai,
|
||||
UnicodeBlock.Lao,
|
||||
UnicodeBlock.Tibetan,
|
||||
UnicodeBlock.Myanmar,
|
||||
UnicodeBlock.Georgian,
|
||||
UnicodeBlock.HangulJamo,
|
||||
UnicodeBlock.Ethiopic,
|
||||
null,
|
||||
UnicodeBlock.Cherokee,
|
||||
UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
|
||||
UnicodeBlock.Ogham,
|
||||
UnicodeBlock.Runic,
|
||||
UnicodeBlock.Tagalog,
|
||||
UnicodeBlock.Hanunoo,
|
||||
UnicodeBlock.Buhid,
|
||||
UnicodeBlock.Tagbanwa,
|
||||
UnicodeBlock.Khmer,
|
||||
UnicodeBlock.Mongolian,
|
||||
null,
|
||||
UnicodeBlock.Limbu,
|
||||
UnicodeBlock.TaiLe,
|
||||
null,
|
||||
UnicodeBlock.KhmerSymbols,
|
||||
null,
|
||||
UnicodeBlock.PhoneticExtensions,
|
||||
null,
|
||||
UnicodeBlock.LatinExtendedAdditional,
|
||||
UnicodeBlock.GreekExtended,
|
||||
UnicodeBlock.GeneralPunctuation,
|
||||
UnicodeBlock.SuperscriptsAndSubscripts,
|
||||
UnicodeBlock.CurrencySymbols,
|
||||
UnicodeBlock.CombiningMarksForSymbols,
|
||||
UnicodeBlock.LetterlikeSymbols,
|
||||
UnicodeBlock.NumberForms,
|
||||
UnicodeBlock.Arrows,
|
||||
UnicodeBlock.MathematicalOperators,
|
||||
UnicodeBlock.MiscellaneousTechnical,
|
||||
UnicodeBlock.ControlPictures,
|
||||
UnicodeBlock.OpticalCharacterRecognition,
|
||||
UnicodeBlock.EnclosedAlphanumerics,
|
||||
UnicodeBlock.BoxDrawing,
|
||||
UnicodeBlock.BlockElements,
|
||||
UnicodeBlock.GeometricShapes,
|
||||
UnicodeBlock.MiscellaneousSymbols,
|
||||
UnicodeBlock.Dingbats,
|
||||
UnicodeBlock.MiscellaneousMathematicalSymbolsA,
|
||||
UnicodeBlock.SupplementalArrowsA,
|
||||
UnicodeBlock.BraillePatterns,
|
||||
UnicodeBlock.SupplementalArrowsB,
|
||||
UnicodeBlock.MiscellaneousMathematicalSymbolsB,
|
||||
UnicodeBlock.SupplementalMathematicalOperators,
|
||||
UnicodeBlock.MiscellaneousSymbolsAndArrows,
|
||||
null,
|
||||
UnicodeBlock.CjkRadicalsSupplement,
|
||||
UnicodeBlock.KangxiRadicals,
|
||||
null,
|
||||
UnicodeBlock.IdeographicDescriptionCharacters,
|
||||
UnicodeBlock.CjkSymbolsAndPunctuation,
|
||||
UnicodeBlock.Hiragana,
|
||||
UnicodeBlock.Katakana,
|
||||
UnicodeBlock.Bopomofo,
|
||||
UnicodeBlock.HangulCompatibilityJamo,
|
||||
UnicodeBlock.Kanbun,
|
||||
UnicodeBlock.BopomofoExtended,
|
||||
null,
|
||||
UnicodeBlock.KatakanaPhoneticExtensions,
|
||||
UnicodeBlock.EnclosedCjkLettersAndMonths,
|
||||
UnicodeBlock.CjkCompatibility,
|
||||
UnicodeBlock.CjkUnifiedIdeographsExtensionA,
|
||||
UnicodeBlock.YijingHexagramSymbols,
|
||||
UnicodeBlock.CjkUnifiedIdeographs,
|
||||
UnicodeBlock.YiSyllables,
|
||||
UnicodeBlock.YiRadicals,
|
||||
null,
|
||||
UnicodeBlock.HangulSyllables,
|
||||
null,
|
||||
UnicodeBlock.HighSurrogates,
|
||||
UnicodeBlock.HighPrivateUseSurrogates,
|
||||
UnicodeBlock.LowSurrogates,
|
||||
UnicodeBlock.PrivateUseArea,
|
||||
UnicodeBlock.CjkCompatibilityIdeographs,
|
||||
UnicodeBlock.AlphabeticPresentationForms,
|
||||
UnicodeBlock.ArabicPresentationFormsA,
|
||||
UnicodeBlock.VariationSelectors,
|
||||
null,
|
||||
UnicodeBlock.CombiningHalfMarks,
|
||||
UnicodeBlock.CjkCompatibilityForms,
|
||||
UnicodeBlock.SmallFormVariants,
|
||||
UnicodeBlock.ArabicPresentationFormsB,
|
||||
UnicodeBlock.HalfwidthAndFullwidthForms,
|
||||
UnicodeBlock.Specials,
|
||||
UnicodeBlock.LinearBSyllabary,
|
||||
UnicodeBlock.LinearBIdeograms,
|
||||
UnicodeBlock.AegeanNumbers,
|
||||
null,
|
||||
UnicodeBlock.OldItalic,
|
||||
UnicodeBlock.Gothic,
|
||||
null,
|
||||
UnicodeBlock.Ugaritic,
|
||||
null,
|
||||
UnicodeBlock.Deseret,
|
||||
UnicodeBlock.Shavian,
|
||||
UnicodeBlock.Osmanya,
|
||||
null,
|
||||
UnicodeBlock.CypriotSyllabary,
|
||||
null,
|
||||
UnicodeBlock.ByzantineMusicalSymbols,
|
||||
UnicodeBlock.MusicalSymbols,
|
||||
null,
|
||||
UnicodeBlock.TaiXuanJingSymbols,
|
||||
null,
|
||||
UnicodeBlock.MathematicalAlphanumericSymbols,
|
||||
null,
|
||||
UnicodeBlock.CjkUnifiedIdeographsExtensionB,
|
||||
null,
|
||||
UnicodeBlock.CjkCompatibilityIdeographsSupplement,
|
||||
null,
|
||||
UnicodeBlock.Tags,
|
||||
null,
|
||||
UnicodeBlock.VariationSelectorsSupplement,
|
||||
null,
|
||||
UnicodeBlock.SupplementaryPrivateUseAreaA,
|
||||
UnicodeBlock.SupplementaryPrivateUseAreaB,
|
||||
|
||||
#endregion
|
||||
};
|
||||
|
||||
#region Public methods
|
||||
|
||||
/// <remarks>
|
||||
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
|
||||
/// </remarks>
|
||||
public static UnicodeBlock? GetUnicodeBlock(this char ch)
|
||||
{
|
||||
int codePoint = ch;
|
||||
|
||||
if (!IsValidCodePoint(codePoint))
|
||||
{
|
||||
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
|
||||
}
|
||||
|
||||
int top, bottom, current;
|
||||
|
||||
bottom = 0;
|
||||
top = _unicodeBlockStarts.Length;
|
||||
current = top / 2;
|
||||
|
||||
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
|
||||
while (top - bottom > 1)
|
||||
{
|
||||
if (codePoint >= _unicodeBlockStarts[current])
|
||||
{
|
||||
bottom = current;
|
||||
}
|
||||
else
|
||||
{
|
||||
top = current;
|
||||
}
|
||||
|
||||
current = (top + bottom) / 2;
|
||||
}
|
||||
|
||||
return _unicodeBlocks[current];
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Private helper methods
|
||||
|
||||
private static bool IsValidCodePoint(int codePoint)
|
||||
{
|
||||
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,51 +0,0 @@
|
|||
using System;
|
||||
|
||||
namespace NLangDetect.Core.Extensions
|
||||
{
|
||||
public static class RandomExtensions
|
||||
{
|
||||
private const double _Epsilon = 2.22044604925031E-15;
|
||||
|
||||
private static readonly object _mutex = new object();
|
||||
|
||||
private static double _nextNextGaussian;
|
||||
private static bool _hasNextNextGaussian;
|
||||
|
||||
/// <summary>
|
||||
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
|
||||
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
|
||||
/// </remarks>
|
||||
public static double NextGaussian(this Random random)
|
||||
{
|
||||
lock (_mutex)
|
||||
{
|
||||
if (_hasNextNextGaussian)
|
||||
{
|
||||
_hasNextNextGaussian = false;
|
||||
|
||||
return _nextNextGaussian;
|
||||
}
|
||||
|
||||
double v1, v2, s;
|
||||
|
||||
do
|
||||
{
|
||||
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
||||
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
||||
s = v1 * v1 + v2 * v2;
|
||||
}
|
||||
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
|
||||
|
||||
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
|
||||
|
||||
_nextNextGaussian = v2 * multiplier;
|
||||
_hasNextNextGaussian = true;
|
||||
|
||||
return v1 * multiplier;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
using System;
|
||||
|
||||
namespace NLangDetect.Core.Extensions
|
||||
{
|
||||
public static class StringExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
|
||||
/// </summary>
|
||||
/// <param name="s"></param>
|
||||
/// <param name="start">the start index, inclusive</param>
|
||||
/// <param name="end">the end index, exclusive</param>
|
||||
/// <returns>the specified subsequence</returns>
|
||||
/// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
|
||||
public static string SubSequence(this string s, int start, int end)
|
||||
{
|
||||
if (start < 0) throw new ArgumentOutOfRangeException(nameof(start), "Argument must not be negative.");
|
||||
if (end < 0) throw new ArgumentOutOfRangeException(nameof(end), "Argument must not be negative.");
|
||||
if (end > s.Length) throw new ArgumentOutOfRangeException(nameof(end), "Argument must not be greater than the input string's length.");
|
||||
if (start > end) throw new ArgumentOutOfRangeException(nameof(start), "Argument must not be greater than the 'end' argument.");
|
||||
|
||||
return s.Substring(start, end - start);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,131 +0,0 @@
|
|||
namespace NLangDetect.Core.Extensions
|
||||
{
|
||||
public enum UnicodeBlock
|
||||
{
|
||||
BasicLatin,
|
||||
Latin1Supplement,
|
||||
LatinExtendedA,
|
||||
LatinExtendedB,
|
||||
IpaExtensions,
|
||||
SpacingModifierLetters,
|
||||
CombiningDiacriticalMarks,
|
||||
Greek,
|
||||
Cyrillic,
|
||||
CyrillicSupplementary,
|
||||
Armenian,
|
||||
Hebrew,
|
||||
Arabic,
|
||||
Syriac,
|
||||
Thaana,
|
||||
Devanagari,
|
||||
Bengali,
|
||||
Gurmukhi,
|
||||
Gujarati,
|
||||
Oriya,
|
||||
Tamil,
|
||||
Telugu,
|
||||
Kannada,
|
||||
Malayalam,
|
||||
Sinhala,
|
||||
Thai,
|
||||
Lao,
|
||||
Tibetan,
|
||||
Myanmar,
|
||||
Georgian,
|
||||
HangulJamo,
|
||||
Ethiopic,
|
||||
Cherokee,
|
||||
UnifiedCanadianAboriginalSyllabics,
|
||||
Ogham,
|
||||
Runic,
|
||||
Tagalog,
|
||||
Hanunoo,
|
||||
Buhid,
|
||||
Tagbanwa,
|
||||
Khmer,
|
||||
Mongolian,
|
||||
Limbu,
|
||||
TaiLe,
|
||||
KhmerSymbols,
|
||||
PhoneticExtensions,
|
||||
LatinExtendedAdditional,
|
||||
GreekExtended,
|
||||
GeneralPunctuation,
|
||||
SuperscriptsAndSubscripts,
|
||||
CurrencySymbols,
|
||||
CombiningMarksForSymbols,
|
||||
LetterlikeSymbols,
|
||||
NumberForms,
|
||||
Arrows,
|
||||
MathematicalOperators,
|
||||
MiscellaneousTechnical,
|
||||
ControlPictures,
|
||||
OpticalCharacterRecognition,
|
||||
EnclosedAlphanumerics,
|
||||
BoxDrawing,
|
||||
BlockElements,
|
||||
GeometricShapes,
|
||||
MiscellaneousSymbols,
|
||||
Dingbats,
|
||||
MiscellaneousMathematicalSymbolsA,
|
||||
SupplementalArrowsA,
|
||||
BraillePatterns,
|
||||
SupplementalArrowsB,
|
||||
MiscellaneousMathematicalSymbolsB,
|
||||
SupplementalMathematicalOperators,
|
||||
MiscellaneousSymbolsAndArrows,
|
||||
CjkRadicalsSupplement,
|
||||
KangxiRadicals,
|
||||
IdeographicDescriptionCharacters,
|
||||
CjkSymbolsAndPunctuation,
|
||||
Hiragana,
|
||||
Katakana,
|
||||
Bopomofo,
|
||||
HangulCompatibilityJamo,
|
||||
Kanbun,
|
||||
BopomofoExtended,
|
||||
KatakanaPhoneticExtensions,
|
||||
EnclosedCjkLettersAndMonths,
|
||||
CjkCompatibility,
|
||||
CjkUnifiedIdeographsExtensionA,
|
||||
YijingHexagramSymbols,
|
||||
CjkUnifiedIdeographs,
|
||||
YiSyllables,
|
||||
YiRadicals,
|
||||
HangulSyllables,
|
||||
HighSurrogates,
|
||||
HighPrivateUseSurrogates,
|
||||
LowSurrogates,
|
||||
PrivateUseArea,
|
||||
CjkCompatibilityIdeographs,
|
||||
AlphabeticPresentationForms,
|
||||
ArabicPresentationFormsA,
|
||||
VariationSelectors,
|
||||
CombiningHalfMarks,
|
||||
CjkCompatibilityForms,
|
||||
SmallFormVariants,
|
||||
ArabicPresentationFormsB,
|
||||
HalfwidthAndFullwidthForms,
|
||||
Specials,
|
||||
LinearBSyllabary,
|
||||
LinearBIdeograms,
|
||||
AegeanNumbers,
|
||||
OldItalic,
|
||||
Gothic,
|
||||
Ugaritic,
|
||||
Deseret,
|
||||
Shavian,
|
||||
Osmanya,
|
||||
CypriotSyllabary,
|
||||
ByzantineMusicalSymbols,
|
||||
MusicalSymbols,
|
||||
TaiXuanJingSymbols,
|
||||
MathematicalAlphanumericSymbols,
|
||||
CjkUnifiedIdeographsExtensionB,
|
||||
CjkCompatibilityIdeographsSupplement,
|
||||
Tags,
|
||||
VariationSelectorsSupplement,
|
||||
SupplementaryPrivateUseAreaA,
|
||||
SupplementaryPrivateUseAreaB,
|
||||
}
|
||||
}
|
|
@ -1,67 +0,0 @@
|
|||
using System;
|
||||
using System.IO;
|
||||
using System.IO.Compression;
|
||||
using System.Xml;
|
||||
using NLangDetect.Core.Utils;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
// TODO IMM HI: xml reader not tested
|
||||
public static class GenProfile
|
||||
{
|
||||
#region Public methods
|
||||
|
||||
public static LangProfile load(string lang, string file)
|
||||
{
|
||||
var profile = new LangProfile(lang);
|
||||
var tagextractor = new TagExtractor("abstract", 100);
|
||||
Stream inputStream = null;
|
||||
|
||||
try
|
||||
{
|
||||
inputStream = File.OpenRead(file);
|
||||
|
||||
string extension = Path.GetExtension(file) ?? "";
|
||||
|
||||
if (extension.ToUpper() == ".GZ")
|
||||
{
|
||||
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
|
||||
}
|
||||
|
||||
using (var xmlReader = XmlReader.Create(inputStream))
|
||||
{
|
||||
while (xmlReader.Read())
|
||||
{
|
||||
switch (xmlReader.NodeType)
|
||||
{
|
||||
case XmlNodeType.Element:
|
||||
tagextractor.SetTag(xmlReader.Name);
|
||||
break;
|
||||
|
||||
case XmlNodeType.Text:
|
||||
tagextractor.Add(xmlReader.Value);
|
||||
break;
|
||||
|
||||
case XmlNodeType.EndElement:
|
||||
tagextractor.CloseTag(profile);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (inputStream != null)
|
||||
{
|
||||
inputStream.Close();
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine(lang + ": " + tagextractor.Count);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
using System;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
[Serializable]
|
||||
public class InternalException : Exception
|
||||
{
|
||||
#region Constructor(s)
|
||||
|
||||
public InternalException(string message, Exception innerException)
|
||||
: base(message, innerException)
|
||||
{
|
||||
}
|
||||
|
||||
public InternalException(string message)
|
||||
: this(message, null)
|
||||
{
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
using System.Globalization;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
// TODO IMM HI: name??
|
||||
public class Language
|
||||
{
|
||||
#region Constructor(s)
|
||||
|
||||
public Language(string name, double probability)
|
||||
{
|
||||
Name = name;
|
||||
Probability = probability;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Object overrides
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
if (Name == null)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
return
|
||||
string.Format(
|
||||
CultureInfo.InvariantCulture.NumberFormat,
|
||||
"{0}:{1:0.000000}",
|
||||
Name,
|
||||
Probability);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Properties
|
||||
|
||||
public string Name { get; set; }
|
||||
|
||||
public double Probability { get; set; }
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
using System;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
// TODO IMM HI: change to non-static class
|
||||
// TODO IMM HI: hide other, unnecassary classes via internal?
|
||||
public static class LanguageDetector
|
||||
{
|
||||
private const double _DefaultAlpha = 0.5;
|
||||
|
||||
#region Public methods
|
||||
|
||||
public static void Initialize(IJsonSerializer json)
|
||||
{
|
||||
DetectorFactory.LoadProfiles(json);
|
||||
}
|
||||
|
||||
public static void Release()
|
||||
{
|
||||
DetectorFactory.Clear();
|
||||
}
|
||||
|
||||
public static string DetectLanguage(string plainText)
|
||||
{
|
||||
if (string.IsNullOrEmpty(plainText)) { throw new ArgumentException("Argument can't be null nor empty.", nameof(plainText)); }
|
||||
|
||||
var detector = DetectorFactory.Create(_DefaultAlpha);
|
||||
|
||||
detector.Append(plainText);
|
||||
|
||||
return detector.Detect();
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
using System;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
public class NLangDetectException : Exception
|
||||
{
|
||||
#region Constructor(s)
|
||||
|
||||
public NLangDetectException(string message, ErrorCode errorCode)
|
||||
: base(message)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Properties
|
||||
|
||||
public ErrorCode ErrorCode { get; private set; }
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace NLangDetect.Core
|
||||
{
|
||||
public class ProbVector
|
||||
{
|
||||
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
|
||||
|
||||
public double this[int key]
|
||||
{
|
||||
get
|
||||
{
|
||||
return _dict.TryGetValue(key, out var value) ? value : 0.0;
|
||||
}
|
||||
|
||||
set
|
||||
{
|
||||
if (Math.Abs(value) < double.Epsilon)
|
||||
{
|
||||
if (_dict.ContainsKey(key))
|
||||
{
|
||||
_dict.Remove(key);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
_dict[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,118 +0,0 @@
|
|||
using System.Collections.Generic;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace NLangDetect.Core.Utils
|
||||
{
|
||||
public class LangProfile
|
||||
{
|
||||
private const int MinimumFreq = 2;
|
||||
private const int LessFreqRatio = 100000;
|
||||
|
||||
public string name { get; set; }
|
||||
|
||||
public Dictionary<string, int> freq { get; set; }
|
||||
public int[] n_words { get; set; }
|
||||
|
||||
#region Constructor(s)
|
||||
|
||||
public LangProfile()
|
||||
{
|
||||
freq = new Dictionary<string, int>();
|
||||
n_words = new int[NGram.GramsCount];
|
||||
}
|
||||
|
||||
public LangProfile(string name)
|
||||
{
|
||||
this.name = name;
|
||||
freq = new Dictionary<string, int>();
|
||||
n_words = new int[NGram.GramsCount];
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public void Add(string gram)
|
||||
{
|
||||
if (name == null || gram == null) return; // Illegal
|
||||
int len = gram.Length;
|
||||
if (len < 1 || len > NGram.GramsCount) return; // Illegal
|
||||
|
||||
n_words[len - 1]++;
|
||||
|
||||
if (freq.ContainsKey(gram))
|
||||
{
|
||||
freq[gram] = freq[gram] + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
freq.Add(gram, 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void OmitLessFreq()
|
||||
{
|
||||
if (name == null) return; // Illegal
|
||||
int threshold = n_words[0] / LessFreqRatio;
|
||||
if (threshold < MinimumFreq) threshold = MinimumFreq;
|
||||
|
||||
ICollection<string> keys = freq.Keys;
|
||||
int roman = 0;
|
||||
// TODO IMM HI: move up?
|
||||
var regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
|
||||
var keysToRemove = new List<string>();
|
||||
|
||||
foreach (string key in keys)
|
||||
{
|
||||
int count = freq[key];
|
||||
|
||||
if (count <= threshold)
|
||||
{
|
||||
n_words[key.Length - 1] -= count;
|
||||
keysToRemove.Add(key);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (regex1.IsMatch(key))
|
||||
{
|
||||
roman += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (string keyToRemove in keysToRemove)
|
||||
{
|
||||
freq.Remove(keyToRemove);
|
||||
}
|
||||
|
||||
// roman check
|
||||
keysToRemove = new List<string>();
|
||||
|
||||
if (roman < n_words[0] / 3)
|
||||
{
|
||||
ICollection<string> keys2 = freq.Keys;
|
||||
|
||||
// TODO IMM HI: move up?
|
||||
var regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
|
||||
|
||||
foreach (string key in keys2)
|
||||
{
|
||||
int count = freq[key];
|
||||
|
||||
if (regex2.IsMatch(key))
|
||||
{
|
||||
n_words[key.Length - 1] -= count;
|
||||
keysToRemove.Add(key);
|
||||
}
|
||||
}
|
||||
|
||||
foreach (string keyToRemove in keysToRemove)
|
||||
{
|
||||
freq.Remove(keyToRemove);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,88 +0,0 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace NLangDetect.Core.Utils
|
||||
{
|
||||
public static class Messages
|
||||
{
|
||||
private static readonly Dictionary<string, string> _messages;
|
||||
|
||||
static Messages()
|
||||
{
|
||||
_messages = LoadMessages();
|
||||
}
|
||||
|
||||
public static string getString(string key)
|
||||
{
|
||||
return
|
||||
_messages.TryGetValue(key, out var value)
|
||||
? value
|
||||
: string.Format("!{0}!", key);
|
||||
}
|
||||
|
||||
private static Dictionary<string, string> LoadMessages()
|
||||
{
|
||||
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1);
|
||||
|
||||
var messagesStream =
|
||||
typeof(Messages).Assembly
|
||||
.GetManifestResourceStream(manifestName);
|
||||
|
||||
if (messagesStream == null)
|
||||
{
|
||||
throw new InternalException(string.Format("Couldn't get embedded resource named '{0}'.", manifestName));
|
||||
}
|
||||
|
||||
using (messagesStream)
|
||||
using (var sr = new StreamReader(messagesStream))
|
||||
{
|
||||
var messages = new Dictionary<string, string>();
|
||||
|
||||
while (!sr.EndOfStream)
|
||||
{
|
||||
string line = sr.ReadLine();
|
||||
|
||||
if (string.IsNullOrEmpty(line))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
string[] keyValue = line.Split('=');
|
||||
|
||||
if (keyValue.Length != 2)
|
||||
{
|
||||
throw new InternalException(string.Format("Invalid format of the 'Messages.properties' resource. Offending line: '{0}'.", line.Trim()));
|
||||
}
|
||||
|
||||
string key = keyValue[0];
|
||||
string value = UnescapeUnicodeString(keyValue[1]);
|
||||
|
||||
messages.Add(key, value);
|
||||
}
|
||||
|
||||
return messages;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks>
|
||||
/// Taken from: http://stackoverflow.com/questions/1615559/converting-unicode-strings-to-escaped-ascii-string/1615860#1615860
|
||||
/// </remarks>
|
||||
private static string UnescapeUnicodeString(string s)
|
||||
{
|
||||
if (s == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return
|
||||
Regex.Replace(
|
||||
s,
|
||||
@"\\u(?<Value>[a-zA-Z0-9]{4})",
|
||||
match => ((char)int.Parse(match.Groups["Value"].Value, NumberStyles.HexNumber)).ToString());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,330 +0,0 @@
|
|||
// TODO IMM HI: check which classes can be made internal?
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
using NLangDetect.Core.Extensions;
|
||||
|
||||
namespace NLangDetect.Core.Utils
|
||||
{
|
||||
public class NGram
|
||||
{
|
||||
public const int GramsCount = 3;
|
||||
|
||||
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
|
||||
|
||||
private static readonly string[] CjkClass =
|
||||
{
|
||||
#region CJK classes
|
||||
|
||||
Messages.getString("NGram.KANJI_1_0"),
|
||||
Messages.getString("NGram.KANJI_1_2"),
|
||||
Messages.getString("NGram.KANJI_1_4"),
|
||||
Messages.getString("NGram.KANJI_1_8"),
|
||||
Messages.getString("NGram.KANJI_1_11"),
|
||||
Messages.getString("NGram.KANJI_1_12"),
|
||||
Messages.getString("NGram.KANJI_1_13"),
|
||||
Messages.getString("NGram.KANJI_1_14"),
|
||||
Messages.getString("NGram.KANJI_1_16"),
|
||||
Messages.getString("NGram.KANJI_1_18"),
|
||||
Messages.getString("NGram.KANJI_1_22"),
|
||||
Messages.getString("NGram.KANJI_1_27"),
|
||||
Messages.getString("NGram.KANJI_1_29"),
|
||||
Messages.getString("NGram.KANJI_1_31"),
|
||||
Messages.getString("NGram.KANJI_1_35"),
|
||||
Messages.getString("NGram.KANJI_2_0"),
|
||||
Messages.getString("NGram.KANJI_2_1"),
|
||||
Messages.getString("NGram.KANJI_2_4"),
|
||||
Messages.getString("NGram.KANJI_2_9"),
|
||||
Messages.getString("NGram.KANJI_2_10"),
|
||||
Messages.getString("NGram.KANJI_2_11"),
|
||||
Messages.getString("NGram.KANJI_2_12"),
|
||||
Messages.getString("NGram.KANJI_2_13"),
|
||||
Messages.getString("NGram.KANJI_2_15"),
|
||||
Messages.getString("NGram.KANJI_2_16"),
|
||||
Messages.getString("NGram.KANJI_2_18"),
|
||||
Messages.getString("NGram.KANJI_2_21"),
|
||||
Messages.getString("NGram.KANJI_2_22"),
|
||||
Messages.getString("NGram.KANJI_2_23"),
|
||||
Messages.getString("NGram.KANJI_2_28"),
|
||||
Messages.getString("NGram.KANJI_2_29"),
|
||||
Messages.getString("NGram.KANJI_2_30"),
|
||||
Messages.getString("NGram.KANJI_2_31"),
|
||||
Messages.getString("NGram.KANJI_2_32"),
|
||||
Messages.getString("NGram.KANJI_2_35"),
|
||||
Messages.getString("NGram.KANJI_2_36"),
|
||||
Messages.getString("NGram.KANJI_2_37"),
|
||||
Messages.getString("NGram.KANJI_2_38"),
|
||||
Messages.getString("NGram.KANJI_3_1"),
|
||||
Messages.getString("NGram.KANJI_3_2"),
|
||||
Messages.getString("NGram.KANJI_3_3"),
|
||||
Messages.getString("NGram.KANJI_3_4"),
|
||||
Messages.getString("NGram.KANJI_3_5"),
|
||||
Messages.getString("NGram.KANJI_3_8"),
|
||||
Messages.getString("NGram.KANJI_3_9"),
|
||||
Messages.getString("NGram.KANJI_3_11"),
|
||||
Messages.getString("NGram.KANJI_3_12"),
|
||||
Messages.getString("NGram.KANJI_3_13"),
|
||||
Messages.getString("NGram.KANJI_3_15"),
|
||||
Messages.getString("NGram.KANJI_3_16"),
|
||||
Messages.getString("NGram.KANJI_3_18"),
|
||||
Messages.getString("NGram.KANJI_3_19"),
|
||||
Messages.getString("NGram.KANJI_3_22"),
|
||||
Messages.getString("NGram.KANJI_3_23"),
|
||||
Messages.getString("NGram.KANJI_3_27"),
|
||||
Messages.getString("NGram.KANJI_3_29"),
|
||||
Messages.getString("NGram.KANJI_3_30"),
|
||||
Messages.getString("NGram.KANJI_3_31"),
|
||||
Messages.getString("NGram.KANJI_3_32"),
|
||||
Messages.getString("NGram.KANJI_3_35"),
|
||||
Messages.getString("NGram.KANJI_3_36"),
|
||||
Messages.getString("NGram.KANJI_3_37"),
|
||||
Messages.getString("NGram.KANJI_3_38"),
|
||||
Messages.getString("NGram.KANJI_4_0"),
|
||||
Messages.getString("NGram.KANJI_4_9"),
|
||||
Messages.getString("NGram.KANJI_4_10"),
|
||||
Messages.getString("NGram.KANJI_4_16"),
|
||||
Messages.getString("NGram.KANJI_4_17"),
|
||||
Messages.getString("NGram.KANJI_4_18"),
|
||||
Messages.getString("NGram.KANJI_4_22"),
|
||||
Messages.getString("NGram.KANJI_4_24"),
|
||||
Messages.getString("NGram.KANJI_4_28"),
|
||||
Messages.getString("NGram.KANJI_4_34"),
|
||||
Messages.getString("NGram.KANJI_4_39"),
|
||||
Messages.getString("NGram.KANJI_5_10"),
|
||||
Messages.getString("NGram.KANJI_5_11"),
|
||||
Messages.getString("NGram.KANJI_5_12"),
|
||||
Messages.getString("NGram.KANJI_5_13"),
|
||||
Messages.getString("NGram.KANJI_5_14"),
|
||||
Messages.getString("NGram.KANJI_5_18"),
|
||||
Messages.getString("NGram.KANJI_5_26"),
|
||||
Messages.getString("NGram.KANJI_5_29"),
|
||||
Messages.getString("NGram.KANJI_5_34"),
|
||||
Messages.getString("NGram.KANJI_5_39"),
|
||||
Messages.getString("NGram.KANJI_6_0"),
|
||||
Messages.getString("NGram.KANJI_6_3"),
|
||||
Messages.getString("NGram.KANJI_6_9"),
|
||||
Messages.getString("NGram.KANJI_6_10"),
|
||||
Messages.getString("NGram.KANJI_6_11"),
|
||||
Messages.getString("NGram.KANJI_6_12"),
|
||||
Messages.getString("NGram.KANJI_6_16"),
|
||||
Messages.getString("NGram.KANJI_6_18"),
|
||||
Messages.getString("NGram.KANJI_6_20"),
|
||||
Messages.getString("NGram.KANJI_6_21"),
|
||||
Messages.getString("NGram.KANJI_6_22"),
|
||||
Messages.getString("NGram.KANJI_6_23"),
|
||||
Messages.getString("NGram.KANJI_6_25"),
|
||||
Messages.getString("NGram.KANJI_6_28"),
|
||||
Messages.getString("NGram.KANJI_6_29"),
|
||||
Messages.getString("NGram.KANJI_6_30"),
|
||||
Messages.getString("NGram.KANJI_6_32"),
|
||||
Messages.getString("NGram.KANJI_6_34"),
|
||||
Messages.getString("NGram.KANJI_6_35"),
|
||||
Messages.getString("NGram.KANJI_6_37"),
|
||||
Messages.getString("NGram.KANJI_6_39"),
|
||||
Messages.getString("NGram.KANJI_7_0"),
|
||||
Messages.getString("NGram.KANJI_7_3"),
|
||||
Messages.getString("NGram.KANJI_7_6"),
|
||||
Messages.getString("NGram.KANJI_7_7"),
|
||||
Messages.getString("NGram.KANJI_7_9"),
|
||||
Messages.getString("NGram.KANJI_7_11"),
|
||||
Messages.getString("NGram.KANJI_7_12"),
|
||||
Messages.getString("NGram.KANJI_7_13"),
|
||||
Messages.getString("NGram.KANJI_7_16"),
|
||||
Messages.getString("NGram.KANJI_7_18"),
|
||||
Messages.getString("NGram.KANJI_7_19"),
|
||||
Messages.getString("NGram.KANJI_7_20"),
|
||||
Messages.getString("NGram.KANJI_7_21"),
|
||||
Messages.getString("NGram.KANJI_7_23"),
|
||||
Messages.getString("NGram.KANJI_7_25"),
|
||||
Messages.getString("NGram.KANJI_7_28"),
|
||||
Messages.getString("NGram.KANJI_7_29"),
|
||||
Messages.getString("NGram.KANJI_7_32"),
|
||||
Messages.getString("NGram.KANJI_7_33"),
|
||||
Messages.getString("NGram.KANJI_7_35"),
|
||||
Messages.getString("NGram.KANJI_7_37"),
|
||||
|
||||
#endregion
|
||||
};
|
||||
|
||||
private static readonly Dictionary<char, char> _cjkMap;
|
||||
|
||||
private StringBuilder _grams;
|
||||
private bool _capitalword;
|
||||
|
||||
#region Constructor(s)
|
||||
|
||||
static NGram()
|
||||
{
|
||||
_cjkMap = new Dictionary<char, char>();
|
||||
|
||||
foreach (string cjk_list in CjkClass)
|
||||
{
|
||||
char representative = cjk_list[0];
|
||||
|
||||
for (int i = 0; i < cjk_list.Length; i++)
|
||||
{
|
||||
_cjkMap.Add(cjk_list[i], representative);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public NGram()
|
||||
{
|
||||
_grams = new StringBuilder(" ");
|
||||
_capitalword = false;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public static char Normalize(char ch)
|
||||
{
|
||||
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
|
||||
|
||||
if (!unicodeBlock.HasValue)
|
||||
{
|
||||
return ch;
|
||||
}
|
||||
|
||||
switch (unicodeBlock.Value)
|
||||
{
|
||||
case UnicodeBlock.BasicLatin:
|
||||
{
|
||||
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
|
||||
{
|
||||
return ' ';
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case UnicodeBlock.Latin1Supplement:
|
||||
{
|
||||
if (Latin1Excluded.IndexOf(ch) >= 0)
|
||||
{
|
||||
return ' ';
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case UnicodeBlock.GeneralPunctuation:
|
||||
{
|
||||
return ' ';
|
||||
}
|
||||
|
||||
case UnicodeBlock.Arabic:
|
||||
{
|
||||
if (ch == '\u06cc')
|
||||
{
|
||||
return '\u064a';
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case UnicodeBlock.LatinExtendedAdditional:
|
||||
{
|
||||
if (ch >= '\u1ea0')
|
||||
{
|
||||
return '\u1ec3';
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case UnicodeBlock.Hiragana:
|
||||
{
|
||||
return '\u3042';
|
||||
}
|
||||
|
||||
case UnicodeBlock.Katakana:
|
||||
{
|
||||
return '\u30a2';
|
||||
}
|
||||
|
||||
case UnicodeBlock.Bopomofo:
|
||||
case UnicodeBlock.BopomofoExtended:
|
||||
{
|
||||
return '\u3105';
|
||||
}
|
||||
|
||||
case UnicodeBlock.CjkUnifiedIdeographs:
|
||||
{
|
||||
if (_cjkMap.ContainsKey(ch))
|
||||
{
|
||||
return _cjkMap[ch];
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case UnicodeBlock.HangulSyllables:
|
||||
{
|
||||
return '\uac00';
|
||||
}
|
||||
}
|
||||
|
||||
return ch;
|
||||
}
|
||||
|
||||
public void AddChar(char ch)
|
||||
{
|
||||
ch = Normalize(ch);
|
||||
char lastchar = _grams[_grams.Length - 1];
|
||||
if (lastchar == ' ')
|
||||
{
|
||||
_grams = new StringBuilder(" ");
|
||||
_capitalword = false;
|
||||
if (ch == ' ') return;
|
||||
}
|
||||
else if (_grams.Length >= GramsCount)
|
||||
{
|
||||
_grams.Remove(0, 1);
|
||||
}
|
||||
_grams.Append(ch);
|
||||
|
||||
if (char.IsUpper(ch))
|
||||
{
|
||||
if (char.IsUpper(lastchar)) _capitalword = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
_capitalword = false;
|
||||
}
|
||||
}
|
||||
|
||||
public string Get(int n)
|
||||
{
|
||||
if (_capitalword)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
int len = _grams.Length;
|
||||
|
||||
if (n < 1 || n > 3 || len < n)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (n == 1)
|
||||
{
|
||||
char ch = _grams[len - 1];
|
||||
|
||||
if (ch == ' ')
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return ch.ToString();
|
||||
}
|
||||
|
||||
// TODO IMM HI: is ToString() here effective?
|
||||
return _grams.ToString().SubSequence(len - n, len);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
using System.Text;
|
||||
|
||||
namespace NLangDetect.Core.Utils
|
||||
{
|
||||
public class TagExtractor
|
||||
{
|
||||
// TODO IMM HI: do the really need to be internal?
|
||||
internal string Target;
|
||||
internal int Threshold;
|
||||
internal StringBuilder StringBuilder;
|
||||
internal string Tag;
|
||||
|
||||
#region Constructor(s)
|
||||
|
||||
public TagExtractor(string tag, int threshold)
|
||||
{
|
||||
Target = tag;
|
||||
Threshold = threshold;
|
||||
Count = 0;
|
||||
Clear();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public void Clear()
|
||||
{
|
||||
StringBuilder = new StringBuilder();
|
||||
Tag = null;
|
||||
}
|
||||
|
||||
public void SetTag(string tag)
|
||||
{
|
||||
Tag = tag;
|
||||
}
|
||||
|
||||
public void Add(string line)
|
||||
{
|
||||
if (Tag == Target && line != null)
|
||||
{
|
||||
StringBuilder.Append(line);
|
||||
}
|
||||
}
|
||||
|
||||
public void CloseTag(LangProfile profile)
|
||||
{
|
||||
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
|
||||
{
|
||||
var gram = new NGram();
|
||||
|
||||
for (int i = 0; i < StringBuilder.Length; i++)
|
||||
{
|
||||
gram.AddChar(StringBuilder[i]);
|
||||
|
||||
for (int n = 1; n <= NGram.GramsCount; n++)
|
||||
{
|
||||
profile.Add(gram.Get(n));
|
||||
}
|
||||
}
|
||||
|
||||
Count++;
|
||||
}
|
||||
|
||||
Clear();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Properties
|
||||
|
||||
public int Count { get; private set; }
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -1,128 +0,0 @@
|
|||
NGram.CJK_KANJI_EXCLUDE=\u0020\uFF08\uFF09
|
||||
NGram.LATIN1_EXCLUDE=\u00A0\u00AB\u00B0\u00BB
|
||||
NGram.KANJI_1_0=\u4F7C\u6934
|
||||
NGram.KANJI_1_2=\u88CF\u95B2
|
||||
NGram.KANJI_1_4=\u7027\u7DCB
|
||||
NGram.KANJI_1_8=\u4E80\u4E9C\u4EEE\u5263\u5264\u5270\u52C5\u52E7\u52F2\u53B3\u5449\u58CA\u58CC\u5968\u59C9\u59EB\u5D8B\u5DE3\u5E30\u6075\u622F\u623B\u6255\u629C\u629E\u62DD\u62E1\u633F\u635C\u63FA\u6442\u6589\u658E\u6669\u66A6\u66FD\u6804\u685C\u6B69\u6B6F\u6BBB\u6C37\u6C5A\u6D44\u6E09\u6E0B\u6E13\u6EDD\u713C\u72A0\u731F\u7363\u7A32\u7A42\u7A93\u7ADC\u7C8B\u7C9B\u7DD1\u7E01\u7E04\u7E26\u7E4A\u7E4B\u7E70\u8074\u8107\u8133\u81D3\u820E\u8217\u8358\u83D3\u85AC\u8987\u899A\u8B21\u8B72\u8B83\u8CDB\u9045\u90F7\u91C8\u9271\u9283\u92AD\u9665\u967A\u96A0\u96A3\u96B7\u970A\u983C\u9854\u9855\u99C6\u9A12\u9ED9\u9F62
|
||||
NGram.KANJI_1_11=\u67D8\u831C
|
||||
NGram.KANJI_1_12=\u5742\u57FC\u5800
|
||||
NGram.KANJI_1_13=\u4E3C\u4E98\u4FE3\u4FF5\u5072\u51A8\u53A9\u5451\u546A\u5504\u5516\u55A9\u55B0\u5618\u5642\u565B\u567A\u56A2\u57F4\u5840\u5841\u58F1\u59F6\u5A2F\u5B22\u5B8D\u5DCC\u5EFB\u5F10\u60A9\u60E3\u61D0\u62F6\u63B4\u63BB\u63C3\u6681\u685F\u6955\u6962\u696F\u698A\u698E\u69FB\u6A2B\u6A7F\u6B53\u6BD8\u6D99\u6E07\u7460\u7473\u7560\u7573\u758E\u7690\u7815\u783A\u7962\u7A4F\u7A63\u7AEA\u7BED\u7CA7\u7D18\u7D3A\u7E4D\u8061\u8218\u8276\u82C5\u8597\u85AB\u86CD\u874B\u88FE\u8ACF\u8B90\u8D0B\u8FBF\u9013\u9061\u914E\u9154\u918D\u9190\u91A4\u91B8\u9262\u929A\u92ED\u92F3\u932C\u96EB\u96F0\u976D\u97EE\u981A\u99C4\u9A28\u9AC4\u9B8E\u9C10\u9D0E\u9D5C\u9D8F\u9E78\u9EB9\u9EBA\u9EBF
|
||||
NGram.KANJI_1_14=\u5F66\u7984\u7985
|
||||
NGram.KANJI_1_16=\u5861\u7B25\u844E\u9419\u9D07
|
||||
NGram.KANJI_1_18=\u5039\u514E\u51E7\u51EA\u5301\u5302\u5859\u58F7\u59AC\u5C2D\u5CA8\u5EFC\u6357\u64B9\u67CA\u6802\u6834\u68BC\u6900\u6919\u691B\u69D9\u6AE8\u6D9C\u6E8C\u6F09\u6F45\u701E\u7026\u7114\u72DB\u7577\u75E9\u783F\u7895\u7A50\u7AC3\u7B48\u7B86\u7BAA\u7C7E\u7C82\u7C8D\u7CCE\u7D2C\u7F6B\u7FEB\u8557\u85AE\u86CE\u877F\u8997\u8ACC\u8CB0\u8CCE\u8FE9\u9197\u920E\u9266\u927E\u92F2\u9306\u9453\u9784\u982C\u9834\u99C8\u9BF5\u9C2F\u9D2C
|
||||
NGram.KANJI_1_22=\u6762\u6A17\u887F
|
||||
NGram.KANJI_1_27=\u4E21\u4E57\u4ECF\u4F1D\u4FA1\u4FF3\u5024\u50CD\u5150\u5186\u51E6\u52B4\u52B9\u5358\u53CE\u55B6\u56E3\u56F2\u56F3\u570F\u5727\u5869\u5897\u58F2\u5909\u5B9F\u5BDB\u5BFE\u5C02\u5DFB\u5E2F\u5E81\u5E83\u5EC3\u5F3E\u5F93\u5FB3\u5FB4\u5FDC\u60AA\u6226\u6238\u6271\u62E0\u6319\u63B2\u6483\u64AE\u67A0\u67FB\u691C\u697D\u69D8\u6A29\u6B73\u6B74\u6BCE\u6C17\u6CA2\u6D5C\u6E08\u6E80\u702C\u7523\u767A\u770C\u7D4C\u7D75\u7D76\u7D99\u7D9A\u7DCF\u8535\u8846\u89A7\u89B3\u8A33\u8AAC\u8AAD\u8C4A\u8EE2\u8EFD\u8FBA\u8FBC\u9244\u9332\u95A2\u95D8\u96D1\u99C5\u9A13\u9ED2
|
||||
NGram.KANJI_1_29=\u4F0E\u4FFA\u5036\u53E1\u54B2\u5506\u583A\u5C3B\u5CAC\u5CE0\u5CEF\u6803\u68B6\u6A0B\u6A8E\u73C2\u7551\u7826\u7881\u79B0\u7B39\u8429\u8599\u8FBB\u9162\u95C7\u9688\u96BC\u9AEA\u9DF2
|
||||
NGram.KANJI_1_31=\u5553\u938C
|
||||
NGram.KANJI_1_35=\u51B4\u564C\u57DC\u5B2C\u6822\u685D\u690B\u6973\u6C93\u7511\u7887\u7A17\u83D6\u847A\u8494\u8526\u854E\u85C1\u86F8\u88B4\u93A7\u9B92\u9C39\u9C48\u9C52
|
||||
NGram.KANJI_2_0=\u4E2B\u4EC3\u4F09\u4F57\u4F6F\u4F70\u4FD1\u4FDA\u500C\u5043\u516E\u5189\u5241\u530D\u5310\u5412\u54AB\u54AF\u5514\u5556\u55B1\u561F\u573B\u586D\u587D\u58C5\u58D1\u5914\u5A62\u5A6A\u5AE6\u5B40\u5B5B\u5B70\u5BB8\u5CD2\u5D01\u5D34\u5E11\u5EA0\u5F0B\u5F2D\u5F87\u607F\u621B\u6221\u6289\u63A3\u6452\u646D\u64D8\u652B\u6600\u6631\u6641\u66F7\u6773\u67B8\u67DD\u67DE\u6829\u68FB\u69AD\u6A47\u6C10\u6C68\u6C74\u6C85\u6CD3\u6D31\u6D93\u6D94\u6DB8\u6DBF\u6DC5\u6E6E\u6EA7\u6EB4\u6EC2\u6F2A\u6F2F\u6FB9\u6FC2\u6FDB\u6FEE\u70AF\u70FD\u7166\u726F\u729B\u739F\u73DE\u740A\u746D\u749C\u749F\u74E0\u759D\u75A3\u75CD\u75DE\u7600\u7620\u7688\u7738\u7762\u776B\u777D\u77E3\u781D\u7837\u78A3\u7946\u7B60\u7F44\u7F54\u7F5F\u7FAF\u8026\u807F\u80C4\u80DB\u80ED\u81E7\u824B\u82B7\u82E3\u8392\u846D\u84D3\u8548\u85B9\u86DE\u873F\u8753\u8782\u87AB\u87B3\u87D1\u87E0\u87FE\u8821\u88D8\u88E8\u8913\u891A\u892B\u8983\u8C3F\u8C49\u8C82\u8D6D\u8DE4\u8E1D\u8E1E\u8E7C\u8FE5\u8FE8\u9005\u9035\u9050\u9082\u9083\u9095\u90E2\u911E\u91AE\u91B4\u93D6\u9621\u968D\u96B9\u96D2\u9711\u9713\u973E\u9AB0\u9AB7\u9AE6\u9B03\u9B23\u9EDC\u9EEF
|
||||
NGram.KANJI_2_1=\u4E82\u4F48\u4F54\u50F9\u5167\u528D\u52DE\u532F\u537B\u53C3\u5433\u555F\u55AE\u56B4\u570D\u5716\u58D3\u58DE\u5920\u5967\u5A1B\u5BEB\u5BEC\u5C08\u5C0D\u5C46\u5C6C\u5CFD\u5E36\u5E6B\u5EC8\u5EF3\u5F48\u5F91\u5F9E\u5FB5\u6046\u60E1\u61F7\u6232\u6236\u64C7\u64CA\u64D4\u64DA\u64F4\u651D\u6578\u65B7\u6649\u6A13\u6A23\u6A6B\u6A94\u6AA2\u6B0A\u6B50\u6B61\u6B72\u6B77\u6B78\u6C92\u6EAB\u6EFF\u6FD5\u6FDF\u71DF\u722D\u72C0\u734E\u737B\u746A\u7522\u773E\u78BC\u7A69\u7C3D\u7CB5\u7D55\u7D72\u7DA0\u7DAB\u7DE3\u7E5E\u7E6A\u7E7C\u7E8C\u8072\u807D\u8085\u812B\u8166\u8173\u81D8\u8209\u820A\u8332\u838A\u840A\u85E5\u860B\u8655\u865B\u88DD\u89BA\u89BD\u89C0\u8AAA\u8B6F\u8B7D\u8B8A\u8B93\u8C50\u8CF4\u8E64\u8F15\u8F49\u8FA6\u8FAD\u9109\u9130\u91AB\u91CB\u92B7\u9304\u9322\u95CA\u96A8\u96AA\u96B1\u96B8\u96D6\u96D9\u96DC\u9748\u975C\u986F\u9918\u99DB\u9A57\u9B25\u9EA5\u9EC3\u9EDE\u9F52
|
||||
NGram.KANJI_2_4=\u514C\u51AA\u5614\u56AE\u56C2\u582F\u58FA\u5B0C\u5D11\u5DD2\u5DD6\u5E40\u5E5F\u5EEC\u6137\u6417\u6488\u64F2\u652A\u6582\u6689\u689F\u68D7\u69D3\u6A97\u6AB8\u6ABB\u6AC3\u6ADA\u6B7F\u6BB2\u6EA5\u6EC4\u6EF2\u7009\u701D\u7028\u703E\u7165\u71BE\u721B\u7463\u7464\u7469\u7515\u7526\u75FA\u7621\u779E\u79B1\u7A1F\u7AC4\u7AC7\u7B8F\u7BE9\u7D2E\u7D68\u7D8F\u7DB8\u7DBA\u7E46\u7E79\u7F4C\u7F88\u8070\u8073\u8076\u81BE\u82BB\u83A2\u858A\u8591\u861A\u8778\u87EC\u8805\u880D\u893B\u8A1B\u8A25\u8A36\u8A85\u8AA6\u8B17\u8B28\u8CB6\u8CE4\u8D16\u8D1B\u8ECB\u9112\u9214\u9249\u93AC\u9594\u9598\u95BB\u95D5\u965E\u96B4\u97DC\u9821\u9824\u9921\u9952\u9A55\u9A5B\u9B1A\u9C13\u9D09\u9DAF\u9E1A\u9E75\u9F67
|
||||
NGram.KANJI_2_9=\u4E9F\u4F6C\u4FDE\u4FFE\u5029\u5140\u51A2\u5345\u539D\u53FB\u54C7\u5599\u560E\u561B\u563B\u566C\u5676\u5729\u574D\u57E4\u595A\u598D\u5A1F\u5A25\u5A77\u5AB2\u5AD6\u5BF0\u5C2C\u5CEA\u5E37\u5F08\u6059\u606A\u6096\u609A\u62A8\u6555\u6556\u66E6\u675E\u68E3\u69BB\u6BCB\u6BD3\u6C1F\u6C26\u6C81\u6DC4\u6DDE\u6E32\u6E44\u6E4D\u6F33\u6F7C\u6FA7\u701A\u701B\u715C\u741B\u7428\u7480\u74A8\u7504\u752C\u768B\u76CE\u78CA\u78FA\u79BA\u7C27\u8046\u81FB\u8331\u8393\u83C1\u8403\u8438\u843C\u8446\u85B0\u87D2\u8862\u8DC6\u9074\u9131\u9672\u96EF\u9704\u9706\u977C\u9ABC\u9E92\u9ECF
|
||||
NGram.KANJI_2_10=\u51BD\u5704\u7350\u73A5
|
||||
NGram.KANJI_2_11=\u4E15\u4EA2\u4F5A\u50D6\u5349\u53DF\u5484\u5958\u5B34\u5B5A\u5C91\u5E1B\u5F77\u61CB\u61FF\u620C\u620D\u622E\u6248\u6538\u660A\u664F\u678B\u67E9\u69B7\u69C3\u6CB1\u6CD7\u6D5A\u6DAA\u6DC7\u7099\u71EE\u7325\u7425\u7455\u747E\u749E\u75B5\u7678\u7693\u76C2\u77B0\u77BF\u78CB\u7957\u795A\u797A\u7A79\u7B08\u7B75\u7BB4\u7F9A\u7FB2\u7FDF\u80E5\u81BA\u8340\u837C\u8398\u8559\u85A8\u86DF\u8734\u8882\u88F4\u8936\u900D\u907D\u9642\u96C9\u9AFB\u9E9D\u9EBE
|
||||
NGram.KANJI_2_12=\u5F57\u7940
|
||||
NGram.KANJI_2_13=\u5191\u7791\u792C\u7D46
|
||||
NGram.KANJI_2_15=\u5713\u58FD\u5D17\u5D19\u5DBC\u5F4C\u6191\u64A5\u687F\u69AE\u6AFB\u6EEC\u6F3F\u6FE4\u6FF1\u6FFE\u700B\u74CA\u76E1\u76E7\u7926\u792B\u79AE\u7AA9\u7C43\u7C4C\u7C64\u7DBD\u81A0\u856D\u8594\u8606\u8A62\u8AF7\u8CC8\u8CE3\u8D99\u8F1B\u8F3B\u9059\u9127\u9264\u947D\u95A9\u97CB\u980C\u9838\u9846\u99AE\u9A19\u9B06\u9B91\u9F4A\u9F4B
|
||||
NGram.KANJI_2_16=\u4E69\u4EC4\u4EDF\u4EF3\u4F0B\u4F5E\u5000\u5028\u50E5\u513B\u5157\u51DC\u52D7\u530F\u5379\u53F5\u5471\u5477\u5555\u555C\u557B\u5594\u55B2\u55C9\u560D\u5616\u562E\u5630\u5653\u5657\u566F\u56A8\u56B6\u5820\u5880\u58CE\u58D9\u5950\u5969\u596D\u599E\u59B3\u59CD\u59D2\u5A40\u5AA7\u5ABC\u5AD7\u5AD8\u5B0B\u5B24\u5B38\u5B53\u5C5C\u5D06\u5D47\u5D94\u5D9D\u5E57\u5EC4\u5F46\u5FAC\u60BD\u60D8\u6123\u615D\u615F\u6175\u618A\u61AB\u61E3\u623E\u6308\u636B\u645F\u6519\u6595\u6698\u66B8\u67D9\u6840\u695D\u696E\u6979\u69C1\u69E8\u6AEC\u6AFA\u6B5F\u6CAC\u6CE0\u6CEF\u6D0C\u6D36\u6DD2\u6DD9\u6DE6\u6DEC\u6E5F\u6FA0\u6FEC\u7156\u71C4\u71DC\u71EC\u71FC\u720D\u7230\u7292\u7296\u72A2\u72CE\u7357\u737A\u7380\u7386\u73A8\u73EE\u743F\u74A6\u74CF\u74D4\u74DA\u755A\u75A5\u75B3\u75C2\u75E0\u75F1\u75FF\u7601\u7609\u7646\u7658\u769A\u76B0\u774F\u775C\u778B\u77BD\u77C7\u7843\u787F\u78F4\u79C8\u7A88\u7A95\u7AFD\u7B1E\u7B67\u7B9D\u7BCC\u7C0D\u7C11\u7C37\u7C40\u7C6E\u7CB3\u7CBD\u7D09\u7D31\u7D40\u7D5B\u7D70\u7D91\u7D9E\u7DB0\u7DD9\u7DF9\u7E08\u7E11\u7E1D\u7E35\u7E52\u7FB6\u7FBF\u7FEE\u8012\u801C\u8028\u8052\u8123\u8188\u81C3\u81DA\u81FE\u8210\u82BE\u83A0\u83D4\u8407\u8435\u8477\u849E\u84C6\u84CA\u85F9\u867A\u86B5\u86B6\u86C4\u8706\u8707\u870A\u8768\u87BB\u8831\u8839\u8879\u8921\u8938\u8964\u89A6\u89AC\u8A10\u8A3E\u8AC2\u8ADB\u8AF3\u8B2B\u8B41\u8B4E\u8B5F\u8B6B\u8B92\u8C55\u8C62\u8C73\u8C8A\u8C8D\u8CB2\u8CB3\u8CD2\u8CE1\u8CFB\u8D0D\u8E34\u8E7A\u8E8A\u8ED4\u8EFE\u8F0A\u8F1C\u8F1E\u8F26\u8FAE\u9088\u90C3\u90FE\u9134\u9148\u91D9\u91E9\u9238\u9239\u923D\u924D\u925A\u9296\u92AC\u92BB\u9315\u9319\u931A\u9321\u9370\u9394\u93A2\u93D8\u93E4\u943A\u9477\u9582\u958E\u95A1\u95C8\u95CC\u95D4\u9658\u966C\u970F\u973D\u9744\u975B\u9766\u97A3\u97A6\u97C1\u97C6\u980A\u9837\u9853\u9870\u98AF\u98B3\u98BA\u98E9\u98ED\u9912\u991B\u991E\u993D\u993F\u99D1\u99DF\u9A01\u9A3E\u9A43\u9A4D\u9ACF\u9AE1\u9B22\u9B58\u9C25\u9C3E\u9C54\u9C56\u9D15\u9D23\u9D89\u9DC2\u9DD3\u9E82\u9E8B\u9EA9\u9EE0\u9EF7\u9F07\u9F2F\u9F34\u9F3E\u9F5F\u9F6C
|
||||
NGram.KANJI_2_18=\u5155\u520E\u55DF\u56C0\u56C1\u5793\u5FD6\u5FF8\u6029\u60FA\u613E\u6147\u615A\u62C8\u6384\u6883\u6894\u68F9\u6AA3\u6AAE\u6AC2\u6E63\u7032\u70A4\u7146\u71FB\u7228\u72F7\u7370\u7441\u74BF\u75B8\u75E3\u7622\u76CD\u7768\u79E3\u7A60\u7B6E\u7BC1\u7C5F\u7D06\u7E2F\u7E39\u8146\u81CF\u8703\u8729\u8737\u87EF\u88D2\u8A22\u8AC4\u8AF6\u8E59\u8F33\u8F42\u9169\u91B1\u9278\u93C3\u93DD\u9460\u946A\u9785\u9AD1\u9B4D\u9B4E\u9C31\u9D12\u9ECC
|
||||
NGram.KANJI_2_21=\u502A\u544E\u59AE\u59EC\u5D1B\u66A8\u6BD7\u6C76\u6E1D\u70EF\u742A\u7459\u7FE1\u82EF\u8343\u85C9\u8A79\u90DD
|
||||
NGram.KANJI_2_22=\u4EDE\u4F7B\u504C\u50EE\u52E3\u52F0\u536E\u54A9\u54BB\u54BF\u54C2\u54E6\u550F\u556A\u55E8\u564E\u5664\u5671\u568F\u56DD\u572F\u57A0\u5809\u5924\u59A3\u59A4\u59E3\u5A13\u5A23\u5B51\u5B73\u5C50\u5C8C\u6035\u60C6\u6106\u6215\u62CE\u62FD\u64ED\u6549\u6554\u655D\u659B\u65CE\u65D6\u6615\u6624\u665E\u6677\u669D\u66E9\u6772\u677C\u696B\u6A84\u6AA0\u6BFD\u6C16\u6C86\u6C94\u6CD6\u6D2E\u6D39\u6F78\u6FB6\u705E\u70CA\u7168\u723B\u7256\u7284\u73B3\u740D\u742F\u7498\u74A9\u752D\u75F3\u7634\u768E\u76B4\u76E5\u77A0\u77DC\u781F\u782D\u7AA0\u7BFE\u7FF1\u80AB\u8174\u81EC\u8202\u8222\u8228\u82DC\u8306\u83FD\u8469\u84FF\u859C\u8617\u86B1\u8722\u8C89\u8D67\u8DCE\u8E49\u8E76\u8E87\u8FE2\u8FE4\u8FF8\u9016\u905B\u9174\u982B\u98E7\u9955\u9B32
|
||||
NGram.KANJI_2_23=\u4F8F\u5055\u524C\u548E\u5583\u594E\u5CB7\u5ED6\u5F5D\u6021\u66B9\u66F0\u6C55\u6C7E\u6C82\u6E2D\u6EC7\u6ED5\u70B3\u71B9\u72C4\u73C0\u7426\u745C\u748B\u7696\u777F\u79A7\u79B9\u7F8C\u8153\u8339\u8386\u8725\u90B5\u9102\u962E\u9716\u97F6
|
||||
NGram.KANJI_2_28=\u5733\u57D4\u838E\u8FEA
|
||||
NGram.KANJI_2_29=\u50ED\u5F29\u62EE\u6A9C\u7BC6\u80F1\u8129\u8171\u822B\u8AEB
|
||||
NGram.KANJI_2_30=\u4EB3\u4F15\u4FB7\u5006\u509A\u50A2\u5102\u5109\u5115\u5137\u5138\u513C\u524B\u524E\u5277\u528A\u52E6\u52FB\u5331\u5436\u5443\u54FD\u5538\u555E\u55C6\u55C7\u5679\u5690\u5695\u56C9\u56D1\u56EA\u588A\u58E2\u5AFB\u5B2A\u5B43\u5B7F\u5BE2\u5C37\u5D27\u5D84\u5D87\u5DD4\u5EC1\u5EDD\u5F12\u5FA0\u60F1\u616B\u61F5\u61F6\u61FE\u62DA\u6371\u6399\u63C0\u6451\u647B\u6493\u64BB\u64BF\u64C4\u64F1\u64F7\u650F\u652C\u665D\u6684\u6688\u66EC\u672E\u68E7\u69A6\u69ED\u69F3\u6A01\u6AAF\u6AE5\u6BA4\u6BAE\u6BAF\u6BC6\u6C08\u6C2C\u6C59\u6D87\u6EBC\u6ECC\u6EF7\u6F6F\u6F80\u6F86\u6FD8\u6FF0\u6FFA\u7006\u7018\u7030\u7051\u7192\u71C9\u71D9\u71F4\u71FE\u7274\u7377\u74A3\u750C\u7613\u7627\u7661\u7662\u7665\u766E\u7671\u7672\u76BA\u775E\u776A\u778C\u78E7\u7955\u7A08\u7AC5\u7B4D\u7C2B\u7C6C\u7CF0\u7D02\u7D1C\u7D73\u7DA2\u7DB5\u7DDE\u7E09\u7E0A\u7E37\u7E43\u7E61\u7E7D\u7E93\u7F3D\u7FF9\u81A9\u8271\u83F8\u84C0\u8514\u85BA\u86A9\u86FB\u879E\u8814\u8836\u889E\u8932\u896A\u896F\u8993\u89B2\u8A15\u8A16\u8A1D\u8A5B\u8A6C\u8A6D\u8A7C\u8AA1\u8AA3\u8AA5\u8B0A\u8B4F\u8B59\u8B96\u8C48\u8C54\u8CBD\u8CFA\u8D13\u8E89\u8E8B\u8EAA\u8EC0\u8EDB\u8EFC\u8F12\u8F1F\u8F3E\u8F45\u8FFA\u9015\u9183\u919E\u91A3\u91D7\u91F5\u9209\u9215\u923E\u9240\u9251\u9257\u927B\u9293\u92A8\u92C5\u92C7\u92F0\u9333\u935A\u9382\u938A\u9398\u93B3\u93D7\u93DF\u93E2\u93FD\u942B\u942E\u9433\u9463\u9470\u9472\u947E\u95D0\u96CB\u97C3\u97CC\u981C\u9839\u986B\u98B6\u98EA\u9909\u991A\u9935\u993E\u9951\u99A5\u99B1\u99D9\u99DD\u99F1\u9A2B\u9A62\u9A65\u9AAF\u9AD2\u9AEF\u9B0D\u9B28\u9B77\u9BFD\u9C49\u9C5F\u9C78\u9D3F\u9D72\u9DD7\u9E1B\u9EB4\u9EF4\u9F66\u9F94
|
||||
NGram.KANJI_2_31=\u5DBD\u63C6\u6E3E\u7587\u8AF1\u8B5A\u9695
|
||||
NGram.KANJI_2_32=\u53A5\u589F\u5CD9\u7109\u7F79\u8006\u8654\u8944\u968B\u96CD
|
||||
NGram.KANJI_2_35=\u4F47\u4F91\u4FCE\u4FDF\u527D\u535E\u55DA\u56A5\u5879\u5A11\u5B7A\u5CAB\u5CF4\u5EBE\u5F7F\u5FA8\u601B\u606B\u60B8\u610D\u6134\u619A\u61FA\u6369\u6523\u65CC\u66C4\u6727\u6968\u6A05\u6A48\u6B59\u6BEC\u6D35\u6D38\u6E19\u701F\u7064\u711C\u716C\u71A8\u71E7\u7258\u743A\u746F\u75BD\u75D9\u75F2\u7669\u766C\u76DE\u7729\u77BC\u78EC\u792A\u7A37\u7A62\u7BE6\u7C2A\u7C50\u7D07\u7DD8\u7E5A\u7F8B\u7FD5\u7FF3\u8151\u81CD\u8317\u83F4\u85EA\u85FA\u8823\u895E\u89F4\u8A0C\u8A41\u8AA8\u8ACD\u8B10\u8CC1\u8D05\u8D73\u8E4A\u8E85\u8E91\u8EFB\u8F13\u9087\u914A\u91C9\u923F\u93B0\u9403\u95A8\u95AD\u9730\u9865\u9903\u9945\u9949\u99AD\u99E2\u9A6A\u9D26\u9E1E\u9EDD\u9F2C\u9F72
|
||||
NGram.KANJI_2_36=\u4E9E\u4F86\u5011\u50B3\u5152\u5169\u5340\u5718\u5B78\u5BE6\u5BF6\u5C07\u5EE3\u61C9\u6230\u6703\u689D\u6A02\u6C23\u7063\u7368\u756B\u7576\u767C\u7A31\u7D93\u7E23\u7E3D\u81FA\u8207\u842C\u85DD\u865F\u8B49\u8B80\u8CFD\u908A\u9435\u95DC\u965D\u9AD4\u9EE8
|
||||
NGram.KANJI_2_37=\u5480\u5580\u5C39\u67EF\u68B5\u6D85\u8521\u90B1
|
||||
NGram.KANJI_2_38=\u4E1F\u4F96\u4FE0\u50F1\u5118\u522A\u5291\u52C1\u52DB\u52F3\u52F5\u52F8\u53B2\u55CE\u562F\u580A\u5862\u58AE\u58D8\u58DF\u58E9\u58EF\u5925\u593E\u599D\u5ABD\u5C62\u5EC2\u5EDA\u5EE2\u5F4E\u5F65\u6085\u6158\u61FC\u6200\u62CB\u633E\u6416\u6436\u6490\u64CB\u64E0\u64FA\u6514\u651C\u6524\u6558\u6583\u66B1\u66C6\u66C9\u66E0\u6A11\u6A1E\u6A38\u6A62\u6AB3\u6B16\u6B98\u6BBC\u6C2B\u6DDA\u6DE8\u6DEA\u6DFA\u6EEF\u6EFE\u6F32\u6F51\u6F5B\u700F\u71D2\u7210\u7246\u7260\u72A7\u72F9\u7375\u7378\u758A\u760B\u76DC\u76EA\u77DA\u77FD\u78DA\u7919\u797F\u79AA\u7A05\u7A4C\u7ACA\u7C72\u7D81\u7DDD\u7E31\u7E69\u7E6B\u7E73\u7E96\u7E9C\u81BD\u81C9\u81DF\u8259\u8277\u8396\u83A7\u8523\u8525\u860A\u863F\u8667\u87A2\u87F2\u881F\u883B\u89F8\u8B20\u8B74\u8B9A\u8C4E\u8C6C\u8C93\u8CEC\u8D0A\u8D0F\u8D95\u8E10\u8F4E\u8FAF\u8FF4\u905E\u9072\u9081\u908F\u91AC\u91C0\u91C1\u91D0\u921E\u9223\u9245\u929C\u92B3\u92C1\u9336\u934A\u93C8\u9444\u9452\u947C\u947F\u9592\u95B1\u95C6\u95D6\u95E1\u95E2\u96DE\u9742\u978F\u984F\u9871\u98B1\u98C4\u99ED\u9A37\u9A45\u9A5F\u9AEE\u9B27\u9BCA\u9C77\u9D51\u9D5D\u9E79\u9E7C\u9E7D\u9EB5\u9EBC\u9F61\u9F63\u9F90\u9F9C
|
||||
NGram.KANJI_3_1=\u5283\u7562\u7DEC\u88E1\u8F2F
|
||||
NGram.KANJI_3_2=\u5009\u502B\u5049\u5075\u507D\u5091\u5098\u50B5\u50B7\u50BE\u5100\u5104\u511F\u518A\u525B\u5289\u5442\u5805\u589C\u58C7\u5922\u596A\u5A66\u5B6B\u5BE7\u5BE9\u5DBA\u5E63\u5E7E\u5FB9\u6163\u616E\u6176\u61B2\u61B6\u61F8\u639B\u63DA\u63EE\u640D\u64B2\u64C1\u64EC\u6557\u6575\u6607\u66AB\u68C4\u6A39\u6C96\u6CC1\u6E1B\u6E6F\u6E9D\u6EC5\u6F01\u6F64\u6FC3\u7058\u707D\u7344\u7642\u76E4\u7832\u790E\u7B46\u7D05\u7D0B\u7D14\u7D19\u7D1B\u7D39\u7D61\u7DB1\u7DCA\u7DD2\u7DE0\u7DE9\u7DEF\u7DF4\u7E2E\u7E3E\u8105\u8108\u81E8\u8266\u84CB\u84EE\u85A9\u885D\u88DC\u8972\u8A02\u8A0E\u8A13\u8A17\u8A2A\u8A34\u8A3A\u8A3C\u8A69\u8A73\u8A95\u8AA0\u8AA4\u8AB2\u8AC7\u8ACB\u8B00\u8B1B\u8B1D\u8B5C\u8C9D\u8C9E\u8CA2\u8CA8\u8CA9\u8CAB\u8CAC\u8CB7\u8CBF\u8CC0\u8CDE\u8CE2\u8CFC\u8D08\u8DE1\u8E8D\u8ECC\u8EDF\u8EF8\u8F14\u8F1D\u8F2A\u8F44\u9055\u9069\u9077\u907C\u90F5\u91DD\u9285\u92FC\u9326\u932F\u9375\u9396\u93AE\u93E1\u9451\u9589\u95A3\u9663\u9670\u9673\u96BB\u9801\u9802\u9803\u9806\u9808\u9810\u983B\u984D\u9858\u9867\u98EF\u98F2\u98FE\u990A\u99D0\u9A0E\u9A5A\u9B5A\u9CE5\u9DB4\u9E97\u9F8D
|
||||
NGram.KANJI_3_3=\u543E\u5BEE\u5F18\u6590\u725F\u83C5\u85E9\u9E93
|
||||
NGram.KANJI_3_4=\u5016\u53AD\u5606\u5629\u58BE\u5F14\u6065\u6144\u646F\u647A\u67F5\u6953\u6C3E\u6F2C\u6F97\u6FB1\u7169\u71E6\u71ED\u74BD\u79BF\u7A1C\u7A4E\u7AAF\u7CDE\u7D17\u7D43\u7E55\u7FA8\u807E\u8139\u8490\u8569\u856A\u87FB\u8A23\u8AB9\u8AE6\u8AFA\u8B2C\u8CD1\u91D8\u92F8\u9318\u96DB\u99B4\u9BC9\u9C2D\u9CF6\u9D61\u9DFA
|
||||
NGram.KANJI_3_5=\u4E26\u4F75\u4FC2\u500B\u5074\u5099\u512A\u5225\u5247\u5275\u5287\u52D5\u52D9\u52DD\u52E2\u5354\u54E1\u554F\u5712\u57F7\u5831\u5834\u5BAE\u5C0E\u5C64\u5CA1\u5CF6\u5E2B\u5E79\u5EAB\u5F35\u5F37\u5F8C\u5FA9\u611B\u614B\u63A1\u63DB\u6642\u66F8\u6771\u696D\u6975\u69CB\u6A19\u6A4B\u6A5F\u6BBA\u6C7A\u6E2C\u6E96\u6F22\u70BA\u7121\u71B1\u7372\u73FE\u74B0\u7570\u76E3\u78BA\u7A2E\u7A4D\u7AF6\u7BC0\u7BC4\u7BC9\u7C21\u7D00\u7D04\u7D0D\u7D1A\u7D30\u7D42\u7D44\u7D50\u7D66\u7D71\u7DAD\u7DDA\u7DE8\u7E54\u7F85\u7FA9\u7FD2\u8056\u805E\u8077\u8208\u83EF\u8449\u8853\u885B\u88FD\u8907\u898B\u898F\u8996\u89AA\u8A08\u8A18\u8A2D\u8A31\u8A55\u8A5E\u8A66\u8A71\u8A72\u8A8C\u8A8D\u8A9E\u8ABF\u8AD6\u8AF8\u8B58\u8B70\u8B77\u8CA0\u8CA1\u8CB4\u8CBB\u8CC7\u8CEA\u8ECA\u8ECD\u8F03\u8F09\u8F38\u8FB2\u9023\u9031\u9032\u904A\u904B\u904E\u9054\u9060\u9078\u907A\u9084\u9280\u9577\u9580\u958B\u9593\u9678\u967D\u968A\u968E\u969B\u96E2\u96E3\u96F2\u96FB\u97D3\u97FF\u9805\u9818\u982D\u984C\u985E\u98A8\u98DB\u9928\u99AC\u9BAE
|
||||
NGram.KANJI_3_8=\u5F6B\u6C4E\u7B87\u8A70
|
||||
NGram.KANJI_3_9=\u540B\u5B5C\u826E
|
||||
NGram.KANJI_3_11=\u4F83\u4FF8\u51CB\u52BE\u53F1\u548B\u558B\u5CB1\u5D69\u5F3C\u620E\u621F\u64E2\u67DA\u6854\u69CC\u6A35\u6C8C\u6E1A\u6F15\u6FE0\u717D\u7252\u7AFA\u82D3\u83DF\u8431\u9041\u9149\u9798
|
||||
NGram.KANJI_3_12=\u4ED5\u55E3\u572D\u57A3\u587E\u5983\u5A9B\u5C90\u5E61\u672D\u6960\u6F5F\u72D9\u72E9\u757F\u7949\u7950\u7E82\u7FCC\u82B8\u90B8\u91DC\u961C\u9B45
|
||||
NGram.KANJI_3_13=\u55AB\u6249\u643E\u6841\u68B1\u725D\u7B8B\u7C95\u7E1E\u7F36\u8A03\u8A6B\u8E74\u95A4
|
||||
NGram.KANJI_3_15=\u50AD\u50D1\u5132\u51F1\u55AC\u5617\u5687\u584A\u59EA\u5B30\u5BF5\u5C0B\u5C4D\u5EDF\u6182\u61A4\u64AB\u64FE\u66A2\u6897\u694A\u69CD\u6B3D\u6BC0\u6D29\u6F38\u7015\u7149\u71C8\u723A\u7336\u7345\u755D\u76C3\u78A9\u798D\u7AAE\u7DFB\u7E2B\u7F75\u7F77\u81E5\u834A\u852D\u85CD\u8755\u8A3B\u8A54\u8AE7\u8B02\u8B39\u8CAA\u8CE6\u8DA8\u8E5F\u8F5F\u905C\u912D\u919C\u92D2\u932B\u937E\u9418\u9583\u9812\u985B\u9905\u99B3\u99C1\u99D5\u9A30\u9CF3\u9D3B\u9D6C
|
||||
NGram.KANJI_3_16=\u6D6C\u72FD\u77A5\u8956\u9C0D
|
||||
NGram.KANJI_3_18=\u5919\u5F4A\u6063\u63AC\u649A\u6715\u6AD3\u71D0\u758B\u834F\u85F7\u88DF\u8F61\u93D1\u98F4\u9D60
|
||||
NGram.KANJI_3_19=\u4F50\u7DB2\u962A
|
||||
NGram.KANJI_3_22=\u5E96\u75D4\u91C6
|
||||
NGram.KANJI_3_23=\u5E9A\u6C40\u821C\u839E\u8FED\u9EDB
|
||||
NGram.KANJI_3_27=\u5F01\u66DC
|
||||
NGram.KANJI_3_29=\u5023\u5208\u531D\u536F\u53E9\u54C9\u598A\u59BE\u5A20\u5D6F\u5DF3\u66C7\u66D6\u66F3\u6775\u6A3D\u6ADB\u6B86\u6C72\u6E25\u73EA\u7435\u760D\u7656\u7825\u78D0\u7A14\u7A6B\u7B20\u7BE0\u7CF8\u7DAC\u7DBB\u7DBE\u80E4\u80F4\u837B\u8466\u8568\u867B\u8A63\u91E7\u9320\u935B\u9591\u965B\u98E2\u990C\u9913\u9BAB
|
||||
NGram.KANJI_3_30=\u60B6\u8AD2\u8CC2\u9237\u9328\u934D\u9397\u9830
|
||||
NGram.KANJI_3_31=\u4FB6\u50D5\u51CD\u559A\u55AA\u5674\u5857\u585A\u5875\u58B3\u596E\u59E6\u5A41\u5D50\u5E25\u5E33\u5F59\u61C7\u61F2\u6368\u6383\u65AC\u68DF\u68F2\u6A3A\u6B04\u6DBC\u6DF5\u6E26\u6E4A\u6E67\u6F54\u6F70\u6FC1\u6FEB\u7159\u727D\u7652\u77EF\u78EF\u798E\u7A40\u7AAA\u7BE4\u7C60\u7CE7\u7CFE\u7D21\u7D33\u7D5E\u7D79\u7DB4\u7DBF\u7E1B\u7E8F\u7F70\u814E\u816B\u8178\u819A\u84BC\u85A6\u865C\u8766\u8A1F\u8A50\u8A60\u8A6E\u8A87\u8A98\u8AB0\u8ADC\u8AED\u8AEE\u8B0E\u8B19\u8CA7\u8CAF\u8CB8\u8CBC\u8CC3\u8CC4\u8CCA\u8CDC\u8CE0\u8CED\u8ED2\u8F29\u8F3F\u91E3\u920D\u9234\u925B\u9298\u9310\u934B\u958F\u95A5\u9727\u97FB\u9811\u984E\u98FC\u98FD\u99D2\u99FF\u9B31\u9BE8\u9C57\u9CE9\u9CF4\u9D28\u9DF9
|
||||
NGram.KANJI_3_32=\u4E1E\u502D\u51A5\u5321\u58EC\u5A3C\u5BC5\u5CE8\u61A9\u620A\u65A1\u6714\u6853\u6893\u6C50\u6C5D\u7436\u745A\u745B\u773A\u7941\u7947\u8543\u865E\u8C5A\u914B\u99A8\u9AB8
|
||||
NGram.KANJI_3_35=\u4E99\u5BA5\u5DFD\u608C\u60C7\u60DA\u6190\u61A7\u6753\u6777\u6787\u6B4E\u6F23\u6FE1\u6FEF\u7337\u7827\u786F\u7893\u7ABA\u7B94\u7BB8\u7C3E\u7D62\u7E6D\u80B1\u81BF\u81C6\u821B\u82E7\u83F0\u84D1\u86ED\u8888\u8B01\u8B04\u8F4D\u9291\u92E4\u932E\u9354\u936C\u939A\u9957\u9AED\u9BAA\u9BAD\u9BD6\u9BDB\u9C3B\u9D1B
|
||||
NGram.KANJI_3_36=\u50C5\u53E2\u5EE0\u65BC\u70CF\u723E\u7D10\u7D9C\u806F\u8607\u862D\u8A0A\u8AFE\u8CD3\u9019\u9813\u9B6F
|
||||
NGram.KANJI_3_37=\u4EA8\u4F3D\u5384\u5EFF\u60DF\u66DD\u6E5B\u8087\u82D1\u8FE6\u9640\u9E9F
|
||||
NGram.KANJI_3_38=\u5147\u525D\u5678\u617E\u6372\u79A6\u8ABC\u92EA\u9438\u9817
|
||||
NGram.KANJI_4_0=\u6D3C\u718F\u74EE\u8712
|
||||
NGram.KANJI_4_9=\u4F84\u54C6\u5565\u68F1\u6D82\u83C7
|
||||
NGram.KANJI_4_10=\u4FE9\u4FED\u51FF\u523D\u5300\u5364\u538C\u5450\u5455\u545C\u54D1\u54D7\u5578\u56A3\u58F6\u592F\u5CE6\u5D2D\u5E90\u6073\u607C\u60EB\u61D2\u62E2\u62E3\u631A\u6320\u6323\u6361\u63B7\u63B8\u63BA\u6405\u65A9\u65F7\u6619\u6655\u67A3\u67E0\u6805\u6808\u6866\u6868\u6869\u6A71\u6BE1\u6C79\u6CA5\u6CDE\u6DA4\u6DA7\u6DA9\u6E85\u70DB\u70E6\u70EB\u7115\u724D\u7410\u759F\u75AE\u75EA\u75F9\u762B\u763E\u76B1\u77EB\u783E\u79C3\u7A8D\u7A9C\u7B5D\u7BF1\u7EC5\u7ED2\u7EDE\u7EE3\u7EF7\u7EF8\u7EFD\u7F00\u7F0E\u7F15\u7F1A\u7F20\u7F24\u7F28\u7FA1\u7FD8\u8038\u803B\u804B\u80AE\u817B\u82C7\u8327\u835E\u8367\u83BA\u8424\u864F\u8681\u8682\u8715\u8717\u8721\u8747\u874E\u8845\u886C\u889C\u88E4\u89C5\u8BB6\u8BB9\u8BC0\u8BC5\u8BE1\u8BEB\u8BEC\u8BF5\u8C0E\u8C1A\u8D2E\u8D31\u8D43\u8D4E\u8D58\u8F67\u8F7F\u9489\u9499\u949D\u94A0\u94A5\u94AE\u94BE\u94D0\u94DB\u94F2\u9508\u950C\u951A\u9525\u952D\u952F\u9530\u953B\u9540\u9550\u9570\u9576\u95F0\u960E\u9668\u96CF\u97E7\u9885\u988A\u98A4\u9965\u9975\u997A\u997F\u9985\u998D\u998F\u9A6E\u9A6F\u9A74\u9A79\u9A7C\u9A82\u9A87\u9CA4\u9CC4\u9CCD\u9CD6\u9E20\u9E25\u9E35\u9E3D\u9E45\u9E49\u9E4A\u9E66
|
||||
NGram.KANJI_4_16=\u576F\u579B\u6345\u78B4\u79EB\u79F8
|
||||
NGram.KANJI_4_17=\u4E13\u4E1A\u4E1C\u4E24\u4E25\u4E2A\u4E3E\u4E49\u4E50\u4E66\u4E9A\u4EA7\u4EBF\u4ECE\u4EEC\u4EF7\u4F17\u4F20\u5170\u5173\u519B\u51B3\u51E4\u51FB\u5219\u521B\u522B\u529E\u52A1\u52A8\u52BF\u534F\u5355\u536B\u5386\u53BF\u53D1\u53D8\u542F\u5458\u54CD\u56E2\u56ED\u56F4\u56FE\u573A\u5904\u590D\u5934\u5B81\u5B9E\u5BF9\u5BFC\u5C14\u5C9B\u5E26\u5E7F\u5E94\u5F00\u5F20\u5F3A\u603B\u6218\u65E0\u65F6\u663E\u672F\u6743\u6784\u6807\u6C14\u6C49\u707E\u70ED\u73AF\u73B0\u7535\u76D1\u786E\u79CD\u79EF\u7B80\u7C7B\u7EA2\u7EA6\u7EA7\u7EAA\u7EBF\u7EC4\u7EC7\u7ED3\u7EDF\u7EE7\u7EED\u7EF4\u7F16\u7F57\u804C\u8054\u817E\u8282\u82CF\u83B7\u8425\u89C1\u89C2\u89C4\u89C6\u8BA1\u8BA4\u8BAE\u8BAF\u8BB0\u8BB8\u8BBA\u8BBE\u8BC1\u8BC4\u8BD1\u8BDD\u8BE5\u8BED\u8BF4\u8C03\u8D22\u8D23\u8D28\u8D39\u8D44\u8D5B\u8F66\u8F6C\u8F83\u8FBE\u8FC7\u8FD0\u8FD8\u8FD9\u8FDB\u8FDE\u9009\u94C1\u957F\u95E8\u95EE\u95F4\u95FB\u961F\u9633\u9645\u9646\u96BE\u9879\u9884\u9886\u9898\u98CE\u9A6C\u9F99
|
||||
NGram.KANJI_4_18=\u51DB\u67B7
|
||||
NGram.KANJI_4_22=\u4FA5\u545B\u5499\u5520\u5570\u56F1\u5A76\u5C96\u60AF\u60ED\u618B\u61A8\u62A0\u62A1\u62E7\u6363\u6390\u63B0\u6400\u6402\u6512\u6748\u70C1\u732C\u765E\u7663\u76CF\u7741\u781A\u7980\u79C6\u79FD\u7AA5\u7B0B\u7B8D\u7BA9\u7BAB\u7BD3\u7CAA\u7EAB\u7ECA\u7EE2\u7F2D\u7F30\u8110\u8113\u81CA\u835A\u8360\u84D6\u852B\u87E5\u8869\u8A8A\u8BA5\u8BF2\u8C05\u8C12\u8D30\u8D4A\u8D61\u8DF7\u8E6D\u8E8F\u8F95\u8F99\u8FAB\u94B3\u94C6\u94E3\u9504\u954A\u9563\u95FA\u9893\u9981\u9992\u9AA1\u9CAB\u9E2F\u9E33\u9EB8
|
||||
NGram.KANJI_4_24=\u4E22\u4E8F\u4F1E\u4FA3\u5151\u517D\u51BB\u51D1\u5220\u529D\u52CB\u5367\u5389\u5395\u53E0\u53F9\u5413\u548F\u5524\u575E\u575F\u5784\u5792\u57A6\u57AB\u58F3\u5986\u5988\u5A04\u5A07\u5BA0\u5C18\u5C82\u5DE9\u5E10\u5E1C\u5F2F\u60E9\u6124\u629B\u6321\u6324\u635E\u63FD\u6401\u644A\u6491\u655B\u658B\u6635\u67AB\u67DC\u680B\u692D\u6984\u6A31\u6B7C\u6BD9\u6C22\u6CA6\u6CA7\u6CEA\u6CFB\u6CFC\u6D46\u6D47\u6D4A\u6D51\u6DA1\u6E0A\u6E83\u6EE4\u6EE5\u6F9C\u6FD2\u70C2\u7237\u727A\u730E\u7574\u75AF\u7792\u7816\u7845\u78B1\u7A77\u7A91\u7A9D\u7AD6\u7B3C\u7B5B\u7CAE\u7EA4\u7EB1\u7EBA\u7ECE\u7ED1\u7EF0\u7EF3\u7F14\u7F1D\u7F34\u7F62\u8042\u806A\u80A0\u80A4\u80BE\u80BF\u80C0\u810F\u8138\u8231\u8270\u829C\u82CD\u8350\u83B9\u841D\u8574\u8680\u8BB3\u8BBC\u8BBD\u8BC8\u8BF1\u8BFD\u8C0A\u8C0D\u8C1C\u8C24\u8C26\u8C2C\u8C2D\u8C34\u8D1E\u8D2C\u8D3C\u8D41\u8D42\u8D4C\u8D50\u8D5A\u8F69\u8F88\u8F90\u8FA9\u915D\u9171\u9493\u949E\u94A7\u94A9\u94BB\u94C3\u94C5\u94DD\u94F8\u9505\u9510\u9523\u9524\u95EF\u95F7\u95F9\u9600\u9610\u96F3\u97F5\u987D\u9882\u9888\u9896\u98D8\u9971\u9972\u9976\u997C\u9A84\u9A86\u9A8F\u9A97\u9A9A\u9AA4\u9CB8\u9CDE\u9E26\u9E43\u9E64\u9E70\u9F7F\u9F9F
|
||||
NGram.KANJI_4_28=\u534E\u62A5\u7ECF\u7F51
|
||||
NGram.KANJI_4_34=\u4E34\u4E3D\u4E4C\u4E54\u4E60\u4E61\u4E70\u4EB2\u4EC5\u4EEA\u4F18\u4F1F\u4F24\u4F26\u4FA7\u50A8\u513F\u5174\u517B\u518C\u519C\u51B5\u51CF\u5218\u521A\u5267\u52B3\u5356\u5382\u5385\u538B\u53A6\u5434\u5706\u5723\u5757\u575A\u575B\u575D\u5907\u591F\u593A\u5956\u5B59\u5BA1\u5BAB\u5BBD\u5BBE\u5BFB\u5C42\u5C81\u5E01\u5E08\u5E86\u5E93\u5F02\u5F39\u5F52\u5F55\u5F7B\u6000\u6001\u6076\u620F\u6237\u6267\u6269\u626C\u62A2\u62A4\u62DF\u62E5\u62E9\u6325\u635F\u6362\u6444\u6653\u6682\u6740\u6742\u6768\u6781\u6811\u6837\u6865\u68C0\u6B22\u6BC1\u6BD5\u6C47\u6C9F\u6CAA\u6CFD\u6D4B\u6DA8\u6E10\u6EE1\u6EE8\u706D\u7075\u70DF\u7231\u739B\u7597\u76D6\u76D8\u77FF\u7801\u7840\u79BB\u7A33\u7ADE\u7B14\u7B7E\u7CA4\u7D27\u7EB3\u7EBD\u7EC3\u7EC6\u7EC8\u7ECD\u7ED5\u7ED9\u7EDC\u7EDD\u7EE9\u7EFC\u7EFF\u7F13\u7F29\u8083\u80DC\u8111\u814A\u8230\u827A\u8363\u836F\u8428\u84DD\u867D\u8865\u88AD\u89C8\u8BA2\u8BA8\u8BA9\u8BAD\u8BB2\u8BBF\u8BC6\u8BCD\u8BD5\u8BEF\u8BF7\u8BF8\u8BFA\u8BFB\u8C08\u8D1D\u8D1F\u8D21\u8D25\u8D27\u8D2D\u8D2F\u8D35\u8D38\u8DC3\u8F6E\u8F6F\u8F7B\u8F7D\u8F86\u8F91\u8F93\u8F96\u8FB9\u8FBD\u8FC1\u8FDC\u8FDD\u9002\u9057\u90BB\u90D1\u91CA\u9488\u949F\u94A2\u94B1\u94F6\u9500\u9526\u9547\u9614\u9634\u9635\u9636\u9648\u9655\u9669\u9690\u97E9\u9875\u9876\u987A\u987B\u987E\u987F\u9891\u989D\u98DE\u9986\u9A7B\u9A8C\u9C81\u9C9C\u9F50
|
||||
NGram.KANJI_4_39=\u4E1B\u4E1D\u4E27\u4EA9\u4ED1\u4ED3\u4F2A\u4FA6\u4FA8\u503A\u503E\u507F\u5188\u51AF\u51C0\u51C9\u51ED\u51EF\u5242\u5251\u52B2\u5362\u53A2\u5415\u5417\u5428\u55B7\u5760\u5899\u5939\u594B\u5987\u5A31\u5A74\u5BAA\u5C1D\u5C7F\u5C97\u5CAD\u5E05\u5E2E\u5E99\u5E9E\u5E9F\u5F03\u5FC6\u5FE7\u60AC\u60CA\u60EF\u626B\u6270\u629A\u62E6\u62E8\u6446\u6447\u654C\u67AA\u680F\u6863\u68A6\u6C64\u6D01\u6D53\u6D9D\u6DA6\u6E14\u6E17\u6EDA\u6EE9\u707F\u70BC\u70E7\u7275\u72B9\u72EE\u72F1\u743C\u7545\u76D0\u7855\u7978\u7B79\u7BEE\u7EA0\u7EAC\u7EAF\u7EB2\u7EB5\u7EB7\u7EB8\u7EB9\u7ED8\u7EEA\u7EF5\u7F05\u7F06\u7F18\u7F5A\u80C1\u80F6\u8109\u8206\u8273\u82F9\u8346\u8361\u83B2\u8427\u8651\u867E\u8854\u89C9\u8BC9\u8BCA\u8BD7\u8BDA\u8BDE\u8BE2\u8BE6\u8BFE\u8C01\u8C0B\u8C10\u8C13\u8C22\u8C23\u8C28\u8C31\u8D24\u8D26\u8D29\u8D2A\u8D2B\u8D34\u8D37\u8D3A\u8D3E\u8D3F\u8D4B\u8D4F\u8D54\u8D56\u8D5E\u8D60\u8D62\u8D75\u8D76\u8D8B\u8F68\u8F70\u8F74\u8F85\u8F89\u8FC8\u8FDF\u900A\u9012\u903B\u9093\u90AE\u917F\u9274\u94A6\u94DC\u94ED\u94FA\u94FE\u9501\u950B\u9519\u9521\u952E\u955C\u95EA\u95ED\u95F2\u95F8\u95FD\u9601\u9605\u9647\u96B6\u96FE\u9877\u9881\u9887\u9897\u989C\u98A0\u996D\u996E\u9970\u9A70\u9A71\u9A73\u9A76\u9A7E\u9A91\u9C7C\u9E1F\u9E21\u9E23\u9E2D\u9E3F\u9E4F\u9F84
|
||||
NGram.KANJI_5_10=\u5239\u8EAF
|
||||
NGram.KANJI_5_11=\u51C4\u8471
|
||||
NGram.KANJI_5_12=\u6DC0\u7C98
|
||||
NGram.KANJI_5_13=\u5631\u5815\u8695
|
||||
NGram.KANJI_5_14=\u4E71\u4FA0\u5265\u52B1\u5374\u53A8\u53D9\u58EE\u5BDD\u5BFF\u5C3D\u5C4A\u5CE1\u5F25\u5F84\u604B\u60A6\u60E7\u60E8\u631F\u636E\u643A\u663C\u664B\u67A2\u6816\u697C\u6B8B\u6BB4\u6D45\u6E7F\u6EDE\u6F5C\u706F\u7089\u72ED\u732A\u732B\u76D7\u793C\u7977\u7A0E\u7A83\u80C6\u811A\u8131\u82A6\u830E\u848B\u865A\u866B\u86EE\u89E6\u8A89\u8DF5\u8E0A\u8E2A\u8F9E\u9065\u968F\u9759\u9EA6
|
||||
NGram.KANJI_5_18=\u601C\u75D2
|
||||
NGram.KANJI_5_26=\u4E07\u4E0E\u4E89\u4F1A\u4F53\u515A\u5185\u5199\u533A\u533B\u53C2\u53CC\u53F7\u58F0\u5965\u5B66\u5B9D\u5C06\u5C5E\u5F53\u62C5\u6570\u65AD\u65E7\u6761\u6765\u6A2A\u6B27\u6CA1\u6E29\u6E7E\u70B9\u72B6\u72EC\u732E\u753B\u79F0\u88C5\u9EC4
|
||||
NGram.KANJI_5_29=\u693F\u82EB
|
||||
NGram.KANJI_5_34=\u53F6\u6D9B\u83B1
|
||||
NGram.KANJI_5_39=\u5C61\u788D
|
||||
NGram.KANJI_6_0=\u4E10\u4E52\u4EC6\u4F88\u4FD0\u51F3\u533E\u53ED\u53EE\u5406\u541D\u5429\u5435\u5440\u5490\u5495\u54B1\u54C4\u54FC\u557C\u55D3\u5669\u56E4\u5777\u5992\u59E8\u5B7D\u5BDE\u5BE5\u5C79\u5C94\u5DCD\u5E18\u5E1A\u5E54\u5FF1\u604D\u6064\u60F6\u6127\u6177\u6233\u6252\u625B\u6273\u6296\u62C2\u62C7\u62F4\u638F\u6396\u63E3\u63EA\u6413\u6479\u64A9\u64C2\u659F\u667E\u6760\u6845\u6963\u6A90\u6B83\u6C13\u6C5E\u6D8E\u6D95\u6DCC\u6ED4\u6F13\u6F3E\u6FA1\u7076\u70D8\u710A\u71CE\u7239\u72E1\u73B7\u7599\u759A\u75A4\u75CA\u7629\u7682\u76C5\u76EF\u778E\u77AA\u787C\u7889\u788C\u78BE\u79E7\u7A96\u7A98\u7B77\u7C7D\u7CB1\u7D0A\u7D6E\u7F94\u7FCE\u8116\u814B\u814C\u819B\u828D\u82DF\u8301\u83E0\u85D5\u8611\u86A3\u8708\u8822\u8C4C\u8DB4\u8DEA\u8E42\u8E66\u8E72\u8EBA\u901B\u9157\u970E\u97ED
|
||||
NGram.KANJI_6_3=\u62FC\u88D4\u9B4F
|
||||
NGram.KANJI_6_9=\u4ED7\u4F63\u4FCF\u5018\u50BB\u50F5\u5154\u5201\u522E\u5254\u527F\u5306\u5462\u5492\u5496\u54A8\u54AA\u554A\u5561\u5564\u5566\u5885\u5938\u5AC2\u5AE9\u5CED\u5F64\u6084\u608D\u60A8\u60D5\u61C2\u61C8\u6254\u626F\u62AC\u6346\u634D\u640F\u6454\u6487\u6495\u64D2\u6746\u6789\u68B3\u68F5\u695E\u6986\u6995\u69A8\u6A44\u6AAC\u6B79\u6C28\u6C2E\u6CF5\u6DE4\u6E34\u6E3A\u6E89\u6F29\u70AB\u70AC\u7130\u715E\u7184\u71AC\u7238\u7281\u72E0\u74E3\u74F7\u7529\u7578\u761F\u7626\u76D4\u775B\u7779\u7784\u77BB\u780C\u780D\u7838\u7898\u78C5\u78F7\u7AED\u7B28\u7BE1\u7C07\u7CD5\u7CD9\u7CEF\u7F38\u800D\u8084\u809A\u8165\u816E\u832B\u8334\u840D\u8774\u886B\u888D\u88D9\u88F9\u8C41\u8D81\u8D9F\u8E22\u8E29\u8EB2\u8F9C\u9165\u918B\u9631\u964B\u964C\u9661\u9709\u9739\u9776\u9AD3\u9ED4
|
||||
NGram.KANJI_6_10=\u4E53\u5582\u5600\u6342\u7B06
|
||||
NGram.KANJI_6_11=\u5288\u543C\u5475\u5486\u54EE\u5598\u56BC\u5962\u5A36\u5A9A\u5B75\u5BA6\u5C38\u5C4E\u5F8A\u5F98\u627C\u62CC\u62D7\u63C9\u6930\u6954\u69D0\u6BEF\u6C90\u6CBD\u6CBE\u6F31\u6F88\u70D9\u7329\u75BC\u75F0\u7737\u77D7\u7B19\u7FB9\u803F\u80D6\u813E\u81C0\u8205\u8309\u83BD\u846B\u8517\u868C\u8759\u8815\u8859\u8B6C\u8E81\u8EAC\u90A2\u9698\u9B44
|
||||
NGram.KANJI_6_12=\u722C\u7FD4
|
||||
NGram.KANJI_6_16=\u5228\u5315\u542E\u54CE\u5509\u5527\u5543\u55B3\u55E1\u5636\u568E\u5FFF\u61E6\u6376\u642A\u6726\u74E4\u76F9\u7736\u7BD9\u8019\u80F0\u80F3\u812F\u818A\u8200\u8214\u8638\u869C\u86C0\u86C6\u86D4\u87C6\u88B1\u8902\u8C7A\u8E4B\u9119
|
||||
NGram.KANJI_6_18=\u67D2\u6ED3\u87C0\u87CB\u8DDB\u901E\u9163
|
||||
NGram.KANJI_6_20=\u4F5B\u52D2\u54C8\u62FF\u66FC\u6D59\u704C\u7586\u9ECE
|
||||
NGram.KANJI_6_21=\u4E48\u4EFF\u4F19\u4FF1\u5021\u5077\u5195\u5212\u5269\u5401\u541E\u5427\u54EA\u5587\u558A\u55BB\u566A\u573E\u574E\u5783\u57AE\u584C\u58E4\u5960\u5976\u59CA\u5A1C\u5DE2\u5F99\u600E\u6015\u6263\u626D\u6293\u62C6\u62D6\u62EF\u62F1\u6316\u632A\u6380\u6389\u63D2\u641E\u64C5\u64CE\u65F1\u6664\u6735\u6770\u67EC\u6846\u684C\u68AD\u6B47\u6B49\u6B67\u6C1B\u6C27\u6C2F\u6C5B\u6C89\u6DF9\u6EAF\u70AE\u70E4\u731C\u7334\u73BB\u7470\u76FC\u788E\u789F\u78B0\u78B3\u7A0D\u7A3B\u7A57\u7CB9\u7F69\u8335\u8354\u84BF\u8DCC\u8DD1\u904F\u90A8\u9189\u9677\u9738\u978B
|
||||
NGram.KANJI_6_22=\u5162\u53E8\u542D\u5501\u552C\u5639\u563F\u56B7\u6043\u60B4\u6194\u61CA\u634E\u63CD\u6414\u64AC\u6DAE\u6E43\u6F66\u7095\u7316\u733E\u7728\u7830\u78D5\u7ABF\u7FE9\u8018\u80EF\u8198\u8693\u86AA\u86AF\u874C\u8783\u879F\u8892\u8E6C
|
||||
NGram.KANJI_6_23=\u4FD8\u4FEF\u501A\u5085\u5180\u526A\u5323\u54ED\u5634\u56CA\u58A9\u58F9\u5955\u5978\u59DA\u5A49\u5B55\u5BC7\u5BE8\u5D4C\u5E62\u6467\u64BC\u6500\u655E\u6572\u658C\u6670\u68CD\u68D5\u68E0\u6912\u6A0A\u6BB7\u6C9B\u6D3D\u6DC6\u6E23\u6F8E\u7011\u7092\u714C\u73AB\u7405\u7624\u76D2\u7960\u79C9\u7A20\u7BF7\u7F50\u804A\u8086\u81C2\u8292\u82DE\u852C\u857E\u859B\u8760\u8C6B\u8DBE\u8E48\u8F9F\u96A7
|
||||
NGram.KANJI_6_25=\u4E8E\u5DF2\u5FB7\u7AD9
|
||||
NGram.KANJI_6_28=\u4E58\u4ECD\u4EFD\u4F30\u4F60\u4F69\u503C\u5047\u51B0\u51F0\u5361\u5377\u53E6\u54E5\u552E\u5708\u5740\u5761\u57C3\u5821\u589E\u5979\u59C6\u5B69\u5B83\u5E15\u5E76\u5F17\u5F88\u6208\u622A\u624E\u627E\u62D4\u62DC\u63ED\u641C\u6536\u6548\u65C1\u665A\u6668\u67E5\u6B65\u6BCF\u6C61\u6CDB\u6D4E\u6D89\u6DB5\u6E38\u6EAA\u6FB3\u70B8\u745F\u7538\u7A97\u7F3A\u7F55\u805A\u8258\u827E\u82AC\u8303\u83F2\u8482\u85CF\u8DDF\u903E\u9080\u970D\u9760\u9ED1\u9ED8
|
||||
NGram.KANJI_6_29=\u634F\u6518\u7B50\u809B
|
||||
NGram.KANJI_6_30=\u54A7\u57C2\u5AB3\u60CB\u6886\u8378\u85D0\u8671
|
||||
NGram.KANJI_6_32=\u5080\u5121\u51A4\u54AC\u55DC\u592D\u5DEB\u6292\u68D8\u69B4\u6A59\u6E24\u7FC5\u80DA\u8180\u86DB\u8700\u8DCB\u9761
|
||||
NGram.KANJI_6_34=\u4E30\u51E0\u542C\u613F
|
||||
NGram.KANJI_6_35=\u4E56\u547B\u55FD\u5C41\u606C\u6115\u6CAE\u7119\u795F\u7CDC\u86C9\u86F9\u8713\u873B\u8757\u8925\u892A\u96F9
|
||||
NGram.KANJI_6_37=\u51B2\u5308\u5398\u54B8\u59DC\u5C4F\u5D14\u5F6D\u60E0\u6241\u6350\u699C\u6BEB\u6C6A\u6CC4\u6DEE\u6F58\u6F6D\u7199\u77EE\u7ADF\u8058\u820D\u8212\u8389\u8587\u884D\u8881\u8FA8\u8FF9\u96D5
|
||||
NGram.KANJI_6_39=\u574F\u6251\u6302
|
||||
NGram.KANJI_7_0=\u52FA\u5544\u60F0\u6994\u86A4\u86E4
|
||||
NGram.KANJI_7_3=\u4E59\u4E7E\u4EAD\u4EF0\u4EF2\u4F0F\u4F10\u4FAF\u4FCA\u500D\u501F\u5076\u508D\u50E7\u5112\u5146\u5192\u51AC\u51DD\u51FD\u5200\u5237\u524A\u52A3\u52C3\u52C7\u52DF\u5351\u5352\u5353\u5378\u537F\u53E5\u5439\u54FA\u574A\u5782\u57CB\u5893\u58C1\u5915\u5937\u5949\u5951\u5974\u59B9\u5A18\u5A5A\u5ACC\u5B54\u5B5D\u5B64\u5B8F\u5BBF\u5BD2\u5C3A\u5C6F\u5CB3\u5D07\u5DE7\u5E84\u5E8A\u5F26\u5F69\u5F70\u5F90\u5FAA\u5FCD\u6012\u6016\u602A\u60A0\u60B2\u60BC\u6148\u6162\u6170\u6291\u6298\u62AB\u62BC\u62BD\u62D2\u62D3\u62D8\u62F3\u6311\u638C\u6398\u63E1\u642C\u6458\u64A4\u654F\u656C\u659C\u65E2\u65E8\u65EC\u6606\u6614\u6676\u6691\u6696\u66F9\u6749\u676F\u679A\u679D\u67CF\u67D4\u67F1\u67F3\u67F4\u6817\u6842\u6843\u6851\u68A8\u68CB\u68D2\u6B20\u6B32\u6BBF\u6C57\u6C88\u6CCA\u6D17\u6D1E\u6D69\u6D6E\u6D78\u6DE1\u6DFB\u6E58\u6EB6\u6F0F\u6F20\u7070\u708E\u70AD\u7126\u718A\u71C3\u7267\u72C2\u731B\u7384\u73A9\u73CD\u7434\u75AB\u75DB\u76C6\u76FE\u773C\u7891\u78C1\u795D\u7965\u79D2\u79DF\u79E6\u7A00\u7B11\u7B51\u7B54\u7C89\u7C92\u7CD6\u7D2B\u7F8A\u7FBD\u7FFC\u8010\u80A5\u80CE\u8150\u8179\u819C\u8247\u829D\u82B3\u82D7\u82E6\u8302\u8336\u8352\u83CA\u83CC\u83DC\u845B\u846C\u84B2\u84B8\u84C4\u8584\u864E\u86C7\u8861\u8863\u8870\u888B\u8896\u88D5\u8986\u8C46\u8DA3\u8E0F\u8F9B\u8FC5\u8FEB\u8FF7\u9003\u9006\u902E\u9042\u9063\u90ED\u963B\u9676\u96EA\u9756\u9B3C\u9B42\u9F3B
|
||||
NGram.KANJI_7_6=\u4E01\u4E03\u4E45\u4E5D\u4E88\u4E92\u4EA1\u4ECB\u4EE4\u4F01\u4F0A\u4F2F\u4F3C\u4F4E\u4F4F\u4F55\u4F8B\u4F9D\u4FBF\u4FEE\u505C\u50CF\u516B\u516D\u5175\u5177\u5178\u5207\u520A\u5224\u526F\u529F\u52A9\u5343\u5348\u535A\u5370\u53BB\u53CB\u53F3\u5409\u542B\u544A\u547C\u5584\u5747\u5802\u590F\u592B\u5931\u5947\u597D\u5A01\u5A92\u5B63\u5B8C\u5B97\u5BA2\u5BA3\u5BA4\u5BB3\u5BB9\u5BC6\u5BCC\u5BDF\u5C04\u5C1A\u5C45\u5C4B\u5CB8\u5DE6\u5E0C\u5E1D\u5E2D\u5E55\u5E8F\u5E95\u5E97\u5EA7\u5EB7\u5EF6\u5F8B\u5FAE\u5FC5\u5FD7\u5FF5\u601D\u6025\u606F\u60F3\u611F\u623F\u6253\u6279\u627F\u6295\u6297\u62EC\u6388\u6392\u63F4\u6545\u6551\u6574\u6599\u65C5\u65E9\u6613\u6620\u6625\u666E\u666F\u66B4\u66F4\u670D\u671B\u6728\u672B\u6751\u677E\u67B6\u6838\u6839\u6848\u68EE\u690D\u6982\u6A21\u6B4C\u6B62\u6B66\u6BB5\u6BCD\u6C0F\u6C38\u6C42\u6CBF\u6CE2\u6CE8\u6D0B\u6D3E\u6D88\u6DF1\u6E05\u6E56\u706B\u7167\u7206\u7236\u7247\u7387\u7530\u7537\u7559\u7565\u7591\u75C5\u767B\u767D\u767E\u7687\u76DB\u76DF\u771F\u7763\u77ED\u7834\u79FB\u7A81\u7AE0\u7AEF\u7B56\u7B97\u7C4D\u7CBE\u7D20\u7D22\u7F72\u7FA4\u8001\u8003\u81F4\u822A\u826F\u82B1\u8349\u843D\u878D\u8857\u89D2\u8B66\u8C37\u8D70\u8D85\u8D8A\u8DB3\u8FF0\u8FFD\u9001\u901F\u90A3\u90A6\u914D\u91CE\u9632\u963F\u9644\u964D\u9664\u96C4\u96E8\u9752\u9769\u98DF
|
||||
NGram.KANJI_7_7=\u4E09\u4E0A\u4E0B\u4E0D\u4E16\u4E3B\u4E8B\u4E8C\u4EE3\u4EE5\u4F4D\u4F5C\u4F7F\u5165\u5168\u516C\u5171\u51FA\u5206\u5229\u5236\u524D\u529B\u52A0\u5316\u5317\u5357\u539F\u53CA\u53F0\u5408\u540C\u540D\u548C\u5730\u57FA\u5916\u591A\u5929\u5B50\u5B9A\u5BB6\u5C0F\u5C71\u5DDE\u5DE5\u5E02\u5E73\u5EA6\u5EFA\u5F0F\u6027\u6210\u6240\u6307\u653F\u6587\u65B0\u65B9\u660E\u6700\u6709\u671F\u672C\u6B21\u6B63\u6C11\u6CBB\u6CD5\u6D77\u7269\u7279\u7406\u751F\u7528\u7531\u754C\u76EE\u76F8\u793E\u79D1\u7ACB\u7B2C\u7B49\u7CFB\u8005\u80FD\u81EA\u82F1\u884C\u8868\u897F\u8981\u901A\u9053\u90E8\u90FD\u91CD\u9AD8
|
||||
NGram.KANJI_7_9=\u4E4D\u4F36\u5319\u6A61\u6DCB\u7194
|
||||
NGram.KANJI_7_11=\u4E5E\u4F43\u5026\u50FB\u515C\u5243\u5420\u5446\u54B3\u54BD\u553E\u55A7\u5703\u5984\u5AC9\u5B09\u5C51\u5DFE\u5ED3\u5F1B\u6055\u618E\u62D9\u65A7\u6652\u6977\u6EBA\u707C\u75D8\u79E4\u7AFF\u7B4F\u7CA5\u808B\u8098\u80B4\u8235\u82DB\u849C\u8549\u868A\u86FE\u8718\u914C
|
||||
NGram.KANJI_7_12=\u4E08\u4E38\u4F8D\u50DA\u5203\u5256\u52C9\u52D8\u52FE\u5320\u533F\u5375\u53D4\u540F\u54E8\u56DA\u5806\u5996\u5999\u59A5\u59A8\u59FF\u5AE1\u5BB0\u5BF8\u5C09\u5C3F\u5C48\u5C65\u5D29\u5E06\u5E4C\u5EB5\u5EB6\u5EB8\u5F13\u5FCC\u5FD8\u6052\u606D\u609F\u60D1\u614E\u6247\u62B1\u6349\u64E6\u6577\u65ED\u6674\u6734\u67C4\u6850\u690E\u6A58\u6B3A\u6B89\u6C41\u6CBC\u6CCC\u6CF3\u6D74\u6DAF\u6DF3\u6ECB\u6F02\u6F84\u71E5\u7261\u7272\u72AC\u72FC\u733F\u7409\u755C\u76F2\u7720\u77AC\u77E2\u7802\u786B\u78E8\u7901\u7948\u79E9\u7A1A\u7A74\u7AE3\u7B4B\u7B52\u7BB1\u7C3F\u8015\u8096\u809D\u80A2\u80A9\u80AA\u80BA\u80F8\u8102\u810A\u8154\u8155\u8170\u817A\u81A8\u81ED\u820C\u8236\u82BD\u8305\u83E9\u83F1\u840C\u85FB\u8650\u8702\u8A93\u8E44\u8FB0\u9038\u9091\u90AA\u916C\u9175\u9177\u9685\u96C0\u96C7\u96CC\u97AD
|
||||
NGram.KANJI_7_13=\u63D6\u803D
|
||||
NGram.KANJI_7_16=\u602F\u7566
|
||||
NGram.KANJI_7_18=\u634C\u7C38
|
||||
NGram.KANJI_7_19=\u4E18\u4E73\u4E95\u4EAB\u4EC1\u4ED8\u4ED9\u4F11\u4F34\u4F38\u4F59\u4FB5\u4FC3\u4FD7\u5012\u5019\u5065\u50AC\u5144\u5145\u514D\u517C\u51A0\u51B7\u5211\u5238\u523A\u523B\u5272\u52E4\u5360\u5371\u539A\u541B\u5426\u5438\u5473\u54F2\u5510\u552F\u5531\u559C\u5609\u56F0\u56FA\u591C\u5948\u594F\u59BB\u59D3\u5B85\u5B87\u5B88\u5B99\u5B9C\u5BC4\u5BFA\u5C0A\u5C3E\u5CA9\u5D0E\u5DE1\u5DE8\u5DEE\u5DF1\u5E45\u5E78\u5E7B\u5E7C\u5EAD\u5EF7\u5F1F\u5F31\u5F79\u5F7C\u5F85\u5F92\u5FA1\u5FE0\u6050\u60A3\u6212\u62DB\u632F\u6355\u63A2\u63AA\u63CF\u642D\u6469\u64CD\u653B\u6563\u660C\u662D\u667A\u6697\u66FF\u6750\u675F\u677F\u6790\u67D3\u682A\u6885\u68B0\u6B8A\u6B96\u6BDB\u6C60\u6CB9\u6CC9\u6D25\u6D66\u6DB2\u6DF7\u6E21\u6ED1\u6F2B\u6F6E\u6FC0\u7235\u725B\u72AF\u7389\u7532\u7533\u756A\u75BE\u75C7\u76AE\u76CA\u7740\u786C\u7956\u7968\u796D\u7981\u79C0\u79C1\u79CB\u79D8\u7A3F\u7AE5\u7AF9\u7E41\u7F6A\u7FFB\u8089\u80CC\u80DE\u81E3\u821E\u8239\u82E5\u8328\u8377\u85E4\u8840\u88C1\u88C2\u8C6A\u8D64\u8DDD\u8FCE\u8FD4\u9000\u9014\u907F\u90CA\u90CE\u90E1\u9152\u9178\u9686\u9694\u969C\u9707\u9732\u9AA8\u9B54\u9E7F\u9EBB
|
||||
NGram.KANJI_7_20=\u4E39\u4E43\u4EAE\u4F73\u504F\u505A\u51C6\u51CC\u52AA\u5339\u5347\u53EB\u53EC\u5448\u5766\u57F9\u5854\u585E\u58A8\u5B8B\u5C01\u5CF0\u5E72\u5EC9\u5F80\u5F81\u5FBD\u5FEB\u6069\u6211\u624D\u628A\u62B5\u62CD\u6309\u63A7\u64AD\u6566\u6597\u65CB\u65D7\u6628\u6717\u6731\u674E\u675C\u683D\u6881\u6B3E\u6BD2\u6C7D\u6C99\u6CE5\u6CF0\u6D1B\u6D2A\u70C8\u719F\u724C\u7259\u73E0\u73ED\u745E\u74E6\u7518\u751A\u7686\u770B\u7B26\u8033\u80A1\u80E1\u821F\u83AB\u8499\u8D74\u8DE8\u900F\u9010\u9047\u904D\u906D\u9675\u96C5\u96F6\u96F7\u9700\u9F13
|
||||
NGram.KANJI_7_21=\u5764\u59D0\u5A03\u6062\u6108\u68C9\u7164\u79BE\u7BAD\u903C
|
||||
NGram.KANJI_7_23=\u4EA5\u50B2\u532A\u5366\u543B\u54E9\u5632\u59D1\u5BB5\u5DF7\u5F6A\u5F6C\u5FFD\u6070\u6168\u61BE\u63A0\u63A9\u6478\u65A4\u68A7\u6A1F\u6CAB\u70F9\u711A\u723D\u7262\u72F8\u751C\u754F\u75B9\u76C8\u7709\u7897\u7CCA\u7F9E\u8299\u82AD\u82B9\u82D4\u8304\u84C9\u84EC\u854A\u85AF\u86D9\u8FA3\u9187\u97A0
|
||||
NGram.KANJI_7_25=\u4E14\u4E5F\u4F46\u514B\u5176\u5230\u5373\u53EA\u540E\u5982\u5C3C\u5DF4\u6216\u62C9\u65AF\u66FE\u6B64\u6D32\u6D6A\u7BC7\u800C
|
||||
NGram.KANJI_7_28=\u4E4E\u4E9B\u4EA6\u4EC0\u4FC4\u5403\u5957\u5C24\u6089\u6258\u67D0\u758F\u7FF0\u8D6B
|
||||
NGram.KANJI_7_29=\u4FAE\u5944\u5A29\u6101\u62ED\u6328\u637B\u6666\u6687\u66AE\u673D\u6756\u67FF\u6813\u68A2\u699B\u7078\u708A\u7396\u7422\u7525\u75E2\u76BF\u7766\u77B3\u7A3C\u7A92\u819D\u81FC\u8237\u8338\u8511\u88F3\u8FC2
|
||||
NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5993\u5A7F\u5AC1\u5B9B\u5BC2\u5BE1\u5F04\u5F0A\u5F27\u6020\u6028\u6068\u6094\u6109\u611A\u614C\u621A\u62B9\u62D0\u62F7\u62FE\u632B\u633D\u6367\u660F\u6627\u6643\u66D9\u674F\u6795\u67AF\u67D1\u6876\u68DA\u68FA\u6905\u69FD\u6A80\u6B6A\u6CB8\u6CE3\u6DD1\u6DEB\u6E9C\u6EA2\u6EF4\u6F06\u714E\u716E\u722A\u7280\u74A7\u752B\u75B2\u75D5\u75F4\u77AD\u77E9\u785D\u79BD\u7A3D\u7A9F\u7B1B\u7B95\u7C9F\u7CDF\u80C3\u8106\u817F\u818F\u81B3\u828B\u82A5\u82AF\u840E\u851A\u853D\u8776\u87F9\u8877\u8910\u8912\u8C79\u8D66\u8FB1\u9017\u90C1\u916A\u9699\u96C1\u971C\u9774\u978D
|
||||
NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
|
||||
NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
|
||||
NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20
|
|
@ -1,271 +0,0 @@
|
|||
using System;
|
||||
using System.Text;
|
||||
using MediaBrowser.Model.IO;
|
||||
using MediaBrowser.Model.Serialization;
|
||||
using MediaBrowser.Model.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using NLangDetect.Core;
|
||||
using UniversalDetector;
|
||||
|
||||
namespace Emby.Server.Implementations.TextEncoding
|
||||
{
|
||||
public class TextEncoding : ITextEncoding
|
||||
{
|
||||
private readonly IFileSystem _fileSystem;
|
||||
private readonly ILogger _logger;
|
||||
private IJsonSerializer _json;
|
||||
|
||||
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
|
||||
{
|
||||
_fileSystem = fileSystem;
|
||||
_logger = logger;
|
||||
_json = json;
|
||||
}
|
||||
|
||||
public Encoding GetASCIIEncoding()
|
||||
{
|
||||
return Encoding.ASCII;
|
||||
}
|
||||
|
||||
private static Encoding GetInitialEncoding(byte[] buffer, int count)
|
||||
{
|
||||
if (count >= 3)
|
||||
{
|
||||
if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
|
||||
return Encoding.UTF8;
|
||||
}
|
||||
|
||||
if (count >= 2)
|
||||
{
|
||||
if (buffer[0] == 0xfe && buffer[1] == 0xff)
|
||||
return Encoding.Unicode;
|
||||
}
|
||||
|
||||
if (count >= 4)
|
||||
{
|
||||
if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
|
||||
return Encoding.UTF32;
|
||||
}
|
||||
|
||||
if (count >= 3)
|
||||
{
|
||||
if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
|
||||
return Encoding.UTF7;
|
||||
}
|
||||
|
||||
var result = new TextEncodingDetect().DetectEncoding(buffer, count);
|
||||
|
||||
switch (result)
|
||||
{
|
||||
case TextEncodingDetect.CharacterEncoding.Ansi:
|
||||
return Encoding.ASCII;
|
||||
case TextEncodingDetect.CharacterEncoding.Ascii:
|
||||
return Encoding.ASCII;
|
||||
case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
|
||||
return Encoding.UTF32;
|
||||
case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
|
||||
return Encoding.UTF32;
|
||||
case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
|
||||
return Encoding.UTF32;
|
||||
case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
|
||||
return Encoding.UTF32;
|
||||
case TextEncodingDetect.CharacterEncoding.Utf8Bom:
|
||||
return Encoding.UTF8;
|
||||
case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
|
||||
return Encoding.UTF8;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private bool _langDetectInitialized;
|
||||
public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
|
||||
{
|
||||
var index = 0;
|
||||
|
||||
var encoding = GetInitialEncoding(bytes, count);
|
||||
|
||||
if (encoding != null && encoding.Equals(Encoding.UTF8))
|
||||
{
|
||||
return "utf-8";
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
|
||||
{
|
||||
if (!_langDetectInitialized)
|
||||
{
|
||||
_langDetectInitialized = true;
|
||||
LanguageDetector.Initialize(_json);
|
||||
}
|
||||
|
||||
language = DetectLanguage(bytes, index, count);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(language))
|
||||
{
|
||||
_logger.LogDebug("Text language detected as {0}", language);
|
||||
}
|
||||
}
|
||||
|
||||
var charset = DetectCharset(bytes, index, count, language);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(charset))
|
||||
{
|
||||
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return "utf-8";
|
||||
}
|
||||
|
||||
if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return charset;
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(language))
|
||||
{
|
||||
return GetFileCharacterSetFromLanguage(language);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private string DetectLanguage(byte[] bytes, int index, int count)
|
||||
{
|
||||
try
|
||||
{
|
||||
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
|
||||
}
|
||||
catch (NLangDetectException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
|
||||
}
|
||||
catch (NLangDetectException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
|
||||
}
|
||||
catch (NLangDetectException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public Encoding GetEncodingFromCharset(string charset)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(charset))
|
||||
{
|
||||
throw new ArgumentNullException(nameof(charset));
|
||||
}
|
||||
|
||||
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
|
||||
|
||||
try
|
||||
{
|
||||
return Encoding.GetEncoding(charset);
|
||||
}
|
||||
catch (ArgumentException)
|
||||
{
|
||||
charset = charset.Replace("-", string.Empty);
|
||||
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
|
||||
|
||||
return Encoding.GetEncoding(charset);
|
||||
}
|
||||
}
|
||||
|
||||
public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
|
||||
{
|
||||
var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
|
||||
|
||||
return GetEncodingFromCharset(charset);
|
||||
}
|
||||
|
||||
private static string GetFileCharacterSetFromLanguage(string language)
|
||||
{
|
||||
// https://developer.xamarin.com/api/type/System.Text.Encoding/
|
||||
|
||||
switch (language.ToLower())
|
||||
{
|
||||
case "tha":
|
||||
return "windows-874";
|
||||
case "hun":
|
||||
return "windows-1252";
|
||||
case "pol":
|
||||
case "cze":
|
||||
case "ces":
|
||||
case "slo":
|
||||
case "srp":
|
||||
case "hrv":
|
||||
case "rum":
|
||||
case "ron":
|
||||
case "rom":
|
||||
case "rup":
|
||||
return "windows-1250";
|
||||
// albanian
|
||||
case "alb":
|
||||
case "sqi":
|
||||
return "windows-1250";
|
||||
// slovak
|
||||
case "slk":
|
||||
case "slv":
|
||||
return "windows-1250";
|
||||
case "ara":
|
||||
return "windows-1256";
|
||||
case "heb":
|
||||
return "windows-1255";
|
||||
case "grc":
|
||||
return "windows-1253";
|
||||
// greek
|
||||
case "gre":
|
||||
case "ell":
|
||||
return "windows-1253";
|
||||
case "crh":
|
||||
case "ota":
|
||||
case "tur":
|
||||
return "windows-1254";
|
||||
// bulgarian
|
||||
case "bul":
|
||||
case "bgr":
|
||||
return "windows-1251";
|
||||
case "rus":
|
||||
return "windows-1251";
|
||||
case "vie":
|
||||
return "windows-1258";
|
||||
case "kor":
|
||||
return "cp949";
|
||||
default:
|
||||
return "windows-1252";
|
||||
}
|
||||
}
|
||||
|
||||
private static string DetectCharset(byte[] bytes, int index, int count, string language)
|
||||
{
|
||||
var detector = new CharsetDetector();
|
||||
detector.Feed(bytes, index, count);
|
||||
detector.DataEnd();
|
||||
|
||||
var charset = detector.Charset;
|
||||
|
||||
// This is often incorrectly indetected. If this happens, try to use other techniques instead
|
||||
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(language))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return charset;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,406 +0,0 @@
|
|||
namespace Emby.Server.Implementations.TextEncoding
|
||||
{
|
||||
// Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
|
||||
//
|
||||
// https://www.autoitscript.com
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
/// <summary>
|
||||
/// Credit: https://github.com/AutoIt/text-encoding-detect
|
||||
/// </summary>
|
||||
public class TextEncodingDetect
|
||||
{
|
||||
private readonly byte[] _utf16BeBom =
|
||||
{
|
||||
0xFE,
|
||||
0xFF
|
||||
};
|
||||
|
||||
private readonly byte[] _utf16LeBom =
|
||||
{
|
||||
0xFF,
|
||||
0xFE
|
||||
};
|
||||
|
||||
private readonly byte[] _utf8Bom =
|
||||
{
|
||||
0xEF,
|
||||
0xBB,
|
||||
0xBF
|
||||
};
|
||||
|
||||
private bool _nullSuggestsBinary = true;
|
||||
private double _utf16ExpectedNullPercent = 70;
|
||||
private double _utf16UnexpectedNullPercent = 10;
|
||||
|
||||
public enum CharacterEncoding
|
||||
{
|
||||
None, // Unknown or binary
|
||||
Ansi, // 0-255
|
||||
Ascii, // 0-127
|
||||
Utf8Bom, // UTF8 with BOM
|
||||
Utf8Nobom, // UTF8 without BOM
|
||||
Utf16LeBom, // UTF16 LE with BOM
|
||||
Utf16LeNoBom, // UTF16 LE without BOM
|
||||
Utf16BeBom, // UTF16-BE with BOM
|
||||
Utf16BeNoBom // UTF16-BE without BOM
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
|
||||
/// </summary>
|
||||
public bool NullSuggestsBinary
|
||||
{
|
||||
set => _nullSuggestsBinary = value;
|
||||
}
|
||||
|
||||
public double Utf16ExpectedNullPercent
|
||||
{
|
||||
set
|
||||
{
|
||||
if (value > 0 && value < 100)
|
||||
{
|
||||
_utf16ExpectedNullPercent = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public double Utf16UnexpectedNullPercent
|
||||
{
|
||||
set
|
||||
{
|
||||
if (value > 0 && value < 100)
|
||||
{
|
||||
_utf16UnexpectedNullPercent = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the BOM length for a given Encoding mode.
|
||||
/// </summary>
|
||||
/// <param name="encoding"></param>
|
||||
/// <returns>The BOM length.</returns>
|
||||
public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
|
||||
{
|
||||
int length;
|
||||
|
||||
switch (encoding)
|
||||
{
|
||||
case CharacterEncoding.Utf16BeBom:
|
||||
case CharacterEncoding.Utf16LeBom:
|
||||
length = 2;
|
||||
break;
|
||||
|
||||
case CharacterEncoding.Utf8Bom:
|
||||
length = 3;
|
||||
break;
|
||||
|
||||
default:
|
||||
length = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks for a BOM sequence in a byte buffer.
|
||||
/// </summary>
|
||||
/// <param name="buffer"></param>
|
||||
/// <param name="size"></param>
|
||||
/// <returns>Encoding type or Encoding.None if no BOM.</returns>
|
||||
public CharacterEncoding CheckBom(byte[] buffer, int size)
|
||||
{
|
||||
// Check for BOM
|
||||
if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
|
||||
{
|
||||
return CharacterEncoding.Utf16LeBom;
|
||||
}
|
||||
|
||||
if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
|
||||
{
|
||||
return CharacterEncoding.Utf16BeBom;
|
||||
}
|
||||
|
||||
if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
|
||||
{
|
||||
return CharacterEncoding.Utf8Bom;
|
||||
}
|
||||
|
||||
return CharacterEncoding.None;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Automatically detects the Encoding type of a given byte buffer.
|
||||
/// </summary>
|
||||
/// <param name="buffer">The byte buffer.</param>
|
||||
/// <param name="size">The size of the byte buffer.</param>
|
||||
/// <returns>The Encoding type or Encoding.None if unknown.</returns>
|
||||
public CharacterEncoding DetectEncoding(byte[] buffer, int size)
|
||||
{
|
||||
// First check if we have a BOM and return that if so
|
||||
CharacterEncoding encoding = CheckBom(buffer, size);
|
||||
if (encoding != CharacterEncoding.None)
|
||||
{
|
||||
return encoding;
|
||||
}
|
||||
|
||||
// Now check for valid UTF8
|
||||
encoding = CheckUtf8(buffer, size);
|
||||
if (encoding != CharacterEncoding.None)
|
||||
{
|
||||
return encoding;
|
||||
}
|
||||
|
||||
// Now try UTF16
|
||||
encoding = CheckUtf16NewlineChars(buffer, size);
|
||||
if (encoding != CharacterEncoding.None)
|
||||
{
|
||||
return encoding;
|
||||
}
|
||||
|
||||
encoding = CheckUtf16Ascii(buffer, size);
|
||||
if (encoding != CharacterEncoding.None)
|
||||
{
|
||||
return encoding;
|
||||
}
|
||||
|
||||
// ANSI or None (binary) then
|
||||
if (!DoesContainNulls(buffer, size))
|
||||
{
|
||||
return CharacterEncoding.Ansi;
|
||||
}
|
||||
|
||||
// Found a null, return based on the preference in null_suggests_binary_
|
||||
return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a buffer contains text that looks like utf16 by scanning for
|
||||
/// newline chars that would be present even in non-english text.
|
||||
/// </summary>
|
||||
/// <param name="buffer">The byte buffer.</param>
|
||||
/// <param name="size">The size of the byte buffer.</param>
|
||||
/// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
|
||||
private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
|
||||
{
|
||||
if (size < 2)
|
||||
{
|
||||
return CharacterEncoding.None;
|
||||
}
|
||||
|
||||
// Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
|
||||
size--;
|
||||
|
||||
var leControlChars = 0;
|
||||
var beControlChars = 0;
|
||||
|
||||
uint pos = 0;
|
||||
while (pos < size)
|
||||
{
|
||||
byte ch1 = buffer[pos++];
|
||||
byte ch2 = buffer[pos++];
|
||||
|
||||
if (ch1 == 0)
|
||||
{
|
||||
if (ch2 == 0x0a || ch2 == 0x0d)
|
||||
{
|
||||
++beControlChars;
|
||||
}
|
||||
}
|
||||
else if (ch2 == 0)
|
||||
{
|
||||
if (ch1 == 0x0a || ch1 == 0x0d)
|
||||
{
|
||||
++leControlChars;
|
||||
}
|
||||
}
|
||||
|
||||
// If we are getting both LE and BE control chars then this file is not utf16
|
||||
if (leControlChars > 0 && beControlChars > 0)
|
||||
{
|
||||
return CharacterEncoding.None;
|
||||
}
|
||||
}
|
||||
|
||||
if (leControlChars > 0)
|
||||
{
|
||||
return CharacterEncoding.Utf16LeNoBom;
|
||||
}
|
||||
|
||||
return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a buffer contains any nulls. Used to check for binary vs text data.
|
||||
/// </summary>
|
||||
/// <param name="buffer">The byte buffer.</param>
|
||||
/// <param name="size">The size of the byte buffer.</param>
|
||||
private static bool DoesContainNulls(byte[] buffer, int size)
|
||||
{
|
||||
uint pos = 0;
|
||||
while (pos < size)
|
||||
{
|
||||
if (buffer[pos++] == 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a buffer contains text that looks like utf16. This is done based
|
||||
/// on the use of nulls which in ASCII/script like text can be useful to identify.
|
||||
/// </summary>
|
||||
/// <param name="buffer">The byte buffer.</param>
|
||||
/// <param name="size">The size of the byte buffer.</param>
|
||||
/// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
|
||||
private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
|
||||
{
|
||||
var numOddNulls = 0;
|
||||
var numEvenNulls = 0;
|
||||
|
||||
// Get even nulls
|
||||
uint pos = 0;
|
||||
while (pos < size)
|
||||
{
|
||||
if (buffer[pos] == 0)
|
||||
{
|
||||
numEvenNulls++;
|
||||
}
|
||||
|
||||
pos += 2;
|
||||
}
|
||||
|
||||
// Get odd nulls
|
||||
pos = 1;
|
||||
while (pos < size)
|
||||
{
|
||||
if (buffer[pos] == 0)
|
||||
{
|
||||
numOddNulls++;
|
||||
}
|
||||
|
||||
pos += 2;
|
||||
}
|
||||
|
||||
double evenNullThreshold = numEvenNulls * 2.0 / size;
|
||||
double oddNullThreshold = numOddNulls * 2.0 / size;
|
||||
double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
|
||||
double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
|
||||
|
||||
// Lots of odd nulls, low number of even nulls
|
||||
if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
|
||||
{
|
||||
return CharacterEncoding.Utf16LeNoBom;
|
||||
}
|
||||
|
||||
// Lots of even nulls, low number of odd nulls
|
||||
if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
|
||||
{
|
||||
return CharacterEncoding.Utf16BeNoBom;
|
||||
}
|
||||
|
||||
// Don't know
|
||||
return CharacterEncoding.None;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a buffer contains valid utf8.
|
||||
/// </summary>
|
||||
/// <param name="buffer">The byte buffer.</param>
|
||||
/// <param name="size">The size of the byte buffer.</param>
|
||||
/// <returns>
|
||||
/// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
|
||||
/// Encoding.ASCII (data in 0.127 range).
|
||||
/// </returns>
|
||||
/// <returns>2</returns>
|
||||
private CharacterEncoding CheckUtf8(byte[] buffer, int size)
|
||||
{
|
||||
// UTF8 Valid sequences
|
||||
// 0xxxxxxx ASCII
|
||||
// 110xxxxx 10xxxxxx 2-byte
|
||||
// 1110xxxx 10xxxxxx 10xxxxxx 3-byte
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
|
||||
//
|
||||
// Width in UTF8
|
||||
// Decimal Width
|
||||
// 0-127 1 byte
|
||||
// 194-223 2 bytes
|
||||
// 224-239 3 bytes
|
||||
// 240-244 4 bytes
|
||||
//
|
||||
// Subsequent chars are in the range 128-191
|
||||
var onlySawAsciiRange = true;
|
||||
uint pos = 0;
|
||||
|
||||
while (pos < size)
|
||||
{
|
||||
byte ch = buffer[pos++];
|
||||
|
||||
if (ch == 0 && _nullSuggestsBinary)
|
||||
{
|
||||
return CharacterEncoding.None;
|
||||
}
|
||||
|
||||
int moreChars;
|
||||
if (ch <= 127)
|
||||
{
|
||||
// 1 byte
|
||||
moreChars = 0;
|
||||
}
|
||||
else if (ch >= 194 && ch <= 223)
|
||||
{
|
||||
// 2 Byte
|
||||
moreChars = 1;
|
||||
}
|
||||
else if (ch >= 224 && ch <= 239)
|
||||
{
|
||||
// 3 Byte
|
||||
moreChars = 2;
|
||||
}
|
||||
else if (ch >= 240 && ch <= 244)
|
||||
{
|
||||
// 4 Byte
|
||||
moreChars = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
return CharacterEncoding.None; // Not utf8
|
||||
}
|
||||
|
||||
// Check secondary chars are in range if we are expecting any
|
||||
while (moreChars > 0 && pos < size)
|
||||
{
|
||||
onlySawAsciiRange = false; // Seen non-ascii chars now
|
||||
|
||||
ch = buffer[pos++];
|
||||
if (ch < 128 || ch > 191)
|
||||
{
|
||||
return CharacterEncoding.None; // Not utf8
|
||||
}
|
||||
|
||||
--moreChars;
|
||||
}
|
||||
}
|
||||
|
||||
// If we get to here then only valid UTF-8 sequences have been processed
|
||||
|
||||
// If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
|
||||
return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,121 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System.IO;
|
||||
|
||||
namespace UniversalDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Default implementation of charset detection interface.
|
||||
/// The detector can be fed by a System.IO.Stream:
|
||||
/// <example>
|
||||
/// <code>
|
||||
/// using (FileStream fs = File.OpenRead(filename)) {
|
||||
/// CharsetDetector cdet = new CharsetDetector();
|
||||
/// cdet.Feed(fs);
|
||||
/// cdet.DataEnd();
|
||||
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
|
||||
/// </code>
|
||||
/// </example>
|
||||
///
|
||||
/// or by a byte a array:
|
||||
///
|
||||
/// <example>
|
||||
/// <code>
|
||||
/// byte[] buff = new byte[1024];
|
||||
/// int read;
|
||||
/// while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
|
||||
/// Feed(buff, 0, read);
|
||||
/// cdet.DataEnd();
|
||||
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
|
||||
/// </code>
|
||||
/// </example>
|
||||
/// </summary>
|
||||
public class CharsetDetector : Core.UniversalDetector, ICharsetDetector
|
||||
{
|
||||
private string charset;
|
||||
|
||||
private float confidence;
|
||||
|
||||
//public event DetectorFinished Finished;
|
||||
|
||||
public CharsetDetector() : base(FILTER_ALL)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public void Feed(Stream stream)
|
||||
{
|
||||
byte[] buff = new byte[1024];
|
||||
int read;
|
||||
while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
|
||||
{
|
||||
Feed(buff, 0, read);
|
||||
}
|
||||
}
|
||||
|
||||
public bool IsDone()
|
||||
{
|
||||
return done;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
this.charset = null;
|
||||
this.confidence = 0.0f;
|
||||
base.Reset();
|
||||
}
|
||||
|
||||
public string Charset => charset;
|
||||
|
||||
public float Confidence => confidence;
|
||||
|
||||
protected override void Report(string charset, float confidence)
|
||||
{
|
||||
this.charset = charset;
|
||||
this.confidence = confidence;
|
||||
// if (Finished != null) {
|
||||
// Finished(charset, confidence);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
//public delegate void DetectorFinished(string charset, float confidence);
|
||||
|
||||
}
|
||||
|
|
@ -1,113 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class Big5Prober : CharsetProber
|
||||
{
|
||||
//void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
private CodingStateMachine codingSM;
|
||||
private BIG5DistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public Big5Prober()
|
||||
{
|
||||
this.codingSM = new CodingStateMachine(new BIG5SMModel());
|
||||
this.distributionAnalyser = new BIG5DistributionAnalyser();
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState = 0;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "Big-5";
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class BitPackage
|
||||
{
|
||||
public static int INDEX_SHIFT_4BITS = 3;
|
||||
public static int INDEX_SHIFT_8BITS = 2;
|
||||
public static int INDEX_SHIFT_16BITS = 1;
|
||||
|
||||
public static int SHIFT_MASK_4BITS = 7;
|
||||
public static int SHIFT_MASK_8BITS = 3;
|
||||
public static int SHIFT_MASK_16BITS = 1;
|
||||
|
||||
public static int BIT_SHIFT_4BITS = 2;
|
||||
public static int BIT_SHIFT_8BITS = 3;
|
||||
public static int BIT_SHIFT_16BITS = 4;
|
||||
|
||||
public static int UNIT_MASK_4BITS = 0x0000000F;
|
||||
public static int UNIT_MASK_8BITS = 0x000000FF;
|
||||
public static int UNIT_MASK_16BITS = 0x0000FFFF;
|
||||
|
||||
private int indexShift;
|
||||
private int shiftMask;
|
||||
private int bitShift;
|
||||
private int unitMask;
|
||||
private int[] data;
|
||||
|
||||
public BitPackage(int indexShift, int shiftMask,
|
||||
int bitShift, int unitMask, int[] data)
|
||||
{
|
||||
this.indexShift = indexShift;
|
||||
this.shiftMask = shiftMask;
|
||||
this.bitShift = bitShift;
|
||||
this.unitMask = unitMask;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public static int Pack16bits(int a, int b)
|
||||
{
|
||||
return ((b << 16) | a);
|
||||
}
|
||||
|
||||
public static int Pack8bits(int a, int b, int c, int d)
|
||||
{
|
||||
return Pack16bits((b << 8) | a, (d << 8) | c);
|
||||
}
|
||||
|
||||
public static int Pack4bits(int a, int b, int c, int d,
|
||||
int e, int f, int g, int h)
|
||||
{
|
||||
return Pack8bits((b << 4) | a, (d << 4) | c,
|
||||
(f << 4) | e, (h << 4) | g);
|
||||
}
|
||||
|
||||
public int Unpack(int i)
|
||||
{
|
||||
return (data[i >> indexShift] >>
|
||||
((i & shiftMask) << bitShift)) & unitMask;
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,202 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
using System.IO;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public enum ProbingState
|
||||
{
|
||||
Detecting = 0, // no sure answer yet, but caller can ask for confidence
|
||||
FoundIt = 1, // positive answer
|
||||
NotMe = 2 // negative answer
|
||||
};
|
||||
|
||||
public abstract class CharsetProber
|
||||
{
|
||||
protected const float SHORTCUT_THRESHOLD = 0.95F;
|
||||
|
||||
protected ProbingState state;
|
||||
|
||||
// ASCII codes
|
||||
private const byte SPACE = 0x20;
|
||||
private const byte CAPITAL_A = 0x41;
|
||||
private const byte CAPITAL_Z = 0x5A;
|
||||
private const byte SMALL_A = 0x61;
|
||||
private const byte SMALL_Z = 0x7A;
|
||||
private const byte LESS_THAN = 0x3C;
|
||||
private const byte GREATER_THAN = 0x3E;
|
||||
|
||||
/// <summary>
|
||||
/// Feed data to the prober
|
||||
/// </summary>
|
||||
/// <param name="buf">a buffer</param>
|
||||
/// <param name="offset">offset into buffer</param>
|
||||
/// <param name="len">number of bytes available into buffer</param>
|
||||
/// <returns>
|
||||
/// A <see cref="ProbingState"/>
|
||||
/// </returns>
|
||||
public abstract ProbingState HandleData(byte[] buf, int offset, int len);
|
||||
|
||||
/// <summary>
|
||||
/// Reset prober state
|
||||
/// </summary>
|
||||
public abstract void Reset();
|
||||
|
||||
public abstract string GetCharsetName();
|
||||
|
||||
public abstract float GetConfidence();
|
||||
|
||||
public virtual ProbingState GetState()
|
||||
{
|
||||
return state;
|
||||
}
|
||||
|
||||
public virtual void SetOption()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public virtual void DumpStatus()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Helper functions used in the Latin1 and Group probers
|
||||
//
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
/// <returns>filtered buffer</returns>
|
||||
protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (var ms = new MemoryStream(buf.Length))
|
||||
{
|
||||
|
||||
bool meetMSB = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max)
|
||||
{
|
||||
byte b = buf[cur];
|
||||
|
||||
if ((b & 0x80) != 0)
|
||||
{
|
||||
meetMSB = true;
|
||||
}
|
||||
else if (b < CAPITAL_A || (b > CAPITAL_Z && b < SMALL_A)
|
||||
|| b > SMALL_Z)
|
||||
{
|
||||
if (meetMSB && cur > prev)
|
||||
{
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.WriteByte(SPACE);
|
||||
meetMSB = false;
|
||||
}
|
||||
prev = cur + 1;
|
||||
}
|
||||
cur++;
|
||||
}
|
||||
|
||||
if (meetMSB && cur > prev)
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.SetLength(ms.Position);
|
||||
result = ms.ToArray();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Do filtering to reduce load to probers (Remove ASCII symbols,
|
||||
/// collapse spaces). This filter applies to all scripts which contain
|
||||
/// both English characters and upper ASCII characters.
|
||||
/// </summary>
|
||||
/// <returns>a filtered copy of the input buffer</returns>
|
||||
protected static byte[] FilterWithEnglishLetters(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (var ms = new MemoryStream(buf.Length))
|
||||
{
|
||||
|
||||
bool inTag = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max)
|
||||
{
|
||||
|
||||
byte b = buf[cur];
|
||||
|
||||
if (b == GREATER_THAN)
|
||||
inTag = false;
|
||||
else if (b == LESS_THAN)
|
||||
inTag = true;
|
||||
|
||||
// it's ascii, but it's not a letter
|
||||
if ((b & 0x80) == 0 && (b < CAPITAL_A || b > SMALL_Z
|
||||
|| (b > CAPITAL_Z && b < SMALL_A)))
|
||||
{
|
||||
if (cur > prev && !inTag)
|
||||
{
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.WriteByte(SPACE);
|
||||
}
|
||||
prev = cur + 1;
|
||||
}
|
||||
cur++;
|
||||
}
|
||||
|
||||
// If the current segment contains more than just a symbol
|
||||
// and it is not inside a tag then keep it.
|
||||
if (!inTag && cur > prev)
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
ms.SetLength(ms.Position);
|
||||
result = ms.ToArray();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,149 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public static class Charsets
|
||||
{
|
||||
public const string ASCII = "ASCII";
|
||||
|
||||
public const string UTF8 = "UTF-8";
|
||||
|
||||
public const string UTF16_LE = "UTF-16LE";
|
||||
|
||||
public const string UTF16_BE = "UTF-16BE";
|
||||
|
||||
public const string UTF32_BE = "UTF-32BE";
|
||||
|
||||
public const string UTF32_LE = "UTF-32LE";
|
||||
|
||||
/// <summary>
|
||||
/// Unusual BOM (3412 order)
|
||||
/// </summary>
|
||||
public const string UCS4_3412 = "X-ISO-10646-UCS-4-3412";
|
||||
|
||||
/// <summary>
|
||||
/// Unusual BOM (2413 order)
|
||||
/// </summary>
|
||||
public const string UCS4_2413 = "X-ISO-10646-UCS-4-2413";
|
||||
|
||||
/// <summary>
|
||||
/// Cyrillic (based on bulgarian and russian data)
|
||||
/// </summary>
|
||||
public const string WIN1251 = "windows-1251";
|
||||
|
||||
/// <summary>
|
||||
/// Latin-1, almost identical to ISO-8859-1
|
||||
/// </summary>
|
||||
public const string WIN1252 = "windows-1252";
|
||||
|
||||
/// <summary>
|
||||
/// Greek
|
||||
/// </summary>
|
||||
public const string WIN1253 = "windows-1253";
|
||||
|
||||
/// <summary>
|
||||
/// Logical hebrew (includes ISO-8859-8-I and most of x-mac-hebrew)
|
||||
/// </summary>
|
||||
public const string WIN1255 = "windows-1255";
|
||||
|
||||
/// <summary>
|
||||
/// Traditional chinese
|
||||
/// </summary>
|
||||
public const string BIG5 = "Big-5";
|
||||
|
||||
public const string EUCKR = "EUC-KR";
|
||||
|
||||
public const string EUCJP = "EUC-JP";
|
||||
|
||||
public const string EUCTW = "EUC-TW";
|
||||
|
||||
/// <summary>
|
||||
/// Note: gb2312 is a subset of gb18030
|
||||
/// </summary>
|
||||
public const string GB18030 = "gb18030";
|
||||
|
||||
public const string ISO2022_JP = "ISO-2022-JP";
|
||||
|
||||
public const string ISO2022_CN = "ISO-2022-CN";
|
||||
|
||||
public const string ISO2022_KR = "ISO-2022-KR";
|
||||
|
||||
/// <summary>
|
||||
/// Simplified chinese
|
||||
/// </summary>
|
||||
public const string HZ_GB_2312 = "HZ-GB-2312";
|
||||
|
||||
public const string SHIFT_JIS = "Shift-JIS";
|
||||
|
||||
public const string MAC_CYRILLIC = "x-mac-cyrillic";
|
||||
|
||||
public const string KOI8R = "KOI8-R";
|
||||
|
||||
public const string IBM855 = "IBM855";
|
||||
|
||||
public const string IBM866 = "IBM866";
|
||||
|
||||
/// <summary>
|
||||
/// East-Europe. Disabled because too similar to windows-1252
|
||||
/// (latin-1). Should use tri-grams models to discriminate between
|
||||
/// these two charsets.
|
||||
/// </summary>
|
||||
public const string ISO8859_2 = "ISO-8859-2";
|
||||
|
||||
/// <summary>
|
||||
/// Cyrillic
|
||||
/// </summary>
|
||||
public const string ISO8859_5 = "ISO-8859-5";
|
||||
|
||||
/// <summary>
|
||||
/// Greek
|
||||
/// </summary>
|
||||
public const string ISO_8859_7 = "ISO-8859-7";
|
||||
|
||||
/// <summary>
|
||||
/// Visual Hebrew
|
||||
/// </summary>
|
||||
public const string ISO8859_8 = "ISO-8859-8";
|
||||
|
||||
/// <summary>
|
||||
/// Thai. This recognizer is not enabled yet.
|
||||
/// </summary>
|
||||
public const string TIS620 = "TIS620";
|
||||
|
||||
}
|
||||
}
|
|
@ -1,85 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Parallel state machine for the Coding Scheme Method
|
||||
/// </summary>
|
||||
public class CodingStateMachine
|
||||
{
|
||||
private int currentState;
|
||||
private SMModel model;
|
||||
private int currentCharLen;
|
||||
private int currentBytePos;
|
||||
|
||||
public CodingStateMachine(SMModel model)
|
||||
{
|
||||
this.currentState = SMModel.START;
|
||||
this.model = model;
|
||||
}
|
||||
|
||||
public int NextState(byte b)
|
||||
{
|
||||
// for each byte we get its class, if it is first byte,
|
||||
// we also get byte length
|
||||
int byteCls = model.GetClass(b);
|
||||
if (currentState == SMModel.START)
|
||||
{
|
||||
currentBytePos = 0;
|
||||
currentCharLen = model.charLenTable[byteCls];
|
||||
}
|
||||
|
||||
// from byte's class and stateTable, we get its next state
|
||||
currentState = model.stateTable.Unpack(
|
||||
currentState * model.ClassFactor + byteCls);
|
||||
currentBytePos++;
|
||||
return currentState;
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
currentState = SMModel.START;
|
||||
}
|
||||
|
||||
public int CurrentCharLen => currentCharLen;
|
||||
|
||||
public string ModelName => model.Name;
|
||||
}
|
||||
}
|
|
@ -1,117 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EUCJPProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private EUCJPContextAnalyser contextAnalyser;
|
||||
private EUCJPDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public EUCJPProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new EUCJPSMModel());
|
||||
distributionAnalyser = new EUCJPDistributionAnalyser();
|
||||
contextAnalyser = new EUCJPContextAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "EUC-JP";
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
contextAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
contextAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max - 1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
contextAnalyser.Reset();
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float contxtCf = contextAnalyser.GetConfidence();
|
||||
float distribCf = distributionAnalyser.GetConfidence();
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,114 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EUCKRProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private EUCKRDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public EUCKRProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new EUCKRSMModel());
|
||||
distributionAnalyser = new EUCKRDistributionAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "EUC-KR";
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
//mContextAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EUCTWProber : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private EUCTWDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
public EUCTWProber()
|
||||
{
|
||||
this.codingSM = new CodingStateMachine(new EUCTWSMModel());
|
||||
this.distributionAnalyser = new EUCTWDistributionAnalyser();
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = 0; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
distributionAnalyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "x-euc-tw";
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class EscCharsetProber : CharsetProber
|
||||
{
|
||||
private const int CHARSETS_NUM = 4;
|
||||
private string detectedCharset;
|
||||
private CodingStateMachine[] codingSM;
|
||||
int activeSM;
|
||||
|
||||
public EscCharsetProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine[CHARSETS_NUM];
|
||||
codingSM[0] = new CodingStateMachine(new HZSMModel());
|
||||
codingSM[1] = new CodingStateMachine(new ISO2022CNSMModel());
|
||||
codingSM[2] = new CodingStateMachine(new ISO2022JPSMModel());
|
||||
codingSM[3] = new CodingStateMachine(new ISO2022KRSMModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
for (int i = 0; i < CHARSETS_NUM; i++)
|
||||
codingSM[i].Reset();
|
||||
activeSM = CHARSETS_NUM;
|
||||
detectedCharset = null;
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max && state == ProbingState.Detecting; i++)
|
||||
{
|
||||
for (int j = activeSM - 1; j >= 0; j--)
|
||||
{
|
||||
// byte is feed to all active state machine
|
||||
int codingState = codingSM[j].NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
// got negative answer for this state machine, make it inactive
|
||||
activeSM--;
|
||||
if (activeSM == 0)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
return state;
|
||||
}
|
||||
else if (j != activeSM)
|
||||
{
|
||||
CodingStateMachine t = codingSM[activeSM];
|
||||
codingSM[activeSM] = codingSM[j];
|
||||
codingSM[j] = t;
|
||||
}
|
||||
}
|
||||
else if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
detectedCharset = codingSM[j].ModelName;
|
||||
return state;
|
||||
}
|
||||
}
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return detectedCharset;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return 0.99f;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,304 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
/// <summary>
|
||||
/// Escaped charsets state machines
|
||||
/// </summary>
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public class HZSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] HZ_cls = {
|
||||
BitPackage.Pack4bits(1,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,4,0,5,2,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 80 - 87
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 88 - 8f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 90 - 97
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 98 - 9f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a0 - a7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a8 - af
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c0 - c7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c8 - cf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d0 - d7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d8 - df
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e0 - e7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e8 - ef
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // f0 - f7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] HZ_st = {
|
||||
BitPackage.Pack4bits(START, ERROR, 3, START, START, START, ERROR, ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, ERROR, ITSME, ITSME, ITSME, ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ERROR, ERROR, START, START, 4, ERROR),//10-17
|
||||
BitPackage.Pack4bits( 5, ERROR, 6, ERROR, 5, 5, 4, ERROR),//18-1f
|
||||
BitPackage.Pack4bits( 4, ERROR, 4, 4, 4, ERROR, 4, ERROR),//20-27
|
||||
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
||||
};
|
||||
|
||||
private readonly static int[] HZCharLenTable = { 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public HZSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, HZ_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, HZ_st),
|
||||
HZCharLenTable, "HZ-GB-2312")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class ISO2022CNSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022CN_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CN_st = {
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//30-37
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CNCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public ISO2022CNSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
|
||||
9,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
|
||||
ISO2022CNCharLenTable, "ISO-2022-CN")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public class ISO2022JPSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022JP_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JP_st = {
|
||||
BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//30-37
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//38-3f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JPCharLenTable = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public ISO2022JPSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
|
||||
10,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
|
||||
ISO2022JPCharLenTable, "ISO-2022-JP")
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class ISO2022KRSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022KR_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KR_st = {
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KRCharLenTable = { 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
public ISO2022KRSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
|
||||
ISO2022KRCharLenTable, "ISO-2022-KR")
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,119 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
// We use gb18030 to replace gb2312, because 18030 is a superset.
|
||||
public class GB18030Prober : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
private GB18030DistributionAnalyser analyser;
|
||||
private byte[] lastChar;
|
||||
|
||||
public GB18030Prober()
|
||||
{
|
||||
lastChar = new byte[2];
|
||||
codingSM = new CodingStateMachine(new GB18030SMModel());
|
||||
analyser = new GB18030DistributionAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "gb18030";
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR)
|
||||
{
|
||||
state = ProbingState.NotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.ITSME)
|
||||
{
|
||||
state = ProbingState.FoundIt;
|
||||
break;
|
||||
}
|
||||
if (codingState == SMModel.START)
|
||||
{
|
||||
int charLen = codingSM.CurrentCharLen;
|
||||
if (i == offset)
|
||||
{
|
||||
lastChar[1] = buf[offset];
|
||||
analyser.HandleOneChar(lastChar, 0, charLen);
|
||||
}
|
||||
else
|
||||
{
|
||||
analyser.HandleOneChar(buf, i - 1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastChar[0] = buf[max - 1];
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
{
|
||||
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return analyser.GetConfidence();
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
analyser.Reset();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -1,328 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
|
||||
/**
|
||||
* General ideas of the Hebrew charset recognition
|
||||
*
|
||||
* Four main charsets exist in Hebrew:
|
||||
* "ISO-8859-8" - Visual Hebrew
|
||||
* "windows-1255" - Logical Hebrew
|
||||
* "ISO-8859-8-I" - Logical Hebrew
|
||||
* "x-mac-hebrew" - ?? Logical Hebrew ??
|
||||
*
|
||||
* Both "ISO" charsets use a completely identical set of code points, whereas
|
||||
* "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
||||
* these code points. windows-1255 defines additional characters in the range
|
||||
* 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
||||
* diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
|
||||
* x-mac-hebrew defines similar additional code points but with a different
|
||||
* mapping.
|
||||
*
|
||||
* As far as an average Hebrew text with no diacritics is concerned, all four
|
||||
* charsets are identical with respect to code points. Meaning that for the
|
||||
* main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
||||
* (including final letters).
|
||||
*
|
||||
* The dominant difference between these charsets is their directionality.
|
||||
* "Visual" directionality means that the text is ordered as if the renderer is
|
||||
* not aware of a BIDI rendering algorithm. The renderer sees the text and
|
||||
* draws it from left to right. The text itself when ordered naturally is read
|
||||
* backwards. A buffer of Visual Hebrew generally looks like so:
|
||||
* "[last word of first line spelled backwards] [whole line ordered backwards
|
||||
* and spelled backwards] [first word of first line spelled backwards]
|
||||
* [end of line] [last word of second line] ... etc' "
|
||||
* adding punctuation marks, numbers and English text to visual text is
|
||||
* naturally also "visual" and from left to right.
|
||||
*
|
||||
* "Logical" directionality means the text is ordered "naturally" according to
|
||||
* the order it is read. It is the responsibility of the renderer to display
|
||||
* the text from right to left. A BIDI algorithm is used to place general
|
||||
* punctuation marks, numbers and English text in the text.
|
||||
*
|
||||
* Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
||||
* what little evidence I could find, it seems that its general directionality
|
||||
* is Logical.
|
||||
*
|
||||
* To sum up all of the above, the Hebrew probing mechanism knows about two
|
||||
* charsets:
|
||||
* Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
|
||||
* backwards while line order is natural. For charset recognition purposes
|
||||
* the line order is unimportant (In fact, for this implementation, even
|
||||
* word order is unimportant).
|
||||
* Logical Hebrew - "windows-1255" - normal, naturally ordered text.
|
||||
*
|
||||
* "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
||||
* specifically identified.
|
||||
* "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
|
||||
* that contain special punctuation marks or diacritics is displayed with
|
||||
* some unconverted characters showing as question marks. This problem might
|
||||
* be corrected using another model prober for x-mac-hebrew. Due to the fact
|
||||
* that x-mac-hebrew texts are so rare, writing another model prober isn't
|
||||
* worth the effort and performance hit.
|
||||
*
|
||||
* *** The Prober ***
|
||||
*
|
||||
* The prober is divided between two nsSBCharSetProbers and an nsHebrewProber,
|
||||
* all of which are managed, created, fed data, inquired and deleted by the
|
||||
* nsSBCSGroupProber. The two nsSBCharSetProbers identify that the text is in
|
||||
* fact some kind of Hebrew, Logical or Visual. The final decision about which
|
||||
* one is it is made by the nsHebrewProber by combining final-letter scores
|
||||
* with the scores of the two nsSBCharSetProbers to produce a final answer.
|
||||
*
|
||||
* The nsSBCSGroupProber is responsible for stripping the original text of HTML
|
||||
* tags, English characters, numbers, low-ASCII punctuation characters, spaces
|
||||
* and new lines. It reduces any sequence of such characters to a single space.
|
||||
* The buffer fed to each prober in the SBCS group prober is pure text in
|
||||
* high-ASCII.
|
||||
* The two nsSBCharSetProbers (model probers) share the same language model:
|
||||
* Win1255Model.
|
||||
* The first nsSBCharSetProber uses the model normally as any other
|
||||
* nsSBCharSetProber does, to recognize windows-1255, upon which this model was
|
||||
* built. The second nsSBCharSetProber is told to make the pair-of-letter
|
||||
* lookup in the language model backwards. This in practice exactly simulates
|
||||
* a visual Hebrew model using the windows-1255 logical Hebrew model.
|
||||
*
|
||||
* The nsHebrewProber is not using any language model. All it does is look for
|
||||
* final-letter evidence suggesting the text is either logical Hebrew or visual
|
||||
* Hebrew. Disjointed from the model probers, the results of the nsHebrewProber
|
||||
* alone are meaningless. nsHebrewProber always returns 0.00 as confidence
|
||||
* since it never identifies a charset by itself. Instead, the pointer to the
|
||||
* nsHebrewProber is passed to the model probers as a helper "Name Prober".
|
||||
* When the Group prober receives a positive identification from any prober,
|
||||
* it asks for the name of the charset identified. If the prober queried is a
|
||||
* Hebrew model prober, the model prober forwards the call to the
|
||||
* nsHebrewProber to make the final decision. In the nsHebrewProber, the
|
||||
* decision is made according to the final-letters scores maintained and Both
|
||||
* model probers scores. The answer is returned in the form of the name of the
|
||||
* charset identified, either "windows-1255" or "ISO-8859-8".
|
||||
*
|
||||
*/
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// This prober doesn't actually recognize a language or a charset.
|
||||
/// It is a helper prober for the use of the Hebrew model probers
|
||||
/// </summary>
|
||||
public class HebrewProber : CharsetProber
|
||||
{
|
||||
// windows-1255 / ISO-8859-8 code points of interest
|
||||
private const byte FINAL_KAF = 0xEA;
|
||||
private const byte NORMAL_KAF = 0xEB;
|
||||
private const byte FINAL_MEM = 0xED;
|
||||
private const byte NORMAL_MEM = 0xEE;
|
||||
private const byte FINAL_NUN = 0xEF;
|
||||
private const byte NORMAL_NUN = 0xF0;
|
||||
private const byte FINAL_PE = 0xF3;
|
||||
private const byte NORMAL_PE = 0xF4;
|
||||
private const byte FINAL_TSADI = 0xF5;
|
||||
private const byte NORMAL_TSADI = 0xF6;
|
||||
|
||||
// Minimum Visual vs Logical final letter score difference.
|
||||
// If the difference is below this, don't rely solely on the final letter score distance.
|
||||
private const int MIN_FINAL_CHAR_DISTANCE = 5;
|
||||
|
||||
// Minimum Visual vs Logical model score difference.
|
||||
// If the difference is below this, don't rely at all on the model score distance.
|
||||
private const float MIN_MODEL_DISTANCE = 0.01f;
|
||||
|
||||
protected const string VISUAL_HEBREW_NAME = "ISO-8859-8";
|
||||
protected const string LOGICAL_HEBREW_NAME = "windows-1255";
|
||||
|
||||
// owned by the group prober.
|
||||
protected CharsetProber logicalProber, visualProber;
|
||||
protected int finalCharLogicalScore, finalCharVisualScore;
|
||||
|
||||
// The two last bytes seen in the previous buffer.
|
||||
protected byte prev, beforePrev;
|
||||
|
||||
public HebrewProber()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public void SetModelProbers(CharsetProber logical, CharsetProber visual)
|
||||
{
|
||||
logicalProber = logical;
|
||||
visualProber = visual;
|
||||
}
|
||||
|
||||
/**
|
||||
* Final letter analysis for logical-visual decision.
|
||||
* Look for evidence that the received buffer is either logical Hebrew or
|
||||
* visual Hebrew.
|
||||
* The following cases are checked:
|
||||
* 1) A word longer than 1 letter, ending with a final letter. This is an
|
||||
* indication that the text is laid out "naturally" since the final letter
|
||||
* really appears at the end. +1 for logical score.
|
||||
* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
||||
* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
||||
* the Non-Final form of that letter. Exceptions to this rule are mentioned
|
||||
* above in isNonFinal(). This is an indication that the text is laid out
|
||||
* backwards. +1 for visual score
|
||||
* 3) A word longer than 1 letter, starting with a final letter. Final letters
|
||||
* should not appear at the beginning of a word. This is an indication that
|
||||
* the text is laid out backwards. +1 for visual score.
|
||||
*
|
||||
* The visual score and logical score are accumulated throughout the text and
|
||||
* are finally checked against each other in GetCharSetName().
|
||||
* No checking for final letters in the middle of words is done since that case
|
||||
* is not an indication for either Logical or Visual text.
|
||||
*
|
||||
* The input buffer should not contain any white spaces that are not (' ')
|
||||
* or any low-ascii punctuation marks.
|
||||
*/
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
// Both model probers say it's not them. No reason to continue.
|
||||
if (GetState() == ProbingState.NotMe)
|
||||
return ProbingState.NotMe;
|
||||
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++)
|
||||
{
|
||||
|
||||
byte b = buf[i];
|
||||
|
||||
// a word just ended
|
||||
if (b == 0x20)
|
||||
{
|
||||
// *(curPtr-2) was not a space so prev is not a 1 letter word
|
||||
if (beforePrev != 0x20)
|
||||
{
|
||||
// case (1) [-2:not space][-1:final letter][cur:space]
|
||||
if (IsFinal(prev))
|
||||
finalCharLogicalScore++;
|
||||
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
||||
else if (IsNonFinal(prev))
|
||||
finalCharVisualScore++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
// case (3) [-2:space][-1:final letter][cur:not space]
|
||||
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
||||
++finalCharVisualScore;
|
||||
}
|
||||
beforePrev = prev;
|
||||
prev = b;
|
||||
}
|
||||
|
||||
// Forever detecting, till the end or until both model probers
|
||||
// return NotMe (handled above).
|
||||
return ProbingState.Detecting;
|
||||
}
|
||||
|
||||
// Make the decision: is it Logical or Visual?
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
// If the final letter score distance is dominant enough, rely on it.
|
||||
int finalsub = finalCharLogicalScore - finalCharVisualScore;
|
||||
if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// It's not dominant enough, try to rely on the model scores instead.
|
||||
float modelsub = logicalProber.GetConfidence() - visualProber.GetConfidence();
|
||||
if (modelsub > MIN_MODEL_DISTANCE)
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (modelsub < -(MIN_MODEL_DISTANCE))
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// Still no good, back to final letter distance, maybe it'll save the day.
|
||||
if (finalsub < 0)
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
}
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
finalCharLogicalScore = 0;
|
||||
finalCharVisualScore = 0;
|
||||
prev = 0x20;
|
||||
beforePrev = 0x20;
|
||||
}
|
||||
|
||||
public override ProbingState GetState()
|
||||
{
|
||||
// Remain active as long as any of the model probers are active.
|
||||
if (logicalProber.GetState() == ProbingState.NotMe &&
|
||||
visualProber.GetState() == ProbingState.NotMe)
|
||||
return ProbingState.NotMe;
|
||||
return ProbingState.Detecting;
|
||||
}
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
//Console.WriteLine(" HEB: {0} - {1} [Logical-Visual score]", finalCharLogicalScore, finalCharVisualScore);
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
protected static bool IsFinal(byte b)
|
||||
{
|
||||
return (b == FINAL_KAF || b == FINAL_MEM || b == FINAL_NUN
|
||||
|| b == FINAL_PE || b == FINAL_TSADI);
|
||||
}
|
||||
|
||||
protected static bool IsNonFinal(byte b)
|
||||
{
|
||||
// The normal Tsadi is not a good Non-Final letter due to words like
|
||||
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
||||
// the Non-Final tsadi to appear at an end of a word even though this is not
|
||||
// the case in the original text.
|
||||
// The letters Pe and Kaf rarely display a related behavior of not being a
|
||||
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
||||
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
||||
// these letters as Non-Final letters outweighs the damage since these words
|
||||
// are quite rare.
|
||||
return (b == NORMAL_KAF || b == NORMAL_MEM || b == NORMAL_NUN
|
||||
|| b == NORMAL_PE);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,327 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class JapaneseContextAnalyser
|
||||
{
|
||||
protected const int CATEGORIES_NUM = 6;
|
||||
protected const int ENOUGH_REL_THRESHOLD = 100;
|
||||
protected const int MAX_REL_THRESHOLD = 1000;
|
||||
protected const int MINIMUM_DATA_THRESHOLD = 4;
|
||||
protected const float DONT_KNOW = -1.0f;
|
||||
|
||||
// hiragana frequency category table
|
||||
// This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
protected static byte[,] jp2CharContext = {
|
||||
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
||||
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
||||
{ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,},
|
||||
{ 0,4,0,5,0,5,0,4,0,4,5,4,4,3,5,3,5,1,5,3,4,3,4,4,3,4,3,3,4,3,5,4,4,3,5,5,3,5,5,5,3,5,5,3,4,5,5,3,1,3,2,0,3,4,0,4,2,0,4,2,1,5,3,2,3,5,0,4,0,2,0,5,4,4,5,4,5,0,4,0,0,4,4,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,3,0,4,0,3,0,3,0,4,5,4,3,3,3,3,4,3,5,4,4,3,5,4,4,3,4,3,4,4,4,4,5,3,4,4,3,4,5,5,4,5,5,1,4,5,4,3,0,3,3,1,3,3,0,4,4,0,3,3,1,5,3,3,3,5,0,4,0,3,0,4,4,3,4,3,3,0,4,1,1,3,4,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,4,0,3,0,3,0,4,0,3,4,4,3,2,2,1,2,1,3,1,3,3,3,3,3,4,3,1,3,3,5,3,3,0,4,3,0,5,4,3,3,5,4,4,3,4,4,5,0,1,2,0,1,2,0,2,2,0,1,0,0,5,2,2,1,4,0,3,0,1,0,4,4,3,5,4,3,0,2,1,0,4,3,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,3,0,5,0,4,0,2,1,4,4,2,4,1,4,2,4,2,4,3,3,3,4,3,3,3,3,1,4,2,3,3,3,1,4,4,1,1,1,4,3,3,2,0,2,4,3,2,0,3,3,0,3,1,1,0,0,0,3,3,0,4,2,2,3,4,0,4,0,3,0,4,4,5,3,4,4,0,3,0,0,1,4,},
|
||||
{ 1,4,0,4,0,4,0,4,0,3,5,4,4,3,4,3,5,4,3,3,4,3,5,4,4,4,4,3,4,2,4,3,3,1,5,4,3,2,4,5,4,5,5,4,4,5,4,4,0,3,2,2,3,3,0,4,3,1,3,2,1,4,3,3,4,5,0,3,0,2,0,4,5,5,4,5,4,0,4,0,0,5,4,},
|
||||
{ 0,5,0,5,0,4,0,3,0,4,4,3,4,3,3,3,4,0,4,4,4,3,4,3,4,3,3,1,4,2,4,3,4,0,5,4,1,4,5,4,4,5,3,2,4,3,4,3,2,4,1,3,3,3,2,3,2,0,4,3,3,4,3,3,3,4,0,4,0,3,0,4,5,4,4,4,3,0,4,1,0,1,3,},
|
||||
{ 0,3,1,4,0,3,0,2,0,3,4,4,3,1,4,2,3,3,4,3,4,3,4,3,4,4,3,2,3,1,5,4,4,1,4,4,3,5,4,4,3,5,5,4,3,4,4,3,1,2,3,1,2,2,0,3,2,0,3,1,0,5,3,3,3,4,3,3,3,3,4,4,4,4,5,4,2,0,3,3,2,4,3,},
|
||||
{ 0,2,0,3,0,1,0,1,0,0,3,2,0,0,2,0,1,0,2,1,3,3,3,1,2,3,1,0,1,0,4,2,1,1,3,3,0,4,3,3,1,4,3,3,0,3,3,2,0,0,0,0,1,0,0,2,0,0,0,0,0,4,1,0,2,3,2,2,2,1,3,3,3,4,4,3,2,0,3,1,0,3,3,},
|
||||
{ 0,4,0,4,0,3,0,3,0,4,4,4,3,3,3,3,3,3,4,3,4,2,4,3,4,3,3,2,4,3,4,5,4,1,4,5,3,5,4,5,3,5,4,0,3,5,5,3,1,3,3,2,2,3,0,3,4,1,3,3,2,4,3,3,3,4,0,4,0,3,0,4,5,4,4,5,3,0,4,1,0,3,4,},
|
||||
{ 0,2,0,3,0,3,0,0,0,2,2,2,1,0,1,0,0,0,3,0,3,0,3,0,1,3,1,0,3,1,3,3,3,1,3,3,3,0,1,3,1,3,4,0,0,3,1,1,0,3,2,0,0,0,0,1,3,0,1,0,0,3,3,2,0,3,0,0,0,0,0,3,4,3,4,3,3,0,3,0,0,2,3,},
|
||||
{ 2,3,0,3,0,2,0,1,0,3,3,4,3,1,3,1,1,1,3,1,4,3,4,3,3,3,0,0,3,1,5,4,3,1,4,3,2,5,5,4,4,4,4,3,3,4,4,4,0,2,1,1,3,2,0,1,2,0,0,1,0,4,1,3,3,3,0,3,0,1,0,4,4,4,5,5,3,0,2,0,0,4,4,},
|
||||
{ 0,2,0,1,0,3,1,3,0,2,3,3,3,0,3,1,0,0,3,0,3,2,3,1,3,2,1,1,0,0,4,2,1,0,2,3,1,4,3,2,0,4,4,3,1,3,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,4,1,1,1,2,0,3,0,0,0,3,4,2,4,3,2,0,1,0,0,3,3,},
|
||||
{ 0,1,0,4,0,5,0,4,0,2,4,4,2,3,3,2,3,3,5,3,3,3,4,3,4,2,3,0,4,3,3,3,4,1,4,3,2,1,5,5,3,4,5,1,3,5,4,2,0,3,3,0,1,3,0,4,2,0,1,3,1,4,3,3,3,3,0,3,0,1,0,3,4,4,4,5,5,0,3,0,1,4,5,},
|
||||
{ 0,2,0,3,0,3,0,0,0,2,3,1,3,0,4,0,1,1,3,0,3,4,3,2,3,1,0,3,3,2,3,1,3,0,2,3,0,2,1,4,1,2,2,0,0,3,3,0,0,2,0,0,0,1,0,0,0,0,2,2,0,3,2,1,3,3,0,2,0,2,0,0,3,3,1,2,4,0,3,0,2,2,3,},
|
||||
{ 2,4,0,5,0,4,0,4,0,2,4,4,4,3,4,3,3,3,1,2,4,3,4,3,4,4,5,0,3,3,3,3,2,0,4,3,1,4,3,4,1,4,4,3,3,4,4,3,1,2,3,0,4,2,0,4,1,0,3,3,0,4,3,3,3,4,0,4,0,2,0,3,5,3,4,5,2,0,3,0,0,4,5,},
|
||||
{ 0,3,0,4,0,1,0,1,0,1,3,2,2,1,3,0,3,0,2,0,2,0,3,0,2,0,0,0,1,0,1,1,0,0,3,1,0,0,0,4,0,3,1,0,2,1,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,2,2,3,1,0,3,0,0,0,1,4,4,4,3,0,0,4,0,0,1,4,},
|
||||
{ 1,4,1,5,0,3,0,3,0,4,5,4,4,3,5,3,3,4,4,3,4,1,3,3,3,3,2,1,4,1,5,4,3,1,4,4,3,5,4,4,3,5,4,3,3,4,4,4,0,3,3,1,2,3,0,3,1,0,3,3,0,5,4,4,4,4,4,4,3,3,5,4,4,3,3,5,4,0,3,2,0,4,4,},
|
||||
{ 0,2,0,3,0,1,0,0,0,1,3,3,3,2,4,1,3,0,3,1,3,0,2,2,1,1,0,0,2,0,4,3,1,0,4,3,0,4,4,4,1,4,3,1,1,3,3,1,0,2,0,0,1,3,0,0,0,0,2,0,0,4,3,2,4,3,5,4,3,3,3,4,3,3,4,3,3,0,2,1,0,3,3,},
|
||||
{ 0,2,0,4,0,3,0,2,0,2,5,5,3,4,4,4,4,1,4,3,3,0,4,3,4,3,1,3,3,2,4,3,0,3,4,3,0,3,4,4,2,4,4,0,4,5,3,3,2,2,1,1,1,2,0,1,5,0,3,3,2,4,3,3,3,4,0,3,0,2,0,4,4,3,5,5,0,0,3,0,2,3,3,},
|
||||
{ 0,3,0,4,0,3,0,1,0,3,4,3,3,1,3,3,3,0,3,1,3,0,4,3,3,1,1,0,3,0,3,3,0,0,4,4,0,1,5,4,3,3,5,0,3,3,4,3,0,2,0,1,1,1,0,1,3,0,1,2,1,3,3,2,3,3,0,3,0,1,0,1,3,3,4,4,1,0,1,2,2,1,3,},
|
||||
{ 0,1,0,4,0,4,0,3,0,1,3,3,3,2,3,1,1,0,3,0,3,3,4,3,2,4,2,0,1,0,4,3,2,0,4,3,0,5,3,3,2,4,4,4,3,3,3,4,0,1,3,0,0,1,0,0,1,0,0,0,0,4,2,3,3,3,0,3,0,0,0,4,4,4,5,3,2,0,3,3,0,3,5,},
|
||||
{ 0,2,0,3,0,0,0,3,0,1,3,0,2,0,0,0,1,0,3,1,1,3,3,0,0,3,0,0,3,0,2,3,1,0,3,1,0,3,3,2,0,4,2,2,0,2,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,1,0,0,0,1,3,1,2,0,0,0,1,0,0,1,4,},
|
||||
{ 0,3,0,3,0,5,0,1,0,2,4,3,1,3,3,2,1,1,5,2,1,0,5,1,2,0,0,0,3,3,2,2,3,2,4,3,0,0,3,3,1,3,3,0,2,5,3,4,0,3,3,0,1,2,0,2,2,0,3,2,0,2,2,3,3,3,0,2,0,1,0,3,4,4,2,5,4,0,3,0,0,3,5,},
|
||||
{ 0,3,0,3,0,3,0,1,0,3,3,3,3,0,3,0,2,0,2,1,1,0,2,0,1,0,0,0,2,1,0,0,1,0,3,2,0,0,3,3,1,2,3,1,0,3,3,0,0,1,0,0,0,0,0,2,0,0,0,0,0,2,3,1,2,3,0,3,0,1,0,3,2,1,0,4,3,0,1,1,0,3,3,},
|
||||
{ 0,4,0,5,0,3,0,3,0,4,5,5,4,3,5,3,4,3,5,3,3,2,5,3,4,4,4,3,4,3,4,5,5,3,4,4,3,4,4,5,4,4,4,3,4,5,5,4,2,3,4,2,3,4,0,3,3,1,4,3,2,4,3,3,5,5,0,3,0,3,0,5,5,5,5,4,4,0,4,0,1,4,4,},
|
||||
{ 0,4,0,4,0,3,0,3,0,3,5,4,4,2,3,2,5,1,3,2,5,1,4,2,3,2,3,3,4,3,3,3,3,2,5,4,1,3,3,5,3,4,4,0,4,4,3,1,1,3,1,0,2,3,0,2,3,0,3,0,0,4,3,1,3,4,0,3,0,2,0,4,4,4,3,4,5,0,4,0,0,3,4,},
|
||||
{ 0,3,0,3,0,3,1,2,0,3,4,4,3,3,3,0,2,2,4,3,3,1,3,3,3,1,1,0,3,1,4,3,2,3,4,4,2,4,4,4,3,4,4,3,2,4,4,3,1,3,3,1,3,3,0,4,1,0,2,2,1,4,3,2,3,3,5,4,3,3,5,4,4,3,3,0,4,0,3,2,2,4,4,},
|
||||
{ 0,2,0,1,0,0,0,0,0,1,2,1,3,0,0,0,0,0,2,0,1,2,1,0,0,1,0,0,0,0,3,0,0,1,0,1,1,3,1,0,0,0,1,1,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,1,2,2,0,3,4,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,},
|
||||
{ 0,1,0,0,0,1,0,0,0,0,4,0,4,1,4,0,3,0,4,0,3,0,4,0,3,0,3,0,4,1,5,1,4,0,0,3,0,5,0,5,2,0,1,0,0,0,2,1,4,0,1,3,0,0,3,0,0,3,1,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,},
|
||||
{ 1,4,0,5,0,3,0,2,0,3,5,4,4,3,4,3,5,3,4,3,3,0,4,3,3,3,3,3,3,2,4,4,3,1,3,4,4,5,4,4,3,4,4,1,3,5,4,3,3,3,1,2,2,3,3,1,3,1,3,3,3,5,3,3,4,5,0,3,0,3,0,3,4,3,4,4,3,0,3,0,2,4,3,},
|
||||
{ 0,1,0,4,0,0,0,0,0,1,4,0,4,1,4,2,4,0,3,0,1,0,1,0,0,0,0,0,2,0,3,1,1,1,0,3,0,0,0,1,2,1,0,0,1,1,1,1,0,1,0,0,0,1,0,0,3,0,0,0,0,3,2,0,2,2,0,1,0,0,0,2,3,2,3,3,0,0,0,0,2,1,0,},
|
||||
{ 0,5,1,5,0,3,0,3,0,5,4,4,5,1,5,3,3,0,4,3,4,3,5,3,4,3,3,2,4,3,4,3,3,0,3,3,1,4,4,3,4,4,4,3,4,5,5,3,2,3,1,1,3,3,1,3,1,1,3,3,2,4,5,3,3,5,0,4,0,3,0,4,4,3,5,3,3,0,3,4,0,4,3,},
|
||||
{ 0,5,0,5,0,3,0,2,0,4,4,3,5,2,4,3,3,3,4,4,4,3,5,3,5,3,3,1,4,0,4,3,3,0,3,3,0,4,4,4,4,5,4,3,3,5,5,3,2,3,1,2,3,2,0,1,0,0,3,2,2,4,4,3,1,5,0,4,0,3,0,4,3,1,3,2,1,0,3,3,0,3,3,},
|
||||
{ 0,4,0,5,0,5,0,4,0,4,5,5,5,3,4,3,3,2,5,4,4,3,5,3,5,3,4,0,4,3,4,4,3,2,4,4,3,4,5,4,4,5,5,0,3,5,5,4,1,3,3,2,3,3,1,3,1,0,4,3,1,4,4,3,4,5,0,4,0,2,0,4,3,4,4,3,3,0,4,0,0,5,5,},
|
||||
{ 0,4,0,4,0,5,0,1,1,3,3,4,4,3,4,1,3,0,5,1,3,0,3,1,3,1,1,0,3,0,3,3,4,0,4,3,0,4,4,4,3,4,4,0,3,5,4,1,0,3,0,0,2,3,0,3,1,0,3,1,0,3,2,1,3,5,0,3,0,1,0,3,2,3,3,4,4,0,2,2,0,4,4,},
|
||||
{ 2,4,0,5,0,4,0,3,0,4,5,5,4,3,5,3,5,3,5,3,5,2,5,3,4,3,3,4,3,4,5,3,2,1,5,4,3,2,3,4,5,3,4,1,2,5,4,3,0,3,3,0,3,2,0,2,3,0,4,1,0,3,4,3,3,5,0,3,0,1,0,4,5,5,5,4,3,0,4,2,0,3,5,},
|
||||
{ 0,5,0,4,0,4,0,2,0,5,4,3,4,3,4,3,3,3,4,3,4,2,5,3,5,3,4,1,4,3,4,4,4,0,3,5,0,4,4,4,4,5,3,1,3,4,5,3,3,3,3,3,3,3,0,2,2,0,3,3,2,4,3,3,3,5,3,4,1,3,3,5,3,2,0,0,0,0,4,3,1,3,3,},
|
||||
{ 0,1,0,3,0,3,0,1,0,1,3,3,3,2,3,3,3,0,3,0,0,0,3,1,3,0,0,0,2,2,2,3,0,0,3,2,0,1,2,4,1,3,3,0,0,3,3,3,0,1,0,0,2,1,0,0,3,0,3,1,0,3,0,0,1,3,0,2,0,1,0,3,3,1,3,3,0,0,1,1,0,3,3,},
|
||||
{ 0,2,0,3,0,2,1,4,0,2,2,3,1,1,3,1,1,0,2,0,3,1,2,3,1,3,0,0,1,0,4,3,2,3,3,3,1,4,2,3,3,3,3,1,0,3,1,4,0,1,1,0,1,2,0,1,1,0,1,1,0,3,1,3,2,2,0,1,0,0,0,2,3,3,3,1,0,0,0,0,0,2,3,},
|
||||
{ 0,5,0,4,0,5,0,2,0,4,5,5,3,3,4,3,3,1,5,4,4,2,4,4,4,3,4,2,4,3,5,5,4,3,3,4,3,3,5,5,4,5,5,1,3,4,5,3,1,4,3,1,3,3,0,3,3,1,4,3,1,4,5,3,3,5,0,4,0,3,0,5,3,3,1,4,3,0,4,0,1,5,3,},
|
||||
{ 0,5,0,5,0,4,0,2,0,4,4,3,4,3,3,3,3,3,5,4,4,4,4,4,4,5,3,3,5,2,4,4,4,3,4,4,3,3,4,4,5,5,3,3,4,3,4,3,3,4,3,3,3,3,1,2,2,1,4,3,3,5,4,4,3,4,0,4,0,3,0,4,4,4,4,4,1,0,4,2,0,2,4,},
|
||||
{ 0,4,0,4,0,3,0,1,0,3,5,2,3,0,3,0,2,1,4,2,3,3,4,1,4,3,3,2,4,1,3,3,3,0,3,3,0,0,3,3,3,5,3,3,3,3,3,2,0,2,0,0,2,0,0,2,0,0,1,0,0,3,1,2,2,3,0,3,0,2,0,4,4,3,3,4,1,0,3,0,0,2,4,},
|
||||
{ 0,0,0,4,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,1,0,2,0,1,0,0,0,0,0,3,1,3,0,3,2,0,0,0,1,0,3,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,0,2,0,0,0,0,0,0,2,},
|
||||
{ 0,2,1,3,0,2,0,2,0,3,3,3,3,1,3,1,3,3,3,3,3,3,4,2,2,1,2,1,4,0,4,3,1,3,3,3,2,4,3,5,4,3,3,3,3,3,3,3,0,1,3,0,2,0,0,1,0,0,1,0,0,4,2,0,2,3,0,3,3,0,3,3,4,2,3,1,4,0,1,2,0,2,3,},
|
||||
{ 0,3,0,3,0,1,0,3,0,2,3,3,3,0,3,1,2,0,3,3,2,3,3,2,3,2,3,1,3,0,4,3,2,0,3,3,1,4,3,3,2,3,4,3,1,3,3,1,1,0,1,1,0,1,0,1,0,1,0,0,0,4,1,1,0,3,0,3,1,0,2,3,3,3,3,3,1,0,0,2,0,3,3,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,3,0,2,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,3,0,3,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0,2,3,0,0,0,0,0,0,0,0,3,},
|
||||
{ 0,2,0,3,1,3,0,3,0,2,3,3,3,1,3,1,3,1,3,1,3,3,3,1,3,0,2,3,1,1,4,3,3,2,3,3,1,2,2,4,1,3,3,0,1,4,2,3,0,1,3,0,3,0,0,1,3,0,2,0,0,3,3,2,1,3,0,3,0,2,0,3,4,4,4,3,1,0,3,0,0,3,3,},
|
||||
{ 0,2,0,1,0,2,0,0,0,1,3,2,2,1,3,0,1,1,3,0,3,2,3,1,2,0,2,0,1,1,3,3,3,0,3,3,1,1,2,3,2,3,3,1,2,3,2,0,0,1,0,0,0,0,0,0,3,0,1,0,0,2,1,2,1,3,0,3,0,0,0,3,4,4,4,3,2,0,2,0,0,2,4,},
|
||||
{ 0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3,1,0,0,0,0,0,0,0,3,},
|
||||
{ 0,3,0,3,0,2,0,3,0,3,3,3,2,3,2,2,2,0,3,1,3,3,3,2,3,3,0,0,3,0,3,2,2,0,2,3,1,4,3,4,3,3,2,3,1,5,4,4,0,3,1,2,1,3,0,3,1,1,2,0,2,3,1,3,1,3,0,3,0,1,0,3,3,4,4,2,1,0,2,1,0,2,4,},
|
||||
{ 0,1,0,3,0,1,0,2,0,1,4,2,5,1,4,0,2,0,2,1,3,1,4,0,2,1,0,0,2,1,4,1,1,0,3,3,0,5,1,3,2,3,3,1,0,3,2,3,0,1,0,0,0,0,0,0,1,0,0,0,0,4,0,1,0,3,0,2,0,1,0,3,3,3,4,3,3,0,0,0,0,2,3,},
|
||||
{ 0,0,0,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,3,},
|
||||
{ 0,1,0,3,0,4,0,3,0,2,4,3,1,0,3,2,2,1,3,1,2,2,3,1,1,1,2,1,3,0,1,2,0,1,3,2,1,3,0,5,5,1,0,0,1,3,2,1,0,3,0,0,1,0,0,0,0,0,3,4,0,1,1,1,3,2,0,2,0,1,0,2,3,3,1,2,3,0,1,0,1,0,4,},
|
||||
{ 0,0,0,1,0,3,0,3,0,2,2,1,0,0,4,0,3,0,3,1,3,0,3,0,3,0,1,0,3,0,3,1,3,0,3,3,0,0,1,2,1,1,1,0,1,2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,2,1,2,0,0,2,0,0,0,0,2,3,3,3,3,0,0,0,0,1,4,},
|
||||
{ 0,0,0,3,0,3,0,0,0,0,3,1,1,0,3,0,1,0,2,0,1,0,0,0,0,0,0,0,1,0,3,0,2,0,2,3,0,0,2,2,3,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,2,3,},
|
||||
{ 2,4,0,5,0,5,0,4,0,3,4,3,3,3,4,3,3,3,4,3,4,4,5,4,5,5,5,2,3,0,5,5,4,1,5,4,3,1,5,4,3,4,4,3,3,4,3,3,0,3,2,0,2,3,0,3,0,0,3,3,0,5,3,2,3,3,0,3,0,3,0,3,4,5,4,5,3,0,4,3,0,3,4,},
|
||||
{ 0,3,0,3,0,3,0,3,0,3,3,4,3,2,3,2,3,0,4,3,3,3,3,3,3,3,3,0,3,2,4,3,3,1,3,4,3,4,4,4,3,4,4,3,2,4,4,1,0,2,0,0,1,1,0,2,0,0,3,1,0,5,3,2,1,3,0,3,0,1,2,4,3,2,4,3,3,0,3,2,0,4,4,},
|
||||
{ 0,3,0,3,0,1,0,0,0,1,4,3,3,2,3,1,3,1,4,2,3,2,4,2,3,4,3,0,2,2,3,3,3,0,3,3,3,0,3,4,1,3,3,0,3,4,3,3,0,1,1,0,1,0,0,0,4,0,3,0,0,3,1,2,1,3,0,4,0,1,0,4,3,3,4,3,3,0,2,0,0,3,3,},
|
||||
{ 0,3,0,4,0,1,0,3,0,3,4,3,3,0,3,3,3,1,3,1,3,3,4,3,3,3,0,0,3,1,5,3,3,1,3,3,2,5,4,3,3,4,5,3,2,5,3,4,0,1,0,0,0,0,0,2,0,0,1,1,0,4,2,2,1,3,0,3,0,2,0,4,4,3,5,3,2,0,1,1,0,3,4,},
|
||||
{ 0,5,0,4,0,5,0,2,0,4,4,3,3,2,3,3,3,1,4,3,4,1,5,3,4,3,4,0,4,2,4,3,4,1,5,4,0,4,4,4,4,5,4,1,3,5,4,2,1,4,1,1,3,2,0,3,1,0,3,2,1,4,3,3,3,4,0,4,0,3,0,4,4,4,3,3,3,0,4,2,0,3,4,},
|
||||
{ 1,4,0,4,0,3,0,1,0,3,3,3,1,1,3,3,2,2,3,3,1,0,3,2,2,1,2,0,3,1,2,1,2,0,3,2,0,2,2,3,3,4,3,0,3,3,1,2,0,1,1,3,1,2,0,0,3,0,1,1,0,3,2,2,3,3,0,3,0,0,0,2,3,3,4,3,3,0,1,0,0,1,4,},
|
||||
{ 0,4,0,4,0,4,0,0,0,3,4,4,3,1,4,2,3,2,3,3,3,1,4,3,4,0,3,0,4,2,3,3,2,2,5,4,2,1,3,4,3,4,3,1,3,3,4,2,0,2,1,0,3,3,0,0,2,0,3,1,0,4,4,3,4,3,0,4,0,1,0,2,4,4,4,4,4,0,3,2,0,3,3,},
|
||||
{ 0,0,0,1,0,4,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,},
|
||||
{ 0,2,0,3,0,4,0,4,0,1,3,3,3,0,4,0,2,1,2,1,1,1,2,0,3,1,1,0,1,0,3,1,0,0,3,3,2,0,1,1,0,0,0,0,0,1,0,2,0,2,2,0,3,1,0,0,1,0,1,1,0,1,2,0,3,0,0,0,0,1,0,0,3,3,4,3,1,0,1,0,3,0,2,},
|
||||
{ 0,0,0,3,0,5,0,0,0,0,1,0,2,0,3,1,0,1,3,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,4,0,0,0,2,3,0,1,4,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,3,},
|
||||
{ 0,2,0,5,0,5,0,1,0,2,4,3,3,2,5,1,3,2,3,3,3,0,4,1,2,0,3,0,4,0,2,2,1,1,5,3,0,0,1,4,2,3,2,0,3,3,3,2,0,2,4,1,1,2,0,1,1,0,3,1,0,1,3,1,2,3,0,2,0,0,0,1,3,5,4,4,4,0,3,0,0,1,3,},
|
||||
{ 0,4,0,5,0,4,0,4,0,4,5,4,3,3,4,3,3,3,4,3,4,4,5,3,4,5,4,2,4,2,3,4,3,1,4,4,1,3,5,4,4,5,5,4,4,5,5,5,2,3,3,1,4,3,1,3,3,0,3,3,1,4,3,4,4,4,0,3,0,4,0,3,3,4,4,5,0,0,4,3,0,4,5,},
|
||||
{ 0,4,0,4,0,3,0,3,0,3,4,4,4,3,3,2,4,3,4,3,4,3,5,3,4,3,2,1,4,2,4,4,3,1,3,4,2,4,5,5,3,4,5,4,1,5,4,3,0,3,2,2,3,2,1,3,1,0,3,3,3,5,3,3,3,5,4,4,2,3,3,4,3,3,3,2,1,0,3,2,1,4,3,},
|
||||
{ 0,4,0,5,0,4,0,3,0,3,5,5,3,2,4,3,4,0,5,4,4,1,4,4,4,3,3,3,4,3,5,5,2,3,3,4,1,2,5,5,3,5,5,2,3,5,5,4,0,3,2,0,3,3,1,1,5,1,4,1,0,4,3,2,3,5,0,4,0,3,0,5,4,3,4,3,0,0,4,1,0,4,4,},
|
||||
{ 1,3,0,4,0,2,0,2,0,2,5,5,3,3,3,3,3,0,4,2,3,4,4,4,3,4,0,0,3,4,5,4,3,3,3,3,2,5,5,4,5,5,5,4,3,5,5,5,1,3,1,0,1,0,0,3,2,0,4,2,0,5,2,3,2,4,1,3,0,3,0,4,5,4,5,4,3,0,4,2,0,5,4,},
|
||||
{ 0,3,0,4,0,5,0,3,0,3,4,4,3,2,3,2,3,3,3,3,3,2,4,3,3,2,2,0,3,3,3,3,3,1,3,3,3,0,4,4,3,4,4,1,1,4,4,2,0,3,1,0,1,1,0,4,1,0,2,3,1,3,3,1,3,4,0,3,0,1,0,3,1,3,0,0,1,0,2,0,0,4,4,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,},
|
||||
{ 0,3,0,3,0,2,0,3,0,1,5,4,3,3,3,1,4,2,1,2,3,4,4,2,4,4,5,0,3,1,4,3,4,0,4,3,3,3,2,3,2,5,3,4,3,2,2,3,0,0,3,0,2,1,0,1,2,0,0,0,0,2,1,1,3,1,0,2,0,4,0,3,4,4,4,5,2,0,2,0,0,1,3,},
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,1,0,0,0,4,2,1,1,0,1,0,3,2,0,0,3,1,1,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,0,0,0,2,0,0,0,1,4,0,4,2,1,0,0,0,0,0,1,},
|
||||
{ 0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,3,1,0,0,0,2,0,2,1,0,0,1,2,1,0,1,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,3,1,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,2,},
|
||||
{ 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,},
|
||||
{ 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,},
|
||||
};
|
||||
|
||||
// category counters, each integer counts sequence in its category
|
||||
int[] relSample = new int[CATEGORIES_NUM];
|
||||
|
||||
// total sequence received
|
||||
int totalRel;
|
||||
|
||||
// The order of previous char
|
||||
int lastCharOrder;
|
||||
|
||||
// if last byte in current buffer is not the last byte of a character,
|
||||
// we need to know how many byte to skip in next buffer.
|
||||
int needToSkipCharNum;
|
||||
|
||||
// If this flag is set to true, detection is done and conclusion has
|
||||
// been made
|
||||
bool done;
|
||||
|
||||
public JapaneseContextAnalyser()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public float GetConfidence()
|
||||
{
|
||||
// This is just one way to calculate confidence. It works well for me.
|
||||
if (totalRel > MINIMUM_DATA_THRESHOLD)
|
||||
return ((float)(totalRel - relSample[0])) / totalRel;
|
||||
else
|
||||
return DONT_KNOW;
|
||||
}
|
||||
|
||||
public void HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
|
||||
int charLen = 0;
|
||||
int max = offset + len;
|
||||
|
||||
if (done)
|
||||
return;
|
||||
|
||||
// The buffer we got is byte oriented, and a character may span
|
||||
// more than one buffer. In case the last one or two byte in last
|
||||
// buffer is not complete, we record how many byte needed to
|
||||
// complete that character and skip these bytes here. We can choose
|
||||
// to record those bytes as well and analyse the character once it
|
||||
// is complete, but since a character will not make much difference,
|
||||
// skipping it will simplify our logic and improve performance.
|
||||
for (int i = needToSkipCharNum + offset; i < max;)
|
||||
{
|
||||
int order = GetOrder(buf, i, out charLen);
|
||||
i += charLen;
|
||||
if (i > max)
|
||||
{
|
||||
needToSkipCharNum = i - max;
|
||||
lastCharOrder = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (order != -1 && lastCharOrder != -1)
|
||||
{
|
||||
totalRel++;
|
||||
if (totalRel > MAX_REL_THRESHOLD)
|
||||
{
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||
}
|
||||
lastCharOrder = order;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void HandleOneChar(byte[] buf, int offset, int charLen)
|
||||
{
|
||||
if (totalRel > MAX_REL_THRESHOLD)
|
||||
done = true;
|
||||
if (done)
|
||||
return;
|
||||
|
||||
// Only 2-bytes characters are of our interest
|
||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||
if (order != -1 && lastCharOrder != -1)
|
||||
{
|
||||
totalRel++;
|
||||
// count this sequence to its category counter
|
||||
relSample[jp2CharContext[lastCharOrder, order]]++;
|
||||
}
|
||||
lastCharOrder = order;
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
totalRel = 0;
|
||||
for (int i = 0; i < CATEGORIES_NUM; i++)
|
||||
{
|
||||
relSample[i] = 0;
|
||||
needToSkipCharNum = 0;
|
||||
lastCharOrder = -1;
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract int GetOrder(byte[] buf, int offset, out int charLen);
|
||||
|
||||
protected abstract int GetOrder(byte[] buf, int offset);
|
||||
|
||||
public bool GotEnoughData()
|
||||
{
|
||||
return totalRel > ENOUGH_REL_THRESHOLD;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class SJISContextAnalyser : JapaneseContextAnalyser
|
||||
{
|
||||
private const byte HIRAGANA_FIRST_BYTE = 0x82;
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset, out int charLen)
|
||||
{
|
||||
//find out current char's byte length
|
||||
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|
||||
|| buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
|
||||
charLen = 2;
|
||||
else
|
||||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class EUCJPContextAnalyser : JapaneseContextAnalyser
|
||||
{
|
||||
private const byte HIRAGANA_FIRST_BYTE = 0xA4;
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset, out int charLen)
|
||||
{
|
||||
byte high = buf[offset];
|
||||
|
||||
//find out current char's byte length
|
||||
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
|
||||
charLen = 2;
|
||||
else if (high == 0xBF)
|
||||
charLen = 3;
|
||||
else
|
||||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
if (high == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
if (buf[offset] == HIRAGANA_FIRST_BYTE)
|
||||
{
|
||||
byte low = buf[offset + 1];
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,246 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class BulgarianModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 96.9392%
|
||||
//first 1024 sequences:3.0618%
|
||||
//rest sequences: 0.2992%
|
||||
//negative sequences: 0.0020%
|
||||
private static byte[] BULGARIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,0,3,1,0,
|
||||
0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,1,3,3,3,3,2,2,2,1,1,2,0,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,2,3,2,2,3,3,1,1,2,3,3,2,3,3,3,3,2,1,2,0,2,0,3,0,0,
|
||||
0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,1,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,3,1,3,0,3,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,1,3,3,2,3,2,2,2,0,0,2,0,2,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,3,3,1,2,2,3,2,1,1,2,0,2,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,2,3,3,1,2,3,2,2,2,3,3,3,3,3,2,2,3,1,2,0,2,1,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,1,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,1,2,0,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,1,1,1,2,2,1,3,1,3,2,2,3,0,0,1,0,1,0,1,0,0,
|
||||
0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,2,2,3,2,2,3,1,2,1,1,1,2,3,1,3,1,2,2,0,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,1,3,2,2,3,3,1,2,3,1,1,3,3,3,3,1,2,2,1,1,1,0,2,0,2,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,2,2,1,1,2,0,2,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,0,1,2,1,3,3,2,3,3,3,3,3,2,3,2,1,0,3,1,2,1,2,1,2,3,2,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,1,3,3,2,3,3,2,2,2,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,0,3,3,3,3,3,2,1,1,2,1,3,3,0,3,1,1,1,1,3,2,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,1,1,3,1,3,3,2,3,2,2,2,3,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,2,3,3,2,2,3,2,1,1,1,1,1,3,1,3,1,1,0,0,0,1,0,0,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,2,3,2,0,3,2,0,3,0,2,0,0,2,1,3,1,0,0,1,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,2,1,1,1,1,2,1,1,2,1,1,1,2,2,1,2,1,1,1,0,1,1,0,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,2,1,3,1,1,2,1,3,2,1,1,0,1,2,3,2,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,2,2,1,0,1,0,0,1,0,0,0,2,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,2,3,2,3,3,1,3,2,1,1,1,2,1,1,2,1,3,0,1,0,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,2,2,3,3,2,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,0,1,1,0,1,0,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,2,1,3,1,0,2,2,1,3,2,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,3,3,3,3,1,2,0,2,3,1,2,3,2,0,1,3,1,2,1,1,1,0,0,1,0,0,2,2,2,3,
|
||||
2,2,2,2,1,2,1,1,2,2,1,1,2,0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1,
|
||||
3,3,3,3,3,2,1,2,2,1,2,0,2,0,1,0,1,2,1,2,1,1,0,0,0,1,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
|
||||
3,3,2,3,3,1,1,3,1,0,3,2,1,0,0,0,1,2,0,2,0,1,0,0,0,1,0,1,2,1,2,2,
|
||||
1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,0,1,2,1,1,1,0,0,0,0,0,1,1,0,0,
|
||||
3,1,0,1,0,2,3,2,2,2,3,2,2,2,2,2,1,0,2,1,2,1,1,1,0,1,2,1,2,2,2,1,
|
||||
1,1,2,2,2,2,1,2,1,1,0,1,2,1,2,2,2,1,1,1,0,1,1,1,1,2,0,1,0,0,0,0,
|
||||
2,3,2,3,3,0,0,2,1,0,2,1,0,0,0,0,2,3,0,2,0,0,0,0,0,1,0,0,2,0,1,2,
|
||||
2,1,2,1,2,2,1,1,1,2,1,1,1,0,1,2,2,1,1,1,1,1,0,1,1,1,0,0,1,2,0,0,
|
||||
3,3,2,2,3,0,2,3,1,1,2,0,0,0,1,0,0,2,0,2,0,0,0,1,0,1,0,1,2,0,2,2,
|
||||
1,1,1,1,2,1,0,1,2,2,2,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,
|
||||
2,3,2,3,3,0,0,3,0,1,1,0,1,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,2,0,1,2,
|
||||
2,2,1,1,1,1,1,2,2,2,1,0,2,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
|
||||
3,3,3,3,2,2,2,2,2,0,2,1,1,1,1,2,1,2,1,1,0,2,0,1,0,1,0,0,2,0,1,2,
|
||||
1,1,1,1,1,1,1,2,2,1,1,0,2,0,1,0,2,0,0,1,1,1,0,0,2,0,0,0,1,1,0,0,
|
||||
2,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,1,2,0,1,2,
|
||||
2,2,2,1,1,2,1,1,2,2,2,1,2,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,
|
||||
2,3,3,3,3,0,2,2,0,2,1,0,0,0,1,1,1,2,0,2,0,0,0,3,0,0,0,0,2,0,2,2,
|
||||
1,1,1,2,1,2,1,1,2,2,2,1,2,0,1,1,1,0,1,1,1,1,0,2,1,0,0,0,1,1,0,0,
|
||||
2,3,3,3,3,0,2,1,0,0,2,0,0,0,0,0,1,2,0,2,0,0,0,0,0,0,0,0,2,0,1,2,
|
||||
1,1,1,2,1,1,1,1,2,2,2,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,
|
||||
3,3,2,2,3,0,1,0,1,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,0,0,0,0,1,0,2,2,
|
||||
1,1,1,1,1,2,1,1,2,2,1,2,2,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,1,0,0,
|
||||
3,1,0,1,0,2,2,2,2,3,2,1,1,1,2,3,0,0,1,0,2,1,1,0,1,1,1,1,2,1,1,1,
|
||||
1,2,2,1,2,1,2,2,1,1,0,1,2,1,2,2,1,1,1,0,0,1,1,1,2,1,0,1,0,0,0,0,
|
||||
2,1,0,1,0,3,1,2,2,2,2,1,2,2,1,1,1,0,2,1,2,2,1,1,2,1,1,0,2,1,1,1,
|
||||
1,2,2,2,2,2,2,2,1,2,0,1,1,0,2,1,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,0,
|
||||
2,1,1,1,1,2,2,2,2,1,2,2,2,1,2,2,1,1,2,1,2,3,2,2,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,3,2,0,1,2,0,1,2,1,1,0,1,0,1,2,1,2,0,0,0,1,1,0,0,0,1,0,0,2,
|
||||
1,1,0,0,1,1,0,1,1,1,1,0,2,0,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,
|
||||
2,0,0,0,0,1,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,2,1,1,1,
|
||||
1,2,2,2,2,1,1,2,1,2,1,1,1,0,2,1,2,1,1,1,0,2,1,1,1,1,0,1,0,0,0,0,
|
||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
|
||||
1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,3,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,1,0,1,2,
|
||||
1,1,1,1,1,1,0,0,2,2,2,2,2,0,1,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1,
|
||||
2,3,1,2,1,0,1,1,0,2,2,2,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,2,
|
||||
1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
|
||||
2,2,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,2,2,
|
||||
1,1,1,1,1,0,0,1,2,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,0,2,0,1,1,0,0,0,1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,1,
|
||||
0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,3,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,2,
|
||||
1,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,2,2,2,1,2,1,2,2,1,1,2,1,1,1,0,1,1,1,1,2,0,1,0,1,1,1,1,0,1,1,
|
||||
1,1,2,1,1,1,1,1,1,0,0,1,2,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,2,1,0,0,1,0,2,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,1,
|
||||
0,2,0,1,0,0,1,1,2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,1,1,0,2,1,0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,
|
||||
0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,1,0,0,1,2,1,1,1,1,1,1,2,2,1,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,
|
||||
1,1,2,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,1,2,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||||
0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,2,0,0,2,0,1,0,0,1,0,0,1,
|
||||
1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
|
||||
1,1,1,1,1,1,1,2,0,0,0,0,0,0,2,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
|
||||
};
|
||||
|
||||
public BulgarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, BULGARIAN_LANG_MODEL, 0.969392f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin5BulgarianModel : BulgarianModel
|
||||
{
|
||||
//255: Control characters that usually does not exist in any text
|
||||
//254: Carriage/Return
|
||||
//253: symbol (punctuation) that does not belong to word
|
||||
//252: 0 - 9
|
||||
// Character Mapping Table:
|
||||
// this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
// only number <64 is sure valid
|
||||
private static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
|
||||
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
|
||||
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
|
||||
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
|
||||
194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80
|
||||
210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90
|
||||
81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0
|
||||
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //b0
|
||||
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0
|
||||
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0
|
||||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0
|
||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||
};
|
||||
|
||||
public Latin5BulgarianModel() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1251BulgarianModel : BulgarianModel
|
||||
{
|
||||
private static byte[] WIN1251__CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
|
||||
110,186,108, 91, 74,119, 84, 96,111,187,115,253,253,253,253,253, //50
|
||||
253, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
|
||||
116,195, 85, 93, 97,113,196,197,198,199,200,253,253,253,253,253, //70
|
||||
206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80
|
||||
221, 78, 64, 83,121, 98,117,105,222,223,224,225,226,227,228,229, //90
|
||||
88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0
|
||||
73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0
|
||||
31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0
|
||||
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0
|
||||
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0
|
||||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0
|
||||
};
|
||||
|
||||
public Win1251BulgarianModel() : base(WIN1251__CHAR_TO_ORDER_MAP, "windows-1251")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,345 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class CyrillicModel : SequenceModel
|
||||
{
|
||||
// Model Table:
|
||||
// total sequences: 100%
|
||||
// first 512 sequences: 97.6601%
|
||||
// first 1024 sequences: 2.3389%
|
||||
// rest sequences: 0.1237%
|
||||
// negative sequences: 0.0009%
|
||||
protected readonly static byte[] RUSSIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,2,3,3,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
|
||||
0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1,
|
||||
0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,2,2,2,3,1,3,3,1,3,3,3,3,2,2,3,0,2,2,2,3,3,2,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3,3,3,2,1,2,2,0,1,2,2,2,2,2,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,2,3,3,2,1,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,1,2,3,2,2,3,2,3,3,3,3,2,2,3,0,3,2,2,3,1,1,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,3,3,3,3,2,2,2,0,3,3,3,2,2,2,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,2,3,2,2,0,1,3,2,1,2,2,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,1,1,3,0,1,1,1,1,2,1,1,0,2,2,2,1,2,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,2,2,2,2,1,3,2,3,2,3,2,1,2,2,0,1,1,2,1,2,1,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,2,2,2,0,2,2,2,2,3,1,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,2,2,3,3,3,3,3,3,3,3,3,1,3,2,0,0,3,3,3,3,2,3,3,3,3,2,3,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,3,2,2,3,3,0,2,1,0,3,2,3,2,3,0,0,1,2,0,0,1,0,1,2,1,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,3,0,2,3,3,3,3,2,3,3,3,3,1,2,2,0,0,2,3,2,2,2,3,2,3,2,2,3,0,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,0,2,3,2,3,0,1,2,3,3,2,0,2,3,0,0,2,3,2,2,0,1,3,1,3,2,2,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,3,0,2,3,3,3,3,3,3,3,3,2,1,3,2,0,0,2,2,3,3,3,2,3,3,0,2,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,2,3,3,2,2,2,3,3,0,0,1,1,1,1,1,2,0,0,1,1,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,0,3,2,3,3,2,3,2,0,2,1,0,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,1,3,2,3,1,1,2,1,0,2,2,2,2,1,3,1,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
2,2,3,3,3,3,3,1,2,2,1,3,1,0,3,0,0,3,0,0,0,1,1,0,1,2,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,2,1,1,3,3,3,2,2,1,2,2,3,1,1,2,0,0,2,2,1,3,0,0,2,1,1,2,1,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,3,3,3,3,1,2,2,2,1,2,1,3,3,1,1,2,1,2,1,2,2,0,2,0,0,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,3,2,1,3,2,2,3,2,0,3,2,0,3,0,1,0,1,1,0,0,1,1,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,3,3,3,2,2,2,3,3,1,2,1,2,1,0,1,0,1,1,0,1,0,0,2,1,1,1,0,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,2,1,2,3,3,2,2,1,2,2,3,0,2,1,0,0,2,2,3,2,1,2,2,2,2,2,3,1,0,
|
||||
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,1,1,0,1,1,2,2,1,1,3,0,0,1,3,1,1,1,0,0,0,1,0,1,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,3,3,3,2,0,0,0,2,1,0,1,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,1,0,0,2,3,2,2,2,1,2,2,2,1,2,1,0,0,1,1,1,0,2,0,1,1,1,0,0,1,1,
|
||||
1,0,0,0,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,3,0,0,0,0,1,0,0,0,0,3,0,1,2,1,0,0,0,0,0,0,0,1,1,0,0,1,1,
|
||||
1,0,1,0,1,2,0,0,1,1,2,1,0,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,0,1,1,0,
|
||||
2,2,3,2,2,2,3,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,1,0,1,1,1,0,2,1,
|
||||
1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,1,1,0,
|
||||
3,3,3,2,2,2,2,3,2,2,1,1,2,2,2,2,1,1,3,1,2,1,2,0,0,1,1,0,1,0,2,1,
|
||||
1,1,1,1,1,2,1,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,
|
||||
2,0,0,1,0,3,2,2,2,2,1,2,1,2,1,2,0,0,0,2,1,2,2,1,1,2,2,0,1,1,0,2,
|
||||
1,1,1,1,1,0,1,1,1,2,1,1,1,2,1,0,1,2,1,1,1,1,0,1,1,1,0,0,1,0,0,1,
|
||||
1,3,2,2,2,1,1,1,2,3,0,0,0,0,2,0,2,2,1,0,0,0,0,0,0,1,0,0,0,0,1,1,
|
||||
1,0,1,1,0,1,0,1,1,0,1,1,0,2,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
|
||||
2,3,2,3,2,1,2,2,2,2,1,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,2,1,
|
||||
1,1,2,1,0,2,0,0,1,0,1,0,0,1,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,
|
||||
3,0,0,1,0,2,2,2,3,2,2,2,2,2,2,2,0,0,0,2,1,2,1,1,1,2,2,0,0,0,1,2,
|
||||
1,1,1,1,1,0,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,
|
||||
2,3,2,3,3,2,0,1,1,1,0,0,1,0,2,0,1,1,3,1,0,0,0,0,0,0,0,1,0,0,2,1,
|
||||
1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,
|
||||
2,3,3,3,3,1,2,2,2,2,0,1,1,0,2,1,1,1,2,1,0,1,1,0,0,1,0,1,0,0,2,0,
|
||||
0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,3,2,0,0,1,1,2,2,1,0,0,2,0,1,1,3,0,0,1,0,0,0,0,0,1,0,1,2,1,
|
||||
1,1,2,0,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,1,0,
|
||||
1,3,2,3,2,1,0,0,2,2,2,0,1,0,2,0,1,1,1,0,1,0,0,0,3,0,1,1,0,0,2,1,
|
||||
1,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,2,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,
|
||||
3,1,2,1,1,2,2,2,2,2,2,1,2,2,1,1,0,0,0,2,2,2,0,0,0,1,2,1,0,1,0,1,
|
||||
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,1,0,1,0,1,1,0,1,1,1,0,0,1,
|
||||
3,0,0,0,0,2,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,0,1,0,1,1,0,0,1,0,1,
|
||||
1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,
|
||||
1,3,3,2,2,0,0,0,2,2,0,0,0,1,2,0,1,1,2,0,0,0,0,0,0,0,0,1,0,0,2,1,
|
||||
0,1,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
2,3,2,3,2,0,0,0,0,1,1,0,0,0,2,0,2,0,2,0,0,0,0,0,1,0,0,1,0,0,1,1,
|
||||
1,1,2,0,1,2,1,0,1,1,2,1,1,1,1,1,2,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0,
|
||||
1,3,2,2,2,1,0,0,2,2,1,0,1,2,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,
|
||||
0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,0,2,3,1,2,2,2,2,2,2,1,1,0,0,0,1,0,1,0,2,1,1,1,0,0,0,0,1,
|
||||
1,1,0,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
|
||||
2,0,2,0,0,1,0,3,2,1,2,1,2,2,0,1,0,0,0,2,1,0,0,2,1,1,1,1,0,2,0,2,
|
||||
2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,0,0,0,1,1,1,1,0,1,0,0,1,
|
||||
1,2,2,2,2,1,0,0,1,0,0,0,0,0,2,0,1,1,1,1,0,0,0,0,1,0,1,2,0,0,2,0,
|
||||
1,0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,
|
||||
2,1,2,2,2,0,3,0,1,1,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,
|
||||
1,2,2,3,2,2,0,0,1,1,2,0,1,2,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,
|
||||
0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,
|
||||
2,2,1,1,2,1,2,2,2,2,2,1,2,2,0,1,0,0,0,1,2,2,2,1,2,1,1,1,1,1,2,1,
|
||||
1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1,
|
||||
1,2,2,2,2,0,1,0,2,2,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
|
||||
0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,0,0,2,2,2,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,
|
||||
0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,2,0,0,0,0,1,0,0,1,1,2,0,0,0,0,1,0,1,0,0,1,0,0,2,0,0,0,1,
|
||||
0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,2,2,1,1,2,0,2,1,1,1,1,0,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,
|
||||
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,2,1,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,
|
||||
0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
1,0,0,0,0,2,0,1,2,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1,
|
||||
0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
1,1,1,0,1,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
1,1,0,1,1,0,1,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,0,0,
|
||||
0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||
};
|
||||
|
||||
public CyrillicModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, RUSSIAN_LANG_MODEL, 0.976601f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Koi8rModel : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] KOI8R_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90
|
||||
223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0
|
||||
238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253, //b0
|
||||
27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0
|
||||
15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0
|
||||
59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0
|
||||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
||||
};
|
||||
|
||||
public Koi8rModel() : base(KOI8R_CHAR_TO_ORDER_MAP, "KOI8-R")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1251Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] WIN1251_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
239,240,241,242,243,244,245,246, 68,247,248,249,250,251,252,253,
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
};
|
||||
|
||||
public Win1251Model() : base(WIN1251_CHAR_TO_ORDER_MAP, "windows-1251")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin5Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
public Latin5Model() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class MacCyrillicModel : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] MACCYRILLIC_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
239,240,241,242,243,244,245,246,247,248,249,250,251,252, 68, 16,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
};
|
||||
|
||||
public MacCyrillicModel() : base(MACCYRILLIC_CHAR_TO_ORDER_MAP,
|
||||
"x-mac-cyrillic")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Ibm855Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] IBM855_BYTE_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205,
|
||||
206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70,
|
||||
3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219,
|
||||
220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229,
|
||||
230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243,
|
||||
8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248,
|
||||
43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
|
||||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
};
|
||||
|
||||
public Ibm855Model() : base(IBM855_BYTE_TO_ORDER_MAP, "IBM855")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Ibm866Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] IBM866_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40
|
||||
155,156,157,158,159,160,161,162,163,164,165,253,253,253,253,253, //50
|
||||
253, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60
|
||||
67,179, 78, 73,180,181, 79,182,183,184,185,253,253,253,253,253, //70
|
||||
37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35,
|
||||
45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43,
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,
|
||||
207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,
|
||||
223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
public Ibm866Model() : base(IBM866_CHAR_TO_ORDER_MAP, "IBM866")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -1,244 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class GreekModel : SequenceModel
|
||||
{
|
||||
// Model Table:
|
||||
// total sequences: 100%
|
||||
// first 512 sequences: 98.2851%
|
||||
// first 1024 sequences:1.7001%
|
||||
// rest sequences: 0.0359%
|
||||
// negative sequences: 0.0148%
|
||||
private readonly static byte[] GREEK_LANG_MODEL = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0,
|
||||
3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
|
||||
0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0,
|
||||
2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0,
|
||||
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0,
|
||||
2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0,
|
||||
2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0,
|
||||
0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0,
|
||||
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0,
|
||||
0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0,
|
||||
3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0,
|
||||
3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0,
|
||||
2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0,
|
||||
2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0,
|
||||
0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0,
|
||||
0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0,
|
||||
0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2,
|
||||
0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,
|
||||
0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2,
|
||||
0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0,
|
||||
0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2,
|
||||
0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2,
|
||||
0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,
|
||||
0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2,
|
||||
0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0,
|
||||
0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0,
|
||||
0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
|
||||
0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0,
|
||||
0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2,
|
||||
0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2,
|
||||
0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2,
|
||||
0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2,
|
||||
0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,
|
||||
0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1,
|
||||
0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
|
||||
0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2,
|
||||
0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2,
|
||||
0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2,
|
||||
0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,
|
||||
0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,
|
||||
0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,
|
||||
0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0,
|
||||
0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public GreekModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, GREEK_LANG_MODEL, 0.982851f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin7Model : GreekModel
|
||||
{
|
||||
/****************************************************************
|
||||
255: Control characters that usually does not exist in any text
|
||||
254: Carriage/Return
|
||||
253: symbol (punctuation) that does not belong to word
|
||||
252: 0 - 9
|
||||
*****************************************************************/
|
||||
//Character Mapping Table:
|
||||
private readonly static byte[] LATIN7_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
|
||||
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
|
||||
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
|
||||
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
|
||||
+253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
|
||||
253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, //b0
|
||||
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
|
||||
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
|
||||
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
|
||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
|
||||
};
|
||||
|
||||
public Latin7Model() : base(LATIN7_CHAR_TO_ORDER_MAP, "ISO-8859-7")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1253Model : GreekModel
|
||||
{
|
||||
private readonly static byte[] WIN1253__CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, //40
|
||||
79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, //50
|
||||
253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, //60
|
||||
78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, //70
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //80
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //90
|
||||
+253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, //a0
|
||||
253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, //b0
|
||||
110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, //c0
|
||||
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
|
||||
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
|
||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
|
||||
};
|
||||
|
||||
public Win1253Model() : base(WIN1253__CHAR_TO_ORDER_MAP, "windows-1253")
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,220 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class HebrewModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 98.4004%
|
||||
//first 1024 sequences: 1.5981%
|
||||
//rest sequences: 0.087%
|
||||
//negative sequences: 0.0015%
|
||||
private readonly static byte[] HEBREW_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,
|
||||
1,2,1,2,1,2,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,
|
||||
1,2,1,3,1,1,0,0,2,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,2,2,1,3,
|
||||
1,2,1,1,2,2,0,0,2,2,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,2,
|
||||
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,3,2,2,2,1,2,2,2,2,
|
||||
1,2,1,1,2,2,0,1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,2,2,2,
|
||||
0,2,0,2,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2,
|
||||
0,2,1,2,2,2,0,0,2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,2,2,
|
||||
1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,2,0,2,
|
||||
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,2,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,1,2,1,1,1,
|
||||
0,1,1,1,1,1,3,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,
|
||||
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,2,1,2,3,3,2,3,3,3,3,2,3,2,1,2,0,2,1,2,
|
||||
0,2,0,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,1,2,2,3,3,2,3,2,3,2,2,3,1,2,2,0,2,2,2,
|
||||
0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,2,3,3,3,3,1,3,2,2,2,
|
||||
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,2,2,2,1,2,2,0,2,2,2,2,
|
||||
0,2,0,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,3,3,2,3,3,2,2,1,2,2,2,2,2,2,
|
||||
0,2,1,2,1,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,3,2,3,3,3,3,3,2,2,2,2,2,2,2,1,
|
||||
0,2,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,3,3,3,2,3,2,3,2,1,2,3,0,2,1,2,2,
|
||||
0,2,1,1,2,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,
|
||||
3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,1,3,1,2,2,2,1,2,3,3,1,2,1,2,2,2,2,
|
||||
0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,1,3,3,3,1,2,2,2,2,1,1,2,2,2,2,2,2,
|
||||
0,2,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,1,2,3,2,3,2,2,2,2,1,2,1,1,1,2,2,
|
||||
0,2,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,
|
||||
1,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,2,3,3,2,3,1,2,2,2,2,3,2,3,1,1,2,2,1,2,2,1,1,0,2,2,2,2,
|
||||
0,1,0,1,2,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,
|
||||
3,0,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,1,0,1,0,1,1,0,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
3,2,2,1,2,2,2,2,2,2,2,1,2,2,1,2,2,1,1,1,1,1,1,1,1,2,1,1,0,3,3,3,
|
||||
0,3,0,2,2,2,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
2,2,2,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,1,2,2,2,1,1,1,2,0,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0,
|
||||
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,3,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,0,2,1,0,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
0,3,1,1,2,2,2,2,2,1,2,2,2,1,1,2,2,2,2,2,2,2,1,2,2,1,0,1,1,1,1,0,
|
||||
0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,2,1,1,1,1,2,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
|
||||
0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,
|
||||
2,1,1,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,1,2,1,1,1,1,0,0,0,0,
|
||||
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,2,1,2,2,2,2,2,2,2,2,2,2,1,2,1,2,1,1,2,1,1,1,2,1,2,1,2,0,1,0,1,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,3,1,2,2,2,1,2,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,2,1,2,1,1,0,1,0,1,
|
||||
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,
|
||||
0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,1,1,1,1,1,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,1,1,0,0,
|
||||
0,1,1,1,2,1,2,2,2,0,2,0,2,0,1,1,2,1,1,1,1,2,1,0,1,1,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,1,0,0,0,0,0,1,0,1,2,2,0,1,0,0,1,1,2,2,1,2,0,2,0,0,0,1,2,0,1,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,2,0,2,1,2,0,2,0,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,1,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,1,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,1,2,2,0,0,1,0,0,0,1,0,0,1,
|
||||
1,1,2,1,0,1,1,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,2,1,
|
||||
0,2,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,1,0,0,1,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,1,1,0,1,
|
||||
2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,1,1,2,1,1,2,0,1,0,0,0,1,1,0,1,
|
||||
1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,0,0,2,1,1,2,0,2,0,0,0,1,1,0,1,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,2,2,1,2,1,1,0,1,0,0,0,1,1,0,1,
|
||||
2,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,0,1,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,2,1,1,1,0,2,1,1,0,0,0,2,1,0,1,
|
||||
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,0,2,1,1,0,1,0,0,0,1,1,0,1,
|
||||
2,2,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,0,1,2,1,0,2,0,0,0,1,1,0,1,
|
||||
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,
|
||||
0,1,0,0,2,0,2,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,
|
||||
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,2,1,1,1,1,1,0,1,0,0,0,0,1,0,1,
|
||||
0,1,1,1,2,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
};
|
||||
|
||||
public HebrewModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HEBREW_LANG_MODEL, 0.984004f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1255Model : HebrewModel
|
||||
{
|
||||
/*
|
||||
255: Control characters that usually does not exist in any text
|
||||
254: Carriage/Return
|
||||
253: symbol (punctuation) that does not belong to word
|
||||
252: 0 - 9
|
||||
*/
|
||||
//Windows-1255 language model
|
||||
//Character Mapping Table:
|
||||
private readonly static byte[] WIN1255_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, //40
|
||||
78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, //50
|
||||
253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, //60
|
||||
66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, //70
|
||||
124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214,
|
||||
215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221,
|
||||
34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227,
|
||||
106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234,
|
||||
30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237,
|
||||
238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250,
|
||||
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
|
||||
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
|
||||
};
|
||||
|
||||
public Win1255Model() : base(WIN1255_CHAR_TO_ORDER_MAP, "windows-1255")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,238 +0,0 @@
|
|||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Mozilla Public License Version
|
||||
* 1.1 (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/MPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is Mozilla Universal charset detector code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 2001
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the MPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class HungarianModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 94.7368%
|
||||
//first 1024 sequences:5.2623%
|
||||
//rest sequences: 0.8894%
|
||||
//negative sequences: 0.0009%
|
||||
private readonly static byte[] HUNGARIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,
|
||||
3,2,1,3,3,3,3,3,2,3,3,3,3,3,1,1,2,3,3,3,3,3,3,3,1,1,3,2,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,1,1,2,3,3,3,1,3,3,3,3,3,1,3,3,2,2,0,3,2,3,
|
||||
0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,3,3,2,3,3,2,2,3,2,3,2,0,3,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,1,2,3,2,2,3,1,2,3,3,2,2,0,3,3,3,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,3,2,
|
||||
0,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,2,1,3,2,2,3,2,1,3,2,2,1,0,3,3,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,2,2,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,3,2,2,3,1,1,3,2,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,1,3,3,3,3,3,2,2,1,3,3,3,0,1,1,2,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,2,0,3,2,3,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,
|
||||
3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,1,3,2,2,2,3,1,1,3,3,1,1,0,3,3,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,3,3,3,3,3,1,2,3,2,2,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,2,3,1,3,3,2,2,1,3,3,3,1,1,3,1,2,3,2,3,2,2,2,1,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,2,2,3,2,1,0,3,2,0,1,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,1,0,3,3,3,3,0,2,3,0,0,2,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,2,2,3,3,2,2,2,2,3,3,0,1,2,3,2,3,2,2,3,2,1,2,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
|
||||
3,3,3,3,3,3,1,2,3,3,3,2,1,2,3,3,2,2,2,3,2,3,3,1,3,3,1,1,0,2,3,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,1,2,2,2,2,3,3,3,1,1,1,3,3,1,1,3,1,1,3,2,1,2,3,1,1,0,2,2,2,
|
||||
0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,2,1,2,1,1,3,3,1,1,1,1,3,3,1,1,2,2,1,2,1,1,2,2,1,1,0,2,2,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,1,1,2,1,1,3,3,1,0,1,1,3,3,2,0,1,1,2,3,1,0,2,2,1,0,0,1,3,2,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,2,1,3,3,3,3,3,1,2,3,2,3,3,2,1,1,3,2,3,2,1,2,2,0,1,2,1,0,0,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
|
||||
3,3,3,3,2,2,2,2,3,1,2,2,1,1,3,3,0,3,2,1,2,3,2,1,3,3,1,1,0,2,1,3,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,3,3,2,2,2,3,2,3,3,3,2,1,1,3,3,1,1,1,2,2,3,2,3,2,2,2,1,0,2,2,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
1,0,0,3,3,3,3,3,0,0,3,3,2,3,0,0,0,2,3,3,1,0,1,2,0,0,1,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,2,3,3,3,3,3,1,2,3,3,2,2,1,1,0,3,3,2,2,1,2,2,1,0,2,2,0,1,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,3,2,2,1,3,1,2,3,3,2,2,1,1,2,2,1,1,1,1,3,2,1,1,1,1,2,1,0,1,2,1,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
|
||||
2,3,3,1,1,1,1,1,3,3,3,0,1,1,3,3,1,1,1,1,1,2,2,0,3,1,1,2,0,2,1,1,
|
||||
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
|
||||
3,1,0,1,2,1,2,2,0,1,2,3,1,2,0,0,0,2,1,1,1,1,1,2,0,0,1,1,0,0,0,0,
|
||||
1,2,1,2,2,2,1,2,1,2,0,2,0,2,2,1,1,2,1,1,2,1,1,1,0,1,0,0,0,1,1,0,
|
||||
1,1,1,2,3,2,3,3,0,1,2,2,3,1,0,1,0,2,1,2,2,0,1,1,0,0,1,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,3,3,2,2,1,0,0,3,2,3,2,0,0,0,1,1,3,0,0,1,1,0,0,2,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,2,2,3,3,1,0,1,3,2,3,1,1,1,0,1,1,1,1,1,3,1,0,0,2,2,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
3,1,1,1,2,2,2,1,0,1,2,3,3,2,0,0,0,2,1,1,1,2,1,1,1,0,1,1,1,0,0,0,
|
||||
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,2,1,1,1,1,1,1,0,1,1,1,0,0,1,1,
|
||||
3,2,2,1,0,0,1,1,2,2,0,3,0,1,2,1,1,0,0,1,1,1,0,1,1,1,1,0,2,1,1,1,
|
||||
2,2,1,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,2,3,1,1,1,1,1,1,1,1,1,0,1,
|
||||
2,3,3,0,1,0,0,0,3,3,1,0,0,1,2,2,1,0,0,0,0,2,0,0,1,1,1,0,2,1,1,1,
|
||||
2,1,1,1,1,1,1,2,1,1,0,1,1,0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,1,0,1,
|
||||
2,3,3,0,1,0,0,0,2,2,0,0,0,0,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,1,0,
|
||||
2,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
|
||||
3,2,2,0,1,0,1,0,2,3,2,0,0,1,2,2,1,0,0,1,1,1,0,0,2,1,0,1,2,2,1,1,
|
||||
2,1,1,1,1,1,1,2,1,1,1,1,1,1,0,2,1,0,1,1,0,1,1,1,0,1,1,2,1,1,0,1,
|
||||
2,2,2,0,0,1,0,0,2,2,1,1,0,0,2,1,1,0,0,0,1,2,0,0,2,1,0,0,2,1,1,1,
|
||||
2,1,1,1,1,2,1,2,1,1,1,2,2,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,
|
||||
1,2,3,0,0,0,1,0,3,2,1,0,0,1,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,2,1,
|
||||
1,1,0,0,0,1,0,1,1,1,1,1,2,0,0,1,0,0,0,2,0,0,1,1,1,1,1,1,1,1,0,1,
|
||||
3,0,0,2,1,2,2,1,0,0,2,1,2,2,0,0,0,2,1,1,1,0,1,1,0,0,1,1,2,0,0,0,
|
||||
1,2,1,2,2,1,1,2,1,2,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,0,0,1,
|
||||
1,3,2,0,0,0,1,0,2,2,2,0,0,0,2,2,1,0,0,0,0,3,1,1,1,1,0,0,2,1,1,1,
|
||||
2,1,0,1,1,1,0,1,1,1,1,1,1,1,0,2,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1,
|
||||
2,3,2,0,0,0,1,0,2,2,0,0,0,0,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,1,0,
|
||||
2,1,1,1,1,2,1,2,1,2,0,1,1,1,0,2,1,1,1,2,1,1,1,1,0,1,1,1,1,1,0,1,
|
||||
3,1,1,2,2,2,3,2,1,1,2,2,1,1,0,1,0,2,2,1,1,1,1,1,0,0,1,1,0,1,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
2,2,2,0,0,0,0,0,2,2,0,0,0,0,2,2,1,0,0,0,1,1,0,0,1,2,0,0,2,1,1,1,
|
||||
2,2,1,1,1,2,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,1,1,0,1,2,1,1,1,0,1,
|
||||
1,0,0,1,2,3,2,1,0,0,2,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0,
|
||||
1,2,1,2,1,2,1,1,1,2,0,2,1,1,1,0,1,2,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
2,3,2,0,0,0,0,0,1,1,2,1,0,0,1,1,1,0,0,0,0,2,0,0,1,1,0,0,2,1,1,1,
|
||||
2,1,1,1,1,1,1,2,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,
|
||||
1,2,2,0,1,1,1,0,2,2,2,0,0,0,3,2,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0,
|
||||
1,1,0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,0,0,1,1,1,0,1,0,1,
|
||||
2,1,0,2,1,1,2,2,1,1,2,1,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,1,0,0,0,
|
||||
1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0,
|
||||
1,2,3,0,0,0,1,0,2,2,0,0,0,0,2,2,0,0,0,0,0,1,0,0,1,0,0,0,2,0,1,0,
|
||||
2,1,1,1,1,1,0,2,0,0,0,1,2,1,1,1,1,0,1,2,0,1,0,1,0,1,1,1,0,1,0,1,
|
||||
2,2,2,0,0,0,1,0,2,1,2,0,0,0,1,1,2,0,0,0,0,1,0,0,1,1,0,0,2,1,0,1,
|
||||
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1,
|
||||
1,2,2,0,0,0,1,0,2,2,2,0,0,0,1,1,0,0,0,0,0,1,1,0,2,0,0,1,1,1,0,1,
|
||||
1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,0,1,
|
||||
1,0,0,1,0,1,2,1,0,0,1,1,1,2,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,
|
||||
0,2,1,2,1,1,1,1,1,2,0,2,0,1,1,0,1,2,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
|
||||
2,1,1,0,1,2,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,2,1,0,1,
|
||||
2,2,1,1,1,1,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,0,1,0,1,1,1,1,1,0,1,
|
||||
1,2,2,0,0,0,0,0,1,1,0,0,0,0,2,1,0,0,0,0,0,2,0,0,2,2,0,0,2,0,0,1,
|
||||
2,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,
|
||||
1,1,2,0,0,3,1,0,2,1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0,
|
||||
1,2,1,0,1,1,1,2,1,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0,
|
||||
2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,2,0,0,0,
|
||||
2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,1,0,1,
|
||||
2,1,1,1,2,1,1,1,0,1,1,2,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,0,1,1,1,1,1,0,0,1,1,2,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,
|
||||
1,2,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,
|
||||
2,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,2,0,0,1,0,0,1,0,1,0,0,0,
|
||||
0,1,1,1,1,1,1,1,1,2,0,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,1,0,0,2,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,1,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,0,1,0,0,1,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,
|
||||
2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public HungarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HUNGARIAN_LANG_MODEL, 0.947368f,
|
||||
false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Latin2HungarianModel : HungarianModel
|
||||
{
|
||||
private readonly static byte[] LATIN2_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
|
||||
46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
|
||||
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
|
||||
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
|
||||
159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,
|
||||
175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,
|
||||
191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205,
|
||||
79,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
|
||||
221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231,
|
||||
232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241,
|
||||
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
|
||||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
public Latin2HungarianModel() : base(LATIN2_CHAR_TO_ORDER_MAP, "ISO-8859-2")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class Win1250HungarianModel : HungarianModel
|
||||
{
|
||||
private readonly static byte[] WIN1250_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
+253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, //20
|
||||
252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, //30
|
||||
253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47,
|
||||
46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253,
|
||||
253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8,
|
||||
23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253,
|
||||
161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,
|
||||
177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190,
|
||||
191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205,
|
||||
81,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,
|
||||
221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231,
|
||||
232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241,
|
||||
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
|
||||
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
public Win1250HungarianModel() : base(WIN1250_CHAR_TO_ORDER_MAP, "windows-1250")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user