add subtitle language detection
This commit is contained in:
parent
c9d7eb9b04
commit
0e7cbb0465
|
@ -317,6 +317,23 @@
|
||||||
<Compile Include="ScheduledTasks\WeeklyTrigger.cs" />
|
<Compile Include="ScheduledTasks\WeeklyTrigger.cs" />
|
||||||
<Compile Include="Serialization\JsonSerializer.cs" />
|
<Compile Include="Serialization\JsonSerializer.cs" />
|
||||||
<Compile Include="Serialization\XmlSerializer.cs" />
|
<Compile Include="Serialization\XmlSerializer.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Detector.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\DetectorFactory.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\ErrorCode.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Extensions\CharExtensions.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Extensions\RandomExtensions.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Extensions\StringExtensions.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Extensions\UnicodeBlock.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\GenProfile.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\InternalException.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Language.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\LanguageDetector.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\NLangDetectException.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\ProbVector.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Utils\LangProfile.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Utils\Messages.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Utils\NGram.cs" />
|
||||||
|
<Compile Include="TextEncoding\NLangDetect\Utils\TagExtractor.cs" />
|
||||||
<Compile Include="TextEncoding\TextEncoding.cs" />
|
<Compile Include="TextEncoding\TextEncoding.cs" />
|
||||||
<Compile Include="TextEncoding\TextEncodingDetect.cs" />
|
<Compile Include="TextEncoding\TextEncodingDetect.cs" />
|
||||||
<Compile Include="TextEncoding\UniversalDetector\CharsetDetector.cs" />
|
<Compile Include="TextEncoding\UniversalDetector\CharsetDetector.cs" />
|
||||||
|
@ -368,7 +385,62 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<None Include="packages.config" />
|
<None Include="packages.config" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\afr" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ara" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\bul" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ben" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ces" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\dan" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\deu" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ell" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\eng" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\spa" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\est" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\fas" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\fin" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\fra" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\guj" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\heb" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\hin" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\hrv" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\hun" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ind" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ita" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\jpn" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\kan" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\kor" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\lit" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\lav" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\mkd" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\mal" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\mar" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\nep" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\nld" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\nor" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\pan" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\pol" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\por" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ron" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\rus" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\slk" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\slv" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\som" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\sqi" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\swe" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\swa" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\tam" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\tel" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\tha" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\tgl" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\tur" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\ukr" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\urd" />
|
||||||
|
<None Include="TextEncoding\NLangDetect\Profiles\vie" />
|
||||||
|
<EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\zh-cn" />
|
||||||
|
<EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\zh-tw" />
|
||||||
|
<EmbeddedResource Include="TextEncoding\NLangDetect\Utils\messages.properties" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
<ItemGroup />
|
||||||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
|
||||||
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
||||||
Other similar extension points exist, see Microsoft.Common.targets.
|
Other similar extension points exist, see Microsoft.Common.targets.
|
||||||
|
|
|
@ -207,24 +207,6 @@ namespace SharpCifs.Util.Sharpen
|
||||||
return (int)tzone.GetUtcOffset(MillisToDateTimeOffset(date, 0).DateTime).TotalMilliseconds;
|
return (int)tzone.GetUtcOffset(MillisToDateTimeOffset(date, 0).DateTime).TotalMilliseconds;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static InputStream GetResourceAsStream(this Type type, string name)
|
|
||||||
{
|
|
||||||
//Type.`Assembly` property deleted
|
|
||||||
//string str2 = type.Assembly.GetName().Name + ".resources";
|
|
||||||
string str2 = type.GetTypeInfo().Assembly.GetName().Name + ".resources";
|
|
||||||
string[] textArray1 = { str2, ".", type.Namespace, ".", name };
|
|
||||||
string str = string.Concat(textArray1);
|
|
||||||
|
|
||||||
//Type.`Assembly` property deleted
|
|
||||||
//Stream manifestResourceStream = type.Assembly.GetManifestResourceStream(str);
|
|
||||||
Stream manifestResourceStream = type.GetTypeInfo().Assembly.GetManifestResourceStream(str);
|
|
||||||
if (manifestResourceStream == null)
|
|
||||||
{
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return InputStream.Wrap(manifestResourceStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static long GetTime(this DateTime dateTime)
|
public static long GetTime(this DateTime dateTime)
|
||||||
{
|
{
|
||||||
return new DateTimeOffset(DateTime.SpecifyKind(dateTime, DateTimeKind.Utc), TimeSpan.Zero).ToMillisecondsSinceEpoch();
|
return new DateTimeOffset(DateTime.SpecifyKind(dateTime, DateTimeKind.Utc), TimeSpan.Zero).ToMillisecondsSinceEpoch();
|
||||||
|
|
371
Emby.Common.Implementations/TextEncoding/NLangDetect/Detector.cs
Normal file
371
Emby.Common.Implementations/TextEncoding/NLangDetect/Detector.cs
Normal file
|
@ -0,0 +1,371 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using NLangDetect.Core.Extensions;
|
||||||
|
using NLangDetect.Core.Utils;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
public class Detector
|
||||||
|
{
|
||||||
|
private const double _AlphaDefault = 0.5;
|
||||||
|
private const double _AlphaWidth = 0.05;
|
||||||
|
|
||||||
|
private const int _IterationLimit = 1000;
|
||||||
|
private const double _ProbThreshold = 0.1;
|
||||||
|
private const double _ConvThreshold = 0.99999;
|
||||||
|
private const int _BaseFreq = 10000;
|
||||||
|
|
||||||
|
private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
|
||||||
|
|
||||||
|
private readonly Dictionary<string, ProbVector> _wordLangProbMap;
|
||||||
|
private readonly List<string> _langlist;
|
||||||
|
|
||||||
|
private StringBuilder _text;
|
||||||
|
private double[] _langprob;
|
||||||
|
|
||||||
|
private double _alpha = _AlphaDefault;
|
||||||
|
private const int _trialsCount = 7;
|
||||||
|
private int _maxTextLength = 10000;
|
||||||
|
private double[] _priorMap;
|
||||||
|
private int? _seed;
|
||||||
|
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public Detector(DetectorFactory factory)
|
||||||
|
{
|
||||||
|
_wordLangProbMap = factory.WordLangProbMap;
|
||||||
|
_langlist = factory.Langlist;
|
||||||
|
_text = new StringBuilder();
|
||||||
|
_seed = factory.Seed;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public void SetAlpha(double alpha)
|
||||||
|
{
|
||||||
|
_alpha = alpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void SetPriorMap(Dictionary<string, double> priorMap)
|
||||||
|
{
|
||||||
|
_priorMap = new double[_langlist.Count];
|
||||||
|
|
||||||
|
double sump = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < _priorMap.Length; i++)
|
||||||
|
{
|
||||||
|
string lang = _langlist[i];
|
||||||
|
|
||||||
|
if (priorMap.ContainsKey(lang))
|
||||||
|
{
|
||||||
|
double p = priorMap[lang];
|
||||||
|
|
||||||
|
if (p < 0)
|
||||||
|
{
|
||||||
|
throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
|
||||||
|
}
|
||||||
|
|
||||||
|
_priorMap[i] = p;
|
||||||
|
sump += p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sump <= 0)
|
||||||
|
{
|
||||||
|
throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < _priorMap.Length; i++)
|
||||||
|
{
|
||||||
|
_priorMap[i] /= sump;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void SetMaxTextLength(int max_text_length)
|
||||||
|
{
|
||||||
|
_maxTextLength = max_text_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO IMM HI: TextReader?
|
||||||
|
public void Append(StreamReader streamReader)
|
||||||
|
{
|
||||||
|
var buf = new char[_maxTextLength / 2];
|
||||||
|
|
||||||
|
while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
|
||||||
|
{
|
||||||
|
int length = streamReader.Read(buf, 0, buf.Length);
|
||||||
|
|
||||||
|
Append(new string(buf, 0, length));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Append(string text)
|
||||||
|
{
|
||||||
|
text = _UrlRegex.Replace(text, " ");
|
||||||
|
text = _MailRegex.Replace(text, " ");
|
||||||
|
|
||||||
|
char pre = '\0';
|
||||||
|
|
||||||
|
for (int i = 0; i < text.Length && i < _maxTextLength; i++)
|
||||||
|
{
|
||||||
|
char c = NGram.Normalize(text[i]);
|
||||||
|
|
||||||
|
if (c != ' ' || pre != ' ')
|
||||||
|
{
|
||||||
|
_text.Append(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
pre = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void CleanText()
|
||||||
|
{
|
||||||
|
int latinCount = 0, nonLatinCount = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < _text.Length; i++)
|
||||||
|
{
|
||||||
|
char c = _text[i];
|
||||||
|
|
||||||
|
if (c <= 'z' && c >= 'A')
|
||||||
|
{
|
||||||
|
latinCount++;
|
||||||
|
}
|
||||||
|
else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
|
||||||
|
{
|
||||||
|
nonLatinCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (latinCount * 2 < nonLatinCount)
|
||||||
|
{
|
||||||
|
var textWithoutLatin = new StringBuilder();
|
||||||
|
|
||||||
|
for (int i = 0; i < _text.Length; i++)
|
||||||
|
{
|
||||||
|
char c = _text[i];
|
||||||
|
|
||||||
|
if (c > 'z' || c < 'A')
|
||||||
|
{
|
||||||
|
textWithoutLatin.Append(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_text = textWithoutLatin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public string Detect()
|
||||||
|
{
|
||||||
|
List<Language> probabilities = GetProbabilities();
|
||||||
|
|
||||||
|
return
|
||||||
|
probabilities.Count > 0
|
||||||
|
? probabilities[0].Name
|
||||||
|
: null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Language> GetProbabilities()
|
||||||
|
{
|
||||||
|
if (_langprob == null)
|
||||||
|
{
|
||||||
|
DetectBlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Language> list = SortProbability(_langprob);
|
||||||
|
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Private helper methods
|
||||||
|
|
||||||
|
private static double NormalizeProb(double[] probs)
|
||||||
|
{
|
||||||
|
double maxp = 0, sump = 0;
|
||||||
|
|
||||||
|
sump += probs.Sum();
|
||||||
|
|
||||||
|
for (int i = 0; i < probs.Length; i++)
|
||||||
|
{
|
||||||
|
double p = probs[i] / sump;
|
||||||
|
|
||||||
|
if (maxp < p)
|
||||||
|
{
|
||||||
|
maxp = p;
|
||||||
|
}
|
||||||
|
|
||||||
|
probs[i] = p;
|
||||||
|
}
|
||||||
|
|
||||||
|
return maxp;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string UnicodeEncode(string word)
|
||||||
|
{
|
||||||
|
var resultSb = new StringBuilder();
|
||||||
|
|
||||||
|
foreach (char ch in word)
|
||||||
|
{
|
||||||
|
if (ch >= '\u0080')
|
||||||
|
{
|
||||||
|
string st = string.Format("{0:x}", 0x10000 + ch);
|
||||||
|
|
||||||
|
while (st.Length < 4)
|
||||||
|
{
|
||||||
|
st = "0" + st;
|
||||||
|
}
|
||||||
|
|
||||||
|
resultSb
|
||||||
|
.Append("\\u")
|
||||||
|
.Append(st.SubSequence(1, 5));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
resultSb.Append(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultSb.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void DetectBlock()
|
||||||
|
{
|
||||||
|
CleanText();
|
||||||
|
|
||||||
|
List<string> ngrams = ExtractNGrams();
|
||||||
|
|
||||||
|
if (ngrams.Count == 0)
|
||||||
|
{
|
||||||
|
throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
|
||||||
|
}
|
||||||
|
|
||||||
|
_langprob = new double[_langlist.Count];
|
||||||
|
|
||||||
|
Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
|
||||||
|
|
||||||
|
for (int t = 0; t < _trialsCount; t++)
|
||||||
|
{
|
||||||
|
double[] prob = InitProbability();
|
||||||
|
|
||||||
|
// TODO IMM HI: verify it works
|
||||||
|
double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
|
||||||
|
|
||||||
|
for (int i = 0; ; i++)
|
||||||
|
{
|
||||||
|
int r = rand.Next(ngrams.Count);
|
||||||
|
|
||||||
|
UpdateLangProb(prob, ngrams[r], alpha);
|
||||||
|
|
||||||
|
if (i % 5 == 0)
|
||||||
|
{
|
||||||
|
if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < _langprob.Length; j++)
|
||||||
|
{
|
||||||
|
_langprob[j] += prob[j] / _trialsCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[] InitProbability()
|
||||||
|
{
|
||||||
|
var prob = new double[_langlist.Count];
|
||||||
|
|
||||||
|
if (_priorMap != null)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < prob.Length; i++)
|
||||||
|
{
|
||||||
|
prob[i] = _priorMap[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < prob.Length; i++)
|
||||||
|
{
|
||||||
|
prob[i] = 1.0 / _langlist.Count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return prob;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<string> ExtractNGrams()
|
||||||
|
{
|
||||||
|
var list = new List<string>();
|
||||||
|
NGram ngram = new NGram();
|
||||||
|
|
||||||
|
for (int i = 0; i < _text.Length; i++)
|
||||||
|
{
|
||||||
|
ngram.AddChar(_text[i]);
|
||||||
|
|
||||||
|
for (int n = 1; n <= NGram.GramsCount; n++)
|
||||||
|
{
|
||||||
|
string w = ngram.Get(n);
|
||||||
|
|
||||||
|
if (w != null && _wordLangProbMap.ContainsKey(w))
|
||||||
|
{
|
||||||
|
list.Add(w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void UpdateLangProb(double[] prob, string word, double alpha)
|
||||||
|
{
|
||||||
|
if (word == null || !_wordLangProbMap.ContainsKey(word))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ProbVector langProbMap = _wordLangProbMap[word];
|
||||||
|
double weight = alpha / _BaseFreq;
|
||||||
|
|
||||||
|
for (int i = 0; i < prob.Length; i++)
|
||||||
|
{
|
||||||
|
prob[i] *= weight + langProbMap[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Language> SortProbability(double[] prob)
|
||||||
|
{
|
||||||
|
var list = new List<Language>();
|
||||||
|
|
||||||
|
for (int j = 0; j < prob.Length; j++)
|
||||||
|
{
|
||||||
|
double p = prob[j];
|
||||||
|
|
||||||
|
if (p > _ProbThreshold)
|
||||||
|
{
|
||||||
|
for (int i = 0; i <= list.Count; i++)
|
||||||
|
{
|
||||||
|
if (i == list.Count || list[i].Probability < p)
|
||||||
|
{
|
||||||
|
list.Insert(i, new Language(_langlist[j], p));
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,127 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
using System.IO.Compression;
|
||||||
|
using NLangDetect.Core.Utils;
|
||||||
|
using MediaBrowser.Model.Serialization;
|
||||||
|
using System.Linq;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
public class DetectorFactory
|
||||||
|
{
|
||||||
|
public Dictionary<string, ProbVector> WordLangProbMap;
|
||||||
|
public List<string> Langlist;
|
||||||
|
|
||||||
|
private static readonly DetectorFactory _instance = new DetectorFactory();
|
||||||
|
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
private DetectorFactory()
|
||||||
|
{
|
||||||
|
WordLangProbMap = new Dictionary<string, ProbVector>();
|
||||||
|
Langlist = new List<string>();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public static void LoadProfiles(IJsonSerializer json)
|
||||||
|
{
|
||||||
|
var assembly = typeof(DetectorFactory).Assembly;
|
||||||
|
var names = assembly.GetManifestResourceNames()
|
||||||
|
.Where(i => i.IndexOf("NLangDetect.Profiles", StringComparison.Ordinal) != -1)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var index = 0;
|
||||||
|
|
||||||
|
foreach (var name in names)
|
||||||
|
{
|
||||||
|
using (var stream = assembly.GetManifestResourceStream(name))
|
||||||
|
{
|
||||||
|
var langProfile = (LangProfile)json.DeserializeFromStream(stream, typeof(LangProfile));
|
||||||
|
|
||||||
|
AddProfile(langProfile, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Detector Create()
|
||||||
|
{
|
||||||
|
return CreateDetector();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Detector Create(double alpha)
|
||||||
|
{
|
||||||
|
Detector detector = CreateDetector();
|
||||||
|
|
||||||
|
detector.SetAlpha(alpha);
|
||||||
|
|
||||||
|
return detector;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void SetSeed(int? seed)
|
||||||
|
{
|
||||||
|
_instance.Seed = seed;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Internal methods
|
||||||
|
|
||||||
|
internal static void AddProfile(LangProfile profile, int index)
|
||||||
|
{
|
||||||
|
var lang = profile.name;
|
||||||
|
|
||||||
|
if (_instance.Langlist.Contains(lang))
|
||||||
|
{
|
||||||
|
throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError);
|
||||||
|
}
|
||||||
|
|
||||||
|
_instance.Langlist.Add(lang);
|
||||||
|
|
||||||
|
foreach (string word in profile.freq.Keys)
|
||||||
|
{
|
||||||
|
if (!_instance.WordLangProbMap.ContainsKey(word))
|
||||||
|
{
|
||||||
|
_instance.WordLangProbMap.Add(word, new ProbVector());
|
||||||
|
}
|
||||||
|
|
||||||
|
double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1];
|
||||||
|
|
||||||
|
_instance.WordLangProbMap[word][index] = prob;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static void Clear()
|
||||||
|
{
|
||||||
|
_instance.Langlist.Clear();
|
||||||
|
_instance.WordLangProbMap.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Private helper methods
|
||||||
|
|
||||||
|
private static Detector CreateDetector()
|
||||||
|
{
|
||||||
|
if (_instance.Langlist.Count == 0)
|
||||||
|
{
|
||||||
|
throw new NLangDetectException("need to load profiles", ErrorCode.NeedLoadProfileError);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Detector(_instance);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public int? Seed { get; private set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
public enum ErrorCode
|
||||||
|
{
|
||||||
|
NoTextError,
|
||||||
|
FormatError,
|
||||||
|
FileLoadError,
|
||||||
|
DuplicateLangError,
|
||||||
|
NeedLoadProfileError,
|
||||||
|
CantDetectError,
|
||||||
|
CantOpenTrainData,
|
||||||
|
TrainDataFormatError,
|
||||||
|
InitParamError,
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,374 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Extensions
|
||||||
|
{
|
||||||
|
public static class CharExtensions
|
||||||
|
{
|
||||||
|
private const int MIN_CODE_POINT = 0x000000;
|
||||||
|
private const int MAX_CODE_POINT = 0x10ffff;
|
||||||
|
|
||||||
|
private static readonly int[] _unicodeBlockStarts =
|
||||||
|
{
|
||||||
|
#region Unicode block starts
|
||||||
|
|
||||||
|
0x0000, // Basic Latin
|
||||||
|
0x0080, // Latin-1 Supplement
|
||||||
|
0x0100, // Latin Extended-A
|
||||||
|
0x0180, // Latin Extended-B
|
||||||
|
0x0250, // IPA Extensions
|
||||||
|
0x02B0, // Spacing Modifier Letters
|
||||||
|
0x0300, // Combining Diacritical Marks
|
||||||
|
0x0370, // Greek and Coptic
|
||||||
|
0x0400, // Cyrillic
|
||||||
|
0x0500, // Cyrillic Supplementary
|
||||||
|
0x0530, // Armenian
|
||||||
|
0x0590, // Hebrew
|
||||||
|
0x0600, // Arabic
|
||||||
|
0x0700, // Syriac
|
||||||
|
0x0750, // unassigned
|
||||||
|
0x0780, // Thaana
|
||||||
|
0x07C0, // unassigned
|
||||||
|
0x0900, // Devanagari
|
||||||
|
0x0980, // Bengali
|
||||||
|
0x0A00, // Gurmukhi
|
||||||
|
0x0A80, // Gujarati
|
||||||
|
0x0B00, // Oriya
|
||||||
|
0x0B80, // Tamil
|
||||||
|
0x0C00, // Telugu
|
||||||
|
0x0C80, // Kannada
|
||||||
|
0x0D00, // Malayalam
|
||||||
|
0x0D80, // Sinhala
|
||||||
|
0x0E00, // Thai
|
||||||
|
0x0E80, // Lao
|
||||||
|
0x0F00, // Tibetan
|
||||||
|
0x1000, // Myanmar
|
||||||
|
0x10A0, // Georgian
|
||||||
|
0x1100, // Hangul Jamo
|
||||||
|
0x1200, // Ethiopic
|
||||||
|
0x1380, // unassigned
|
||||||
|
0x13A0, // Cherokee
|
||||||
|
0x1400, // Unified Canadian Aboriginal Syllabics
|
||||||
|
0x1680, // Ogham
|
||||||
|
0x16A0, // Runic
|
||||||
|
0x1700, // Tagalog
|
||||||
|
0x1720, // Hanunoo
|
||||||
|
0x1740, // Buhid
|
||||||
|
0x1760, // Tagbanwa
|
||||||
|
0x1780, // Khmer
|
||||||
|
0x1800, // Mongolian
|
||||||
|
0x18B0, // unassigned
|
||||||
|
0x1900, // Limbu
|
||||||
|
0x1950, // Tai Le
|
||||||
|
0x1980, // unassigned
|
||||||
|
0x19E0, // Khmer Symbols
|
||||||
|
0x1A00, // unassigned
|
||||||
|
0x1D00, // Phonetic Extensions
|
||||||
|
0x1D80, // unassigned
|
||||||
|
0x1E00, // Latin Extended Additional
|
||||||
|
0x1F00, // Greek Extended
|
||||||
|
0x2000, // General Punctuation
|
||||||
|
0x2070, // Superscripts and Subscripts
|
||||||
|
0x20A0, // Currency Symbols
|
||||||
|
0x20D0, // Combining Diacritical Marks for Symbols
|
||||||
|
0x2100, // Letterlike Symbols
|
||||||
|
0x2150, // Number Forms
|
||||||
|
0x2190, // Arrows
|
||||||
|
0x2200, // Mathematical Operators
|
||||||
|
0x2300, // Miscellaneous Technical
|
||||||
|
0x2400, // Control Pictures
|
||||||
|
0x2440, // Optical Character Recognition
|
||||||
|
0x2460, // Enclosed Alphanumerics
|
||||||
|
0x2500, // Box Drawing
|
||||||
|
0x2580, // Block Elements
|
||||||
|
0x25A0, // Geometric Shapes
|
||||||
|
0x2600, // Miscellaneous Symbols
|
||||||
|
0x2700, // Dingbats
|
||||||
|
0x27C0, // Miscellaneous Mathematical Symbols-A
|
||||||
|
0x27F0, // Supplemental Arrows-A
|
||||||
|
0x2800, // Braille Patterns
|
||||||
|
0x2900, // Supplemental Arrows-B
|
||||||
|
0x2980, // Miscellaneous Mathematical Symbols-B
|
||||||
|
0x2A00, // Supplemental Mathematical Operators
|
||||||
|
0x2B00, // Miscellaneous Symbols and Arrows
|
||||||
|
0x2C00, // unassigned
|
||||||
|
0x2E80, // CJK Radicals Supplement
|
||||||
|
0x2F00, // Kangxi Radicals
|
||||||
|
0x2FE0, // unassigned
|
||||||
|
0x2FF0, // Ideographic Description Characters
|
||||||
|
0x3000, // CJK Symbols and Punctuation
|
||||||
|
0x3040, // Hiragana
|
||||||
|
0x30A0, // Katakana
|
||||||
|
0x3100, // Bopomofo
|
||||||
|
0x3130, // Hangul Compatibility Jamo
|
||||||
|
0x3190, // Kanbun
|
||||||
|
0x31A0, // Bopomofo Extended
|
||||||
|
0x31C0, // unassigned
|
||||||
|
0x31F0, // Katakana Phonetic Extensions
|
||||||
|
0x3200, // Enclosed CJK Letters and Months
|
||||||
|
0x3300, // CJK Compatibility
|
||||||
|
0x3400, // CJK Unified Ideographs Extension A
|
||||||
|
0x4DC0, // Yijing Hexagram Symbols
|
||||||
|
0x4E00, // CJK Unified Ideographs
|
||||||
|
0xA000, // Yi Syllables
|
||||||
|
0xA490, // Yi Radicals
|
||||||
|
0xA4D0, // unassigned
|
||||||
|
0xAC00, // Hangul Syllables
|
||||||
|
0xD7B0, // unassigned
|
||||||
|
0xD800, // High Surrogates
|
||||||
|
0xDB80, // High Private Use Surrogates
|
||||||
|
0xDC00, // Low Surrogates
|
||||||
|
0xE000, // Private Use
|
||||||
|
0xF900, // CJK Compatibility Ideographs
|
||||||
|
0xFB00, // Alphabetic Presentation Forms
|
||||||
|
0xFB50, // Arabic Presentation Forms-A
|
||||||
|
0xFE00, // Variation Selectors
|
||||||
|
0xFE10, // unassigned
|
||||||
|
0xFE20, // Combining Half Marks
|
||||||
|
0xFE30, // CJK Compatibility Forms
|
||||||
|
0xFE50, // Small Form Variants
|
||||||
|
0xFE70, // Arabic Presentation Forms-B
|
||||||
|
0xFF00, // Halfwidth and Fullwidth Forms
|
||||||
|
0xFFF0, // Specials
|
||||||
|
0x10000, // Linear B Syllabary
|
||||||
|
0x10080, // Linear B Ideograms
|
||||||
|
0x10100, // Aegean Numbers
|
||||||
|
0x10140, // unassigned
|
||||||
|
0x10300, // Old Italic
|
||||||
|
0x10330, // Gothic
|
||||||
|
0x10350, // unassigned
|
||||||
|
0x10380, // Ugaritic
|
||||||
|
0x103A0, // unassigned
|
||||||
|
0x10400, // Deseret
|
||||||
|
0x10450, // Shavian
|
||||||
|
0x10480, // Osmanya
|
||||||
|
0x104B0, // unassigned
|
||||||
|
0x10800, // Cypriot Syllabary
|
||||||
|
0x10840, // unassigned
|
||||||
|
0x1D000, // Byzantine Musical Symbols
|
||||||
|
0x1D100, // Musical Symbols
|
||||||
|
0x1D200, // unassigned
|
||||||
|
0x1D300, // Tai Xuan Jing Symbols
|
||||||
|
0x1D360, // unassigned
|
||||||
|
0x1D400, // Mathematical Alphanumeric Symbols
|
||||||
|
0x1D800, // unassigned
|
||||||
|
0x20000, // CJK Unified Ideographs Extension B
|
||||||
|
0x2A6E0, // unassigned
|
||||||
|
0x2F800, // CJK Compatibility Ideographs Supplement
|
||||||
|
0x2FA20, // unassigned
|
||||||
|
0xE0000, // Tags
|
||||||
|
0xE0080, // unassigned
|
||||||
|
0xE0100, // Variation Selectors Supplement
|
||||||
|
0xE01F0, // unassigned
|
||||||
|
0xF0000, // Supplementary Private Use Area-A
|
||||||
|
0x100000, // Supplementary Private Use Area-B
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
};
|
||||||
|
|
||||||
|
private static readonly UnicodeBlock?[] _unicodeBlocks =
|
||||||
|
{
|
||||||
|
#region Unicode blocks
|
||||||
|
UnicodeBlock.BasicLatin,
|
||||||
|
UnicodeBlock.Latin1Supplement,
|
||||||
|
UnicodeBlock.LatinExtendedA,
|
||||||
|
UnicodeBlock.LatinExtendedB,
|
||||||
|
UnicodeBlock.IpaExtensions,
|
||||||
|
UnicodeBlock.SpacingModifierLetters,
|
||||||
|
UnicodeBlock.CombiningDiacriticalMarks,
|
||||||
|
UnicodeBlock.Greek,
|
||||||
|
UnicodeBlock.Cyrillic,
|
||||||
|
UnicodeBlock.CyrillicSupplementary,
|
||||||
|
UnicodeBlock.Armenian,
|
||||||
|
UnicodeBlock.Hebrew,
|
||||||
|
UnicodeBlock.Arabic,
|
||||||
|
UnicodeBlock.Syriac,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Thaana,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Devanagari,
|
||||||
|
UnicodeBlock.Bengali,
|
||||||
|
UnicodeBlock.Gurmukhi,
|
||||||
|
UnicodeBlock.Gujarati,
|
||||||
|
UnicodeBlock.Oriya,
|
||||||
|
UnicodeBlock.Tamil,
|
||||||
|
UnicodeBlock.Telugu,
|
||||||
|
UnicodeBlock.Kannada,
|
||||||
|
UnicodeBlock.Malayalam,
|
||||||
|
UnicodeBlock.Sinhala,
|
||||||
|
UnicodeBlock.Thai,
|
||||||
|
UnicodeBlock.Lao,
|
||||||
|
UnicodeBlock.Tibetan,
|
||||||
|
UnicodeBlock.Myanmar,
|
||||||
|
UnicodeBlock.Georgian,
|
||||||
|
UnicodeBlock.HangulJamo,
|
||||||
|
UnicodeBlock.Ethiopic,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Cherokee,
|
||||||
|
UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
|
||||||
|
UnicodeBlock.Ogham,
|
||||||
|
UnicodeBlock.Runic,
|
||||||
|
UnicodeBlock.Tagalog,
|
||||||
|
UnicodeBlock.Hanunoo,
|
||||||
|
UnicodeBlock.Buhid,
|
||||||
|
UnicodeBlock.Tagbanwa,
|
||||||
|
UnicodeBlock.Khmer,
|
||||||
|
UnicodeBlock.Mongolian,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Limbu,
|
||||||
|
UnicodeBlock.TaiLe,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.KhmerSymbols,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.PhoneticExtensions,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.LatinExtendedAdditional,
|
||||||
|
UnicodeBlock.GreekExtended,
|
||||||
|
UnicodeBlock.GeneralPunctuation,
|
||||||
|
UnicodeBlock.SuperscriptsAndSubscripts,
|
||||||
|
UnicodeBlock.CurrencySymbols,
|
||||||
|
UnicodeBlock.CombiningMarksForSymbols,
|
||||||
|
UnicodeBlock.LetterlikeSymbols,
|
||||||
|
UnicodeBlock.NumberForms,
|
||||||
|
UnicodeBlock.Arrows,
|
||||||
|
UnicodeBlock.MathematicalOperators,
|
||||||
|
UnicodeBlock.MiscellaneousTechnical,
|
||||||
|
UnicodeBlock.ControlPictures,
|
||||||
|
UnicodeBlock.OpticalCharacterRecognition,
|
||||||
|
UnicodeBlock.EnclosedAlphanumerics,
|
||||||
|
UnicodeBlock.BoxDrawing,
|
||||||
|
UnicodeBlock.BlockElements,
|
||||||
|
UnicodeBlock.GeometricShapes,
|
||||||
|
UnicodeBlock.MiscellaneousSymbols,
|
||||||
|
UnicodeBlock.Dingbats,
|
||||||
|
UnicodeBlock.MiscellaneousMathematicalSymbolsA,
|
||||||
|
UnicodeBlock.SupplementalArrowsA,
|
||||||
|
UnicodeBlock.BraillePatterns,
|
||||||
|
UnicodeBlock.SupplementalArrowsB,
|
||||||
|
UnicodeBlock.MiscellaneousMathematicalSymbolsB,
|
||||||
|
UnicodeBlock.SupplementalMathematicalOperators,
|
||||||
|
UnicodeBlock.MiscellaneousSymbolsAndArrows,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.CjkRadicalsSupplement,
|
||||||
|
UnicodeBlock.KangxiRadicals,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.IdeographicDescriptionCharacters,
|
||||||
|
UnicodeBlock.CjkSymbolsAndPunctuation,
|
||||||
|
UnicodeBlock.Hiragana,
|
||||||
|
UnicodeBlock.Katakana,
|
||||||
|
UnicodeBlock.Bopomofo,
|
||||||
|
UnicodeBlock.HangulCompatibilityJamo,
|
||||||
|
UnicodeBlock.Kanbun,
|
||||||
|
UnicodeBlock.BopomofoExtended,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.KatakanaPhoneticExtensions,
|
||||||
|
UnicodeBlock.EnclosedCjkLettersAndMonths,
|
||||||
|
UnicodeBlock.CjkCompatibility,
|
||||||
|
UnicodeBlock.CjkUnifiedIdeographsExtensionA,
|
||||||
|
UnicodeBlock.YijingHexagramSymbols,
|
||||||
|
UnicodeBlock.CjkUnifiedIdeographs,
|
||||||
|
UnicodeBlock.YiSyllables,
|
||||||
|
UnicodeBlock.YiRadicals,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.HangulSyllables,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.HighSurrogates,
|
||||||
|
UnicodeBlock.HighPrivateUseSurrogates,
|
||||||
|
UnicodeBlock.LowSurrogates,
|
||||||
|
UnicodeBlock.PrivateUseArea,
|
||||||
|
UnicodeBlock.CjkCompatibilityIdeographs,
|
||||||
|
UnicodeBlock.AlphabeticPresentationForms,
|
||||||
|
UnicodeBlock.ArabicPresentationFormsA,
|
||||||
|
UnicodeBlock.VariationSelectors,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.CombiningHalfMarks,
|
||||||
|
UnicodeBlock.CjkCompatibilityForms,
|
||||||
|
UnicodeBlock.SmallFormVariants,
|
||||||
|
UnicodeBlock.ArabicPresentationFormsB,
|
||||||
|
UnicodeBlock.HalfwidthAndFullwidthForms,
|
||||||
|
UnicodeBlock.Specials,
|
||||||
|
UnicodeBlock.LinearBSyllabary,
|
||||||
|
UnicodeBlock.LinearBIdeograms,
|
||||||
|
UnicodeBlock.AegeanNumbers,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.OldItalic,
|
||||||
|
UnicodeBlock.Gothic,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Ugaritic,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Deseret,
|
||||||
|
UnicodeBlock.Shavian,
|
||||||
|
UnicodeBlock.Osmanya,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.CypriotSyllabary,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.ByzantineMusicalSymbols,
|
||||||
|
UnicodeBlock.MusicalSymbols,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.TaiXuanJingSymbols,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.MathematicalAlphanumericSymbols,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.CjkUnifiedIdeographsExtensionB,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.CjkCompatibilityIdeographsSupplement,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.Tags,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.VariationSelectorsSupplement,
|
||||||
|
null,
|
||||||
|
UnicodeBlock.SupplementaryPrivateUseAreaA,
|
||||||
|
UnicodeBlock.SupplementaryPrivateUseAreaB,
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
};
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
/// <remarks>
|
||||||
|
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
|
||||||
|
/// </remarks>
|
||||||
|
public static UnicodeBlock? GetUnicodeBlock(this char ch)
|
||||||
|
{
|
||||||
|
int codePoint = ch;
|
||||||
|
|
||||||
|
if (!IsValidCodePoint(codePoint))
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Argument is not a valid code point.", "ch");
|
||||||
|
}
|
||||||
|
|
||||||
|
int top, bottom, current;
|
||||||
|
|
||||||
|
bottom = 0;
|
||||||
|
top = _unicodeBlockStarts.Length;
|
||||||
|
current = top / 2;
|
||||||
|
|
||||||
|
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
|
||||||
|
while (top - bottom > 1)
|
||||||
|
{
|
||||||
|
if (codePoint >= _unicodeBlockStarts[current])
|
||||||
|
{
|
||||||
|
bottom = current;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
top = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
current = (top + bottom) / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return _unicodeBlocks[current];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Private helper methods
|
||||||
|
|
||||||
|
private static bool IsValidCodePoint(int codePoint)
|
||||||
|
{
|
||||||
|
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,51 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Extensions
|
||||||
|
{
|
||||||
|
public static class RandomExtensions
|
||||||
|
{
|
||||||
|
private const double _Epsilon = 2.22044604925031E-15;
|
||||||
|
|
||||||
|
private static readonly object _mutex = new object();
|
||||||
|
|
||||||
|
private static double _nextNextGaussian;
|
||||||
|
private static bool _hasNextNextGaussian;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
|
||||||
|
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
|
||||||
|
/// </remarks>
|
||||||
|
public static double NextGaussian(this Random random)
|
||||||
|
{
|
||||||
|
lock (_mutex)
|
||||||
|
{
|
||||||
|
if (_hasNextNextGaussian)
|
||||||
|
{
|
||||||
|
_hasNextNextGaussian = false;
|
||||||
|
|
||||||
|
return _nextNextGaussian;
|
||||||
|
}
|
||||||
|
|
||||||
|
double v1, v2, s;
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
||||||
|
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
|
||||||
|
s = v1 * v1 + v2 * v2;
|
||||||
|
}
|
||||||
|
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
|
||||||
|
|
||||||
|
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
|
||||||
|
|
||||||
|
_nextNextGaussian = v2 * multiplier;
|
||||||
|
_hasNextNextGaussian = true;
|
||||||
|
|
||||||
|
return v1 * multiplier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Extensions
|
||||||
|
{
|
||||||
|
public static class StringExtensions
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="s"></param>
|
||||||
|
/// <param name="start">the start index, inclusive</param>
|
||||||
|
/// <param name="end">the end index, exclusive</param>
|
||||||
|
/// <returns>the specified subsequence</returns>
|
||||||
|
/// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
|
||||||
|
public static string SubSequence(this string s, int start, int end)
|
||||||
|
{
|
||||||
|
if (start < 0) throw new ArgumentOutOfRangeException("start", "Argument must not be negative.");
|
||||||
|
if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative.");
|
||||||
|
if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length.");
|
||||||
|
if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument.");
|
||||||
|
|
||||||
|
return s.Substring(start, end - start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,131 @@
|
||||||
|
namespace NLangDetect.Core.Extensions
|
||||||
|
{
|
||||||
|
public enum UnicodeBlock
|
||||||
|
{
|
||||||
|
BasicLatin,
|
||||||
|
Latin1Supplement,
|
||||||
|
LatinExtendedA,
|
||||||
|
LatinExtendedB,
|
||||||
|
IpaExtensions,
|
||||||
|
SpacingModifierLetters,
|
||||||
|
CombiningDiacriticalMarks,
|
||||||
|
Greek,
|
||||||
|
Cyrillic,
|
||||||
|
CyrillicSupplementary,
|
||||||
|
Armenian,
|
||||||
|
Hebrew,
|
||||||
|
Arabic,
|
||||||
|
Syriac,
|
||||||
|
Thaana,
|
||||||
|
Devanagari,
|
||||||
|
Bengali,
|
||||||
|
Gurmukhi,
|
||||||
|
Gujarati,
|
||||||
|
Oriya,
|
||||||
|
Tamil,
|
||||||
|
Telugu,
|
||||||
|
Kannada,
|
||||||
|
Malayalam,
|
||||||
|
Sinhala,
|
||||||
|
Thai,
|
||||||
|
Lao,
|
||||||
|
Tibetan,
|
||||||
|
Myanmar,
|
||||||
|
Georgian,
|
||||||
|
HangulJamo,
|
||||||
|
Ethiopic,
|
||||||
|
Cherokee,
|
||||||
|
UnifiedCanadianAboriginalSyllabics,
|
||||||
|
Ogham,
|
||||||
|
Runic,
|
||||||
|
Tagalog,
|
||||||
|
Hanunoo,
|
||||||
|
Buhid,
|
||||||
|
Tagbanwa,
|
||||||
|
Khmer,
|
||||||
|
Mongolian,
|
||||||
|
Limbu,
|
||||||
|
TaiLe,
|
||||||
|
KhmerSymbols,
|
||||||
|
PhoneticExtensions,
|
||||||
|
LatinExtendedAdditional,
|
||||||
|
GreekExtended,
|
||||||
|
GeneralPunctuation,
|
||||||
|
SuperscriptsAndSubscripts,
|
||||||
|
CurrencySymbols,
|
||||||
|
CombiningMarksForSymbols,
|
||||||
|
LetterlikeSymbols,
|
||||||
|
NumberForms,
|
||||||
|
Arrows,
|
||||||
|
MathematicalOperators,
|
||||||
|
MiscellaneousTechnical,
|
||||||
|
ControlPictures,
|
||||||
|
OpticalCharacterRecognition,
|
||||||
|
EnclosedAlphanumerics,
|
||||||
|
BoxDrawing,
|
||||||
|
BlockElements,
|
||||||
|
GeometricShapes,
|
||||||
|
MiscellaneousSymbols,
|
||||||
|
Dingbats,
|
||||||
|
MiscellaneousMathematicalSymbolsA,
|
||||||
|
SupplementalArrowsA,
|
||||||
|
BraillePatterns,
|
||||||
|
SupplementalArrowsB,
|
||||||
|
MiscellaneousMathematicalSymbolsB,
|
||||||
|
SupplementalMathematicalOperators,
|
||||||
|
MiscellaneousSymbolsAndArrows,
|
||||||
|
CjkRadicalsSupplement,
|
||||||
|
KangxiRadicals,
|
||||||
|
IdeographicDescriptionCharacters,
|
||||||
|
CjkSymbolsAndPunctuation,
|
||||||
|
Hiragana,
|
||||||
|
Katakana,
|
||||||
|
Bopomofo,
|
||||||
|
HangulCompatibilityJamo,
|
||||||
|
Kanbun,
|
||||||
|
BopomofoExtended,
|
||||||
|
KatakanaPhoneticExtensions,
|
||||||
|
EnclosedCjkLettersAndMonths,
|
||||||
|
CjkCompatibility,
|
||||||
|
CjkUnifiedIdeographsExtensionA,
|
||||||
|
YijingHexagramSymbols,
|
||||||
|
CjkUnifiedIdeographs,
|
||||||
|
YiSyllables,
|
||||||
|
YiRadicals,
|
||||||
|
HangulSyllables,
|
||||||
|
HighSurrogates,
|
||||||
|
HighPrivateUseSurrogates,
|
||||||
|
LowSurrogates,
|
||||||
|
PrivateUseArea,
|
||||||
|
CjkCompatibilityIdeographs,
|
||||||
|
AlphabeticPresentationForms,
|
||||||
|
ArabicPresentationFormsA,
|
||||||
|
VariationSelectors,
|
||||||
|
CombiningHalfMarks,
|
||||||
|
CjkCompatibilityForms,
|
||||||
|
SmallFormVariants,
|
||||||
|
ArabicPresentationFormsB,
|
||||||
|
HalfwidthAndFullwidthForms,
|
||||||
|
Specials,
|
||||||
|
LinearBSyllabary,
|
||||||
|
LinearBIdeograms,
|
||||||
|
AegeanNumbers,
|
||||||
|
OldItalic,
|
||||||
|
Gothic,
|
||||||
|
Ugaritic,
|
||||||
|
Deseret,
|
||||||
|
Shavian,
|
||||||
|
Osmanya,
|
||||||
|
CypriotSyllabary,
|
||||||
|
ByzantineMusicalSymbols,
|
||||||
|
MusicalSymbols,
|
||||||
|
TaiXuanJingSymbols,
|
||||||
|
MathematicalAlphanumericSymbols,
|
||||||
|
CjkUnifiedIdeographsExtensionB,
|
||||||
|
CjkCompatibilityIdeographsSupplement,
|
||||||
|
Tags,
|
||||||
|
VariationSelectorsSupplement,
|
||||||
|
SupplementaryPrivateUseAreaA,
|
||||||
|
SupplementaryPrivateUseAreaB,
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
using System;
|
||||||
|
using System.IO.Compression;
|
||||||
|
using System.Xml;
|
||||||
|
using NLangDetect.Core.Utils;
|
||||||
|
using System.IO;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
// TODO IMM HI: xml reader not tested
|
||||||
|
public static class GenProfile
|
||||||
|
{
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public static LangProfile load(string lang, string file)
|
||||||
|
{
|
||||||
|
LangProfile profile = new LangProfile(lang);
|
||||||
|
TagExtractor tagextractor = new TagExtractor("abstract", 100);
|
||||||
|
Stream inputStream = null;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
inputStream = File.OpenRead(file);
|
||||||
|
|
||||||
|
string extension = Path.GetExtension(file) ?? "";
|
||||||
|
|
||||||
|
if (extension.ToUpper() == ".GZ")
|
||||||
|
{
|
||||||
|
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
|
||||||
|
}
|
||||||
|
|
||||||
|
using (XmlReader xmlReader = XmlReader.Create(inputStream))
|
||||||
|
{
|
||||||
|
while (xmlReader.Read())
|
||||||
|
{
|
||||||
|
switch (xmlReader.NodeType)
|
||||||
|
{
|
||||||
|
case XmlNodeType.Element:
|
||||||
|
tagextractor.SetTag(xmlReader.Name);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case XmlNodeType.Text:
|
||||||
|
tagextractor.Add(xmlReader.Value);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case XmlNodeType.EndElement:
|
||||||
|
tagextractor.CloseTag(profile);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (inputStream != null)
|
||||||
|
{
|
||||||
|
inputStream.Close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Console.WriteLine(lang + ": " + tagextractor.Count);
|
||||||
|
|
||||||
|
return profile;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
[Serializable]
|
||||||
|
public class InternalException : Exception
|
||||||
|
{
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public InternalException(string message, Exception innerException)
|
||||||
|
: base(message, innerException)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public InternalException(string message)
|
||||||
|
: this(message, null)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,45 @@
|
||||||
|
using System.Globalization;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
// TODO IMM HI: name??
|
||||||
|
public class Language
|
||||||
|
{
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public Language(string name, double probability)
|
||||||
|
{
|
||||||
|
Name = name;
|
||||||
|
Probability = probability;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Object overrides
|
||||||
|
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
if (Name == null)
|
||||||
|
{
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
string.Format(
|
||||||
|
CultureInfo.InvariantCulture.NumberFormat,
|
||||||
|
"{0}:{1:0.000000}",
|
||||||
|
Name,
|
||||||
|
Probability);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public string Name { get; set; }
|
||||||
|
|
||||||
|
public double Probability { get; set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,37 @@
|
||||||
|
using System;
|
||||||
|
using MediaBrowser.Model.Serialization;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
// TODO IMM HI: change to non-static class
|
||||||
|
// TODO IMM HI: hide other, unnecassary classes via internal?
|
||||||
|
public static class LanguageDetector
|
||||||
|
{
|
||||||
|
private const double _DefaultAlpha = 0.5;
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public static void Initialize(IJsonSerializer json)
|
||||||
|
{
|
||||||
|
DetectorFactory.LoadProfiles(json);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void Release()
|
||||||
|
{
|
||||||
|
DetectorFactory.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string DetectLanguage(string plainText)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrEmpty(plainText)) { throw new ArgumentException("Argument can't be null nor empty.", "plainText"); }
|
||||||
|
|
||||||
|
Detector detector = DetectorFactory.Create(_DefaultAlpha);
|
||||||
|
|
||||||
|
detector.Append(plainText);
|
||||||
|
|
||||||
|
return detector.Detect();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
public class NLangDetectException : Exception
|
||||||
|
{
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public NLangDetectException(string message, ErrorCode errorCode)
|
||||||
|
: base(message)
|
||||||
|
{
|
||||||
|
ErrorCode = errorCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public ErrorCode ErrorCode { get; private set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core
|
||||||
|
{
|
||||||
|
public class ProbVector
|
||||||
|
{
|
||||||
|
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
|
||||||
|
|
||||||
|
public double this[int key]
|
||||||
|
{
|
||||||
|
get
|
||||||
|
{
|
||||||
|
double value;
|
||||||
|
|
||||||
|
return _dict.TryGetValue(key, out value) ? value : 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
set
|
||||||
|
{
|
||||||
|
if (Math.Abs(value) < double.Epsilon)
|
||||||
|
{
|
||||||
|
if (_dict.ContainsKey(key))
|
||||||
|
{
|
||||||
|
_dict.Remove(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
_dict[key] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,118 @@
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Utils
|
||||||
|
{
|
||||||
|
public class LangProfile
|
||||||
|
{
|
||||||
|
private const int MinimumFreq = 2;
|
||||||
|
private const int LessFreqRatio = 100000;
|
||||||
|
|
||||||
|
public string name { get; set; }
|
||||||
|
|
||||||
|
public Dictionary<string, int> freq { get; set; }
|
||||||
|
public int[] n_words { get; set; }
|
||||||
|
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public LangProfile()
|
||||||
|
{
|
||||||
|
freq = new Dictionary<string, int>();
|
||||||
|
n_words = new int[NGram.GramsCount];
|
||||||
|
}
|
||||||
|
|
||||||
|
public LangProfile(string name)
|
||||||
|
{
|
||||||
|
this.name = name;
|
||||||
|
freq = new Dictionary<string, int>();
|
||||||
|
n_words = new int[NGram.GramsCount];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public void Add(string gram)
|
||||||
|
{
|
||||||
|
if (name == null || gram == null) return; // Illegal
|
||||||
|
int len = gram.Length;
|
||||||
|
if (len < 1 || len > NGram.GramsCount) return; // Illegal
|
||||||
|
|
||||||
|
n_words[len - 1]++;
|
||||||
|
|
||||||
|
if (freq.ContainsKey(gram))
|
||||||
|
{
|
||||||
|
freq[gram] = freq[gram] + 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
freq.Add(gram, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void OmitLessFreq()
|
||||||
|
{
|
||||||
|
if (name == null) return; // Illegal
|
||||||
|
int threshold = n_words[0] / LessFreqRatio;
|
||||||
|
if (threshold < MinimumFreq) threshold = MinimumFreq;
|
||||||
|
|
||||||
|
ICollection<string> keys = freq.Keys;
|
||||||
|
int roman = 0;
|
||||||
|
// TODO IMM HI: move up?
|
||||||
|
Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
|
||||||
|
List<string> keysToRemove = new List<string>();
|
||||||
|
|
||||||
|
foreach (string key in keys)
|
||||||
|
{
|
||||||
|
int count = freq[key];
|
||||||
|
|
||||||
|
if (count <= threshold)
|
||||||
|
{
|
||||||
|
n_words[key.Length - 1] -= count;
|
||||||
|
keysToRemove.Add(key);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (regex1.IsMatch(key))
|
||||||
|
{
|
||||||
|
roman += count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (string keyToRemove in keysToRemove)
|
||||||
|
{
|
||||||
|
freq.Remove(keyToRemove);
|
||||||
|
}
|
||||||
|
|
||||||
|
// roman check
|
||||||
|
keysToRemove = new List<string>();
|
||||||
|
|
||||||
|
if (roman < n_words[0] / 3)
|
||||||
|
{
|
||||||
|
ICollection<string> keys2 = freq.Keys;
|
||||||
|
|
||||||
|
// TODO IMM HI: move up?
|
||||||
|
Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
|
||||||
|
|
||||||
|
foreach (string key in keys2)
|
||||||
|
{
|
||||||
|
int count = freq[key];
|
||||||
|
|
||||||
|
if (regex2.IsMatch(key))
|
||||||
|
{
|
||||||
|
n_words[key.Length - 1] -= count;
|
||||||
|
keysToRemove.Add(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (string keyToRemove in keysToRemove)
|
||||||
|
{
|
||||||
|
freq.Remove(keyToRemove);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,91 @@
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Globalization;
|
||||||
|
using System.IO;
|
||||||
|
using System.Reflection;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Linq;
|
||||||
|
using System;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Utils
|
||||||
|
{
|
||||||
|
public static class Messages
|
||||||
|
{
|
||||||
|
private static readonly Dictionary<string, string> _messages;
|
||||||
|
|
||||||
|
static Messages()
|
||||||
|
{
|
||||||
|
_messages = LoadMessages();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static string getString(string key)
|
||||||
|
{
|
||||||
|
string value;
|
||||||
|
|
||||||
|
return
|
||||||
|
_messages.TryGetValue(key, out value)
|
||||||
|
? value
|
||||||
|
: string.Format("!{0}!", key);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dictionary<string, string> LoadMessages()
|
||||||
|
{
|
||||||
|
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ;
|
||||||
|
|
||||||
|
Stream messagesStream =
|
||||||
|
typeof(Messages).Assembly
|
||||||
|
.GetManifestResourceStream(manifestName);
|
||||||
|
|
||||||
|
if (messagesStream == null)
|
||||||
|
{
|
||||||
|
throw new InternalException(string.Format("Couldn't get embedded resource named '{0}'.", manifestName));
|
||||||
|
}
|
||||||
|
|
||||||
|
using (messagesStream)
|
||||||
|
using (var sr = new StreamReader(messagesStream))
|
||||||
|
{
|
||||||
|
var messages = new Dictionary<string, string>();
|
||||||
|
|
||||||
|
while (!sr.EndOfStream)
|
||||||
|
{
|
||||||
|
string line = sr.ReadLine();
|
||||||
|
|
||||||
|
if (string.IsNullOrEmpty(line))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
string[] keyValue = line.Split('=');
|
||||||
|
|
||||||
|
if (keyValue.Length != 2)
|
||||||
|
{
|
||||||
|
throw new InternalException(string.Format("Invalid format of the 'Messages.properties' resource. Offending line: '{0}'.", line.Trim()));
|
||||||
|
}
|
||||||
|
|
||||||
|
string key = keyValue[0];
|
||||||
|
string value = UnescapeUnicodeString(keyValue[1]);
|
||||||
|
|
||||||
|
messages.Add(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return messages;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <remarks>
|
||||||
|
/// Taken from: http://stackoverflow.com/questions/1615559/converting-unicode-strings-to-escaped-ascii-string/1615860#1615860
|
||||||
|
/// </remarks>
|
||||||
|
private static string UnescapeUnicodeString(string s)
|
||||||
|
{
|
||||||
|
if (s == null)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
Regex.Replace(
|
||||||
|
s,
|
||||||
|
@"\\u(?<Value>[a-zA-Z0-9]{4})",
|
||||||
|
match => ((char)int.Parse(match.Groups["Value"].Value, NumberStyles.HexNumber)).ToString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,330 @@
|
||||||
|
// TODO IMM HI: check which classes can be made internal?
|
||||||
|
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Text;
|
||||||
|
using NLangDetect.Core.Extensions;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Utils
|
||||||
|
{
|
||||||
|
public class NGram
|
||||||
|
{
|
||||||
|
public const int GramsCount = 3;
|
||||||
|
|
||||||
|
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
|
||||||
|
|
||||||
|
private static readonly string[] CjkClass =
|
||||||
|
{
|
||||||
|
#region CJK classes
|
||||||
|
|
||||||
|
Messages.getString("NGram.KANJI_1_0"),
|
||||||
|
Messages.getString("NGram.KANJI_1_2"),
|
||||||
|
Messages.getString("NGram.KANJI_1_4"),
|
||||||
|
Messages.getString("NGram.KANJI_1_8"),
|
||||||
|
Messages.getString("NGram.KANJI_1_11"),
|
||||||
|
Messages.getString("NGram.KANJI_1_12"),
|
||||||
|
Messages.getString("NGram.KANJI_1_13"),
|
||||||
|
Messages.getString("NGram.KANJI_1_14"),
|
||||||
|
Messages.getString("NGram.KANJI_1_16"),
|
||||||
|
Messages.getString("NGram.KANJI_1_18"),
|
||||||
|
Messages.getString("NGram.KANJI_1_22"),
|
||||||
|
Messages.getString("NGram.KANJI_1_27"),
|
||||||
|
Messages.getString("NGram.KANJI_1_29"),
|
||||||
|
Messages.getString("NGram.KANJI_1_31"),
|
||||||
|
Messages.getString("NGram.KANJI_1_35"),
|
||||||
|
Messages.getString("NGram.KANJI_2_0"),
|
||||||
|
Messages.getString("NGram.KANJI_2_1"),
|
||||||
|
Messages.getString("NGram.KANJI_2_4"),
|
||||||
|
Messages.getString("NGram.KANJI_2_9"),
|
||||||
|
Messages.getString("NGram.KANJI_2_10"),
|
||||||
|
Messages.getString("NGram.KANJI_2_11"),
|
||||||
|
Messages.getString("NGram.KANJI_2_12"),
|
||||||
|
Messages.getString("NGram.KANJI_2_13"),
|
||||||
|
Messages.getString("NGram.KANJI_2_15"),
|
||||||
|
Messages.getString("NGram.KANJI_2_16"),
|
||||||
|
Messages.getString("NGram.KANJI_2_18"),
|
||||||
|
Messages.getString("NGram.KANJI_2_21"),
|
||||||
|
Messages.getString("NGram.KANJI_2_22"),
|
||||||
|
Messages.getString("NGram.KANJI_2_23"),
|
||||||
|
Messages.getString("NGram.KANJI_2_28"),
|
||||||
|
Messages.getString("NGram.KANJI_2_29"),
|
||||||
|
Messages.getString("NGram.KANJI_2_30"),
|
||||||
|
Messages.getString("NGram.KANJI_2_31"),
|
||||||
|
Messages.getString("NGram.KANJI_2_32"),
|
||||||
|
Messages.getString("NGram.KANJI_2_35"),
|
||||||
|
Messages.getString("NGram.KANJI_2_36"),
|
||||||
|
Messages.getString("NGram.KANJI_2_37"),
|
||||||
|
Messages.getString("NGram.KANJI_2_38"),
|
||||||
|
Messages.getString("NGram.KANJI_3_1"),
|
||||||
|
Messages.getString("NGram.KANJI_3_2"),
|
||||||
|
Messages.getString("NGram.KANJI_3_3"),
|
||||||
|
Messages.getString("NGram.KANJI_3_4"),
|
||||||
|
Messages.getString("NGram.KANJI_3_5"),
|
||||||
|
Messages.getString("NGram.KANJI_3_8"),
|
||||||
|
Messages.getString("NGram.KANJI_3_9"),
|
||||||
|
Messages.getString("NGram.KANJI_3_11"),
|
||||||
|
Messages.getString("NGram.KANJI_3_12"),
|
||||||
|
Messages.getString("NGram.KANJI_3_13"),
|
||||||
|
Messages.getString("NGram.KANJI_3_15"),
|
||||||
|
Messages.getString("NGram.KANJI_3_16"),
|
||||||
|
Messages.getString("NGram.KANJI_3_18"),
|
||||||
|
Messages.getString("NGram.KANJI_3_19"),
|
||||||
|
Messages.getString("NGram.KANJI_3_22"),
|
||||||
|
Messages.getString("NGram.KANJI_3_23"),
|
||||||
|
Messages.getString("NGram.KANJI_3_27"),
|
||||||
|
Messages.getString("NGram.KANJI_3_29"),
|
||||||
|
Messages.getString("NGram.KANJI_3_30"),
|
||||||
|
Messages.getString("NGram.KANJI_3_31"),
|
||||||
|
Messages.getString("NGram.KANJI_3_32"),
|
||||||
|
Messages.getString("NGram.KANJI_3_35"),
|
||||||
|
Messages.getString("NGram.KANJI_3_36"),
|
||||||
|
Messages.getString("NGram.KANJI_3_37"),
|
||||||
|
Messages.getString("NGram.KANJI_3_38"),
|
||||||
|
Messages.getString("NGram.KANJI_4_0"),
|
||||||
|
Messages.getString("NGram.KANJI_4_9"),
|
||||||
|
Messages.getString("NGram.KANJI_4_10"),
|
||||||
|
Messages.getString("NGram.KANJI_4_16"),
|
||||||
|
Messages.getString("NGram.KANJI_4_17"),
|
||||||
|
Messages.getString("NGram.KANJI_4_18"),
|
||||||
|
Messages.getString("NGram.KANJI_4_22"),
|
||||||
|
Messages.getString("NGram.KANJI_4_24"),
|
||||||
|
Messages.getString("NGram.KANJI_4_28"),
|
||||||
|
Messages.getString("NGram.KANJI_4_34"),
|
||||||
|
Messages.getString("NGram.KANJI_4_39"),
|
||||||
|
Messages.getString("NGram.KANJI_5_10"),
|
||||||
|
Messages.getString("NGram.KANJI_5_11"),
|
||||||
|
Messages.getString("NGram.KANJI_5_12"),
|
||||||
|
Messages.getString("NGram.KANJI_5_13"),
|
||||||
|
Messages.getString("NGram.KANJI_5_14"),
|
||||||
|
Messages.getString("NGram.KANJI_5_18"),
|
||||||
|
Messages.getString("NGram.KANJI_5_26"),
|
||||||
|
Messages.getString("NGram.KANJI_5_29"),
|
||||||
|
Messages.getString("NGram.KANJI_5_34"),
|
||||||
|
Messages.getString("NGram.KANJI_5_39"),
|
||||||
|
Messages.getString("NGram.KANJI_6_0"),
|
||||||
|
Messages.getString("NGram.KANJI_6_3"),
|
||||||
|
Messages.getString("NGram.KANJI_6_9"),
|
||||||
|
Messages.getString("NGram.KANJI_6_10"),
|
||||||
|
Messages.getString("NGram.KANJI_6_11"),
|
||||||
|
Messages.getString("NGram.KANJI_6_12"),
|
||||||
|
Messages.getString("NGram.KANJI_6_16"),
|
||||||
|
Messages.getString("NGram.KANJI_6_18"),
|
||||||
|
Messages.getString("NGram.KANJI_6_20"),
|
||||||
|
Messages.getString("NGram.KANJI_6_21"),
|
||||||
|
Messages.getString("NGram.KANJI_6_22"),
|
||||||
|
Messages.getString("NGram.KANJI_6_23"),
|
||||||
|
Messages.getString("NGram.KANJI_6_25"),
|
||||||
|
Messages.getString("NGram.KANJI_6_28"),
|
||||||
|
Messages.getString("NGram.KANJI_6_29"),
|
||||||
|
Messages.getString("NGram.KANJI_6_30"),
|
||||||
|
Messages.getString("NGram.KANJI_6_32"),
|
||||||
|
Messages.getString("NGram.KANJI_6_34"),
|
||||||
|
Messages.getString("NGram.KANJI_6_35"),
|
||||||
|
Messages.getString("NGram.KANJI_6_37"),
|
||||||
|
Messages.getString("NGram.KANJI_6_39"),
|
||||||
|
Messages.getString("NGram.KANJI_7_0"),
|
||||||
|
Messages.getString("NGram.KANJI_7_3"),
|
||||||
|
Messages.getString("NGram.KANJI_7_6"),
|
||||||
|
Messages.getString("NGram.KANJI_7_7"),
|
||||||
|
Messages.getString("NGram.KANJI_7_9"),
|
||||||
|
Messages.getString("NGram.KANJI_7_11"),
|
||||||
|
Messages.getString("NGram.KANJI_7_12"),
|
||||||
|
Messages.getString("NGram.KANJI_7_13"),
|
||||||
|
Messages.getString("NGram.KANJI_7_16"),
|
||||||
|
Messages.getString("NGram.KANJI_7_18"),
|
||||||
|
Messages.getString("NGram.KANJI_7_19"),
|
||||||
|
Messages.getString("NGram.KANJI_7_20"),
|
||||||
|
Messages.getString("NGram.KANJI_7_21"),
|
||||||
|
Messages.getString("NGram.KANJI_7_23"),
|
||||||
|
Messages.getString("NGram.KANJI_7_25"),
|
||||||
|
Messages.getString("NGram.KANJI_7_28"),
|
||||||
|
Messages.getString("NGram.KANJI_7_29"),
|
||||||
|
Messages.getString("NGram.KANJI_7_32"),
|
||||||
|
Messages.getString("NGram.KANJI_7_33"),
|
||||||
|
Messages.getString("NGram.KANJI_7_35"),
|
||||||
|
Messages.getString("NGram.KANJI_7_37"),
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
};
|
||||||
|
|
||||||
|
private static readonly Dictionary<char, char> _cjkMap;
|
||||||
|
|
||||||
|
private StringBuilder _grams;
|
||||||
|
private bool _capitalword;
|
||||||
|
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
static NGram()
|
||||||
|
{
|
||||||
|
_cjkMap = new Dictionary<char, char>();
|
||||||
|
|
||||||
|
foreach (string cjk_list in CjkClass)
|
||||||
|
{
|
||||||
|
char representative = cjk_list[0];
|
||||||
|
|
||||||
|
for (int i = 0; i < cjk_list.Length; i++)
|
||||||
|
{
|
||||||
|
_cjkMap.Add(cjk_list[i], representative);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public NGram()
|
||||||
|
{
|
||||||
|
_grams = new StringBuilder(" ");
|
||||||
|
_capitalword = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public static char Normalize(char ch)
|
||||||
|
{
|
||||||
|
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
|
||||||
|
|
||||||
|
if (!unicodeBlock.HasValue)
|
||||||
|
{
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (unicodeBlock.Value)
|
||||||
|
{
|
||||||
|
case UnicodeBlock.BasicLatin:
|
||||||
|
{
|
||||||
|
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
|
||||||
|
{
|
||||||
|
return ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Latin1Supplement:
|
||||||
|
{
|
||||||
|
if (Latin1Excluded.IndexOf(ch) >= 0)
|
||||||
|
{
|
||||||
|
return ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.GeneralPunctuation:
|
||||||
|
{
|
||||||
|
return ' ';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Arabic:
|
||||||
|
{
|
||||||
|
if (ch == '\u06cc')
|
||||||
|
{
|
||||||
|
return '\u064a';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.LatinExtendedAdditional:
|
||||||
|
{
|
||||||
|
if (ch >= '\u1ea0')
|
||||||
|
{
|
||||||
|
return '\u1ec3';
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Hiragana:
|
||||||
|
{
|
||||||
|
return '\u3042';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Katakana:
|
||||||
|
{
|
||||||
|
return '\u30a2';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.Bopomofo:
|
||||||
|
case UnicodeBlock.BopomofoExtended:
|
||||||
|
{
|
||||||
|
return '\u3105';
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.CjkUnifiedIdeographs:
|
||||||
|
{
|
||||||
|
if (_cjkMap.ContainsKey(ch))
|
||||||
|
{
|
||||||
|
return _cjkMap[ch];
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case UnicodeBlock.HangulSyllables:
|
||||||
|
{
|
||||||
|
return '\uac00';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void AddChar(char ch)
|
||||||
|
{
|
||||||
|
ch = Normalize(ch);
|
||||||
|
char lastchar = _grams[_grams.Length - 1];
|
||||||
|
if (lastchar == ' ')
|
||||||
|
{
|
||||||
|
_grams = new StringBuilder(" ");
|
||||||
|
_capitalword = false;
|
||||||
|
if (ch == ' ') return;
|
||||||
|
}
|
||||||
|
else if (_grams.Length >= GramsCount)
|
||||||
|
{
|
||||||
|
_grams.Remove(0, 1);
|
||||||
|
}
|
||||||
|
_grams.Append(ch);
|
||||||
|
|
||||||
|
if (char.IsUpper(ch))
|
||||||
|
{
|
||||||
|
if (char.IsUpper(lastchar)) _capitalword = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_capitalword = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public string Get(int n)
|
||||||
|
{
|
||||||
|
if (_capitalword)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
int len = _grams.Length;
|
||||||
|
|
||||||
|
if (n < 1 || n > 3 || len < n)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n == 1)
|
||||||
|
{
|
||||||
|
char ch = _grams[len - 1];
|
||||||
|
|
||||||
|
if (ch == ' ')
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ch.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO IMM HI: is ToString() here effective?
|
||||||
|
return _grams.ToString().SubSequence(len - n, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,76 @@
|
||||||
|
using System.Text;
|
||||||
|
|
||||||
|
namespace NLangDetect.Core.Utils
|
||||||
|
{
|
||||||
|
public class TagExtractor
|
||||||
|
{
|
||||||
|
// TODO IMM HI: do the really need to be internal?
|
||||||
|
internal string Target;
|
||||||
|
internal int Threshold;
|
||||||
|
internal StringBuilder StringBuilder;
|
||||||
|
internal string Tag;
|
||||||
|
|
||||||
|
#region Constructor(s)
|
||||||
|
|
||||||
|
public TagExtractor(string tag, int threshold)
|
||||||
|
{
|
||||||
|
Target = tag;
|
||||||
|
Threshold = threshold;
|
||||||
|
Count = 0;
|
||||||
|
Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Public methods
|
||||||
|
|
||||||
|
public void Clear()
|
||||||
|
{
|
||||||
|
StringBuilder = new StringBuilder();
|
||||||
|
Tag = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void SetTag(string tag)
|
||||||
|
{
|
||||||
|
Tag = tag;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Add(string line)
|
||||||
|
{
|
||||||
|
if (Tag == Target && line != null)
|
||||||
|
{
|
||||||
|
StringBuilder.Append(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void CloseTag(LangProfile profile)
|
||||||
|
{
|
||||||
|
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
|
||||||
|
{
|
||||||
|
var gram = new NGram();
|
||||||
|
|
||||||
|
for (int i = 0; i < StringBuilder.Length; i++)
|
||||||
|
{
|
||||||
|
gram.AddChar(StringBuilder[i]);
|
||||||
|
|
||||||
|
for (int n = 1; n <= NGram.GramsCount; n++)
|
||||||
|
{
|
||||||
|
profile.Add(gram.Get(n));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Properties
|
||||||
|
|
||||||
|
public int Count { get; private set; }
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,128 @@
|
||||||
|
NGram.CJK_KANJI_EXCLUDE=\u0020\uFF08\uFF09
|
||||||
|
NGram.LATIN1_EXCLUDE=\u00A0\u00AB\u00B0\u00BB
|
||||||
|
NGram.KANJI_1_0=\u4F7C\u6934
|
||||||
|
NGram.KANJI_1_2=\u88CF\u95B2
|
||||||
|
NGram.KANJI_1_4=\u7027\u7DCB
|
||||||
|
NGram.KANJI_1_8=\u4E80\u4E9C\u4EEE\u5263\u5264\u5270\u52C5\u52E7\u52F2\u53B3\u5449\u58CA\u58CC\u5968\u59C9\u59EB\u5D8B\u5DE3\u5E30\u6075\u622F\u623B\u6255\u629C\u629E\u62DD\u62E1\u633F\u635C\u63FA\u6442\u6589\u658E\u6669\u66A6\u66FD\u6804\u685C\u6B69\u6B6F\u6BBB\u6C37\u6C5A\u6D44\u6E09\u6E0B\u6E13\u6EDD\u713C\u72A0\u731F\u7363\u7A32\u7A42\u7A93\u7ADC\u7C8B\u7C9B\u7DD1\u7E01\u7E04\u7E26\u7E4A\u7E4B\u7E70\u8074\u8107\u8133\u81D3\u820E\u8217\u8358\u83D3\u85AC\u8987\u899A\u8B21\u8B72\u8B83\u8CDB\u9045\u90F7\u91C8\u9271\u9283\u92AD\u9665\u967A\u96A0\u96A3\u96B7\u970A\u983C\u9854\u9855\u99C6\u9A12\u9ED9\u9F62
|
||||||
|
NGram.KANJI_1_11=\u67D8\u831C
|
||||||
|
NGram.KANJI_1_12=\u5742\u57FC\u5800
|
||||||
|
NGram.KANJI_1_13=\u4E3C\u4E98\u4FE3\u4FF5\u5072\u51A8\u53A9\u5451\u546A\u5504\u5516\u55A9\u55B0\u5618\u5642\u565B\u567A\u56A2\u57F4\u5840\u5841\u58F1\u59F6\u5A2F\u5B22\u5B8D\u5DCC\u5EFB\u5F10\u60A9\u60E3\u61D0\u62F6\u63B4\u63BB\u63C3\u6681\u685F\u6955\u6962\u696F\u698A\u698E\u69FB\u6A2B\u6A7F\u6B53\u6BD8\u6D99\u6E07\u7460\u7473\u7560\u7573\u758E\u7690\u7815\u783A\u7962\u7A4F\u7A63\u7AEA\u7BED\u7CA7\u7D18\u7D3A\u7E4D\u8061\u8218\u8276\u82C5\u8597\u85AB\u86CD\u874B\u88FE\u8ACF\u8B90\u8D0B\u8FBF\u9013\u9061\u914E\u9154\u918D\u9190\u91A4\u91B8\u9262\u929A\u92ED\u92F3\u932C\u96EB\u96F0\u976D\u97EE\u981A\u99C4\u9A28\u9AC4\u9B8E\u9C10\u9D0E\u9D5C\u9D8F\u9E78\u9EB9\u9EBA\u9EBF
|
||||||
|
NGram.KANJI_1_14=\u5F66\u7984\u7985
|
||||||
|
NGram.KANJI_1_16=\u5861\u7B25\u844E\u9419\u9D07
|
||||||
|
NGram.KANJI_1_18=\u5039\u514E\u51E7\u51EA\u5301\u5302\u5859\u58F7\u59AC\u5C2D\u5CA8\u5EFC\u6357\u64B9\u67CA\u6802\u6834\u68BC\u6900\u6919\u691B\u69D9\u6AE8\u6D9C\u6E8C\u6F09\u6F45\u701E\u7026\u7114\u72DB\u7577\u75E9\u783F\u7895\u7A50\u7AC3\u7B48\u7B86\u7BAA\u7C7E\u7C82\u7C8D\u7CCE\u7D2C\u7F6B\u7FEB\u8557\u85AE\u86CE\u877F\u8997\u8ACC\u8CB0\u8CCE\u8FE9\u9197\u920E\u9266\u927E\u92F2\u9306\u9453\u9784\u982C\u9834\u99C8\u9BF5\u9C2F\u9D2C
|
||||||
|
NGram.KANJI_1_22=\u6762\u6A17\u887F
|
||||||
|
NGram.KANJI_1_27=\u4E21\u4E57\u4ECF\u4F1D\u4FA1\u4FF3\u5024\u50CD\u5150\u5186\u51E6\u52B4\u52B9\u5358\u53CE\u55B6\u56E3\u56F2\u56F3\u570F\u5727\u5869\u5897\u58F2\u5909\u5B9F\u5BDB\u5BFE\u5C02\u5DFB\u5E2F\u5E81\u5E83\u5EC3\u5F3E\u5F93\u5FB3\u5FB4\u5FDC\u60AA\u6226\u6238\u6271\u62E0\u6319\u63B2\u6483\u64AE\u67A0\u67FB\u691C\u697D\u69D8\u6A29\u6B73\u6B74\u6BCE\u6C17\u6CA2\u6D5C\u6E08\u6E80\u702C\u7523\u767A\u770C\u7D4C\u7D75\u7D76\u7D99\u7D9A\u7DCF\u8535\u8846\u89A7\u89B3\u8A33\u8AAC\u8AAD\u8C4A\u8EE2\u8EFD\u8FBA\u8FBC\u9244\u9332\u95A2\u95D8\u96D1\u99C5\u9A13\u9ED2
|
||||||
|
NGram.KANJI_1_29=\u4F0E\u4FFA\u5036\u53E1\u54B2\u5506\u583A\u5C3B\u5CAC\u5CE0\u5CEF\u6803\u68B6\u6A0B\u6A8E\u73C2\u7551\u7826\u7881\u79B0\u7B39\u8429\u8599\u8FBB\u9162\u95C7\u9688\u96BC\u9AEA\u9DF2
|
||||||
|
NGram.KANJI_1_31=\u5553\u938C
|
||||||
|
NGram.KANJI_1_35=\u51B4\u564C\u57DC\u5B2C\u6822\u685D\u690B\u6973\u6C93\u7511\u7887\u7A17\u83D6\u847A\u8494\u8526\u854E\u85C1\u86F8\u88B4\u93A7\u9B92\u9C39\u9C48\u9C52
|
||||||
|
NGram.KANJI_2_0=\u4E2B\u4EC3\u4F09\u4F57\u4F6F\u4F70\u4FD1\u4FDA\u500C\u5043\u516E\u5189\u5241\u530D\u5310\u5412\u54AB\u54AF\u5514\u5556\u55B1\u561F\u573B\u586D\u587D\u58C5\u58D1\u5914\u5A62\u5A6A\u5AE6\u5B40\u5B5B\u5B70\u5BB8\u5CD2\u5D01\u5D34\u5E11\u5EA0\u5F0B\u5F2D\u5F87\u607F\u621B\u6221\u6289\u63A3\u6452\u646D\u64D8\u652B\u6600\u6631\u6641\u66F7\u6773\u67B8\u67DD\u67DE\u6829\u68FB\u69AD\u6A47\u6C10\u6C68\u6C74\u6C85\u6CD3\u6D31\u6D93\u6D94\u6DB8\u6DBF\u6DC5\u6E6E\u6EA7\u6EB4\u6EC2\u6F2A\u6F2F\u6FB9\u6FC2\u6FDB\u6FEE\u70AF\u70FD\u7166\u726F\u729B\u739F\u73DE\u740A\u746D\u749C\u749F\u74E0\u759D\u75A3\u75CD\u75DE\u7600\u7620\u7688\u7738\u7762\u776B\u777D\u77E3\u781D\u7837\u78A3\u7946\u7B60\u7F44\u7F54\u7F5F\u7FAF\u8026\u807F\u80C4\u80DB\u80ED\u81E7\u824B\u82B7\u82E3\u8392\u846D\u84D3\u8548\u85B9\u86DE\u873F\u8753\u8782\u87AB\u87B3\u87D1\u87E0\u87FE\u8821\u88D8\u88E8\u8913\u891A\u892B\u8983\u8C3F\u8C49\u8C82\u8D6D\u8DE4\u8E1D\u8E1E\u8E7C\u8FE5\u8FE8\u9005\u9035\u9050\u9082\u9083\u9095\u90E2\u911E\u91AE\u91B4\u93D6\u9621\u968D\u96B9\u96D2\u9711\u9713\u973E\u9AB0\u9AB7\u9AE6\u9B03\u9B23\u9EDC\u9EEF
|
||||||
|
NGram.KANJI_2_1=\u4E82\u4F48\u4F54\u50F9\u5167\u528D\u52DE\u532F\u537B\u53C3\u5433\u555F\u55AE\u56B4\u570D\u5716\u58D3\u58DE\u5920\u5967\u5A1B\u5BEB\u5BEC\u5C08\u5C0D\u5C46\u5C6C\u5CFD\u5E36\u5E6B\u5EC8\u5EF3\u5F48\u5F91\u5F9E\u5FB5\u6046\u60E1\u61F7\u6232\u6236\u64C7\u64CA\u64D4\u64DA\u64F4\u651D\u6578\u65B7\u6649\u6A13\u6A23\u6A6B\u6A94\u6AA2\u6B0A\u6B50\u6B61\u6B72\u6B77\u6B78\u6C92\u6EAB\u6EFF\u6FD5\u6FDF\u71DF\u722D\u72C0\u734E\u737B\u746A\u7522\u773E\u78BC\u7A69\u7C3D\u7CB5\u7D55\u7D72\u7DA0\u7DAB\u7DE3\u7E5E\u7E6A\u7E7C\u7E8C\u8072\u807D\u8085\u812B\u8166\u8173\u81D8\u8209\u820A\u8332\u838A\u840A\u85E5\u860B\u8655\u865B\u88DD\u89BA\u89BD\u89C0\u8AAA\u8B6F\u8B7D\u8B8A\u8B93\u8C50\u8CF4\u8E64\u8F15\u8F49\u8FA6\u8FAD\u9109\u9130\u91AB\u91CB\u92B7\u9304\u9322\u95CA\u96A8\u96AA\u96B1\u96B8\u96D6\u96D9\u96DC\u9748\u975C\u986F\u9918\u99DB\u9A57\u9B25\u9EA5\u9EC3\u9EDE\u9F52
|
||||||
|
NGram.KANJI_2_4=\u514C\u51AA\u5614\u56AE\u56C2\u582F\u58FA\u5B0C\u5D11\u5DD2\u5DD6\u5E40\u5E5F\u5EEC\u6137\u6417\u6488\u64F2\u652A\u6582\u6689\u689F\u68D7\u69D3\u6A97\u6AB8\u6ABB\u6AC3\u6ADA\u6B7F\u6BB2\u6EA5\u6EC4\u6EF2\u7009\u701D\u7028\u703E\u7165\u71BE\u721B\u7463\u7464\u7469\u7515\u7526\u75FA\u7621\u779E\u79B1\u7A1F\u7AC4\u7AC7\u7B8F\u7BE9\u7D2E\u7D68\u7D8F\u7DB8\u7DBA\u7E46\u7E79\u7F4C\u7F88\u8070\u8073\u8076\u81BE\u82BB\u83A2\u858A\u8591\u861A\u8778\u87EC\u8805\u880D\u893B\u8A1B\u8A25\u8A36\u8A85\u8AA6\u8B17\u8B28\u8CB6\u8CE4\u8D16\u8D1B\u8ECB\u9112\u9214\u9249\u93AC\u9594\u9598\u95BB\u95D5\u965E\u96B4\u97DC\u9821\u9824\u9921\u9952\u9A55\u9A5B\u9B1A\u9C13\u9D09\u9DAF\u9E1A\u9E75\u9F67
|
||||||
|
NGram.KANJI_2_9=\u4E9F\u4F6C\u4FDE\u4FFE\u5029\u5140\u51A2\u5345\u539D\u53FB\u54C7\u5599\u560E\u561B\u563B\u566C\u5676\u5729\u574D\u57E4\u595A\u598D\u5A1F\u5A25\u5A77\u5AB2\u5AD6\u5BF0\u5C2C\u5CEA\u5E37\u5F08\u6059\u606A\u6096\u609A\u62A8\u6555\u6556\u66E6\u675E\u68E3\u69BB\u6BCB\u6BD3\u6C1F\u6C26\u6C81\u6DC4\u6DDE\u6E32\u6E44\u6E4D\u6F33\u6F7C\u6FA7\u701A\u701B\u715C\u741B\u7428\u7480\u74A8\u7504\u752C\u768B\u76CE\u78CA\u78FA\u79BA\u7C27\u8046\u81FB\u8331\u8393\u83C1\u8403\u8438\u843C\u8446\u85B0\u87D2\u8862\u8DC6\u9074\u9131\u9672\u96EF\u9704\u9706\u977C\u9ABC\u9E92\u9ECF
|
||||||
|
NGram.KANJI_2_10=\u51BD\u5704\u7350\u73A5
|
||||||
|
NGram.KANJI_2_11=\u4E15\u4EA2\u4F5A\u50D6\u5349\u53DF\u5484\u5958\u5B34\u5B5A\u5C91\u5E1B\u5F77\u61CB\u61FF\u620C\u620D\u622E\u6248\u6538\u660A\u664F\u678B\u67E9\u69B7\u69C3\u6CB1\u6CD7\u6D5A\u6DAA\u6DC7\u7099\u71EE\u7325\u7425\u7455\u747E\u749E\u75B5\u7678\u7693\u76C2\u77B0\u77BF\u78CB\u7957\u795A\u797A\u7A79\u7B08\u7B75\u7BB4\u7F9A\u7FB2\u7FDF\u80E5\u81BA\u8340\u837C\u8398\u8559\u85A8\u86DF\u8734\u8882\u88F4\u8936\u900D\u907D\u9642\u96C9\u9AFB\u9E9D\u9EBE
|
||||||
|
NGram.KANJI_2_12=\u5F57\u7940
|
||||||
|
NGram.KANJI_2_13=\u5191\u7791\u792C\u7D46
|
||||||
|
NGram.KANJI_2_15=\u5713\u58FD\u5D17\u5D19\u5DBC\u5F4C\u6191\u64A5\u687F\u69AE\u6AFB\u6EEC\u6F3F\u6FE4\u6FF1\u6FFE\u700B\u74CA\u76E1\u76E7\u7926\u792B\u79AE\u7AA9\u7C43\u7C4C\u7C64\u7DBD\u81A0\u856D\u8594\u8606\u8A62\u8AF7\u8CC8\u8CE3\u8D99\u8F1B\u8F3B\u9059\u9127\u9264\u947D\u95A9\u97CB\u980C\u9838\u9846\u99AE\u9A19\u9B06\u9B91\u9F4A\u9F4B
|
||||||
|
NGram.KANJI_2_16=\u4E69\u4EC4\u4EDF\u4EF3\u4F0B\u4F5E\u5000\u5028\u50E5\u513B\u5157\u51DC\u52D7\u530F\u5379\u53F5\u5471\u5477\u5555\u555C\u557B\u5594\u55B2\u55C9\u560D\u5616\u562E\u5630\u5653\u5657\u566F\u56A8\u56B6\u5820\u5880\u58CE\u58D9\u5950\u5969\u596D\u599E\u59B3\u59CD\u59D2\u5A40\u5AA7\u5ABC\u5AD7\u5AD8\u5B0B\u5B24\u5B38\u5B53\u5C5C\u5D06\u5D47\u5D94\u5D9D\u5E57\u5EC4\u5F46\u5FAC\u60BD\u60D8\u6123\u615D\u615F\u6175\u618A\u61AB\u61E3\u623E\u6308\u636B\u645F\u6519\u6595\u6698\u66B8\u67D9\u6840\u695D\u696E\u6979\u69C1\u69E8\u6AEC\u6AFA\u6B5F\u6CAC\u6CE0\u6CEF\u6D0C\u6D36\u6DD2\u6DD9\u6DE6\u6DEC\u6E5F\u6FA0\u6FEC\u7156\u71C4\u71DC\u71EC\u71FC\u720D\u7230\u7292\u7296\u72A2\u72CE\u7357\u737A\u7380\u7386\u73A8\u73EE\u743F\u74A6\u74CF\u74D4\u74DA\u755A\u75A5\u75B3\u75C2\u75E0\u75F1\u75FF\u7601\u7609\u7646\u7658\u769A\u76B0\u774F\u775C\u778B\u77BD\u77C7\u7843\u787F\u78F4\u79C8\u7A88\u7A95\u7AFD\u7B1E\u7B67\u7B9D\u7BCC\u7C0D\u7C11\u7C37\u7C40\u7C6E\u7CB3\u7CBD\u7D09\u7D31\u7D40\u7D5B\u7D70\u7D91\u7D9E\u7DB0\u7DD9\u7DF9\u7E08\u7E11\u7E1D\u7E35\u7E52\u7FB6\u7FBF\u7FEE\u8012\u801C\u8028\u8052\u8123\u8188\u81C3\u81DA\u81FE\u8210\u82BE\u83A0\u83D4\u8407\u8435\u8477\u849E\u84C6\u84CA\u85F9\u867A\u86B5\u86B6\u86C4\u8706\u8707\u870A\u8768\u87BB\u8831\u8839\u8879\u8921\u8938\u8964\u89A6\u89AC\u8A10\u8A3E\u8AC2\u8ADB\u8AF3\u8B2B\u8B41\u8B4E\u8B5F\u8B6B\u8B92\u8C55\u8C62\u8C73\u8C8A\u8C8D\u8CB2\u8CB3\u8CD2\u8CE1\u8CFB\u8D0D\u8E34\u8E7A\u8E8A\u8ED4\u8EFE\u8F0A\u8F1C\u8F1E\u8F26\u8FAE\u9088\u90C3\u90FE\u9134\u9148\u91D9\u91E9\u9238\u9239\u923D\u924D\u925A\u9296\u92AC\u92BB\u9315\u9319\u931A\u9321\u9370\u9394\u93A2\u93D8\u93E4\u943A\u9477\u9582\u958E\u95A1\u95C8\u95CC\u95D4\u9658\u966C\u970F\u973D\u9744\u975B\u9766\u97A3\u97A6\u97C1\u97C6\u980A\u9837\u9853\u9870\u98AF\u98B3\u98BA\u98E9\u98ED\u9912\u991B\u991E\u993D\u993F\u99D1\u99DF\u9A01\u9A3E\u9A43\u9A4D\u9ACF\u9AE1\u9B22\u9B58\u9C25\u9C3E\u9C54\u9C56\u9D15\u9D23\u9D89\u9DC2\u9DD3\u9E82\u9E8B\u9EA9\u9EE0\u9EF7\u9F07\u9F2F\u9F34\u9F3E\u9F5F\u9F6C
|
||||||
|
NGram.KANJI_2_18=\u5155\u520E\u55DF\u56C0\u56C1\u5793\u5FD6\u5FF8\u6029\u60FA\u613E\u6147\u615A\u62C8\u6384\u6883\u6894\u68F9\u6AA3\u6AAE\u6AC2\u6E63\u7032\u70A4\u7146\u71FB\u7228\u72F7\u7370\u7441\u74BF\u75B8\u75E3\u7622\u76CD\u7768\u79E3\u7A60\u7B6E\u7BC1\u7C5F\u7D06\u7E2F\u7E39\u8146\u81CF\u8703\u8729\u8737\u87EF\u88D2\u8A22\u8AC4\u8AF6\u8E59\u8F33\u8F42\u9169\u91B1\u9278\u93C3\u93DD\u9460\u946A\u9785\u9AD1\u9B4D\u9B4E\u9C31\u9D12\u9ECC
|
||||||
|
NGram.KANJI_2_21=\u502A\u544E\u59AE\u59EC\u5D1B\u66A8\u6BD7\u6C76\u6E1D\u70EF\u742A\u7459\u7FE1\u82EF\u8343\u85C9\u8A79\u90DD
|
||||||
|
NGram.KANJI_2_22=\u4EDE\u4F7B\u504C\u50EE\u52E3\u52F0\u536E\u54A9\u54BB\u54BF\u54C2\u54E6\u550F\u556A\u55E8\u564E\u5664\u5671\u568F\u56DD\u572F\u57A0\u5809\u5924\u59A3\u59A4\u59E3\u5A13\u5A23\u5B51\u5B73\u5C50\u5C8C\u6035\u60C6\u6106\u6215\u62CE\u62FD\u64ED\u6549\u6554\u655D\u659B\u65CE\u65D6\u6615\u6624\u665E\u6677\u669D\u66E9\u6772\u677C\u696B\u6A84\u6AA0\u6BFD\u6C16\u6C86\u6C94\u6CD6\u6D2E\u6D39\u6F78\u6FB6\u705E\u70CA\u7168\u723B\u7256\u7284\u73B3\u740D\u742F\u7498\u74A9\u752D\u75F3\u7634\u768E\u76B4\u76E5\u77A0\u77DC\u781F\u782D\u7AA0\u7BFE\u7FF1\u80AB\u8174\u81EC\u8202\u8222\u8228\u82DC\u8306\u83FD\u8469\u84FF\u859C\u8617\u86B1\u8722\u8C89\u8D67\u8DCE\u8E49\u8E76\u8E87\u8FE2\u8FE4\u8FF8\u9016\u905B\u9174\u982B\u98E7\u9955\u9B32
|
||||||
|
NGram.KANJI_2_23=\u4F8F\u5055\u524C\u548E\u5583\u594E\u5CB7\u5ED6\u5F5D\u6021\u66B9\u66F0\u6C55\u6C7E\u6C82\u6E2D\u6EC7\u6ED5\u70B3\u71B9\u72C4\u73C0\u7426\u745C\u748B\u7696\u777F\u79A7\u79B9\u7F8C\u8153\u8339\u8386\u8725\u90B5\u9102\u962E\u9716\u97F6
|
||||||
|
NGram.KANJI_2_28=\u5733\u57D4\u838E\u8FEA
|
||||||
|
NGram.KANJI_2_29=\u50ED\u5F29\u62EE\u6A9C\u7BC6\u80F1\u8129\u8171\u822B\u8AEB
|
||||||
|
NGram.KANJI_2_30=\u4EB3\u4F15\u4FB7\u5006\u509A\u50A2\u5102\u5109\u5115\u5137\u5138\u513C\u524B\u524E\u5277\u528A\u52E6\u52FB\u5331\u5436\u5443\u54FD\u5538\u555E\u55C6\u55C7\u5679\u5690\u5695\u56C9\u56D1\u56EA\u588A\u58E2\u5AFB\u5B2A\u5B43\u5B7F\u5BE2\u5C37\u5D27\u5D84\u5D87\u5DD4\u5EC1\u5EDD\u5F12\u5FA0\u60F1\u616B\u61F5\u61F6\u61FE\u62DA\u6371\u6399\u63C0\u6451\u647B\u6493\u64BB\u64BF\u64C4\u64F1\u64F7\u650F\u652C\u665D\u6684\u6688\u66EC\u672E\u68E7\u69A6\u69ED\u69F3\u6A01\u6AAF\u6AE5\u6BA4\u6BAE\u6BAF\u6BC6\u6C08\u6C2C\u6C59\u6D87\u6EBC\u6ECC\u6EF7\u6F6F\u6F80\u6F86\u6FD8\u6FF0\u6FFA\u7006\u7018\u7030\u7051\u7192\u71C9\u71D9\u71F4\u71FE\u7274\u7377\u74A3\u750C\u7613\u7627\u7661\u7662\u7665\u766E\u7671\u7672\u76BA\u775E\u776A\u778C\u78E7\u7955\u7A08\u7AC5\u7B4D\u7C2B\u7C6C\u7CF0\u7D02\u7D1C\u7D73\u7DA2\u7DB5\u7DDE\u7E09\u7E0A\u7E37\u7E43\u7E61\u7E7D\u7E93\u7F3D\u7FF9\u81A9\u8271\u83F8\u84C0\u8514\u85BA\u86A9\u86FB\u879E\u8814\u8836\u889E\u8932\u896A\u896F\u8993\u89B2\u8A15\u8A16\u8A1D\u8A5B\u8A6C\u8A6D\u8A7C\u8AA1\u8AA3\u8AA5\u8B0A\u8B4F\u8B59\u8B96\u8C48\u8C54\u8CBD\u8CFA\u8D13\u8E89\u8E8B\u8EAA\u8EC0\u8EDB\u8EFC\u8F12\u8F1F\u8F3E\u8F45\u8FFA\u9015\u9183\u919E\u91A3\u91D7\u91F5\u9209\u9215\u923E\u9240\u9251\u9257\u927B\u9293\u92A8\u92C5\u92C7\u92F0\u9333\u935A\u9382\u938A\u9398\u93B3\u93D7\u93DF\u93E2\u93FD\u942B\u942E\u9433\u9463\u9470\u9472\u947E\u95D0\u96CB\u97C3\u97CC\u981C\u9839\u986B\u98B6\u98EA\u9909\u991A\u9935\u993E\u9951\u99A5\u99B1\u99D9\u99DD\u99F1\u9A2B\u9A62\u9A65\u9AAF\u9AD2\u9AEF\u9B0D\u9B28\u9B77\u9BFD\u9C49\u9C5F\u9C78\u9D3F\u9D72\u9DD7\u9E1B\u9EB4\u9EF4\u9F66\u9F94
|
||||||
|
NGram.KANJI_2_31=\u5DBD\u63C6\u6E3E\u7587\u8AF1\u8B5A\u9695
|
||||||
|
NGram.KANJI_2_32=\u53A5\u589F\u5CD9\u7109\u7F79\u8006\u8654\u8944\u968B\u96CD
|
||||||
|
NGram.KANJI_2_35=\u4F47\u4F91\u4FCE\u4FDF\u527D\u535E\u55DA\u56A5\u5879\u5A11\u5B7A\u5CAB\u5CF4\u5EBE\u5F7F\u5FA8\u601B\u606B\u60B8\u610D\u6134\u619A\u61FA\u6369\u6523\u65CC\u66C4\u6727\u6968\u6A05\u6A48\u6B59\u6BEC\u6D35\u6D38\u6E19\u701F\u7064\u711C\u716C\u71A8\u71E7\u7258\u743A\u746F\u75BD\u75D9\u75F2\u7669\u766C\u76DE\u7729\u77BC\u78EC\u792A\u7A37\u7A62\u7BE6\u7C2A\u7C50\u7D07\u7DD8\u7E5A\u7F8B\u7FD5\u7FF3\u8151\u81CD\u8317\u83F4\u85EA\u85FA\u8823\u895E\u89F4\u8A0C\u8A41\u8AA8\u8ACD\u8B10\u8CC1\u8D05\u8D73\u8E4A\u8E85\u8E91\u8EFB\u8F13\u9087\u914A\u91C9\u923F\u93B0\u9403\u95A8\u95AD\u9730\u9865\u9903\u9945\u9949\u99AD\u99E2\u9A6A\u9D26\u9E1E\u9EDD\u9F2C\u9F72
|
||||||
|
NGram.KANJI_2_36=\u4E9E\u4F86\u5011\u50B3\u5152\u5169\u5340\u5718\u5B78\u5BE6\u5BF6\u5C07\u5EE3\u61C9\u6230\u6703\u689D\u6A02\u6C23\u7063\u7368\u756B\u7576\u767C\u7A31\u7D93\u7E23\u7E3D\u81FA\u8207\u842C\u85DD\u865F\u8B49\u8B80\u8CFD\u908A\u9435\u95DC\u965D\u9AD4\u9EE8
|
||||||
|
NGram.KANJI_2_37=\u5480\u5580\u5C39\u67EF\u68B5\u6D85\u8521\u90B1
|
||||||
|
NGram.KANJI_2_38=\u4E1F\u4F96\u4FE0\u50F1\u5118\u522A\u5291\u52C1\u52DB\u52F3\u52F5\u52F8\u53B2\u55CE\u562F\u580A\u5862\u58AE\u58D8\u58DF\u58E9\u58EF\u5925\u593E\u599D\u5ABD\u5C62\u5EC2\u5EDA\u5EE2\u5F4E\u5F65\u6085\u6158\u61FC\u6200\u62CB\u633E\u6416\u6436\u6490\u64CB\u64E0\u64FA\u6514\u651C\u6524\u6558\u6583\u66B1\u66C6\u66C9\u66E0\u6A11\u6A1E\u6A38\u6A62\u6AB3\u6B16\u6B98\u6BBC\u6C2B\u6DDA\u6DE8\u6DEA\u6DFA\u6EEF\u6EFE\u6F32\u6F51\u6F5B\u700F\u71D2\u7210\u7246\u7260\u72A7\u72F9\u7375\u7378\u758A\u760B\u76DC\u76EA\u77DA\u77FD\u78DA\u7919\u797F\u79AA\u7A05\u7A4C\u7ACA\u7C72\u7D81\u7DDD\u7E31\u7E69\u7E6B\u7E73\u7E96\u7E9C\u81BD\u81C9\u81DF\u8259\u8277\u8396\u83A7\u8523\u8525\u860A\u863F\u8667\u87A2\u87F2\u881F\u883B\u89F8\u8B20\u8B74\u8B9A\u8C4E\u8C6C\u8C93\u8CEC\u8D0A\u8D0F\u8D95\u8E10\u8F4E\u8FAF\u8FF4\u905E\u9072\u9081\u908F\u91AC\u91C0\u91C1\u91D0\u921E\u9223\u9245\u929C\u92B3\u92C1\u9336\u934A\u93C8\u9444\u9452\u947C\u947F\u9592\u95B1\u95C6\u95D6\u95E1\u95E2\u96DE\u9742\u978F\u984F\u9871\u98B1\u98C4\u99ED\u9A37\u9A45\u9A5F\u9AEE\u9B27\u9BCA\u9C77\u9D51\u9D5D\u9E79\u9E7C\u9E7D\u9EB5\u9EBC\u9F61\u9F63\u9F90\u9F9C
|
||||||
|
NGram.KANJI_3_1=\u5283\u7562\u7DEC\u88E1\u8F2F
|
||||||
|
NGram.KANJI_3_2=\u5009\u502B\u5049\u5075\u507D\u5091\u5098\u50B5\u50B7\u50BE\u5100\u5104\u511F\u518A\u525B\u5289\u5442\u5805\u589C\u58C7\u5922\u596A\u5A66\u5B6B\u5BE7\u5BE9\u5DBA\u5E63\u5E7E\u5FB9\u6163\u616E\u6176\u61B2\u61B6\u61F8\u639B\u63DA\u63EE\u640D\u64B2\u64C1\u64EC\u6557\u6575\u6607\u66AB\u68C4\u6A39\u6C96\u6CC1\u6E1B\u6E6F\u6E9D\u6EC5\u6F01\u6F64\u6FC3\u7058\u707D\u7344\u7642\u76E4\u7832\u790E\u7B46\u7D05\u7D0B\u7D14\u7D19\u7D1B\u7D39\u7D61\u7DB1\u7DCA\u7DD2\u7DE0\u7DE9\u7DEF\u7DF4\u7E2E\u7E3E\u8105\u8108\u81E8\u8266\u84CB\u84EE\u85A9\u885D\u88DC\u8972\u8A02\u8A0E\u8A13\u8A17\u8A2A\u8A34\u8A3A\u8A3C\u8A69\u8A73\u8A95\u8AA0\u8AA4\u8AB2\u8AC7\u8ACB\u8B00\u8B1B\u8B1D\u8B5C\u8C9D\u8C9E\u8CA2\u8CA8\u8CA9\u8CAB\u8CAC\u8CB7\u8CBF\u8CC0\u8CDE\u8CE2\u8CFC\u8D08\u8DE1\u8E8D\u8ECC\u8EDF\u8EF8\u8F14\u8F1D\u8F2A\u8F44\u9055\u9069\u9077\u907C\u90F5\u91DD\u9285\u92FC\u9326\u932F\u9375\u9396\u93AE\u93E1\u9451\u9589\u95A3\u9663\u9670\u9673\u96BB\u9801\u9802\u9803\u9806\u9808\u9810\u983B\u984D\u9858\u9867\u98EF\u98F2\u98FE\u990A\u99D0\u9A0E\u9A5A\u9B5A\u9CE5\u9DB4\u9E97\u9F8D
|
||||||
|
NGram.KANJI_3_3=\u543E\u5BEE\u5F18\u6590\u725F\u83C5\u85E9\u9E93
|
||||||
|
NGram.KANJI_3_4=\u5016\u53AD\u5606\u5629\u58BE\u5F14\u6065\u6144\u646F\u647A\u67F5\u6953\u6C3E\u6F2C\u6F97\u6FB1\u7169\u71E6\u71ED\u74BD\u79BF\u7A1C\u7A4E\u7AAF\u7CDE\u7D17\u7D43\u7E55\u7FA8\u807E\u8139\u8490\u8569\u856A\u87FB\u8A23\u8AB9\u8AE6\u8AFA\u8B2C\u8CD1\u91D8\u92F8\u9318\u96DB\u99B4\u9BC9\u9C2D\u9CF6\u9D61\u9DFA
|
||||||
|
NGram.KANJI_3_5=\u4E26\u4F75\u4FC2\u500B\u5074\u5099\u512A\u5225\u5247\u5275\u5287\u52D5\u52D9\u52DD\u52E2\u5354\u54E1\u554F\u5712\u57F7\u5831\u5834\u5BAE\u5C0E\u5C64\u5CA1\u5CF6\u5E2B\u5E79\u5EAB\u5F35\u5F37\u5F8C\u5FA9\u611B\u614B\u63A1\u63DB\u6642\u66F8\u6771\u696D\u6975\u69CB\u6A19\u6A4B\u6A5F\u6BBA\u6C7A\u6E2C\u6E96\u6F22\u70BA\u7121\u71B1\u7372\u73FE\u74B0\u7570\u76E3\u78BA\u7A2E\u7A4D\u7AF6\u7BC0\u7BC4\u7BC9\u7C21\u7D00\u7D04\u7D0D\u7D1A\u7D30\u7D42\u7D44\u7D50\u7D66\u7D71\u7DAD\u7DDA\u7DE8\u7E54\u7F85\u7FA9\u7FD2\u8056\u805E\u8077\u8208\u83EF\u8449\u8853\u885B\u88FD\u8907\u898B\u898F\u8996\u89AA\u8A08\u8A18\u8A2D\u8A31\u8A55\u8A5E\u8A66\u8A71\u8A72\u8A8C\u8A8D\u8A9E\u8ABF\u8AD6\u8AF8\u8B58\u8B70\u8B77\u8CA0\u8CA1\u8CB4\u8CBB\u8CC7\u8CEA\u8ECA\u8ECD\u8F03\u8F09\u8F38\u8FB2\u9023\u9031\u9032\u904A\u904B\u904E\u9054\u9060\u9078\u907A\u9084\u9280\u9577\u9580\u958B\u9593\u9678\u967D\u968A\u968E\u969B\u96E2\u96E3\u96F2\u96FB\u97D3\u97FF\u9805\u9818\u982D\u984C\u985E\u98A8\u98DB\u9928\u99AC\u9BAE
|
||||||
|
NGram.KANJI_3_8=\u5F6B\u6C4E\u7B87\u8A70
|
||||||
|
NGram.KANJI_3_9=\u540B\u5B5C\u826E
|
||||||
|
NGram.KANJI_3_11=\u4F83\u4FF8\u51CB\u52BE\u53F1\u548B\u558B\u5CB1\u5D69\u5F3C\u620E\u621F\u64E2\u67DA\u6854\u69CC\u6A35\u6C8C\u6E1A\u6F15\u6FE0\u717D\u7252\u7AFA\u82D3\u83DF\u8431\u9041\u9149\u9798
|
||||||
|
NGram.KANJI_3_12=\u4ED5\u55E3\u572D\u57A3\u587E\u5983\u5A9B\u5C90\u5E61\u672D\u6960\u6F5F\u72D9\u72E9\u757F\u7949\u7950\u7E82\u7FCC\u82B8\u90B8\u91DC\u961C\u9B45
|
||||||
|
NGram.KANJI_3_13=\u55AB\u6249\u643E\u6841\u68B1\u725D\u7B8B\u7C95\u7E1E\u7F36\u8A03\u8A6B\u8E74\u95A4
|
||||||
|
NGram.KANJI_3_15=\u50AD\u50D1\u5132\u51F1\u55AC\u5617\u5687\u584A\u59EA\u5B30\u5BF5\u5C0B\u5C4D\u5EDF\u6182\u61A4\u64AB\u64FE\u66A2\u6897\u694A\u69CD\u6B3D\u6BC0\u6D29\u6F38\u7015\u7149\u71C8\u723A\u7336\u7345\u755D\u76C3\u78A9\u798D\u7AAE\u7DFB\u7E2B\u7F75\u7F77\u81E5\u834A\u852D\u85CD\u8755\u8A3B\u8A54\u8AE7\u8B02\u8B39\u8CAA\u8CE6\u8DA8\u8E5F\u8F5F\u905C\u912D\u919C\u92D2\u932B\u937E\u9418\u9583\u9812\u985B\u9905\u99B3\u99C1\u99D5\u9A30\u9CF3\u9D3B\u9D6C
|
||||||
|
NGram.KANJI_3_16=\u6D6C\u72FD\u77A5\u8956\u9C0D
|
||||||
|
NGram.KANJI_3_18=\u5919\u5F4A\u6063\u63AC\u649A\u6715\u6AD3\u71D0\u758B\u834F\u85F7\u88DF\u8F61\u93D1\u98F4\u9D60
|
||||||
|
NGram.KANJI_3_19=\u4F50\u7DB2\u962A
|
||||||
|
NGram.KANJI_3_22=\u5E96\u75D4\u91C6
|
||||||
|
NGram.KANJI_3_23=\u5E9A\u6C40\u821C\u839E\u8FED\u9EDB
|
||||||
|
NGram.KANJI_3_27=\u5F01\u66DC
|
||||||
|
NGram.KANJI_3_29=\u5023\u5208\u531D\u536F\u53E9\u54C9\u598A\u59BE\u5A20\u5D6F\u5DF3\u66C7\u66D6\u66F3\u6775\u6A3D\u6ADB\u6B86\u6C72\u6E25\u73EA\u7435\u760D\u7656\u7825\u78D0\u7A14\u7A6B\u7B20\u7BE0\u7CF8\u7DAC\u7DBB\u7DBE\u80E4\u80F4\u837B\u8466\u8568\u867B\u8A63\u91E7\u9320\u935B\u9591\u965B\u98E2\u990C\u9913\u9BAB
|
||||||
|
NGram.KANJI_3_30=\u60B6\u8AD2\u8CC2\u9237\u9328\u934D\u9397\u9830
|
||||||
|
NGram.KANJI_3_31=\u4FB6\u50D5\u51CD\u559A\u55AA\u5674\u5857\u585A\u5875\u58B3\u596E\u59E6\u5A41\u5D50\u5E25\u5E33\u5F59\u61C7\u61F2\u6368\u6383\u65AC\u68DF\u68F2\u6A3A\u6B04\u6DBC\u6DF5\u6E26\u6E4A\u6E67\u6F54\u6F70\u6FC1\u6FEB\u7159\u727D\u7652\u77EF\u78EF\u798E\u7A40\u7AAA\u7BE4\u7C60\u7CE7\u7CFE\u7D21\u7D33\u7D5E\u7D79\u7DB4\u7DBF\u7E1B\u7E8F\u7F70\u814E\u816B\u8178\u819A\u84BC\u85A6\u865C\u8766\u8A1F\u8A50\u8A60\u8A6E\u8A87\u8A98\u8AB0\u8ADC\u8AED\u8AEE\u8B0E\u8B19\u8CA7\u8CAF\u8CB8\u8CBC\u8CC3\u8CC4\u8CCA\u8CDC\u8CE0\u8CED\u8ED2\u8F29\u8F3F\u91E3\u920D\u9234\u925B\u9298\u9310\u934B\u958F\u95A5\u9727\u97FB\u9811\u984E\u98FC\u98FD\u99D2\u99FF\u9B31\u9BE8\u9C57\u9CE9\u9CF4\u9D28\u9DF9
|
||||||
|
NGram.KANJI_3_32=\u4E1E\u502D\u51A5\u5321\u58EC\u5A3C\u5BC5\u5CE8\u61A9\u620A\u65A1\u6714\u6853\u6893\u6C50\u6C5D\u7436\u745A\u745B\u773A\u7941\u7947\u8543\u865E\u8C5A\u914B\u99A8\u9AB8
|
||||||
|
NGram.KANJI_3_35=\u4E99\u5BA5\u5DFD\u608C\u60C7\u60DA\u6190\u61A7\u6753\u6777\u6787\u6B4E\u6F23\u6FE1\u6FEF\u7337\u7827\u786F\u7893\u7ABA\u7B94\u7BB8\u7C3E\u7D62\u7E6D\u80B1\u81BF\u81C6\u821B\u82E7\u83F0\u84D1\u86ED\u8888\u8B01\u8B04\u8F4D\u9291\u92E4\u932E\u9354\u936C\u939A\u9957\u9AED\u9BAA\u9BAD\u9BD6\u9BDB\u9C3B\u9D1B
|
||||||
|
NGram.KANJI_3_36=\u50C5\u53E2\u5EE0\u65BC\u70CF\u723E\u7D10\u7D9C\u806F\u8607\u862D\u8A0A\u8AFE\u8CD3\u9019\u9813\u9B6F
|
||||||
|
NGram.KANJI_3_37=\u4EA8\u4F3D\u5384\u5EFF\u60DF\u66DD\u6E5B\u8087\u82D1\u8FE6\u9640\u9E9F
|
||||||
|
NGram.KANJI_3_38=\u5147\u525D\u5678\u617E\u6372\u79A6\u8ABC\u92EA\u9438\u9817
|
||||||
|
NGram.KANJI_4_0=\u6D3C\u718F\u74EE\u8712
|
||||||
|
NGram.KANJI_4_9=\u4F84\u54C6\u5565\u68F1\u6D82\u83C7
|
||||||
|
NGram.KANJI_4_10=\u4FE9\u4FED\u51FF\u523D\u5300\u5364\u538C\u5450\u5455\u545C\u54D1\u54D7\u5578\u56A3\u58F6\u592F\u5CE6\u5D2D\u5E90\u6073\u607C\u60EB\u61D2\u62E2\u62E3\u631A\u6320\u6323\u6361\u63B7\u63B8\u63BA\u6405\u65A9\u65F7\u6619\u6655\u67A3\u67E0\u6805\u6808\u6866\u6868\u6869\u6A71\u6BE1\u6C79\u6CA5\u6CDE\u6DA4\u6DA7\u6DA9\u6E85\u70DB\u70E6\u70EB\u7115\u724D\u7410\u759F\u75AE\u75EA\u75F9\u762B\u763E\u76B1\u77EB\u783E\u79C3\u7A8D\u7A9C\u7B5D\u7BF1\u7EC5\u7ED2\u7EDE\u7EE3\u7EF7\u7EF8\u7EFD\u7F00\u7F0E\u7F15\u7F1A\u7F20\u7F24\u7F28\u7FA1\u7FD8\u8038\u803B\u804B\u80AE\u817B\u82C7\u8327\u835E\u8367\u83BA\u8424\u864F\u8681\u8682\u8715\u8717\u8721\u8747\u874E\u8845\u886C\u889C\u88E4\u89C5\u8BB6\u8BB9\u8BC0\u8BC5\u8BE1\u8BEB\u8BEC\u8BF5\u8C0E\u8C1A\u8D2E\u8D31\u8D43\u8D4E\u8D58\u8F67\u8F7F\u9489\u9499\u949D\u94A0\u94A5\u94AE\u94BE\u94D0\u94DB\u94F2\u9508\u950C\u951A\u9525\u952D\u952F\u9530\u953B\u9540\u9550\u9570\u9576\u95F0\u960E\u9668\u96CF\u97E7\u9885\u988A\u98A4\u9965\u9975\u997A\u997F\u9985\u998D\u998F\u9A6E\u9A6F\u9A74\u9A79\u9A7C\u9A82\u9A87\u9CA4\u9CC4\u9CCD\u9CD6\u9E20\u9E25\u9E35\u9E3D\u9E45\u9E49\u9E4A\u9E66
|
||||||
|
NGram.KANJI_4_16=\u576F\u579B\u6345\u78B4\u79EB\u79F8
|
||||||
|
NGram.KANJI_4_17=\u4E13\u4E1A\u4E1C\u4E24\u4E25\u4E2A\u4E3E\u4E49\u4E50\u4E66\u4E9A\u4EA7\u4EBF\u4ECE\u4EEC\u4EF7\u4F17\u4F20\u5170\u5173\u519B\u51B3\u51E4\u51FB\u5219\u521B\u522B\u529E\u52A1\u52A8\u52BF\u534F\u5355\u536B\u5386\u53BF\u53D1\u53D8\u542F\u5458\u54CD\u56E2\u56ED\u56F4\u56FE\u573A\u5904\u590D\u5934\u5B81\u5B9E\u5BF9\u5BFC\u5C14\u5C9B\u5E26\u5E7F\u5E94\u5F00\u5F20\u5F3A\u603B\u6218\u65E0\u65F6\u663E\u672F\u6743\u6784\u6807\u6C14\u6C49\u707E\u70ED\u73AF\u73B0\u7535\u76D1\u786E\u79CD\u79EF\u7B80\u7C7B\u7EA2\u7EA6\u7EA7\u7EAA\u7EBF\u7EC4\u7EC7\u7ED3\u7EDF\u7EE7\u7EED\u7EF4\u7F16\u7F57\u804C\u8054\u817E\u8282\u82CF\u83B7\u8425\u89C1\u89C2\u89C4\u89C6\u8BA1\u8BA4\u8BAE\u8BAF\u8BB0\u8BB8\u8BBA\u8BBE\u8BC1\u8BC4\u8BD1\u8BDD\u8BE5\u8BED\u8BF4\u8C03\u8D22\u8D23\u8D28\u8D39\u8D44\u8D5B\u8F66\u8F6C\u8F83\u8FBE\u8FC7\u8FD0\u8FD8\u8FD9\u8FDB\u8FDE\u9009\u94C1\u957F\u95E8\u95EE\u95F4\u95FB\u961F\u9633\u9645\u9646\u96BE\u9879\u9884\u9886\u9898\u98CE\u9A6C\u9F99
|
||||||
|
NGram.KANJI_4_18=\u51DB\u67B7
|
||||||
|
NGram.KANJI_4_22=\u4FA5\u545B\u5499\u5520\u5570\u56F1\u5A76\u5C96\u60AF\u60ED\u618B\u61A8\u62A0\u62A1\u62E7\u6363\u6390\u63B0\u6400\u6402\u6512\u6748\u70C1\u732C\u765E\u7663\u76CF\u7741\u781A\u7980\u79C6\u79FD\u7AA5\u7B0B\u7B8D\u7BA9\u7BAB\u7BD3\u7CAA\u7EAB\u7ECA\u7EE2\u7F2D\u7F30\u8110\u8113\u81CA\u835A\u8360\u84D6\u852B\u87E5\u8869\u8A8A\u8BA5\u8BF2\u8C05\u8C12\u8D30\u8D4A\u8D61\u8DF7\u8E6D\u8E8F\u8F95\u8F99\u8FAB\u94B3\u94C6\u94E3\u9504\u954A\u9563\u95FA\u9893\u9981\u9992\u9AA1\u9CAB\u9E2F\u9E33\u9EB8
|
||||||
|
NGram.KANJI_4_24=\u4E22\u4E8F\u4F1E\u4FA3\u5151\u517D\u51BB\u51D1\u5220\u529D\u52CB\u5367\u5389\u5395\u53E0\u53F9\u5413\u548F\u5524\u575E\u575F\u5784\u5792\u57A6\u57AB\u58F3\u5986\u5988\u5A04\u5A07\u5BA0\u5C18\u5C82\u5DE9\u5E10\u5E1C\u5F2F\u60E9\u6124\u629B\u6321\u6324\u635E\u63FD\u6401\u644A\u6491\u655B\u658B\u6635\u67AB\u67DC\u680B\u692D\u6984\u6A31\u6B7C\u6BD9\u6C22\u6CA6\u6CA7\u6CEA\u6CFB\u6CFC\u6D46\u6D47\u6D4A\u6D51\u6DA1\u6E0A\u6E83\u6EE4\u6EE5\u6F9C\u6FD2\u70C2\u7237\u727A\u730E\u7574\u75AF\u7792\u7816\u7845\u78B1\u7A77\u7A91\u7A9D\u7AD6\u7B3C\u7B5B\u7CAE\u7EA4\u7EB1\u7EBA\u7ECE\u7ED1\u7EF0\u7EF3\u7F14\u7F1D\u7F34\u7F62\u8042\u806A\u80A0\u80A4\u80BE\u80BF\u80C0\u810F\u8138\u8231\u8270\u829C\u82CD\u8350\u83B9\u841D\u8574\u8680\u8BB3\u8BBC\u8BBD\u8BC8\u8BF1\u8BFD\u8C0A\u8C0D\u8C1C\u8C24\u8C26\u8C2C\u8C2D\u8C34\u8D1E\u8D2C\u8D3C\u8D41\u8D42\u8D4C\u8D50\u8D5A\u8F69\u8F88\u8F90\u8FA9\u915D\u9171\u9493\u949E\u94A7\u94A9\u94BB\u94C3\u94C5\u94DD\u94F8\u9505\u9510\u9523\u9524\u95EF\u95F7\u95F9\u9600\u9610\u96F3\u97F5\u987D\u9882\u9888\u9896\u98D8\u9971\u9972\u9976\u997C\u9A84\u9A86\u9A8F\u9A97\u9A9A\u9AA4\u9CB8\u9CDE\u9E26\u9E43\u9E64\u9E70\u9F7F\u9F9F
|
||||||
|
NGram.KANJI_4_28=\u534E\u62A5\u7ECF\u7F51
|
||||||
|
NGram.KANJI_4_34=\u4E34\u4E3D\u4E4C\u4E54\u4E60\u4E61\u4E70\u4EB2\u4EC5\u4EEA\u4F18\u4F1F\u4F24\u4F26\u4FA7\u50A8\u513F\u5174\u517B\u518C\u519C\u51B5\u51CF\u5218\u521A\u5267\u52B3\u5356\u5382\u5385\u538B\u53A6\u5434\u5706\u5723\u5757\u575A\u575B\u575D\u5907\u591F\u593A\u5956\u5B59\u5BA1\u5BAB\u5BBD\u5BBE\u5BFB\u5C42\u5C81\u5E01\u5E08\u5E86\u5E93\u5F02\u5F39\u5F52\u5F55\u5F7B\u6000\u6001\u6076\u620F\u6237\u6267\u6269\u626C\u62A2\u62A4\u62DF\u62E5\u62E9\u6325\u635F\u6362\u6444\u6653\u6682\u6740\u6742\u6768\u6781\u6811\u6837\u6865\u68C0\u6B22\u6BC1\u6BD5\u6C47\u6C9F\u6CAA\u6CFD\u6D4B\u6DA8\u6E10\u6EE1\u6EE8\u706D\u7075\u70DF\u7231\u739B\u7597\u76D6\u76D8\u77FF\u7801\u7840\u79BB\u7A33\u7ADE\u7B14\u7B7E\u7CA4\u7D27\u7EB3\u7EBD\u7EC3\u7EC6\u7EC8\u7ECD\u7ED5\u7ED9\u7EDC\u7EDD\u7EE9\u7EFC\u7EFF\u7F13\u7F29\u8083\u80DC\u8111\u814A\u8230\u827A\u8363\u836F\u8428\u84DD\u867D\u8865\u88AD\u89C8\u8BA2\u8BA8\u8BA9\u8BAD\u8BB2\u8BBF\u8BC6\u8BCD\u8BD5\u8BEF\u8BF7\u8BF8\u8BFA\u8BFB\u8C08\u8D1D\u8D1F\u8D21\u8D25\u8D27\u8D2D\u8D2F\u8D35\u8D38\u8DC3\u8F6E\u8F6F\u8F7B\u8F7D\u8F86\u8F91\u8F93\u8F96\u8FB9\u8FBD\u8FC1\u8FDC\u8FDD\u9002\u9057\u90BB\u90D1\u91CA\u9488\u949F\u94A2\u94B1\u94F6\u9500\u9526\u9547\u9614\u9634\u9635\u9636\u9648\u9655\u9669\u9690\u97E9\u9875\u9876\u987A\u987B\u987E\u987F\u9891\u989D\u98DE\u9986\u9A7B\u9A8C\u9C81\u9C9C\u9F50
|
||||||
|
NGram.KANJI_4_39=\u4E1B\u4E1D\u4E27\u4EA9\u4ED1\u4ED3\u4F2A\u4FA6\u4FA8\u503A\u503E\u507F\u5188\u51AF\u51C0\u51C9\u51ED\u51EF\u5242\u5251\u52B2\u5362\u53A2\u5415\u5417\u5428\u55B7\u5760\u5899\u5939\u594B\u5987\u5A31\u5A74\u5BAA\u5C1D\u5C7F\u5C97\u5CAD\u5E05\u5E2E\u5E99\u5E9E\u5E9F\u5F03\u5FC6\u5FE7\u60AC\u60CA\u60EF\u626B\u6270\u629A\u62E6\u62E8\u6446\u6447\u654C\u67AA\u680F\u6863\u68A6\u6C64\u6D01\u6D53\u6D9D\u6DA6\u6E14\u6E17\u6EDA\u6EE9\u707F\u70BC\u70E7\u7275\u72B9\u72EE\u72F1\u743C\u7545\u76D0\u7855\u7978\u7B79\u7BEE\u7EA0\u7EAC\u7EAF\u7EB2\u7EB5\u7EB7\u7EB8\u7EB9\u7ED8\u7EEA\u7EF5\u7F05\u7F06\u7F18\u7F5A\u80C1\u80F6\u8109\u8206\u8273\u82F9\u8346\u8361\u83B2\u8427\u8651\u867E\u8854\u89C9\u8BC9\u8BCA\u8BD7\u8BDA\u8BDE\u8BE2\u8BE6\u8BFE\u8C01\u8C0B\u8C10\u8C13\u8C22\u8C23\u8C28\u8C31\u8D24\u8D26\u8D29\u8D2A\u8D2B\u8D34\u8D37\u8D3A\u8D3E\u8D3F\u8D4B\u8D4F\u8D54\u8D56\u8D5E\u8D60\u8D62\u8D75\u8D76\u8D8B\u8F68\u8F70\u8F74\u8F85\u8F89\u8FC8\u8FDF\u900A\u9012\u903B\u9093\u90AE\u917F\u9274\u94A6\u94DC\u94ED\u94FA\u94FE\u9501\u950B\u9519\u9521\u952E\u955C\u95EA\u95ED\u95F2\u95F8\u95FD\u9601\u9605\u9647\u96B6\u96FE\u9877\u9881\u9887\u9897\u989C\u98A0\u996D\u996E\u9970\u9A70\u9A71\u9A73\u9A76\u9A7E\u9A91\u9C7C\u9E1F\u9E21\u9E23\u9E2D\u9E3F\u9E4F\u9F84
|
||||||
|
NGram.KANJI_5_10=\u5239\u8EAF
|
||||||
|
NGram.KANJI_5_11=\u51C4\u8471
|
||||||
|
NGram.KANJI_5_12=\u6DC0\u7C98
|
||||||
|
NGram.KANJI_5_13=\u5631\u5815\u8695
|
||||||
|
NGram.KANJI_5_14=\u4E71\u4FA0\u5265\u52B1\u5374\u53A8\u53D9\u58EE\u5BDD\u5BFF\u5C3D\u5C4A\u5CE1\u5F25\u5F84\u604B\u60A6\u60E7\u60E8\u631F\u636E\u643A\u663C\u664B\u67A2\u6816\u697C\u6B8B\u6BB4\u6D45\u6E7F\u6EDE\u6F5C\u706F\u7089\u72ED\u732A\u732B\u76D7\u793C\u7977\u7A0E\u7A83\u80C6\u811A\u8131\u82A6\u830E\u848B\u865A\u866B\u86EE\u89E6\u8A89\u8DF5\u8E0A\u8E2A\u8F9E\u9065\u968F\u9759\u9EA6
|
||||||
|
NGram.KANJI_5_18=\u601C\u75D2
|
||||||
|
NGram.KANJI_5_26=\u4E07\u4E0E\u4E89\u4F1A\u4F53\u515A\u5185\u5199\u533A\u533B\u53C2\u53CC\u53F7\u58F0\u5965\u5B66\u5B9D\u5C06\u5C5E\u5F53\u62C5\u6570\u65AD\u65E7\u6761\u6765\u6A2A\u6B27\u6CA1\u6E29\u6E7E\u70B9\u72B6\u72EC\u732E\u753B\u79F0\u88C5\u9EC4
|
||||||
|
NGram.KANJI_5_29=\u693F\u82EB
|
||||||
|
NGram.KANJI_5_34=\u53F6\u6D9B\u83B1
|
||||||
|
NGram.KANJI_5_39=\u5C61\u788D
|
||||||
|
NGram.KANJI_6_0=\u4E10\u4E52\u4EC6\u4F88\u4FD0\u51F3\u533E\u53ED\u53EE\u5406\u541D\u5429\u5435\u5440\u5490\u5495\u54B1\u54C4\u54FC\u557C\u55D3\u5669\u56E4\u5777\u5992\u59E8\u5B7D\u5BDE\u5BE5\u5C79\u5C94\u5DCD\u5E18\u5E1A\u5E54\u5FF1\u604D\u6064\u60F6\u6127\u6177\u6233\u6252\u625B\u6273\u6296\u62C2\u62C7\u62F4\u638F\u6396\u63E3\u63EA\u6413\u6479\u64A9\u64C2\u659F\u667E\u6760\u6845\u6963\u6A90\u6B83\u6C13\u6C5E\u6D8E\u6D95\u6DCC\u6ED4\u6F13\u6F3E\u6FA1\u7076\u70D8\u710A\u71CE\u7239\u72E1\u73B7\u7599\u759A\u75A4\u75CA\u7629\u7682\u76C5\u76EF\u778E\u77AA\u787C\u7889\u788C\u78BE\u79E7\u7A96\u7A98\u7B77\u7C7D\u7CB1\u7D0A\u7D6E\u7F94\u7FCE\u8116\u814B\u814C\u819B\u828D\u82DF\u8301\u83E0\u85D5\u8611\u86A3\u8708\u8822\u8C4C\u8DB4\u8DEA\u8E42\u8E66\u8E72\u8EBA\u901B\u9157\u970E\u97ED
|
||||||
|
NGram.KANJI_6_3=\u62FC\u88D4\u9B4F
|
||||||
|
NGram.KANJI_6_9=\u4ED7\u4F63\u4FCF\u5018\u50BB\u50F5\u5154\u5201\u522E\u5254\u527F\u5306\u5462\u5492\u5496\u54A8\u54AA\u554A\u5561\u5564\u5566\u5885\u5938\u5AC2\u5AE9\u5CED\u5F64\u6084\u608D\u60A8\u60D5\u61C2\u61C8\u6254\u626F\u62AC\u6346\u634D\u640F\u6454\u6487\u6495\u64D2\u6746\u6789\u68B3\u68F5\u695E\u6986\u6995\u69A8\u6A44\u6AAC\u6B79\u6C28\u6C2E\u6CF5\u6DE4\u6E34\u6E3A\u6E89\u6F29\u70AB\u70AC\u7130\u715E\u7184\u71AC\u7238\u7281\u72E0\u74E3\u74F7\u7529\u7578\u761F\u7626\u76D4\u775B\u7779\u7784\u77BB\u780C\u780D\u7838\u7898\u78C5\u78F7\u7AED\u7B28\u7BE1\u7C07\u7CD5\u7CD9\u7CEF\u7F38\u800D\u8084\u809A\u8165\u816E\u832B\u8334\u840D\u8774\u886B\u888D\u88D9\u88F9\u8C41\u8D81\u8D9F\u8E22\u8E29\u8EB2\u8F9C\u9165\u918B\u9631\u964B\u964C\u9661\u9709\u9739\u9776\u9AD3\u9ED4
|
||||||
|
NGram.KANJI_6_10=\u4E53\u5582\u5600\u6342\u7B06
|
||||||
|
NGram.KANJI_6_11=\u5288\u543C\u5475\u5486\u54EE\u5598\u56BC\u5962\u5A36\u5A9A\u5B75\u5BA6\u5C38\u5C4E\u5F8A\u5F98\u627C\u62CC\u62D7\u63C9\u6930\u6954\u69D0\u6BEF\u6C90\u6CBD\u6CBE\u6F31\u6F88\u70D9\u7329\u75BC\u75F0\u7737\u77D7\u7B19\u7FB9\u803F\u80D6\u813E\u81C0\u8205\u8309\u83BD\u846B\u8517\u868C\u8759\u8815\u8859\u8B6C\u8E81\u8EAC\u90A2\u9698\u9B44
|
||||||
|
NGram.KANJI_6_12=\u722C\u7FD4
|
||||||
|
NGram.KANJI_6_16=\u5228\u5315\u542E\u54CE\u5509\u5527\u5543\u55B3\u55E1\u5636\u568E\u5FFF\u61E6\u6376\u642A\u6726\u74E4\u76F9\u7736\u7BD9\u8019\u80F0\u80F3\u812F\u818A\u8200\u8214\u8638\u869C\u86C0\u86C6\u86D4\u87C6\u88B1\u8902\u8C7A\u8E4B\u9119
|
||||||
|
NGram.KANJI_6_18=\u67D2\u6ED3\u87C0\u87CB\u8DDB\u901E\u9163
|
||||||
|
NGram.KANJI_6_20=\u4F5B\u52D2\u54C8\u62FF\u66FC\u6D59\u704C\u7586\u9ECE
|
||||||
|
NGram.KANJI_6_21=\u4E48\u4EFF\u4F19\u4FF1\u5021\u5077\u5195\u5212\u5269\u5401\u541E\u5427\u54EA\u5587\u558A\u55BB\u566A\u573E\u574E\u5783\u57AE\u584C\u58E4\u5960\u5976\u59CA\u5A1C\u5DE2\u5F99\u600E\u6015\u6263\u626D\u6293\u62C6\u62D6\u62EF\u62F1\u6316\u632A\u6380\u6389\u63D2\u641E\u64C5\u64CE\u65F1\u6664\u6735\u6770\u67EC\u6846\u684C\u68AD\u6B47\u6B49\u6B67\u6C1B\u6C27\u6C2F\u6C5B\u6C89\u6DF9\u6EAF\u70AE\u70E4\u731C\u7334\u73BB\u7470\u76FC\u788E\u789F\u78B0\u78B3\u7A0D\u7A3B\u7A57\u7CB9\u7F69\u8335\u8354\u84BF\u8DCC\u8DD1\u904F\u90A8\u9189\u9677\u9738\u978B
|
||||||
|
NGram.KANJI_6_22=\u5162\u53E8\u542D\u5501\u552C\u5639\u563F\u56B7\u6043\u60B4\u6194\u61CA\u634E\u63CD\u6414\u64AC\u6DAE\u6E43\u6F66\u7095\u7316\u733E\u7728\u7830\u78D5\u7ABF\u7FE9\u8018\u80EF\u8198\u8693\u86AA\u86AF\u874C\u8783\u879F\u8892\u8E6C
|
||||||
|
NGram.KANJI_6_23=\u4FD8\u4FEF\u501A\u5085\u5180\u526A\u5323\u54ED\u5634\u56CA\u58A9\u58F9\u5955\u5978\u59DA\u5A49\u5B55\u5BC7\u5BE8\u5D4C\u5E62\u6467\u64BC\u6500\u655E\u6572\u658C\u6670\u68CD\u68D5\u68E0\u6912\u6A0A\u6BB7\u6C9B\u6D3D\u6DC6\u6E23\u6F8E\u7011\u7092\u714C\u73AB\u7405\u7624\u76D2\u7960\u79C9\u7A20\u7BF7\u7F50\u804A\u8086\u81C2\u8292\u82DE\u852C\u857E\u859B\u8760\u8C6B\u8DBE\u8E48\u8F9F\u96A7
|
||||||
|
NGram.KANJI_6_25=\u4E8E\u5DF2\u5FB7\u7AD9
|
||||||
|
NGram.KANJI_6_28=\u4E58\u4ECD\u4EFD\u4F30\u4F60\u4F69\u503C\u5047\u51B0\u51F0\u5361\u5377\u53E6\u54E5\u552E\u5708\u5740\u5761\u57C3\u5821\u589E\u5979\u59C6\u5B69\u5B83\u5E15\u5E76\u5F17\u5F88\u6208\u622A\u624E\u627E\u62D4\u62DC\u63ED\u641C\u6536\u6548\u65C1\u665A\u6668\u67E5\u6B65\u6BCF\u6C61\u6CDB\u6D4E\u6D89\u6DB5\u6E38\u6EAA\u6FB3\u70B8\u745F\u7538\u7A97\u7F3A\u7F55\u805A\u8258\u827E\u82AC\u8303\u83F2\u8482\u85CF\u8DDF\u903E\u9080\u970D\u9760\u9ED1\u9ED8
|
||||||
|
NGram.KANJI_6_29=\u634F\u6518\u7B50\u809B
|
||||||
|
NGram.KANJI_6_30=\u54A7\u57C2\u5AB3\u60CB\u6886\u8378\u85D0\u8671
|
||||||
|
NGram.KANJI_6_32=\u5080\u5121\u51A4\u54AC\u55DC\u592D\u5DEB\u6292\u68D8\u69B4\u6A59\u6E24\u7FC5\u80DA\u8180\u86DB\u8700\u8DCB\u9761
|
||||||
|
NGram.KANJI_6_34=\u4E30\u51E0\u542C\u613F
|
||||||
|
NGram.KANJI_6_35=\u4E56\u547B\u55FD\u5C41\u606C\u6115\u6CAE\u7119\u795F\u7CDC\u86C9\u86F9\u8713\u873B\u8757\u8925\u892A\u96F9
|
||||||
|
NGram.KANJI_6_37=\u51B2\u5308\u5398\u54B8\u59DC\u5C4F\u5D14\u5F6D\u60E0\u6241\u6350\u699C\u6BEB\u6C6A\u6CC4\u6DEE\u6F58\u6F6D\u7199\u77EE\u7ADF\u8058\u820D\u8212\u8389\u8587\u884D\u8881\u8FA8\u8FF9\u96D5
|
||||||
|
NGram.KANJI_6_39=\u574F\u6251\u6302
|
||||||
|
NGram.KANJI_7_0=\u52FA\u5544\u60F0\u6994\u86A4\u86E4
|
||||||
|
NGram.KANJI_7_3=\u4E59\u4E7E\u4EAD\u4EF0\u4EF2\u4F0F\u4F10\u4FAF\u4FCA\u500D\u501F\u5076\u508D\u50E7\u5112\u5146\u5192\u51AC\u51DD\u51FD\u5200\u5237\u524A\u52A3\u52C3\u52C7\u52DF\u5351\u5352\u5353\u5378\u537F\u53E5\u5439\u54FA\u574A\u5782\u57CB\u5893\u58C1\u5915\u5937\u5949\u5951\u5974\u59B9\u5A18\u5A5A\u5ACC\u5B54\u5B5D\u5B64\u5B8F\u5BBF\u5BD2\u5C3A\u5C6F\u5CB3\u5D07\u5DE7\u5E84\u5E8A\u5F26\u5F69\u5F70\u5F90\u5FAA\u5FCD\u6012\u6016\u602A\u60A0\u60B2\u60BC\u6148\u6162\u6170\u6291\u6298\u62AB\u62BC\u62BD\u62D2\u62D3\u62D8\u62F3\u6311\u638C\u6398\u63E1\u642C\u6458\u64A4\u654F\u656C\u659C\u65E2\u65E8\u65EC\u6606\u6614\u6676\u6691\u6696\u66F9\u6749\u676F\u679A\u679D\u67CF\u67D4\u67F1\u67F3\u67F4\u6817\u6842\u6843\u6851\u68A8\u68CB\u68D2\u6B20\u6B32\u6BBF\u6C57\u6C88\u6CCA\u6D17\u6D1E\u6D69\u6D6E\u6D78\u6DE1\u6DFB\u6E58\u6EB6\u6F0F\u6F20\u7070\u708E\u70AD\u7126\u718A\u71C3\u7267\u72C2\u731B\u7384\u73A9\u73CD\u7434\u75AB\u75DB\u76C6\u76FE\u773C\u7891\u78C1\u795D\u7965\u79D2\u79DF\u79E6\u7A00\u7B11\u7B51\u7B54\u7C89\u7C92\u7CD6\u7D2B\u7F8A\u7FBD\u7FFC\u8010\u80A5\u80CE\u8150\u8179\u819C\u8247\u829D\u82B3\u82D7\u82E6\u8302\u8336\u8352\u83CA\u83CC\u83DC\u845B\u846C\u84B2\u84B8\u84C4\u8584\u864E\u86C7\u8861\u8863\u8870\u888B\u8896\u88D5\u8986\u8C46\u8DA3\u8E0F\u8F9B\u8FC5\u8FEB\u8FF7\u9003\u9006\u902E\u9042\u9063\u90ED\u963B\u9676\u96EA\u9756\u9B3C\u9B42\u9F3B
|
||||||
|
NGram.KANJI_7_6=\u4E01\u4E03\u4E45\u4E5D\u4E88\u4E92\u4EA1\u4ECB\u4EE4\u4F01\u4F0A\u4F2F\u4F3C\u4F4E\u4F4F\u4F55\u4F8B\u4F9D\u4FBF\u4FEE\u505C\u50CF\u516B\u516D\u5175\u5177\u5178\u5207\u520A\u5224\u526F\u529F\u52A9\u5343\u5348\u535A\u5370\u53BB\u53CB\u53F3\u5409\u542B\u544A\u547C\u5584\u5747\u5802\u590F\u592B\u5931\u5947\u597D\u5A01\u5A92\u5B63\u5B8C\u5B97\u5BA2\u5BA3\u5BA4\u5BB3\u5BB9\u5BC6\u5BCC\u5BDF\u5C04\u5C1A\u5C45\u5C4B\u5CB8\u5DE6\u5E0C\u5E1D\u5E2D\u5E55\u5E8F\u5E95\u5E97\u5EA7\u5EB7\u5EF6\u5F8B\u5FAE\u5FC5\u5FD7\u5FF5\u601D\u6025\u606F\u60F3\u611F\u623F\u6253\u6279\u627F\u6295\u6297\u62EC\u6388\u6392\u63F4\u6545\u6551\u6574\u6599\u65C5\u65E9\u6613\u6620\u6625\u666E\u666F\u66B4\u66F4\u670D\u671B\u6728\u672B\u6751\u677E\u67B6\u6838\u6839\u6848\u68EE\u690D\u6982\u6A21\u6B4C\u6B62\u6B66\u6BB5\u6BCD\u6C0F\u6C38\u6C42\u6CBF\u6CE2\u6CE8\u6D0B\u6D3E\u6D88\u6DF1\u6E05\u6E56\u706B\u7167\u7206\u7236\u7247\u7387\u7530\u7537\u7559\u7565\u7591\u75C5\u767B\u767D\u767E\u7687\u76DB\u76DF\u771F\u7763\u77ED\u7834\u79FB\u7A81\u7AE0\u7AEF\u7B56\u7B97\u7C4D\u7CBE\u7D20\u7D22\u7F72\u7FA4\u8001\u8003\u81F4\u822A\u826F\u82B1\u8349\u843D\u878D\u8857\u89D2\u8B66\u8C37\u8D70\u8D85\u8D8A\u8DB3\u8FF0\u8FFD\u9001\u901F\u90A3\u90A6\u914D\u91CE\u9632\u963F\u9644\u964D\u9664\u96C4\u96E8\u9752\u9769\u98DF
|
||||||
|
NGram.KANJI_7_7=\u4E09\u4E0A\u4E0B\u4E0D\u4E16\u4E3B\u4E8B\u4E8C\u4EE3\u4EE5\u4F4D\u4F5C\u4F7F\u5165\u5168\u516C\u5171\u51FA\u5206\u5229\u5236\u524D\u529B\u52A0\u5316\u5317\u5357\u539F\u53CA\u53F0\u5408\u540C\u540D\u548C\u5730\u57FA\u5916\u591A\u5929\u5B50\u5B9A\u5BB6\u5C0F\u5C71\u5DDE\u5DE5\u5E02\u5E73\u5EA6\u5EFA\u5F0F\u6027\u6210\u6240\u6307\u653F\u6587\u65B0\u65B9\u660E\u6700\u6709\u671F\u672C\u6B21\u6B63\u6C11\u6CBB\u6CD5\u6D77\u7269\u7279\u7406\u751F\u7528\u7531\u754C\u76EE\u76F8\u793E\u79D1\u7ACB\u7B2C\u7B49\u7CFB\u8005\u80FD\u81EA\u82F1\u884C\u8868\u897F\u8981\u901A\u9053\u90E8\u90FD\u91CD\u9AD8
|
||||||
|
NGram.KANJI_7_9=\u4E4D\u4F36\u5319\u6A61\u6DCB\u7194
|
||||||
|
NGram.KANJI_7_11=\u4E5E\u4F43\u5026\u50FB\u515C\u5243\u5420\u5446\u54B3\u54BD\u553E\u55A7\u5703\u5984\u5AC9\u5B09\u5C51\u5DFE\u5ED3\u5F1B\u6055\u618E\u62D9\u65A7\u6652\u6977\u6EBA\u707C\u75D8\u79E4\u7AFF\u7B4F\u7CA5\u808B\u8098\u80B4\u8235\u82DB\u849C\u8549\u868A\u86FE\u8718\u914C
|
||||||
|
NGram.KANJI_7_12=\u4E08\u4E38\u4F8D\u50DA\u5203\u5256\u52C9\u52D8\u52FE\u5320\u533F\u5375\u53D4\u540F\u54E8\u56DA\u5806\u5996\u5999\u59A5\u59A8\u59FF\u5AE1\u5BB0\u5BF8\u5C09\u5C3F\u5C48\u5C65\u5D29\u5E06\u5E4C\u5EB5\u5EB6\u5EB8\u5F13\u5FCC\u5FD8\u6052\u606D\u609F\u60D1\u614E\u6247\u62B1\u6349\u64E6\u6577\u65ED\u6674\u6734\u67C4\u6850\u690E\u6A58\u6B3A\u6B89\u6C41\u6CBC\u6CCC\u6CF3\u6D74\u6DAF\u6DF3\u6ECB\u6F02\u6F84\u71E5\u7261\u7272\u72AC\u72FC\u733F\u7409\u755C\u76F2\u7720\u77AC\u77E2\u7802\u786B\u78E8\u7901\u7948\u79E9\u7A1A\u7A74\u7AE3\u7B4B\u7B52\u7BB1\u7C3F\u8015\u8096\u809D\u80A2\u80A9\u80AA\u80BA\u80F8\u8102\u810A\u8154\u8155\u8170\u817A\u81A8\u81ED\u820C\u8236\u82BD\u8305\u83E9\u83F1\u840C\u85FB\u8650\u8702\u8A93\u8E44\u8FB0\u9038\u9091\u90AA\u916C\u9175\u9177\u9685\u96C0\u96C7\u96CC\u97AD
|
||||||
|
NGram.KANJI_7_13=\u63D6\u803D
|
||||||
|
NGram.KANJI_7_16=\u602F\u7566
|
||||||
|
NGram.KANJI_7_18=\u634C\u7C38
|
||||||
|
NGram.KANJI_7_19=\u4E18\u4E73\u4E95\u4EAB\u4EC1\u4ED8\u4ED9\u4F11\u4F34\u4F38\u4F59\u4FB5\u4FC3\u4FD7\u5012\u5019\u5065\u50AC\u5144\u5145\u514D\u517C\u51A0\u51B7\u5211\u5238\u523A\u523B\u5272\u52E4\u5360\u5371\u539A\u541B\u5426\u5438\u5473\u54F2\u5510\u552F\u5531\u559C\u5609\u56F0\u56FA\u591C\u5948\u594F\u59BB\u59D3\u5B85\u5B87\u5B88\u5B99\u5B9C\u5BC4\u5BFA\u5C0A\u5C3E\u5CA9\u5D0E\u5DE1\u5DE8\u5DEE\u5DF1\u5E45\u5E78\u5E7B\u5E7C\u5EAD\u5EF7\u5F1F\u5F31\u5F79\u5F7C\u5F85\u5F92\u5FA1\u5FE0\u6050\u60A3\u6212\u62DB\u632F\u6355\u63A2\u63AA\u63CF\u642D\u6469\u64CD\u653B\u6563\u660C\u662D\u667A\u6697\u66FF\u6750\u675F\u677F\u6790\u67D3\u682A\u6885\u68B0\u6B8A\u6B96\u6BDB\u6C60\u6CB9\u6CC9\u6D25\u6D66\u6DB2\u6DF7\u6E21\u6ED1\u6F2B\u6F6E\u6FC0\u7235\u725B\u72AF\u7389\u7532\u7533\u756A\u75BE\u75C7\u76AE\u76CA\u7740\u786C\u7956\u7968\u796D\u7981\u79C0\u79C1\u79CB\u79D8\u7A3F\u7AE5\u7AF9\u7E41\u7F6A\u7FFB\u8089\u80CC\u80DE\u81E3\u821E\u8239\u82E5\u8328\u8377\u85E4\u8840\u88C1\u88C2\u8C6A\u8D64\u8DDD\u8FCE\u8FD4\u9000\u9014\u907F\u90CA\u90CE\u90E1\u9152\u9178\u9686\u9694\u969C\u9707\u9732\u9AA8\u9B54\u9E7F\u9EBB
|
||||||
|
NGram.KANJI_7_20=\u4E39\u4E43\u4EAE\u4F73\u504F\u505A\u51C6\u51CC\u52AA\u5339\u5347\u53EB\u53EC\u5448\u5766\u57F9\u5854\u585E\u58A8\u5B8B\u5C01\u5CF0\u5E72\u5EC9\u5F80\u5F81\u5FBD\u5FEB\u6069\u6211\u624D\u628A\u62B5\u62CD\u6309\u63A7\u64AD\u6566\u6597\u65CB\u65D7\u6628\u6717\u6731\u674E\u675C\u683D\u6881\u6B3E\u6BD2\u6C7D\u6C99\u6CE5\u6CF0\u6D1B\u6D2A\u70C8\u719F\u724C\u7259\u73E0\u73ED\u745E\u74E6\u7518\u751A\u7686\u770B\u7B26\u8033\u80A1\u80E1\u821F\u83AB\u8499\u8D74\u8DE8\u900F\u9010\u9047\u904D\u906D\u9675\u96C5\u96F6\u96F7\u9700\u9F13
|
||||||
|
NGram.KANJI_7_21=\u5764\u59D0\u5A03\u6062\u6108\u68C9\u7164\u79BE\u7BAD\u903C
|
||||||
|
NGram.KANJI_7_23=\u4EA5\u50B2\u532A\u5366\u543B\u54E9\u5632\u59D1\u5BB5\u5DF7\u5F6A\u5F6C\u5FFD\u6070\u6168\u61BE\u63A0\u63A9\u6478\u65A4\u68A7\u6A1F\u6CAB\u70F9\u711A\u723D\u7262\u72F8\u751C\u754F\u75B9\u76C8\u7709\u7897\u7CCA\u7F9E\u8299\u82AD\u82B9\u82D4\u8304\u84C9\u84EC\u854A\u85AF\u86D9\u8FA3\u9187\u97A0
|
||||||
|
NGram.KANJI_7_25=\u4E14\u4E5F\u4F46\u514B\u5176\u5230\u5373\u53EA\u540E\u5982\u5C3C\u5DF4\u6216\u62C9\u65AF\u66FE\u6B64\u6D32\u6D6A\u7BC7\u800C
|
||||||
|
NGram.KANJI_7_28=\u4E4E\u4E9B\u4EA6\u4EC0\u4FC4\u5403\u5957\u5C24\u6089\u6258\u67D0\u758F\u7FF0\u8D6B
|
||||||
|
NGram.KANJI_7_29=\u4FAE\u5944\u5A29\u6101\u62ED\u6328\u637B\u6666\u6687\u66AE\u673D\u6756\u67FF\u6813\u68A2\u699B\u7078\u708A\u7396\u7422\u7525\u75E2\u76BF\u7766\u77B3\u7A3C\u7A92\u819D\u81FC\u8237\u8338\u8511\u88F3\u8FC2
|
||||||
|
NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5993\u5A7F\u5AC1\u5B9B\u5BC2\u5BE1\u5F04\u5F0A\u5F27\u6020\u6028\u6068\u6094\u6109\u611A\u614C\u621A\u62B9\u62D0\u62F7\u62FE\u632B\u633D\u6367\u660F\u6627\u6643\u66D9\u674F\u6795\u67AF\u67D1\u6876\u68DA\u68FA\u6905\u69FD\u6A80\u6B6A\u6CB8\u6CE3\u6DD1\u6DEB\u6E9C\u6EA2\u6EF4\u6F06\u714E\u716E\u722A\u7280\u74A7\u752B\u75B2\u75D5\u75F4\u77AD\u77E9\u785D\u79BD\u7A3D\u7A9F\u7B1B\u7B95\u7C9F\u7CDF\u80C3\u8106\u817F\u818F\u81B3\u828B\u82A5\u82AF\u840E\u851A\u853D\u8776\u87F9\u8877\u8910\u8912\u8C79\u8D66\u8FB1\u9017\u90C1\u916A\u9699\u96C1\u971C\u9774\u978D
|
||||||
|
NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
|
||||||
|
NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
|
||||||
|
NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20
|
|
@ -8,6 +8,8 @@ using System.Threading.Tasks;
|
||||||
using MediaBrowser.Model.MediaInfo;
|
using MediaBrowser.Model.MediaInfo;
|
||||||
using MediaBrowser.Model.Logging;
|
using MediaBrowser.Model.Logging;
|
||||||
using UniversalDetector;
|
using UniversalDetector;
|
||||||
|
using NLangDetect.Core;
|
||||||
|
using MediaBrowser.Model.Serialization;
|
||||||
|
|
||||||
namespace Emby.Common.Implementations.TextEncoding
|
namespace Emby.Common.Implementations.TextEncoding
|
||||||
{
|
{
|
||||||
|
@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding
|
||||||
{
|
{
|
||||||
private readonly IFileSystem _fileSystem;
|
private readonly IFileSystem _fileSystem;
|
||||||
private readonly ILogger _logger;
|
private readonly ILogger _logger;
|
||||||
|
private IJsonSerializer _json;
|
||||||
|
|
||||||
public TextEncoding(IFileSystem fileSystem, ILogger logger)
|
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
|
||||||
{
|
{
|
||||||
_fileSystem = fileSystem;
|
_fileSystem = fileSystem;
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
|
_json = json;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Encoding GetASCIIEncoding()
|
public Encoding GetASCIIEncoding()
|
||||||
|
@ -63,6 +67,7 @@ namespace Emby.Common.Implementations.TextEncoding
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private bool _langDetectInitialized;
|
||||||
public string GetDetectedEncodingName(byte[] bytes, string language)
|
public string GetDetectedEncodingName(byte[] bytes, string language)
|
||||||
{
|
{
|
||||||
var encoding = GetInitialEncoding(bytes);
|
var encoding = GetInitialEncoding(bytes);
|
||||||
|
@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding
|
||||||
return "utf-8";
|
return "utf-8";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!_langDetectInitialized)
|
||||||
|
{
|
||||||
|
_langDetectInitialized = true;
|
||||||
|
LanguageDetector.Initialize(_json);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(language))
|
||||||
|
{
|
||||||
|
language = DetectLanguage(bytes);
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(language))
|
||||||
|
{
|
||||||
|
_logger.Debug("Text language detected as {0}", language);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var charset = DetectCharset(bytes, language);
|
var charset = DetectCharset(bytes, language);
|
||||||
|
|
||||||
if (!string.IsNullOrWhiteSpace(charset))
|
if (!string.IsNullOrWhiteSpace(charset))
|
||||||
|
@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private string DetectLanguage(byte[] bytes)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
|
||||||
|
}
|
||||||
|
catch (NLangDetectException ex)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
|
||||||
|
}
|
||||||
|
catch (NLangDetectException ex)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
|
||||||
|
}
|
||||||
|
catch (NLangDetectException ex)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
public Encoding GetEncodingFromCharset(string charset)
|
public Encoding GetEncodingFromCharset(string charset)
|
||||||
{
|
{
|
||||||
if (string.IsNullOrWhiteSpace(charset))
|
if (string.IsNullOrWhiteSpace(charset))
|
||||||
|
@ -136,22 +186,29 @@ namespace Emby.Common.Implementations.TextEncoding
|
||||||
case "cze":
|
case "cze":
|
||||||
case "ces":
|
case "ces":
|
||||||
case "slo":
|
case "slo":
|
||||||
case "slk":
|
|
||||||
case "slv":
|
|
||||||
case "srp":
|
case "srp":
|
||||||
case "hrv":
|
case "hrv":
|
||||||
case "rum":
|
case "rum":
|
||||||
case "ron":
|
case "ron":
|
||||||
case "rup":
|
case "rup":
|
||||||
|
return "windows-1250";
|
||||||
|
// albanian
|
||||||
case "alb":
|
case "alb":
|
||||||
case "sqi":
|
case "sqi":
|
||||||
return "windows-1250";
|
return "windows-1250";
|
||||||
|
// slovak
|
||||||
|
case "slk":
|
||||||
|
case "slv":
|
||||||
|
return "windows-1250";
|
||||||
case "ara":
|
case "ara":
|
||||||
return "windows-1256";
|
return "windows-1256";
|
||||||
case "heb":
|
case "heb":
|
||||||
return "windows-1255";
|
return "windows-1255";
|
||||||
case "grc":
|
case "grc":
|
||||||
|
return "windows-1253";
|
||||||
|
// greek
|
||||||
case "gre":
|
case "gre":
|
||||||
|
case "ell":
|
||||||
return "windows-1253";
|
return "windows-1253";
|
||||||
case "crh":
|
case "crh":
|
||||||
case "ota":
|
case "ota":
|
||||||
|
|
|
@ -561,7 +561,7 @@ namespace Emby.Server.Core
|
||||||
StringExtensions.LocalizationManager = LocalizationManager;
|
StringExtensions.LocalizationManager = LocalizationManager;
|
||||||
RegisterSingleInstance(LocalizationManager);
|
RegisterSingleInstance(LocalizationManager);
|
||||||
|
|
||||||
ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"));
|
ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"), JsonSerializer);
|
||||||
RegisterSingleInstance(textEncoding);
|
RegisterSingleInstance(textEncoding);
|
||||||
Utilities.EncodingHelper = textEncoding;
|
Utilities.EncodingHelper = textEncoding;
|
||||||
RegisterSingleInstance<IBlurayExaminer>(() => new BdInfoExaminer(FileSystemManager, textEncoding));
|
RegisterSingleInstance<IBlurayExaminer>(() => new BdInfoExaminer(FileSystemManager, textEncoding));
|
||||||
|
|
|
@ -82,16 +82,16 @@ namespace MediaBrowser.Api.UserLibrary
|
||||||
[ApiMember(Name = "AiredDuringSeason", Description = "Gets all episodes that aired during a season, including specials.", IsRequired = false, DataType = "int", ParameterType = "query", Verb = "GET")]
|
[ApiMember(Name = "AiredDuringSeason", Description = "Gets all episodes that aired during a season, including specials.", IsRequired = false, DataType = "int", ParameterType = "query", Verb = "GET")]
|
||||||
public int? AiredDuringSeason { get; set; }
|
public int? AiredDuringSeason { get; set; }
|
||||||
|
|
||||||
[ApiMember(Name = "MinPremiereDate", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
|
[ApiMember(Name = "MinPremiereDate", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
|
||||||
public string MinPremiereDate { get; set; }
|
public string MinPremiereDate { get; set; }
|
||||||
|
|
||||||
[ApiMember(Name = "MinDateLastSaved", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
|
[ApiMember(Name = "MinDateLastSaved", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
|
||||||
public string MinDateLastSaved { get; set; }
|
public string MinDateLastSaved { get; set; }
|
||||||
|
|
||||||
[ApiMember(Name = "MinDateLastSavedForUser", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
|
[ApiMember(Name = "MinDateLastSavedForUser", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
|
||||||
public string MinDateLastSavedForUser { get; set; }
|
public string MinDateLastSavedForUser { get; set; }
|
||||||
|
|
||||||
[ApiMember(Name = "MaxPremiereDate", Description = "Optional. The maximum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
|
[ApiMember(Name = "MaxPremiereDate", Description = "Optional. The maximum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
|
||||||
public string MaxPremiereDate { get; set; }
|
public string MaxPremiereDate { get; set; }
|
||||||
|
|
||||||
[ApiMember(Name = "HasOverview", Description = "Optional filter by items that have an overview or not.", IsRequired = false, DataType = "bool", ParameterType = "query", Verb = "GET")]
|
[ApiMember(Name = "HasOverview", Description = "Optional filter by items that have an overview or not.", IsRequired = false, DataType = "bool", ParameterType = "query", Verb = "GET")]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user