272 lines
8.6 KiB
C#
272 lines
8.6 KiB
C#
using System;
|
|
using System.Text;
|
|
using MediaBrowser.Model.IO;
|
|
using MediaBrowser.Model.Serialization;
|
|
using MediaBrowser.Model.Text;
|
|
using Microsoft.Extensions.Logging;
|
|
using NLangDetect.Core;
|
|
using UniversalDetector;
|
|
|
|
namespace Emby.Server.Implementations.TextEncoding
|
|
{
|
|
public class TextEncoding : ITextEncoding
|
|
{
|
|
private readonly IFileSystem _fileSystem;
|
|
private readonly ILogger _logger;
|
|
private IJsonSerializer _json;
|
|
|
|
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
|
|
{
|
|
_fileSystem = fileSystem;
|
|
_logger = logger;
|
|
_json = json;
|
|
}
|
|
|
|
public Encoding GetASCIIEncoding()
|
|
{
|
|
return Encoding.ASCII;
|
|
}
|
|
|
|
private static Encoding GetInitialEncoding(byte[] buffer, int count)
|
|
{
|
|
if (count >= 3)
|
|
{
|
|
if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
|
|
return Encoding.UTF8;
|
|
}
|
|
|
|
if (count >= 2)
|
|
{
|
|
if (buffer[0] == 0xfe && buffer[1] == 0xff)
|
|
return Encoding.Unicode;
|
|
}
|
|
|
|
if (count >= 4)
|
|
{
|
|
if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
|
|
return Encoding.UTF32;
|
|
}
|
|
|
|
if (count >= 3)
|
|
{
|
|
if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
|
|
return Encoding.UTF7;
|
|
}
|
|
|
|
var result = new TextEncodingDetect().DetectEncoding(buffer, count);
|
|
|
|
switch (result)
|
|
{
|
|
case TextEncodingDetect.CharacterEncoding.Ansi:
|
|
return Encoding.ASCII;
|
|
case TextEncodingDetect.CharacterEncoding.Ascii:
|
|
return Encoding.ASCII;
|
|
case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
|
|
return Encoding.UTF32;
|
|
case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
|
|
return Encoding.UTF32;
|
|
case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
|
|
return Encoding.UTF32;
|
|
case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
|
|
return Encoding.UTF32;
|
|
case TextEncodingDetect.CharacterEncoding.Utf8Bom:
|
|
return Encoding.UTF8;
|
|
case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
|
|
return Encoding.UTF8;
|
|
default:
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private bool _langDetectInitialized;
|
|
public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
|
|
{
|
|
var index = 0;
|
|
|
|
var encoding = GetInitialEncoding(bytes, count);
|
|
|
|
if (encoding != null && encoding.Equals(Encoding.UTF8))
|
|
{
|
|
return "utf-8";
|
|
}
|
|
|
|
if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
|
|
{
|
|
if (!_langDetectInitialized)
|
|
{
|
|
_langDetectInitialized = true;
|
|
LanguageDetector.Initialize(_json);
|
|
}
|
|
|
|
language = DetectLanguage(bytes, index, count);
|
|
|
|
if (!string.IsNullOrWhiteSpace(language))
|
|
{
|
|
_logger.LogDebug("Text language detected as {0}", language);
|
|
}
|
|
}
|
|
|
|
var charset = DetectCharset(bytes, index, count, language);
|
|
|
|
if (!string.IsNullOrWhiteSpace(charset))
|
|
{
|
|
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return "utf-8";
|
|
}
|
|
|
|
if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return charset;
|
|
}
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(language))
|
|
{
|
|
return GetFileCharacterSetFromLanguage(language);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private string DetectLanguage(byte[] bytes, int index, int count)
|
|
{
|
|
try
|
|
{
|
|
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
|
|
}
|
|
catch (NLangDetectException ex)
|
|
{
|
|
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
|
|
}
|
|
|
|
try
|
|
{
|
|
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
|
|
}
|
|
catch (NLangDetectException ex)
|
|
{
|
|
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
|
|
}
|
|
|
|
try
|
|
{
|
|
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
|
|
}
|
|
catch (NLangDetectException ex)
|
|
{
|
|
_logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
public Encoding GetEncodingFromCharset(string charset)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(charset))
|
|
{
|
|
throw new ArgumentNullException(nameof(charset));
|
|
}
|
|
|
|
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
|
|
|
|
try
|
|
{
|
|
return Encoding.GetEncoding(charset);
|
|
}
|
|
catch (ArgumentException)
|
|
{
|
|
charset = charset.Replace("-", string.Empty);
|
|
_logger.LogDebug("Getting encoding object for character set: {0}", charset);
|
|
|
|
return Encoding.GetEncoding(charset);
|
|
}
|
|
}
|
|
|
|
public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
|
|
{
|
|
var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
|
|
|
|
return GetEncodingFromCharset(charset);
|
|
}
|
|
|
|
private static string GetFileCharacterSetFromLanguage(string language)
|
|
{
|
|
// https://developer.xamarin.com/api/type/System.Text.Encoding/
|
|
|
|
switch (language.ToLower())
|
|
{
|
|
case "tha":
|
|
return "windows-874";
|
|
case "hun":
|
|
return "windows-1252";
|
|
case "pol":
|
|
case "cze":
|
|
case "ces":
|
|
case "slo":
|
|
case "srp":
|
|
case "hrv":
|
|
case "rum":
|
|
case "ron":
|
|
case "rom":
|
|
case "rup":
|
|
return "windows-1250";
|
|
// albanian
|
|
case "alb":
|
|
case "sqi":
|
|
return "windows-1250";
|
|
// slovak
|
|
case "slk":
|
|
case "slv":
|
|
return "windows-1250";
|
|
case "ara":
|
|
return "windows-1256";
|
|
case "heb":
|
|
return "windows-1255";
|
|
case "grc":
|
|
return "windows-1253";
|
|
// greek
|
|
case "gre":
|
|
case "ell":
|
|
return "windows-1253";
|
|
case "crh":
|
|
case "ota":
|
|
case "tur":
|
|
return "windows-1254";
|
|
// bulgarian
|
|
case "bul":
|
|
case "bgr":
|
|
return "windows-1251";
|
|
case "rus":
|
|
return "windows-1251";
|
|
case "vie":
|
|
return "windows-1258";
|
|
case "kor":
|
|
return "cp949";
|
|
default:
|
|
return "windows-1252";
|
|
}
|
|
}
|
|
|
|
private static string DetectCharset(byte[] bytes, int index, int count, string language)
|
|
{
|
|
var detector = new CharsetDetector();
|
|
detector.Feed(bytes, index, count);
|
|
detector.DataEnd();
|
|
|
|
var charset = detector.Charset;
|
|
|
|
// This is often incorrectly indetected. If this happens, try to use other techniques instead
|
|
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(language))
|
|
{
|
|
return null;
|
|
}
|
|
}
|
|
|
|
return charset;
|
|
}
|
|
}
|
|
}
|