jellyfin/Emby.Server.Implementations/TextEncoding/TextEncoding.cs

249 lines
7.8 KiB
C#
Raw Normal View History

2017-04-18 05:53:39 +00:00
using System;
using System.Text;
2016-11-01 04:07:12 +00:00
using MediaBrowser.Model.IO;
2017-04-18 05:53:39 +00:00
using MediaBrowser.Model.Logging;
2017-06-17 22:59:17 +00:00
using MediaBrowser.Model.Serialization;
using MediaBrowser.Model.Text;
using NLangDetect.Core;
using UniversalDetector;
2016-10-30 07:11:37 +00:00
namespace Emby.Server.Implementations.TextEncoding
2016-10-30 07:11:37 +00:00
{
2016-11-08 18:44:23 +00:00
public class TextEncoding : ITextEncoding
2016-10-30 07:11:37 +00:00
{
2016-11-01 04:07:12 +00:00
private readonly IFileSystem _fileSystem;
2017-04-18 05:53:39 +00:00
private readonly ILogger _logger;
2017-06-17 22:59:17 +00:00
private IJsonSerializer _json;
2016-11-01 04:07:12 +00:00
2017-06-17 22:59:17 +00:00
public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
2016-11-01 04:07:12 +00:00
{
_fileSystem = fileSystem;
2017-04-18 05:53:39 +00:00
_logger = logger;
2017-06-17 22:59:17 +00:00
_json = json;
2016-11-01 04:07:12 +00:00
}
2016-11-08 18:44:23 +00:00
public Encoding GetASCIIEncoding()
2016-10-30 07:11:37 +00:00
{
2016-11-08 18:44:23 +00:00
return Encoding.ASCII;
2016-10-30 07:11:37 +00:00
}
2016-11-01 04:07:12 +00:00
2017-04-18 05:53:39 +00:00
private Encoding GetInitialEncoding(byte[] buffer)
2016-11-01 04:07:12 +00:00
{
if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
return Encoding.UTF8;
if (buffer[0] == 0xfe && buffer[1] == 0xff)
return Encoding.Unicode;
if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
return Encoding.UTF32;
if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
return Encoding.UTF7;
2017-04-18 05:53:39 +00:00
var result = new TextEncodingDetect().DetectEncoding(buffer, buffer.Length);
switch (result)
{
case TextEncodingDetect.CharacterEncoding.Ansi:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Ascii:
return Encoding.ASCII;
case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
return Encoding.UTF32;
case TextEncodingDetect.CharacterEncoding.Utf8Bom:
return Encoding.UTF8;
case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
return Encoding.UTF8;
default:
return null;
}
}
2017-06-17 22:59:17 +00:00
private bool _langDetectInitialized;
2017-06-18 07:11:55 +00:00
public string GetDetectedEncodingName(byte[] bytes, string language, bool enableLanguageDetection)
2017-04-18 05:53:39 +00:00
{
var encoding = GetInitialEncoding(bytes);
if (encoding != null && encoding.Equals(Encoding.UTF8))
{
return "utf-8";
}
2017-06-18 07:11:55 +00:00
if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
2017-06-17 22:59:17 +00:00
{
2017-06-18 07:11:55 +00:00
if (!_langDetectInitialized)
{
_langDetectInitialized = true;
LanguageDetector.Initialize(_json);
}
2017-06-17 22:59:17 +00:00
language = DetectLanguage(bytes);
if (!string.IsNullOrWhiteSpace(language))
{
_logger.Debug("Text language detected as {0}", language);
}
}
2017-04-18 05:53:39 +00:00
var charset = DetectCharset(bytes, language);
if (!string.IsNullOrWhiteSpace(charset))
{
if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
{
return "utf-8";
}
if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
{
return charset;
}
}
if (!string.IsNullOrWhiteSpace(language))
{
return GetFileCharacterSetFromLanguage(language);
}
2016-11-03 22:34:16 +00:00
return null;
2016-11-01 04:07:12 +00:00
}
2017-04-18 05:53:39 +00:00
2017-06-17 22:59:17 +00:00
private string DetectLanguage(byte[] bytes)
{
try
{
return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
try
{
return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
try
{
return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
}
catch (NLangDetectException ex)
{
}
return null;
}
2017-04-18 05:53:39 +00:00
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
{
throw new ArgumentNullException("charset");
}
_logger.Debug("Getting encoding object for character set: {0}", charset);
try
{
return Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
charset = charset.Replace("-", string.Empty);
_logger.Debug("Getting encoding object for character set: {0}", charset);
return Encoding.GetEncoding(charset);
}
}
2017-06-18 07:11:55 +00:00
public Encoding GetDetectedEncoding(byte[] bytes, string language, bool enableLanguageDetection)
2017-04-18 05:53:39 +00:00
{
2017-06-18 07:11:55 +00:00
var charset = GetDetectedEncodingName(bytes, language, enableLanguageDetection);
2017-04-18 05:53:39 +00:00
return GetEncodingFromCharset(charset);
}
private string GetFileCharacterSetFromLanguage(string language)
{
// https://developer.xamarin.com/api/type/System.Text.Encoding/
switch (language.ToLower())
{
case "hun":
return "windows-1252";
case "pol":
case "cze":
case "ces":
case "slo":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
2017-06-17 22:59:17 +00:00
return "windows-1250";
// albanian
2017-04-18 05:53:39 +00:00
case "alb":
case "sqi":
return "windows-1250";
2017-06-17 22:59:17 +00:00
// slovak
case "slk":
case "slv":
return "windows-1250";
2017-04-18 05:53:39 +00:00
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
2017-06-17 22:59:17 +00:00
return "windows-1253";
// greek
2017-04-18 05:53:39 +00:00
case "gre":
2017-06-17 22:59:17 +00:00
case "ell":
2017-04-18 05:53:39 +00:00
return "windows-1253";
case "crh":
case "ota":
case "tur":
return "windows-1254";
2017-06-20 19:38:42 +00:00
// bulgarian
case "bul":
2017-06-04 20:27:57 +00:00
case "bgr":
2017-06-20 19:38:42 +00:00
return "windows-1251";
2017-04-18 05:53:39 +00:00
case "rus":
return "windows-1251";
case "vie":
return "windows-1258";
case "kor":
return "cp949";
default:
return "windows-1252";
}
}
private string DetectCharset(byte[] bytes, string language)
{
var detector = new CharsetDetector();
detector.Feed(bytes, 0, bytes.Length);
detector.DataEnd();
var charset = detector.Charset;
// This is often incorrectly indetected. If this happens, try to use other techniques instead
if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
{
if (!string.IsNullOrWhiteSpace(language))
{
return null;
}
}
return charset;
}
2016-10-30 07:11:37 +00:00
}
}