using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; namespace Emby.Common.Implementations.TextEncoding { // Copyright 2015-2016 Jonathan Bennett // // https://www.autoitscript.com // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /// /// Credit: https://github.com/AutoIt/text-encoding-detect /// public class TextEncodingDetect { private readonly byte[] _utf16BeBom = { 0xFE, 0xFF }; private readonly byte[] _utf16LeBom = { 0xFF, 0xFE }; private readonly byte[] _utf8Bom = { 0xEF, 0xBB, 0xBF }; private bool _nullSuggestsBinary = true; private double _utf16ExpectedNullPercent = 70; private double _utf16UnexpectedNullPercent = 10; public enum CharacterEncoding { None, // Unknown or binary Ansi, // 0-255 Ascii, // 0-127 Utf8Bom, // UTF8 with BOM Utf8Nobom, // UTF8 without BOM Utf16LeBom, // UTF16 LE with BOM Utf16LeNoBom, // UTF16 LE without BOM Utf16BeBom, // UTF16-BE with BOM Utf16BeNoBom // UTF16-BE without BOM } /// /// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text. /// public bool NullSuggestsBinary { set { _nullSuggestsBinary = value; } } public double Utf16ExpectedNullPercent { set { if (value > 0 && value < 100) { _utf16ExpectedNullPercent = value; } } } public double Utf16UnexpectedNullPercent { set { if (value > 0 && value < 100) { _utf16UnexpectedNullPercent = value; } } } /// /// Gets the BOM length for a given Encoding mode. /// /// /// The BOM length. public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding) { int length; switch (encoding) { case CharacterEncoding.Utf16BeBom: case CharacterEncoding.Utf16LeBom: length = 2; break; case CharacterEncoding.Utf8Bom: length = 3; break; default: length = 0; break; } return length; } /// /// Checks for a BOM sequence in a byte buffer. /// /// /// /// Encoding type or Encoding.None if no BOM. public CharacterEncoding CheckBom(byte[] buffer, int size) { // Check for BOM if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1]) { return CharacterEncoding.Utf16LeBom; } if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1]) { return CharacterEncoding.Utf16BeBom; } if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2]) { return CharacterEncoding.Utf8Bom; } return CharacterEncoding.None; } /// /// Automatically detects the Encoding type of a given byte buffer. /// /// The byte buffer. /// The size of the byte buffer. /// The Encoding type or Encoding.None if unknown. public CharacterEncoding DetectEncoding(byte[] buffer, int size) { // First check if we have a BOM and return that if so CharacterEncoding encoding = CheckBom(buffer, size); if (encoding != CharacterEncoding.None) { return encoding; } // Now check for valid UTF8 encoding = CheckUtf8(buffer, size); if (encoding != CharacterEncoding.None) { return encoding; } // Now try UTF16 encoding = CheckUtf16NewlineChars(buffer, size); if (encoding != CharacterEncoding.None) { return encoding; } encoding = CheckUtf16Ascii(buffer, size); if (encoding != CharacterEncoding.None) { return encoding; } // ANSI or None (binary) then if (!DoesContainNulls(buffer, size)) { return CharacterEncoding.Ansi; } // Found a null, return based on the preference in null_suggests_binary_ return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi; } /// /// Checks if a buffer contains text that looks like utf16 by scanning for /// newline chars that would be present even in non-english text. /// /// The byte buffer. /// The size of the byte buffer. /// Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom. private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size) { if (size < 2) { return CharacterEncoding.None; } // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes size--; var leControlChars = 0; var beControlChars = 0; uint pos = 0; while (pos < size) { byte ch1 = buffer[pos++]; byte ch2 = buffer[pos++]; if (ch1 == 0) { if (ch2 == 0x0a || ch2 == 0x0d) { ++beControlChars; } } else if (ch2 == 0) { if (ch1 == 0x0a || ch1 == 0x0d) { ++leControlChars; } } // If we are getting both LE and BE control chars then this file is not utf16 if (leControlChars > 0 && beControlChars > 0) { return CharacterEncoding.None; } } if (leControlChars > 0) { return CharacterEncoding.Utf16LeNoBom; } return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None; } /// /// Checks if a buffer contains any nulls. Used to check for binary vs text data. /// /// The byte buffer. /// The size of the byte buffer. private static bool DoesContainNulls(byte[] buffer, int size) { uint pos = 0; while (pos < size) { if (buffer[pos++] == 0) { return true; } } return false; } /// /// Checks if a buffer contains text that looks like utf16. This is done based /// on the use of nulls which in ASCII/script like text can be useful to identify. /// /// The byte buffer. /// The size of the byte buffer. /// Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom. private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size) { var numOddNulls = 0; var numEvenNulls = 0; // Get even nulls uint pos = 0; while (pos < size) { if (buffer[pos] == 0) { numEvenNulls++; } pos += 2; } // Get odd nulls pos = 1; while (pos < size) { if (buffer[pos] == 0) { numOddNulls++; } pos += 2; } double evenNullThreshold = numEvenNulls * 2.0 / size; double oddNullThreshold = numOddNulls * 2.0 / size; double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0; double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0; // Lots of odd nulls, low number of even nulls if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold) { return CharacterEncoding.Utf16LeNoBom; } // Lots of even nulls, low number of odd nulls if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold) { return CharacterEncoding.Utf16BeNoBom; } // Don't know return CharacterEncoding.None; } /// /// Checks if a buffer contains valid utf8. /// /// The byte buffer. /// The size of the byte buffer. /// /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or /// Encoding.ASCII (data in 0.127 range). /// /// 2 private CharacterEncoding CheckUtf8(byte[] buffer, int size) { // UTF8 Valid sequences // 0xxxxxxx ASCII // 110xxxxx 10xxxxxx 2-byte // 1110xxxx 10xxxxxx 10xxxxxx 3-byte // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte // // Width in UTF8 // Decimal Width // 0-127 1 byte // 194-223 2 bytes // 224-239 3 bytes // 240-244 4 bytes // // Subsequent chars are in the range 128-191 var onlySawAsciiRange = true; uint pos = 0; while (pos < size) { byte ch = buffer[pos++]; if (ch == 0 && _nullSuggestsBinary) { return CharacterEncoding.None; } int moreChars; if (ch <= 127) { // 1 byte moreChars = 0; } else if (ch >= 194 && ch <= 223) { // 2 Byte moreChars = 1; } else if (ch >= 224 && ch <= 239) { // 3 Byte moreChars = 2; } else if (ch >= 240 && ch <= 244) { // 4 Byte moreChars = 3; } else { return CharacterEncoding.None; // Not utf8 } // Check secondary chars are in range if we are expecting any while (moreChars > 0 && pos < size) { onlySawAsciiRange = false; // Seen non-ascii chars now ch = buffer[pos++]; if (ch < 128 || ch > 191) { return CharacterEncoding.None; // Not utf8 } --moreChars; } } // If we get to here then only valid UTF-8 sequences have been processed // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide) return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom; } } }