375 lines
12 KiB
C#
375 lines
12 KiB
C#
using System;
|
|
|
|
namespace NLangDetect.Core.Extensions
|
|
{
|
|
public static class CharExtensions
|
|
{
|
|
private const int MIN_CODE_POINT = 0x000000;
|
|
private const int MAX_CODE_POINT = 0x10ffff;
|
|
|
|
private static readonly int[] _unicodeBlockStarts =
|
|
{
|
|
#region Unicode block starts
|
|
|
|
0x0000, // Basic Latin
|
|
0x0080, // Latin-1 Supplement
|
|
0x0100, // Latin Extended-A
|
|
0x0180, // Latin Extended-B
|
|
0x0250, // IPA Extensions
|
|
0x02B0, // Spacing Modifier Letters
|
|
0x0300, // Combining Diacritical Marks
|
|
0x0370, // Greek and Coptic
|
|
0x0400, // Cyrillic
|
|
0x0500, // Cyrillic Supplementary
|
|
0x0530, // Armenian
|
|
0x0590, // Hebrew
|
|
0x0600, // Arabic
|
|
0x0700, // Syriac
|
|
0x0750, // unassigned
|
|
0x0780, // Thaana
|
|
0x07C0, // unassigned
|
|
0x0900, // Devanagari
|
|
0x0980, // Bengali
|
|
0x0A00, // Gurmukhi
|
|
0x0A80, // Gujarati
|
|
0x0B00, // Oriya
|
|
0x0B80, // Tamil
|
|
0x0C00, // Telugu
|
|
0x0C80, // Kannada
|
|
0x0D00, // Malayalam
|
|
0x0D80, // Sinhala
|
|
0x0E00, // Thai
|
|
0x0E80, // Lao
|
|
0x0F00, // Tibetan
|
|
0x1000, // Myanmar
|
|
0x10A0, // Georgian
|
|
0x1100, // Hangul Jamo
|
|
0x1200, // Ethiopic
|
|
0x1380, // unassigned
|
|
0x13A0, // Cherokee
|
|
0x1400, // Unified Canadian Aboriginal Syllabics
|
|
0x1680, // Ogham
|
|
0x16A0, // Runic
|
|
0x1700, // Tagalog
|
|
0x1720, // Hanunoo
|
|
0x1740, // Buhid
|
|
0x1760, // Tagbanwa
|
|
0x1780, // Khmer
|
|
0x1800, // Mongolian
|
|
0x18B0, // unassigned
|
|
0x1900, // Limbu
|
|
0x1950, // Tai Le
|
|
0x1980, // unassigned
|
|
0x19E0, // Khmer Symbols
|
|
0x1A00, // unassigned
|
|
0x1D00, // Phonetic Extensions
|
|
0x1D80, // unassigned
|
|
0x1E00, // Latin Extended Additional
|
|
0x1F00, // Greek Extended
|
|
0x2000, // General Punctuation
|
|
0x2070, // Superscripts and Subscripts
|
|
0x20A0, // Currency Symbols
|
|
0x20D0, // Combining Diacritical Marks for Symbols
|
|
0x2100, // Letterlike Symbols
|
|
0x2150, // Number Forms
|
|
0x2190, // Arrows
|
|
0x2200, // Mathematical Operators
|
|
0x2300, // Miscellaneous Technical
|
|
0x2400, // Control Pictures
|
|
0x2440, // Optical Character Recognition
|
|
0x2460, // Enclosed Alphanumerics
|
|
0x2500, // Box Drawing
|
|
0x2580, // Block Elements
|
|
0x25A0, // Geometric Shapes
|
|
0x2600, // Miscellaneous Symbols
|
|
0x2700, // Dingbats
|
|
0x27C0, // Miscellaneous Mathematical Symbols-A
|
|
0x27F0, // Supplemental Arrows-A
|
|
0x2800, // Braille Patterns
|
|
0x2900, // Supplemental Arrows-B
|
|
0x2980, // Miscellaneous Mathematical Symbols-B
|
|
0x2A00, // Supplemental Mathematical Operators
|
|
0x2B00, // Miscellaneous Symbols and Arrows
|
|
0x2C00, // unassigned
|
|
0x2E80, // CJK Radicals Supplement
|
|
0x2F00, // Kangxi Radicals
|
|
0x2FE0, // unassigned
|
|
0x2FF0, // Ideographic Description Characters
|
|
0x3000, // CJK Symbols and Punctuation
|
|
0x3040, // Hiragana
|
|
0x30A0, // Katakana
|
|
0x3100, // Bopomofo
|
|
0x3130, // Hangul Compatibility Jamo
|
|
0x3190, // Kanbun
|
|
0x31A0, // Bopomofo Extended
|
|
0x31C0, // unassigned
|
|
0x31F0, // Katakana Phonetic Extensions
|
|
0x3200, // Enclosed CJK Letters and Months
|
|
0x3300, // CJK Compatibility
|
|
0x3400, // CJK Unified Ideographs Extension A
|
|
0x4DC0, // Yijing Hexagram Symbols
|
|
0x4E00, // CJK Unified Ideographs
|
|
0xA000, // Yi Syllables
|
|
0xA490, // Yi Radicals
|
|
0xA4D0, // unassigned
|
|
0xAC00, // Hangul Syllables
|
|
0xD7B0, // unassigned
|
|
0xD800, // High Surrogates
|
|
0xDB80, // High Private Use Surrogates
|
|
0xDC00, // Low Surrogates
|
|
0xE000, // Private Use
|
|
0xF900, // CJK Compatibility Ideographs
|
|
0xFB00, // Alphabetic Presentation Forms
|
|
0xFB50, // Arabic Presentation Forms-A
|
|
0xFE00, // Variation Selectors
|
|
0xFE10, // unassigned
|
|
0xFE20, // Combining Half Marks
|
|
0xFE30, // CJK Compatibility Forms
|
|
0xFE50, // Small Form Variants
|
|
0xFE70, // Arabic Presentation Forms-B
|
|
0xFF00, // Halfwidth and Fullwidth Forms
|
|
0xFFF0, // Specials
|
|
0x10000, // Linear B Syllabary
|
|
0x10080, // Linear B Ideograms
|
|
0x10100, // Aegean Numbers
|
|
0x10140, // unassigned
|
|
0x10300, // Old Italic
|
|
0x10330, // Gothic
|
|
0x10350, // unassigned
|
|
0x10380, // Ugaritic
|
|
0x103A0, // unassigned
|
|
0x10400, // Deseret
|
|
0x10450, // Shavian
|
|
0x10480, // Osmanya
|
|
0x104B0, // unassigned
|
|
0x10800, // Cypriot Syllabary
|
|
0x10840, // unassigned
|
|
0x1D000, // Byzantine Musical Symbols
|
|
0x1D100, // Musical Symbols
|
|
0x1D200, // unassigned
|
|
0x1D300, // Tai Xuan Jing Symbols
|
|
0x1D360, // unassigned
|
|
0x1D400, // Mathematical Alphanumeric Symbols
|
|
0x1D800, // unassigned
|
|
0x20000, // CJK Unified Ideographs Extension B
|
|
0x2A6E0, // unassigned
|
|
0x2F800, // CJK Compatibility Ideographs Supplement
|
|
0x2FA20, // unassigned
|
|
0xE0000, // Tags
|
|
0xE0080, // unassigned
|
|
0xE0100, // Variation Selectors Supplement
|
|
0xE01F0, // unassigned
|
|
0xF0000, // Supplementary Private Use Area-A
|
|
0x100000, // Supplementary Private Use Area-B
|
|
|
|
#endregion
|
|
};
|
|
|
|
private static readonly UnicodeBlock?[] _unicodeBlocks =
|
|
{
|
|
#region Unicode blocks
|
|
UnicodeBlock.BasicLatin,
|
|
UnicodeBlock.Latin1Supplement,
|
|
UnicodeBlock.LatinExtendedA,
|
|
UnicodeBlock.LatinExtendedB,
|
|
UnicodeBlock.IpaExtensions,
|
|
UnicodeBlock.SpacingModifierLetters,
|
|
UnicodeBlock.CombiningDiacriticalMarks,
|
|
UnicodeBlock.Greek,
|
|
UnicodeBlock.Cyrillic,
|
|
UnicodeBlock.CyrillicSupplementary,
|
|
UnicodeBlock.Armenian,
|
|
UnicodeBlock.Hebrew,
|
|
UnicodeBlock.Arabic,
|
|
UnicodeBlock.Syriac,
|
|
null,
|
|
UnicodeBlock.Thaana,
|
|
null,
|
|
UnicodeBlock.Devanagari,
|
|
UnicodeBlock.Bengali,
|
|
UnicodeBlock.Gurmukhi,
|
|
UnicodeBlock.Gujarati,
|
|
UnicodeBlock.Oriya,
|
|
UnicodeBlock.Tamil,
|
|
UnicodeBlock.Telugu,
|
|
UnicodeBlock.Kannada,
|
|
UnicodeBlock.Malayalam,
|
|
UnicodeBlock.Sinhala,
|
|
UnicodeBlock.Thai,
|
|
UnicodeBlock.Lao,
|
|
UnicodeBlock.Tibetan,
|
|
UnicodeBlock.Myanmar,
|
|
UnicodeBlock.Georgian,
|
|
UnicodeBlock.HangulJamo,
|
|
UnicodeBlock.Ethiopic,
|
|
null,
|
|
UnicodeBlock.Cherokee,
|
|
UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
|
|
UnicodeBlock.Ogham,
|
|
UnicodeBlock.Runic,
|
|
UnicodeBlock.Tagalog,
|
|
UnicodeBlock.Hanunoo,
|
|
UnicodeBlock.Buhid,
|
|
UnicodeBlock.Tagbanwa,
|
|
UnicodeBlock.Khmer,
|
|
UnicodeBlock.Mongolian,
|
|
null,
|
|
UnicodeBlock.Limbu,
|
|
UnicodeBlock.TaiLe,
|
|
null,
|
|
UnicodeBlock.KhmerSymbols,
|
|
null,
|
|
UnicodeBlock.PhoneticExtensions,
|
|
null,
|
|
UnicodeBlock.LatinExtendedAdditional,
|
|
UnicodeBlock.GreekExtended,
|
|
UnicodeBlock.GeneralPunctuation,
|
|
UnicodeBlock.SuperscriptsAndSubscripts,
|
|
UnicodeBlock.CurrencySymbols,
|
|
UnicodeBlock.CombiningMarksForSymbols,
|
|
UnicodeBlock.LetterlikeSymbols,
|
|
UnicodeBlock.NumberForms,
|
|
UnicodeBlock.Arrows,
|
|
UnicodeBlock.MathematicalOperators,
|
|
UnicodeBlock.MiscellaneousTechnical,
|
|
UnicodeBlock.ControlPictures,
|
|
UnicodeBlock.OpticalCharacterRecognition,
|
|
UnicodeBlock.EnclosedAlphanumerics,
|
|
UnicodeBlock.BoxDrawing,
|
|
UnicodeBlock.BlockElements,
|
|
UnicodeBlock.GeometricShapes,
|
|
UnicodeBlock.MiscellaneousSymbols,
|
|
UnicodeBlock.Dingbats,
|
|
UnicodeBlock.MiscellaneousMathematicalSymbolsA,
|
|
UnicodeBlock.SupplementalArrowsA,
|
|
UnicodeBlock.BraillePatterns,
|
|
UnicodeBlock.SupplementalArrowsB,
|
|
UnicodeBlock.MiscellaneousMathematicalSymbolsB,
|
|
UnicodeBlock.SupplementalMathematicalOperators,
|
|
UnicodeBlock.MiscellaneousSymbolsAndArrows,
|
|
null,
|
|
UnicodeBlock.CjkRadicalsSupplement,
|
|
UnicodeBlock.KangxiRadicals,
|
|
null,
|
|
UnicodeBlock.IdeographicDescriptionCharacters,
|
|
UnicodeBlock.CjkSymbolsAndPunctuation,
|
|
UnicodeBlock.Hiragana,
|
|
UnicodeBlock.Katakana,
|
|
UnicodeBlock.Bopomofo,
|
|
UnicodeBlock.HangulCompatibilityJamo,
|
|
UnicodeBlock.Kanbun,
|
|
UnicodeBlock.BopomofoExtended,
|
|
null,
|
|
UnicodeBlock.KatakanaPhoneticExtensions,
|
|
UnicodeBlock.EnclosedCjkLettersAndMonths,
|
|
UnicodeBlock.CjkCompatibility,
|
|
UnicodeBlock.CjkUnifiedIdeographsExtensionA,
|
|
UnicodeBlock.YijingHexagramSymbols,
|
|
UnicodeBlock.CjkUnifiedIdeographs,
|
|
UnicodeBlock.YiSyllables,
|
|
UnicodeBlock.YiRadicals,
|
|
null,
|
|
UnicodeBlock.HangulSyllables,
|
|
null,
|
|
UnicodeBlock.HighSurrogates,
|
|
UnicodeBlock.HighPrivateUseSurrogates,
|
|
UnicodeBlock.LowSurrogates,
|
|
UnicodeBlock.PrivateUseArea,
|
|
UnicodeBlock.CjkCompatibilityIdeographs,
|
|
UnicodeBlock.AlphabeticPresentationForms,
|
|
UnicodeBlock.ArabicPresentationFormsA,
|
|
UnicodeBlock.VariationSelectors,
|
|
null,
|
|
UnicodeBlock.CombiningHalfMarks,
|
|
UnicodeBlock.CjkCompatibilityForms,
|
|
UnicodeBlock.SmallFormVariants,
|
|
UnicodeBlock.ArabicPresentationFormsB,
|
|
UnicodeBlock.HalfwidthAndFullwidthForms,
|
|
UnicodeBlock.Specials,
|
|
UnicodeBlock.LinearBSyllabary,
|
|
UnicodeBlock.LinearBIdeograms,
|
|
UnicodeBlock.AegeanNumbers,
|
|
null,
|
|
UnicodeBlock.OldItalic,
|
|
UnicodeBlock.Gothic,
|
|
null,
|
|
UnicodeBlock.Ugaritic,
|
|
null,
|
|
UnicodeBlock.Deseret,
|
|
UnicodeBlock.Shavian,
|
|
UnicodeBlock.Osmanya,
|
|
null,
|
|
UnicodeBlock.CypriotSyllabary,
|
|
null,
|
|
UnicodeBlock.ByzantineMusicalSymbols,
|
|
UnicodeBlock.MusicalSymbols,
|
|
null,
|
|
UnicodeBlock.TaiXuanJingSymbols,
|
|
null,
|
|
UnicodeBlock.MathematicalAlphanumericSymbols,
|
|
null,
|
|
UnicodeBlock.CjkUnifiedIdeographsExtensionB,
|
|
null,
|
|
UnicodeBlock.CjkCompatibilityIdeographsSupplement,
|
|
null,
|
|
UnicodeBlock.Tags,
|
|
null,
|
|
UnicodeBlock.VariationSelectorsSupplement,
|
|
null,
|
|
UnicodeBlock.SupplementaryPrivateUseAreaA,
|
|
UnicodeBlock.SupplementaryPrivateUseAreaB,
|
|
|
|
#endregion
|
|
};
|
|
|
|
#region Public methods
|
|
|
|
/// <remarks>
|
|
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
|
|
/// </remarks>
|
|
public static UnicodeBlock? GetUnicodeBlock(this char ch)
|
|
{
|
|
int codePoint = ch;
|
|
|
|
if (!IsValidCodePoint(codePoint))
|
|
{
|
|
throw new ArgumentException("Argument is not a valid code point.", "ch");
|
|
}
|
|
|
|
int top, bottom, current;
|
|
|
|
bottom = 0;
|
|
top = _unicodeBlockStarts.Length;
|
|
current = top / 2;
|
|
|
|
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
|
|
while (top - bottom > 1)
|
|
{
|
|
if (codePoint >= _unicodeBlockStarts[current])
|
|
{
|
|
bottom = current;
|
|
}
|
|
else
|
|
{
|
|
top = current;
|
|
}
|
|
|
|
current = (top + bottom) / 2;
|
|
}
|
|
|
|
return _unicodeBlocks[current];
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Private helper methods
|
|
|
|
private static bool IsValidCodePoint(int codePoint)
|
|
{
|
|
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|