jellyfin/Emby.Common.Implementations/TextEncoding/NLangDetect/Detector.cs

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using NLangDetect.Core.Extensions;
using NLangDetect.Core.Utils;

namespace NLangDetect.Core
{
    public class Detector
    {
        private const double _AlphaDefault = 0.5;
        private const double _AlphaWidth = 0.05;

        private const int _IterationLimit = 1000;
        private const double _ProbThreshold = 0.1;
        private const double _ConvThreshold = 0.99999;
        private const int _BaseFreq = 10000;

        private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
        private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);

        private readonly Dictionary<string, ProbVector> _wordLangProbMap;
        private readonly List<string> _langlist;

        private StringBuilder _text;
        private double[] _langprob;

        private double _alpha = _AlphaDefault;
        private const int _trialsCount = 7;
        private int _maxTextLength = 10000;
        private double[] _priorMap;
        private int? _seed;

        #region Constructor(s)

        public Detector(DetectorFactory factory)
        {
            _wordLangProbMap = factory.WordLangProbMap;
            _langlist = factory.Langlist;
            _text = new StringBuilder();
            _seed = factory.Seed;
        }

        #endregion

        #region Public methods

        public void SetAlpha(double alpha)
        {
            _alpha = alpha;
        }

        public void SetPriorMap(Dictionary<string, double> priorMap)
        {
            _priorMap = new double[_langlist.Count];

            double sump = 0;

            for (int i = 0; i < _priorMap.Length; i++)
            {
                string lang = _langlist[i];

                if (priorMap.ContainsKey(lang))
                {
                    double p = priorMap[lang];

                    if (p < 0)
                    {
                        throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
                    }

                    _priorMap[i] = p;
                    sump += p;
                }
            }

            if (sump <= 0)
            {
                throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
            }

            for (int i = 0; i < _priorMap.Length; i++)
            {
                _priorMap[i] /= sump;
            }
        }

        public void SetMaxTextLength(int max_text_length)
        {
            _maxTextLength = max_text_length;
        }

        // TODO IMM HI: TextReader?
        public void Append(StreamReader streamReader)
        {
            var buf = new char[_maxTextLength / 2];

            while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
            {
                int length = streamReader.Read(buf, 0, buf.Length);

                Append(new string(buf, 0, length));
            }
        }

        public void Append(string text)
        {
            text = _UrlRegex.Replace(text, " ");
            text = _MailRegex.Replace(text, " ");

            char pre = '\0';

            for (int i = 0; i < text.Length && i < _maxTextLength; i++)
            {
                char c = NGram.Normalize(text[i]);

                if (c != ' ' || pre != ' ')
                {
                    _text.Append(c);
                }

                pre = c;
            }
        }

        private void CleanText()
        {
            int latinCount = 0, nonLatinCount = 0;

            for (int i = 0; i < _text.Length; i++)
            {
                char c = _text[i];

                if (c <= 'z' && c >= 'A')
                {
                    latinCount++;
                }
                else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
                {
                    nonLatinCount++;
                }
            }

            if (latinCount * 2 < nonLatinCount)
            {
                var textWithoutLatin = new StringBuilder();

                for (int i = 0; i < _text.Length; i++)
                {
                    char c = _text[i];

                    if (c > 'z' || c < 'A')
                    {
                        textWithoutLatin.Append(c);
                    }
                }

                _text = textWithoutLatin;
            }
        }

        public string Detect()
        {
            List<Language> probabilities = GetProbabilities();

            return
              probabilities.Count > 0
                ? probabilities[0].Name
                : null;
        }

        public List<Language> GetProbabilities()
        {
            if (_langprob == null)
            {
                DetectBlock();
            }

            List<Language> list = SortProbability(_langprob);

            return list;
        }

        #endregion

        #region Private helper methods

        private static double NormalizeProb(double[] probs)
        {
            double maxp = 0, sump = 0;

            sump += probs.Sum();

            for (int i = 0; i < probs.Length; i++)
            {
                double p = probs[i] / sump;

                if (maxp < p)
                {
                    maxp = p;
                }

                probs[i] = p;
            }

            return maxp;
        }

        private static string UnicodeEncode(string word)
        {
            var resultSb = new StringBuilder();

            foreach (char ch in word)
            {
                if (ch >= '\u0080')
                {
                    string st = string.Format("{0:x}", 0x10000 + ch);

                    while (st.Length < 4)
                    {
                        st = "0" + st;
                    }

                    resultSb
                      .Append("\\u")
                      .Append(st.SubSequence(1, 5));
                }
                else
                {
                    resultSb.Append(ch);
                }
            }

            return resultSb.ToString();
        }

        private void DetectBlock()
        {
            CleanText();

            List<string> ngrams = ExtractNGrams();

            if (ngrams.Count == 0)
            {
                throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
            }

            _langprob = new double[_langlist.Count];

            Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());

            for (int t = 0; t < _trialsCount; t++)
            {
                double[] prob = InitProbability();

                // TODO IMM HI: verify it works
                double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;

                for (int i = 0; ; i++)
                {
                    int r = rand.Next(ngrams.Count);

                    UpdateLangProb(prob, ngrams[r], alpha);

                    if (i % 5 == 0)
                    {
                        if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
                        {
                            break;
                        }
                    }
                }

                for (int j = 0; j < _langprob.Length; j++)
                {
                    _langprob[j] += prob[j] / _trialsCount;
                }
            }
        }

        private double[] InitProbability()
        {
            var prob = new double[_langlist.Count];

            if (_priorMap != null)
            {
                for (int i = 0; i < prob.Length; i++)
                {
                    prob[i] = _priorMap[i];
                }
            }
            else
            {
                for (int i = 0; i < prob.Length; i++)
                {
                    prob[i] = 1.0 / _langlist.Count;
                }
            }
            return prob;
        }

        private List<string> ExtractNGrams()
        {
            var list = new List<string>();
            NGram ngram = new NGram();

            for (int i = 0; i < _text.Length; i++)
            {
                ngram.AddChar(_text[i]);

                for (int n = 1; n <= NGram.GramsCount; n++)
                {
                    string w = ngram.Get(n);

                    if (w != null && _wordLangProbMap.ContainsKey(w))
                    {
                        list.Add(w);
                    }
                }
            }

            return list;
        }

        private void UpdateLangProb(double[] prob, string word, double alpha)
        {
            if (word == null || !_wordLangProbMap.ContainsKey(word))
            {
                return;
            }

            ProbVector langProbMap = _wordLangProbMap[word];
            double weight = alpha / _BaseFreq;

            for (int i = 0; i < prob.Length; i++)
            {
                prob[i] *= weight + langProbMap[i];
            }
        }

        private List<Language> SortProbability(double[] prob)
        {
            var list = new List<Language>();

            for (int j = 0; j < prob.Length; j++)
            {
                double p = prob[j];

                if (p > _ProbThreshold)
                {
                    for (int i = 0; i <= list.Count; i++)
                    {
                        if (i == list.Count || list[i].Probability < p)
                        {
                            list.Insert(i, new Language(_langlist[j], p));

                            break;
                        }
                    }
                }
            }

            return list;
        }

        #endregion
    }
}