diff options
Diffstat (limited to 'kmail/encodingdetector.cpp')
-rw-r--r-- | kmail/encodingdetector.cpp | 1377 |
1 files changed, 1377 insertions, 0 deletions
diff --git a/kmail/encodingdetector.cpp b/kmail/encodingdetector.cpp new file mode 100644 index 000000000..e5881d6f7 --- /dev/null +++ b/kmail/encodingdetector.cpp @@ -0,0 +1,1377 @@ +/* + This file was taken from the KDE 4.x libraries and backported to Qt 3. + + Copyright (C) 1999 Lars Knoll (knoll@kde.org) + Copyright (C) 2003 Dirk Mueller (mueller@kde.org) + Copyright (C) 2003 Apple Computer, Inc. + Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ +//---------------------------------------------------------------------------- +// +// decoder for input stream + +#include "encodingdetector.h" + +#undef DECODE_DEBUG +//#define DECODE_DEBUG + +#define MAX_BUFFER 16*1024 + +#include <assert.h> +#include <stdlib.h> + +#include "encodingdetector_ja_p.h" + +#include <qregexp.h> +#include <qtextcodec.h> + +#include <kglobal.h> +#include <kcharsets.h> +#include <kdebug.h> +#include <klocale.h> + +#include <ctype.h> + +// The following table was taken from libpango 1.19.3 and slightly modified. +// Multiple scripts per language were removed and the entries were reordered so +// that simple substring matching will work. For example, bam was put before ba +// so that the first match will be likely the right match. Otherwise "ba" would +// match "bam" but we would have to search on to find "bam" which is what we want. +// The original file is called pango-script-lang-table.h + +/* pango-script-lang-table.h: + * + * Generated by gen-script-for-lang-new.c + * Date: 2007-10-26 + * Source: fontconfig-2.4.91 + * + * Do not edit. // I did. Sue me ;) + */ +typedef struct _PangoScriptForLang { + const char lang[6]; + EncodingDetector::AutoDetectScript scripts[1]; +} PangoScriptForLang; + +//Unfortunately EncodingDetector does not know all scripts that Pango knows. +//Also, using EncodingDetector::CentralEuropean for the appropriate countries +//might give better results in some cases. +//One especially important (many speakers/literates) omission is the lack of +//Indian scripts. + +#define PANGO_SCRIPT_ARMENIAN EncodingDetector::None +#define PANGO_SCRIPT_BENGALI EncodingDetector::None +#define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None +#define PANGO_SCRIPT_CHEROKEE EncodingDetector::None +#define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None +#define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None +#define PANGO_SCRIPT_GUJARATI EncodingDetector::None +#define PANGO_SCRIPT_GURMUKHI EncodingDetector::None +#define PANGO_SCRIPT_KANNADA EncodingDetector::None +#define PANGO_SCRIPT_KHMER EncodingDetector::None +#define PANGO_SCRIPT_LAO EncodingDetector::None +#define PANGO_SCRIPT_MALAYALAM EncodingDetector::None +#define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None +#define PANGO_SCRIPT_MYANMAR EncodingDetector::None +#define PANGO_SCRIPT_ORIYA EncodingDetector::None +#define PANGO_SCRIPT_SINHALA EncodingDetector::None +#define PANGO_SCRIPT_SYRIAC EncodingDetector::None +#define PANGO_SCRIPT_TAGALOG EncodingDetector::None +#define PANGO_SCRIPT_TAMIL EncodingDetector::None +#define PANGO_SCRIPT_TIBETAN EncodingDetector::None +#define PANGO_SCRIPT_TELUGU EncodingDetector::None + +//Instead of changing the table even more... +#define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic +#define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic +#define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope +#define PANGO_SCRIPT_GREEK EncodingDetector::Greek +#define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew +#define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean +#define PANGO_SCRIPT_THAI EncodingDetector::Thai + + +static const PangoScriptForLang pango_script_for_lang[] = { + { "aa", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } }, + { "af", { PANGO_SCRIPT_LATIN/*69*/ } }, + { "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, + { "ar", { PANGO_SCRIPT_ARABIC/*125*/ } }, + { "as", { PANGO_SCRIPT_BENGALI/*89*/ } }, + { "ast", { PANGO_SCRIPT_LATIN/*66*/ } }, + { "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, + { "ay", { PANGO_SCRIPT_LATIN/*60*/ } }, + { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } }, + { "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } }, + { "bam", { PANGO_SCRIPT_LATIN/*60*/ } }, + { "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } }, + { "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, + { "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, + { "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "bi", { PANGO_SCRIPT_LATIN/*58*/ } }, + { "bin", { PANGO_SCRIPT_LATIN/*76*/ } }, + { "bn", { PANGO_SCRIPT_BENGALI/*89*/ } }, + { "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } }, + { "br", { PANGO_SCRIPT_LATIN/*64*/ } }, + { "bs", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, + { "ca", { PANGO_SCRIPT_LATIN/*74*/ } }, + { "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, + { "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, + { "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } }, + { "ch", { PANGO_SCRIPT_LATIN/*58*/ } }, + { "co", { PANGO_SCRIPT_LATIN/*84*/ } }, + { "cs", { PANGO_SCRIPT_LATIN/*82*/ } }, + { "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } }, + { "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } }, + { "cy", { PANGO_SCRIPT_LATIN/*78*/ } }, + { "da", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "de", { PANGO_SCRIPT_LATIN/*59*/ } }, + { "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } }, + { "el", { PANGO_SCRIPT_GREEK/*69*/ } }, + { "en", { PANGO_SCRIPT_LATIN/*72*/ } }, + { "eo", { PANGO_SCRIPT_LATIN/*64*/ } }, + { "es", { PANGO_SCRIPT_LATIN/*66*/ } }, +// { "et", { PANGO_SCRIPT_LATIN/*64*/ } }, + { "et", { EncodingDetector::Baltic } }, + { "eu", { PANGO_SCRIPT_LATIN/*56*/ } }, + { "fa", { PANGO_SCRIPT_ARABIC/*129*/ } }, + { "fi", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "fj", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "fo", { PANGO_SCRIPT_LATIN/*68*/ } }, + { "fr", { PANGO_SCRIPT_LATIN/*84*/ } }, + { "ful", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "fur", { PANGO_SCRIPT_LATIN/*66*/ } }, + { "fy", { PANGO_SCRIPT_LATIN/*75*/ } }, + { "ga", { PANGO_SCRIPT_LATIN/*80*/ } }, + { "gd", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, + { "gl", { PANGO_SCRIPT_LATIN/*66*/ } }, + { "gn", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } }, + { "gv", { PANGO_SCRIPT_LATIN/*54*/ } }, + { "ha", { PANGO_SCRIPT_LATIN/*60*/ } }, + { "haw", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "he", { PANGO_SCRIPT_HEBREW/*27*/ } }, + { "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "ho", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "hr", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "hu", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } }, + { "ia", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "ibo", { PANGO_SCRIPT_LATIN/*58*/ } }, + { "id", { PANGO_SCRIPT_LATIN/*54*/ } }, + { "ie", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, + { "io", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "is", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "it", { PANGO_SCRIPT_LATIN/*72*/ } }, + { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } }, +// { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } }, + { "ja", { EncodingDetector::Japanese } }, + { "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, + { "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } }, + { "ki", { PANGO_SCRIPT_LATIN/*56*/ } }, + { "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } }, + { "kl", { PANGO_SCRIPT_LATIN/*81*/ } }, + { "km", { PANGO_SCRIPT_KHMER/*70*/ } }, + { "kn", { PANGO_SCRIPT_KANNADA/*80*/ } }, +// { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } }, + { "ko", { EncodingDetector::Korean } }, + { "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } }, + { "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } }, + { "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, + { "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, + { "kw", { PANGO_SCRIPT_LATIN/*64*/ } }, + { "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, + { "la", { PANGO_SCRIPT_LATIN/*68*/ } }, + { "lb", { PANGO_SCRIPT_LATIN/*75*/ } }, + { "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, + { "ln", { PANGO_SCRIPT_LATIN/*78*/ } }, + { "lo", { PANGO_SCRIPT_LAO/*65*/ } }, +// { "lt", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "lt", { EncodingDetector::Baltic } }, +// { "lv", { PANGO_SCRIPT_LATIN/*78*/ } }, + { "lv", { EncodingDetector::Baltic } }, + { "mg", { PANGO_SCRIPT_LATIN/*56*/ } }, + { "mh", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "mi", { PANGO_SCRIPT_LATIN/*64*/ } }, + { "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } }, + { "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } }, + { "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } }, + { "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } }, + { "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "mt", { PANGO_SCRIPT_LATIN/*72*/ } }, + { "my", { PANGO_SCRIPT_MYANMAR/*48*/ } }, + { "nb", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "nds", { PANGO_SCRIPT_LATIN/*59*/ } }, + { "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "nl", { PANGO_SCRIPT_LATIN/*82*/ } }, + { "nn", { PANGO_SCRIPT_LATIN/*76*/ } }, + { "no", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "nr", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "nso", { PANGO_SCRIPT_LATIN/*58*/ } }, + { "ny", { PANGO_SCRIPT_LATIN/*54*/ } }, + { "oc", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "om", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "or", { PANGO_SCRIPT_ORIYA/*79*/ } }, + { "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, + { "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } }, + { "pl", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } }, + { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } }, + { "pt", { PANGO_SCRIPT_LATIN/*82*/ } }, + { "rm", { PANGO_SCRIPT_LATIN/*66*/ } }, + { "ro", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, + { "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, + { "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, + { "sco", { PANGO_SCRIPT_LATIN/*56*/ } }, + { "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, + { "se", { PANGO_SCRIPT_LATIN/*66*/ } }, + { "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, + { "si", { PANGO_SCRIPT_SINHALA/*77*/ } }, + { "sk", { PANGO_SCRIPT_LATIN/*86*/ } }, + { "sl", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "sma", { PANGO_SCRIPT_LATIN/*60*/ } }, + { "smj", { PANGO_SCRIPT_LATIN/*60*/ } }, + { "smn", { PANGO_SCRIPT_LATIN/*68*/ } }, + { "sms", { PANGO_SCRIPT_LATIN/*80*/ } }, + { "sm", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "so", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "sq", { PANGO_SCRIPT_LATIN/*56*/ } }, + { "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, + { "ss", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "st", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "sv", { PANGO_SCRIPT_LATIN/*68*/ } }, + { "sw", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } }, + { "ta", { PANGO_SCRIPT_TAMIL/*48*/ } }, + { "te", { PANGO_SCRIPT_TELUGU/*80*/ } }, + { "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, + { "th", { PANGO_SCRIPT_THAI/*86*/ } }, + { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, + { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, + { "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } }, + { "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } }, + { "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } }, + { "tn", { PANGO_SCRIPT_LATIN/*58*/ } }, + { "to", { PANGO_SCRIPT_LATIN/*52*/ } }, +// { "tr", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "tr", { EncodingDetector::Turkish } }, + { "ts", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, + { "tw", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, + { "ug", { PANGO_SCRIPT_ARABIC/*125*/ } }, + { "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, + { "ur", { PANGO_SCRIPT_ARABIC/*145*/ } }, + { "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, + { "ven", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "vi", { PANGO_SCRIPT_LATIN/*186*/ } }, + { "vot", { PANGO_SCRIPT_LATIN/*62*/ } }, + { "vo", { PANGO_SCRIPT_LATIN/*54*/ } }, + { "wa", { PANGO_SCRIPT_LATIN/*70*/ } }, + { "wen", { PANGO_SCRIPT_LATIN/*76*/ } }, + { "wo", { PANGO_SCRIPT_LATIN/*66*/ } }, + { "xh", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "yap", { PANGO_SCRIPT_LATIN/*58*/ } }, + { "yi", { PANGO_SCRIPT_HEBREW/*27*/ } }, + { "yo", { PANGO_SCRIPT_LATIN/*114*/ } }, +// { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } }, + { "zh-cn", { EncodingDetector::ChineseSimplified } }, +// { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } }, + { "zh-hk", { EncodingDetector::ChineseTraditional } }, +// { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } }, + { "zh-mo", { EncodingDetector::ChineseTraditional } }, +// { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } }, + { "zh-sg", { EncodingDetector::ChineseSimplified } }, +// { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } }, + { "zh-tw", { EncodingDetector::ChineseTraditional } }, + { "zu", { PANGO_SCRIPT_LATIN/*52*/ } }, + { "\x00", { EncodingDetector::None } } //end mark +}; + +enum MIB +{ + MibLatin1 = 4, + Mib8859_8 = 85, + MibUtf8 = 106, + MibUcs2 = 1000, + MibUtf16 = 1015, + MibUtf16BE = 1013, + MibUtf16LE = 1014 +}; + +static bool is16Bit(QTextCodec* codec) +{ + switch (codec->mibEnum()) + { + case MibUtf16: + case MibUtf16BE: + case MibUtf16LE: + case MibUcs2: + return true; + default: + return false; + } +} + +class EncodingDetectorPrivate +{ +public: + QTextCodec *m_codec; + QTextDecoder *m_decoder; // utf16 + QTextCodec *m_defaultCodec; + QCString m_storeDecoderName; + + EncodingDetector::EncodingChoiceSource m_source; + EncodingDetector::AutoDetectScript m_autoDetectLanguage; + + bool m_visualRTL : 1; + bool m_seenBody : 1; + bool m_writtingHappened : 1; + bool m_analyzeCalled : 1; //for decode() + int m_multiByte; + + QCString m_bufferForDefferedEncDetection; + + EncodingDetectorPrivate() + : m_codec(QTextCodec::codecForMib(MibLatin1)) + , m_decoder(m_codec->makeDecoder()) + , m_defaultCodec(m_codec) + , m_source(EncodingDetector::DefaultEncoding) + , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection) + , m_visualRTL(false) + , m_seenBody(false) + , m_writtingHappened(false) + , m_analyzeCalled(false) + , m_multiByte(0) + { + } + + EncodingDetectorPrivate(QTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script) + : m_codec(codec) + , m_decoder(m_codec->makeDecoder()) + , m_defaultCodec(m_codec) + , m_source(source) + , m_autoDetectLanguage(script) + , m_visualRTL(false) + , m_seenBody(false) + , m_writtingHappened(false) + , m_analyzeCalled(false) + , m_multiByte(0) + { + } + + ~EncodingDetectorPrivate() + { + delete m_decoder; + } +}; + + +static QCString automaticDetectionForArabic( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 + || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) + || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 + || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { + return "cp1256"; + } + } + + return "iso-8859-6"; +} + +static QCString automaticDetectionForBaltic( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) + return "cp1257"; + + if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) + return "iso-8859-13"; + } + + return "iso-8859-13"; +} + +static QCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) +{ + QCString charset; + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { + if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) + return "ibm852"; + + if ( i + 1 > size ) + return "cp1250"; + else { // maybe ibm852 ? + charset = "cp1250"; + continue; + } + } + if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { + if ( i + 1 > size ) + return "iso-8859-2"; + else { // maybe ibm852 ? + if ( charset.isNull() ) + charset = "iso-8859-2"; + continue; + } + } + } + + if ( charset.isNull() ) + charset = "iso-8859-3"; + + return charset.data(); +} + +static QCString automaticDetectionForCyrillic( const unsigned char* ptr, int size) +{ +#ifdef DECODE_DEBUG + kWarning() << "EncodingDetector: Cyr heuristics"; +#endif + +// if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) +// return "utf8"; + int utf8_mark=0; + int koi_score=0; + int cp1251_score=0; + + int koi_st=0; + int cp1251_st=0; + +// int koi_na=0; +// int cp1251_na=0; + + int koi_o_capital=0; + int koi_o=0; + int cp1251_o_capital=0; + int cp1251_o=0; + + int koi_a_capital=0; + int koi_a=0; + int cp1251_a_capital=0; + int cp1251_a=0; + + int koi_s_capital=0; + int koi_s=0; + int cp1251_s_capital=0; + int cp1251_s=0; + + int koi_i_capital=0; + int koi_i=0; + int cp1251_i_capital=0; + int cp1251_i=0; + + int cp1251_small_range=0; + int koi_small_range=0; + int ibm866_small_range=0; + + int i; + for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i) + { + if (ptr[i]>0xdf) + { + ++cp1251_small_range; + + if (ptr[i]==0xee)//small o + ++cp1251_o; + else if (ptr[i]==0xe0)//small a + ++cp1251_a; + else if (ptr[i]==0xe8)//small i + ++cp1251_i; + else if (ptr[i]==0xf1)//small s + ++cp1251_s; + else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st + ++cp1251_st; + + else if (ptr[i]==0xef) + ++koi_o_capital; + else if (ptr[i]==0xe1) + ++koi_a_capital; + else if (ptr[i]==0xe9) + ++koi_i_capital; + else if (ptr[i]==0xf3) + ++koi_s_capital; + + } + else if (ptr[i]>0xbf) + { + ++koi_small_range; + + if (ptr[i]==0xd0||ptr[i]==0xd1)//small o + ++utf8_mark; + else if (ptr[i]==0xcf)//small o + ++koi_o; + else if (ptr[i]==0xc1)//small a + ++koi_a; + else if (ptr[i]==0xc9)//small i + ++koi_i; + else if (ptr[i]==0xd3)//small s + ++koi_s; + else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st + ++koi_st; + + else if (ptr[i]==0xce) + ++cp1251_o_capital; + else if (ptr[i]==0xc0) + ++cp1251_a_capital; + else if (ptr[i]==0xc8) + ++cp1251_i_capital; + else if (ptr[i]==0xd1) + ++cp1251_s_capital; + } + else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60% + ++ibm866_small_range; + + } + + //cannot decide? + if (cp1251_small_range+koi_small_range+ibm866_small_range<8) + { + return ""; + } + + if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range) + { +#ifdef DECODE_DEBUG + kWarning() << "Cyr Enc Detection: UTF8"; +#endif + return "UTF-8"; + } + + if (ibm866_small_range>cp1251_small_range+koi_small_range) + return "ibm866"; + +// QCString koi_string = "koi8-u"; +// QCString cp1251_string = "cp1251"; + + if (cp1251_st==0 && koi_st>1) + koi_score+=10; + else if (koi_st==0 && cp1251_st>1) + cp1251_score+=10; + + if (cp1251_st && koi_st) + { + if (cp1251_st/koi_st>2) + cp1251_score+=20; + else if (koi_st/cp1251_st>2) + koi_score+=20; + } + + if (cp1251_a>koi_a) + cp1251_score+=10; + else if (cp1251_a || koi_a) + koi_score+=10; + + if (cp1251_o>koi_o) + cp1251_score+=10; + else if (cp1251_o || koi_o) + koi_score+=10; + + if (cp1251_i>koi_i) + cp1251_score+=10; + else if (cp1251_i || koi_i) + koi_score+=10; + + if (cp1251_s>koi_s) + cp1251_score+=10; + else if (cp1251_s || koi_s) + koi_score+=10; + + if (cp1251_a_capital>koi_a_capital) + cp1251_score+=9; + else if (cp1251_a_capital || koi_a_capital) + koi_score+=9; + + if (cp1251_o_capital>koi_o_capital) + cp1251_score+=9; + else if (cp1251_o_capital || koi_o_capital) + koi_score+=9; + + if (cp1251_i_capital>koi_i_capital) + cp1251_score+=9; + else if (cp1251_i_capital || koi_i_capital) + koi_score+=9; + + if (cp1251_s_capital>koi_s_capital) + cp1251_score+=9; + else if (cp1251_s_capital || koi_s_capital) + koi_score+=9; +#ifdef DECODE_DEBUG + kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score; +#endif + if (abs(koi_score-cp1251_score)<10) + { + //fallback... + cp1251_score=cp1251_small_range; + koi_score=koi_small_range; + } + if (cp1251_score>koi_score) + return "cp1251"; + else + return "koi8-u"; + + +// if (cp1251_score>koi_score) +// setEncoding("cp1251",AutoDetectedEncoding); +// else +// setEncoding("koi8-u",AutoDetectedEncoding); +// return true; + +} + +static QCString automaticDetectionForGreek( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B + || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 + || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { + return "cp1253"; + } + } + + return "iso-8859-7"; +} + +static QCString automaticDetectionForHebrew( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B + || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) + || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { + return "cp1255"; + } + + if ( ptr[ i ] == 0xDF ) + return "iso-8859-8-i"; + } + + return "iso-8859-8-i"; +} + +static QCString automaticDetectionForJapanese( const unsigned char* ptr, int size ) +{ + JapaneseCode kc; + + switch ( kc.guess_jp( (const char*)ptr, size ) ) { + case JapaneseCode::JIS: + return "jis7"; + case JapaneseCode::EUC: + return "eucjp"; + case JapaneseCode::SJIS: + return "sjis"; + case JapaneseCode::UTF8: + return "utf8"; + default: + break; + } + + return ""; +} + +static QCString automaticDetectionForTurkish( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { + return "cp1254"; + } + } + + return "iso-8859-9"; +} + +static QCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) +{ + uint nonansi_count=0; + for (int i=0; i<size; ++i) + { + if (ptr[i]>0x79) + { + ++nonansi_count; + if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0) + { + return "UTF-8"; + } + if (ptr[i] >= 0x78 && ptr[i] <= 0x9 ) + { + return "cp1252"; + } + } + + } + + if (nonansi_count>0) + return "iso-8859-15"; + + return ""; +} + +// Other browsers allow comments in the head section, so we need to also. +// It's important not to look for tags inside the comments. +static void skipComment(const char *&ptr, const char *pEnd) +{ + const char *p = ptr; + // Allow <!-->; other browsers do. + if (*p=='>') + { + p++; + } + else + { + while (p!=pEnd) + { + if (*p=='-') + { + // This is the real end of comment, "-->". + if (p[1]=='-' && p[2]=='>') + { + p += 3; + break; + } + // This is the incorrect end of comment that other browsers allow, "--!>". + if (p[1] == '-' && p[2] == '!' && p[3] == '>') + { + p += 4; + break; + } + } + p++; + } + } + ptr=p; +} + +// Returns the position of the encoding string. +static int findXMLEncoding(const QCString &str, int &encodingLength) +{ + int len = str.length(); + int pos = str.find("encoding"); + if (pos == -1) + return -1; + pos += 8; + + // Skip spaces and stray control characters. + while (pos<len && str[pos]<=' ') + ++pos; + + //Bail out if nothing after + // Skip equals sign. + if (pos>=len || str[pos] != '=') + return -1; + ++pos; + + // Skip spaces and stray control characters. + while (pos<len && str[pos]<=' ') + ++pos; + + //Bail out if nothing after + if (pos >= len) + return -1; + + // Skip quotation mark. + char quoteMark = str[pos]; + if (quoteMark != '"' && quoteMark != '\'') + return -1; + ++pos; + + // Find the trailing quotation mark. + int end=pos; + while (end<len && str[end]!=quoteMark) + ++end; + + if (end>=len) + return -1; + + encodingLength = end-pos; + return pos; +} + + +bool EncodingDetector::errorsIfUtf8 (const char* data, int length) +{ + if (d->m_codec->mibEnum()!=MibUtf8) + return false; //means no errors +// #define highest1Bits (unsigned char)0x80 +// #define highest2Bits (unsigned char)0xC0 +// #define highest3Bits (unsigned char)0xE0 +// #define highest4Bits (unsigned char)0xF0 +// #define highest5Bits (unsigned char)0xF8 +static const unsigned char highest1Bits = 0x80; +static const unsigned char highest2Bits = 0xC0; +static const unsigned char highest3Bits = 0xE0; +static const unsigned char highest4Bits = 0xF0; +static const unsigned char highest5Bits = 0xF8; + + for (int i=0; i<length; ++i) + { + unsigned char c = data[i]; + + if (d->m_multiByte>0) + { + if ((c & highest2Bits) == 0x80) + { + --(d->m_multiByte); + continue; + } +#ifdef DECODE_DEBUG + kWarning() << "EncDetector: Broken UTF8"; +#endif + return true; + } + + // most significant bit zero, single char + if ((c & highest1Bits) == 0x00) + continue; + + // 110xxxxx => init 1 following bytes + if ((c & highest3Bits) == 0xC0) + { + d->m_multiByte = 1; + continue; + } + + // 1110xxxx => init 2 following bytes + if ((c & highest4Bits) == 0xE0) + { + d->m_multiByte = 2; + continue; + } + + // 11110xxx => init 3 following bytes + if ((c & highest5Bits) == 0xF0) + { + d->m_multiByte = 3; + continue; + } +#ifdef DECODE_DEBUG + kWarning() << "EncDetector:_Broken UTF8"; +#endif + return true; + } + return false; +} + +EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate) +{ +} + +EncodingDetector::EncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) : + d(new EncodingDetectorPrivate(codec,source,script)) +{ +} + +EncodingDetector::~EncodingDetector() +{ + delete d; +} + +void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang) +{ + d->m_autoDetectLanguage=lang; +} +EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const +{ + return d->m_autoDetectLanguage; +} + +EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const +{ + return d->m_source; +} + +const char* EncodingDetector::encoding() const +{ + d->m_storeDecoderName = d->m_codec->name(); + return d->m_storeDecoderName.data(); +} + +bool EncodingDetector::visuallyOrdered() const +{ + return d->m_visualRTL; +} + +// const QTextCodec* EncodingDetector::codec() const +// { +// return d->m_codec; +// } + +QTextDecoder* EncodingDetector::decoder() +{ + return d->m_decoder; +} + +bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) +{ + QTextCodec *codec; + QCString enc(_encoding); + if(/*enc.isNull() || */enc.isEmpty()) + { + if (type==DefaultEncoding) + codec=d->m_defaultCodec; + else + return false; + } + else + { + //QString->QTextCodec + + enc = enc.lower(); + // hebrew visually ordered + if(enc=="visual") + enc="iso8859-8"; + bool b; + codec = KGlobal::charsets()->codecForName(enc, b); + if (!b) + return false; + } + + if (d->m_codec->mibEnum()==codec->mibEnum()) + return true; + + if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) + { + //Sometimes the codec specified is absurd, i.e. UTF-16 despite + //us decoding a meta tag as ASCII. In that case, ignore it. + return false; + } + + if (codec->mibEnum() == Mib8859_8) + { + //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. + codec = QTextCodec::codecForName("iso8859-8-i"); + + // visually ordered unless one of the following + if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) + d->m_visualRTL = true; + } + + d->m_codec = codec; + d->m_source = type; + delete d->m_decoder; + d->m_decoder = d->m_codec->makeDecoder(); +#ifdef DECODE_DEBUG + kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name(); +#endif + return true; +} + +bool EncodingDetector::analyze(const QByteArray &data) +{ + return analyze( data.data(), data.size() ); +} + +bool EncodingDetector::analyze(const char *data, int len) +{ + // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. + // maximumBOMLength = 10 + // Even if the user has chosen utf16 we still need to auto-detect the endianness + if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) + { + // Extract the first three bytes. + const uchar *udata = (const uchar *)data; + uchar c1 = *udata++; + uchar c2 = *udata++; + uchar c3 = *udata++; + + // Check for the BOM + const char *autoDetectedEncoding; + if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) + { + autoDetectedEncoding = "ISO-10646-UCS-2"; + } + else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) + { + autoDetectedEncoding = "UTF-8"; + } + else if (c1 == 0x00 || c2 == 0x00) + { + uchar c4 = *udata++; + uchar c5 = *udata++; + uchar c6 = *udata++; + uchar c7 = *udata++; + uchar c8 = *udata++; + uchar c9 = *udata++; + uchar c10 = *udata++; + + int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); + int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); + if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0)) + autoDetectedEncoding = "ISO-10646-UCS-2"; + else + autoDetectedEncoding = 0; + } + else + { + autoDetectedEncoding = 0; + } + + // If we found a BOM, use the encoding it implies. + if (autoDetectedEncoding != 0) + { + d->m_source = BOM; + d->m_codec = QTextCodec::codecForName(autoDetectedEncoding); + assert(d->m_codec); + //enc = d->m_codec->name(); + delete d->m_decoder; + d->m_decoder = d->m_codec->makeDecoder(); +#ifdef DECODE_DEBUG + kWarning() << "Detection by BOM"; +#endif + if (is16Bit(d->m_codec) && c2==0x00) + { + // utf16LE, we need to put the decoder in LE mode + char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; + d->m_decoder->toUnicode(reverseUtf16, 2); + } + return true; + } + } + + //exit from routine in case it was called to only detect byte order for utf-16 + if (d->m_source==UserChosenEncoding) + { +#ifdef DECODE_DEBUG + kWarning() << "EncodingDetector: UserChosenEncoding exit "; +#endif + + if (errorsIfUtf8(data, len)) + setEncoding("",DefaultEncoding); + return true; + } +#if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz + if (!d->m_seenBody) + { + // we still don't have an encoding, and are in the head + // the following tags are allowed in <head>: + // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE + const char *ptr = data; + const char *pEnd = data+len; + + while(ptr != pEnd) + { + if(*ptr!='<') + { + ++ptr; + continue; + } + ++ptr; + // Handle comments. + if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') + { + ptr += 3; + skipComment(ptr, pEnd); + continue; + } + + // Handle XML header, which can have encoding in it. + if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l') + { + const char *end = ptr; + while (*end != '>' && end < pEnd) + end++; + if (*end == '\0' || end == pEnd) + break; + QCString str(ptr, end - ptr + 1); + int length; + int pos = findXMLEncoding(str, length); + // also handles the case when specified encoding aint correct + if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader)) + { + return true; + } + } + + //look for <meta>, stop if we reach <body> + while ( + !((*ptr >= 'a') && (*ptr <= 'z') || + (*ptr >= 'A') && (*ptr <= 'Z')) + && ptr < pEnd + ) + ++ptr; + + char tmp[5]; + int length=0; + const char* max=ptr+4; + if (pEnd<max) + max=pEnd; + while ( + ((*ptr >= 'a') && (*ptr <= 'z') || + (*ptr >= 'A') && (*ptr <= 'Z') || + (*ptr >= '0') && (*ptr <= '9')) + && ptr < max + ) + { + tmp[length] = tolower( *ptr ); + ++ptr; + ++length; + } + tmp[length] = 0; + if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a') + { + // found a meta tag... + const char* end = ptr; + while(*end != '>' && *end != '\0' && end<pEnd) + end++; + //if ( *end == '\0' ) break; + QCString str( ptr, (end-ptr)+1); + str = str.lower(); + int pos=0; + //if( (pos = str.find("http-equiv", pos)) == -1) break; + //if( (pos = str.find("content-type", pos)) == -1) break; + if( (pos = str.find("charset")) == -1) + continue; + pos+=6; + // skip to '=' + if( (pos = str.find('=', pos)) == -1) + continue; + + // skip whitespace before encoding itself + while (pos < (int)str.length() && str[pos] <= ' ') + ++pos; + if ( pos == (int)str.length()) + continue; + + int endpos = pos; + while( endpos < str.length() && + (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' + && str[endpos] != ';' && str[endpos] != '>') ) + ++endpos; + #ifdef DECODE_DEBUG + kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data(); + #endif + if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag)) + return true; + } + else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y') + { + d->m_seenBody=true; + break; + } + } + } + + if (d->m_source==EncodingFromHTTPHeader) + return true; +#endif + //if (len<20) //make a guess even if the file is short -- ahartmetz + if (len < 1) + { + setEncoding("",DefaultEncoding); + return false; + } +#ifdef DECODE_DEBUG + kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")"; +#endif + + switch ( d->m_autoDetectLanguage ) + { + case EncodingDetector::Arabic: + return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding); +// break; + case EncodingDetector::Baltic: + return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding); +// break; + case EncodingDetector::CentralEuropean: + return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding); + break; + case EncodingDetector::Cyrillic: + return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding); +// break; + case EncodingDetector::Greek: + return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding); +// break; + case EncodingDetector::Hebrew: + return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding); +// break; + case EncodingDetector::Japanese: + return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding); +// break; + case EncodingDetector::Turkish: + return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding); +// break; + case EncodingDetector::WesternEuropean: + if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding)) + return true; + else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for khtml + { + return setEncoding("iso-8859-15",AutoDetectedEncoding); + } + else //use default provided by eg katepart + { + return setEncoding("",DefaultEncoding); + } +// break; + case EncodingDetector::SemiautomaticDetection: + case EncodingDetector::ChineseSimplified: + case EncodingDetector::ChineseTraditional: + case EncodingDetector::Korean: + case EncodingDetector::Thai: + case EncodingDetector::Unicode: + case EncodingDetector::NorthernSaami: + case EncodingDetector::SouthEasternEurope: + case EncodingDetector::None: + // huh. somethings broken in this code ### FIXME + //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. + break; + } + + setEncoding("",DefaultEncoding); + return true; +} + + +EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const QString& lang) +{ + if (lang.isEmpty()) + return EncodingDetector::None; + else if (lang==i18n("@item Text character set", "Unicode")) + return EncodingDetector::Unicode; + else if (lang==i18n("@item Text character set", "Cyrillic")) + return EncodingDetector::Cyrillic; + else if (lang==i18n("@item Text character set", "Western European")) + return EncodingDetector::WesternEuropean; + else if (lang==i18n("@item Text character set", "Central European")) + return EncodingDetector::CentralEuropean; + else if (lang==i18n("@item Text character set", "Greek")) + return EncodingDetector::Greek; + else if (lang==i18n("@item Text character set", "Hebrew")) + return EncodingDetector::Hebrew; + else if (lang==i18n("@item Text character set", "Turkish")) + return EncodingDetector::Turkish; + else if (lang==i18n("@item Text character set", "Japanese")) + return EncodingDetector::Japanese; + else if (lang==i18n("@item Text character set", "Baltic")) + return EncodingDetector::Baltic; + else if (lang==i18n("@item Text character set", "Arabic")) + return EncodingDetector::Arabic; + + return EncodingDetector::None; +} + +bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script) +{ + switch (script) + { + case EncodingDetector::Arabic: + return true; + case EncodingDetector::Baltic: + return true; + case EncodingDetector::CentralEuropean: + return true; + case EncodingDetector::Cyrillic: + return true; + case EncodingDetector::Greek: + return true; + case EncodingDetector::Hebrew: + return true; + case EncodingDetector::Japanese: + return true; + case EncodingDetector::Turkish: + return true; + case EncodingDetector::WesternEuropean: + return true; + case EncodingDetector::ChineseTraditional: + return true; + case EncodingDetector::ChineseSimplified: + return true; + case EncodingDetector::Unicode: + return true; + break; + default: + return false; + } +} + +QString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script) +{ + switch (script) + { + case EncodingDetector::Arabic: + return i18n("@item Text character set", "Arabic"); + break; + case EncodingDetector::Baltic: + return i18n("@item Text character set", "Baltic"); + break; + case EncodingDetector::CentralEuropean: + return i18n("@item Text character set", "Central European"); + break; + case EncodingDetector::Cyrillic: + return i18n("@item Text character set", "Cyrillic"); + break; + case EncodingDetector::Greek: + return i18n("@item Text character set", "Greek"); + break; + case EncodingDetector::Hebrew: + return i18n("@item Text character set", "Hebrew"); + break; + case EncodingDetector::Japanese: + return i18n("@item Text character set", "Japanese"); + break; + case EncodingDetector::Turkish: + return i18n("@item Text character set", "Turkish"); + break; + case EncodingDetector::WesternEuropean: + return i18n("@item Text character set", "Western European"); + break; + case EncodingDetector::ChineseTraditional: + return i18n("@item Text character set", "Chinese Traditional"); + break; + case EncodingDetector::ChineseSimplified: + return i18n("@item Text character set", "Chinese Simplified"); + break; + case EncodingDetector::Korean: + return i18n("@item Text character set", "Korean"); + break; + case EncodingDetector::Thai: + return i18n("@item Text character set", "Thai"); + break; + case EncodingDetector::Unicode: + return i18n("@item Text character set", "Unicode"); + break; + //case EncodingDetector::SemiautomaticDetection: + default: + return QString(); + + } +} + +EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const QString &lc) +{ + // It might make sense to do something special if the locale ends with + // ".UTF-8" or "@utf8" + const char *langStr = pango_script_for_lang[0].lang; + // There is obvious optimization potential... + for ( int i = 0; langStr; i++ ) { + langStr = pango_script_for_lang[i].lang; + // startsWith() works for empty strings: every string "starts with" an empty string. + if ( lc.startsWith( QString::fromAscii( langStr ) ) ) + return pango_script_for_lang[i].scripts[0]; + } + return None; +} + +#undef DECODE_DEBUG + |