From e16866e072f94410321d70daedbcb855ea878cac Mon Sep 17 00:00:00 2001 From: Timothy Pearson Date: Sun, 6 Nov 2011 15:56:40 -0600 Subject: Actually move the kde files that were renamed in the last commit --- kdecore/kcharsets.cpp | 664 -------------------------------------------------- 1 file changed, 664 deletions(-) delete mode 100644 kdecore/kcharsets.cpp (limited to 'kdecore/kcharsets.cpp') diff --git a/kdecore/kcharsets.cpp b/kdecore/kcharsets.cpp deleted file mode 100644 index b78345276..000000000 --- a/kdecore/kcharsets.cpp +++ /dev/null @@ -1,664 +0,0 @@ -/* This file is part of the KDE libraries - Copyright (C) 1999 Lars Knoll (knoll@kde.org) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - Boston, MA 02110-1301, USA. -*/ -#include "kcharsets.h" - -#include "kqiodevicegzip_p.h" -#include "kentities.c" - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -static const char * const language_names[] = { - I18N_NOOP( "Other" ), - I18N_NOOP( "Arabic" ), - I18N_NOOP( "Baltic" ), - I18N_NOOP( "Central European" ), - I18N_NOOP( "Chinese Simplified" ), - I18N_NOOP( "Chinese Traditional" ), - I18N_NOOP( "Cyrillic" ), - I18N_NOOP( "Greek" ), - I18N_NOOP( "Hebrew" ), - I18N_NOOP( "Japanese" ), - I18N_NOOP( "Korean" ), - I18N_NOOP( "Thai" ), - I18N_NOOP( "Turkish" ), - I18N_NOOP( "Western European" ), - I18N_NOOP( "Tamil" ), - I18N_NOOP( "Unicode" ), - I18N_NOOP( "Northern Saami" ), - I18N_NOOP( "Vietnamese" ), - I18N_NOOP( "South-Eastern Europe" ) -}; - -// This list gives the charsets that can be used to display a file given in a certain encoding. -// The list should be in order of preference -static const char* const charsets_for_encoding[] = { - "koi8-r", - "koi8-u", - "iso 8859-1", - "iso 8859-2", - "iso 8859-3", - "iso 8859-4", - "iso 8859-5", - "iso 8859-6", - "iso 8859-7", - "iso 8859-8", - "iso 8859-8-i", - "iso 8859-9", - "iso 8859-11", - "iso 8859-13", - "iso 8859-14", - "iso 8859-15", - "iso 8859-16", - "utf8", - "utf16", - "iso-10646-ucs-2", - "cp 1250", - "cp 1251", - "cp 1252", - "cp 1253", - "cp 1254", - "cp 1255", - "cp 1256", - "cp 1257", - "cp 1258", - "ibm850", - "ibm852", - "ibm866", - "tis620", - "eucjp", - "sjis", - "jis7", - "big5", - "big5-hkscs", - "gbk", - "gb18030", - "gb2312", - "euckr", - "tscii", -// "pt 154", - "winsami2", - "cp 874", - 0 }; // extra 0 for end - -// 0 other -// 1 Arabic -// 2 Baltic -// 3 Central European -// 4 Chinese Simplified -// 5 Chinese Traditional -// 6 Cyrillic -// 7 Greek -// 8 Hebrew -// 9 Japanese -// 10 Korean -// 11 Thai -// 12 Turkish -// 13 Western European -// 14 Tamil -// 15 Unicode -// 16 Northern Sami -// 17 Vietnamese -// 18 South-Eastern Europe -// ### FIXME KDE4: the name of the encodings should mostly be uppercase -static struct LanguageForEncoding - { - const char* index; - int data; - } const language_for_encoding[] = { - { "iso 8859-1", 13 }, - { "iso 8859-15", 13 }, - { "iso 8859-14", 13 }, - { "cp 1252", 13 }, - { "ibm850", 13 }, - { "iso 8859-2", 3 }, - { "iso 8859-3", 3 }, - { "iso 8859-4", 2 }, - { "iso 8859-13", 2 }, - { "iso 8859-16", 18 }, - { "cp 1250", 3 }, - { "cp 1254", 12 }, - { "cp 1257", 2 }, - { "ibm852", 3 }, - { "koi8-r", 6 }, - { "iso 8859-5", 6 }, - { "cp 1251", 6 }, - { "koi8-u", 6 }, -// { "pt 154", 6 }, - { "ibm866", 6 }, - { "big5", 5 }, - { "big5-hkscs", 5 }, - { "gb18030", 4 }, - { "gbk", 4 }, - { "gb2312", 4 }, - { "euckr", 10 }, - { "sjis", 9 }, - { "jis7", 9 }, - { "eucjp", 9 }, - { "iso 8859-7", 7 }, - { "cp 1253", 7 }, - { "iso 8859-6", 1 }, - { "cp 1256", 1 }, - { "iso 8859-8", 8 }, - { "iso 8859-8-i", 8 }, - { "cp 1255", 8 }, - { "iso 8859-9", 12 }, - { "tis620", 11 }, - { "iso 8859-11", 11 }, - { "cp 874", 11 }, - { "cp 1258", 17 }, - { "tscii", 14 }, - { "utf8", 15 }, - { "utf16", 15 }, - { "utf7", 15 }, // ### FIXME: UTF-7 is not in Qt - { "ucs2", 15 }, - { "iso-10646-ucs-2", 15 }, - { "winsami2", 16}, - { 0, 0 } }; - -// defines some different names for codecs that are built into Qt. -static struct Builtin - { - const char* index; - const char* data; - } const builtin[] = { - { "iso-ir-111", "koi8-r" }, - { "koi8-ru", "koi8-u" }, // ### Qt 3.3 maps it to koi8-r - { "koi unified", "koi8-r" }, // ### FIXME: Qt 3.3 seems to map this to EUC-KR, so this mapping is too late :-( - // Using ISO-8859-1 for ASCII is an approximation at write - { "us-ascii", "iso 8859-1" }, - { "usascii", "iso 8859-1" }, - { "ascii", "iso 8859-1" }, - { "x-utf-8", "utf-8" }, - { "x-utf-7", "utf-7" }, // ### FIXME: UTF-7 is not in Qt - { "tqunicode-1-1-utf-7", "utf-7" }, // ### FIXME: UTF-7 is not in Qt - { "utf-16", "iso-10646-ucs-2" }, - { "utf16", "iso-10646-ucs-2" }, - { "ucs2", "iso-10646-ucs-2" }, - { "iso10646-1", "iso-10646-ucs-2" }, - { "gb18030.2000-1", "gb18030" }, - { "gb18030.2000-0", "gb18030" }, - { "gbk-0", "gbk" }, - { "gb2312.1980-0", "gbk" }, - { "gb_2312-80", "gbk" },/* this one is not official, but MS is using it :/ */ - { "x-euc-kr", "euckr" }, - { "jisx0201.1976-0", "eucjp" }, - { "jisx0208.1983-0", "eucjp" }, - { "jisx0208.1990-0", "eucjp" }, - { "jisx0208.1997-0", "eucjp" }, - { "jisx0212.1990-0", "eucjp" }, - { "jisx0213.2000-1", "eucjp" }, - { "jisx0213.2000-2", "eucjp" }, - { "windows850", "ibm850" }, - { "windows866", "ibm866" }, - { "windows1251", "cp 1251" }, - { "windows1252", "cp 1252" }, - { "windows1253", "cp 1253" }, - { "windows1254", "cp 1254" }, - { "windows1255", "cp 1255" }, - { "windows1256", "cp 1256" }, - { "windows1257", "cp 1257" }, - { "windows1258", "cp 1258" }, - { "windows-850", "ibm850" }, - { "windows-866", "ibm866" }, - { "x-windows-850", "ibm850" }, - { "x-windows-866", "ibm866" }, - { "x-windows-1250", "cp 1250" }, - { "x-windows-1251", "cp 1251" }, - { "x-windows-1252", "cp 1252" }, - { "x-windows-1253", "cp 1253" }, - { "x-windows-1254", "cp 1254" }, - { "x-windows-1255", "cp 1255" }, - { "x-windows-1256", "cp 1256" }, - { "x-windows-1257", "cp 1257" }, - { "x-windows-1258", "cp 1258" }, - { "cp819", "iso 8859-1" }, - { "cp850", "ibm850" }, - { "cp866", "ibm866" }, - { "cp-819", "iso 8859-1" }, - { "cp-850", "ibm850" }, - { "cp-866", "ibm866" }, - { "cp-1250", "cp 1250" }, - { "cp-1251", "cp 1251" }, - { "cp-1252", "cp 1252" }, - { "cp-1253", "cp 1253" }, - { "cp-1254", "cp 1254" }, - { "cp-1255", "cp 1255" }, - { "cp-1256", "cp 1256" }, - { "cp-1257", "cp 1257" }, - { "cp-1258", "cp 1258" }, - { "cp-10000", "apple roman" }, - { "x-cp-850", "ibm850" }, - { "x-cp-866", "ibm866" }, - { "x-cp-1250", "cp 1250" }, - { "x-cp-1251", "cp 1251" }, - { "x-cp-1252", "cp 1252" }, - { "x-cp-1253", "cp 1253" }, - { "x-cp-1254", "cp 1254" }, - { "x-cp-1255", "cp 1255" }, - { "x-cp-1256", "cp 1256" }, - { "x-cp-1257", "cp 1257" }, - { "x-cp-1258", "cp 1258" }, - { "x-cp-10000", "apple roman" }, - { "ibm819", "iso 8859-1" }, - { "thai-tis620", "iso 8859-11" }, - { "windows-874", "cp 874" }, - { "windows874", "cp 874" }, - { "x-windows-874", "cp 874" }, - { "x-cp-874", "cp 874" }, - { "ibm 874", "cp 874" }, - { "ibm874", "cp 874" }, // Qt4 name - { "x-ibm874", "cp 874" }, - { "ksc5601.1987-0", "euckr" }, - { "x-winsami2", "winsami2" }, - { "x-mac-roman", "apple roman" }, - { "macintosh", "apple roman" }, - { "mac", "apple roman" }, - { "csiso2022jp", "jis7" }, // See bug #77243 - { "big5-eten", "big5-hkscs" }, - { "cp950", "big5-hkscs" }, - { 0, 0 }}; - -// some different names for the encodings defined in the charmaps files. -// even though the charmap file names are all uppercase, the names are all lowercase here. -static struct Aliases - { - const char* index; - const char* data; - } const aliases[] = { - { "cp852", "ibm852" }, - { "cp-852", "ibm852" }, - { "x-cp-852", "ibm852" }, - { "windows852", "ibm852" }, - { "windows-852", "ibm852" }, - { "x-windows-852", "ibm852" }, - { 0, 0 }}; - -// some last resort hints in case the charmap file couldn't be found. This gives at least a partial conversion -// and helps making things readable. -// the name used as input here is already converted to the more canonical name as defined in the aliases array. -static struct ConversionHints - { - const char* index; - const char* data; - } const conversion_hints[] = { - { "cp1250", "iso-8859-2" }, - { "koi8-r", "iso-8859-5" }, - { "koi8-u", "koi8-r" }, - // KDE had always "CP 1251" as best fallback to PT 154. Now that Qt does not offer this encoding anymore, it is our fallback. - { "pt 154", "cp 1251" }, - { "paratype-154", "cp 1251" }, - { "pt-154", "cp 1251" }, - { 0, 0 }}; - - -// search an array of items index/data, index is const char*, data is T, find first matching index -// and return data, or return 0 -template< typename T, typename Data > -static Data kcharsets_array_search( const T* start, const char* entry ) -{ - for( const T* pos = start; - pos->index != 0; - ++pos ) - if( qstrcmp( pos->index, entry ) == 0 ) - return pos->data; - return 0; -} - - -class KCharsetsPrivate -{ -public: - KCharsetsPrivate(KCharsets* _kc) - : codecForNameDict(43, false) // case insensitive - { - db = 0; - kc = _kc; - } - ~KCharsetsPrivate() - { - delete db; - } - TQFontDatabase *db; - TQAsciiDict codecForNameDict; - KCharsets* kc; -}; - -// -------------------------------------------------------------------------- - -KCharsets::KCharsets() -{ - d = new KCharsetsPrivate(this); -} - -KCharsets::~KCharsets() -{ - delete d; -} - -TQChar KCharsets::fromEntity(const TQString &str) -{ - TQChar res = TQChar::null; - - int pos = 0; - if(str[pos] == (QChar)'&') pos++; - - // Check for '�' or '�' sequence - if (str[pos] == (QChar)'#' && str.length()-pos > 1) { - bool ok; - pos++; - if (str[pos] == (QChar)'x' || str[pos] == (QChar)'X') { - pos++; - // '�', hexadeciaml character reference - TQString tmp(str.tqunicode()+pos, str.length()-pos); - res = tmp.toInt(&ok, 16); - } else { - // '�', decimal character reference - TQString tmp(str.tqunicode()+pos, str.length()-pos); - res = tmp.toInt(&ok, 10); - } - return res; - } - - const entity *e = kde_findEntity(str.ascii(), str.length()); - - if(!e) - { - //kdDebug( 0 ) << "unknown entity " << str <<", len = " << str.length() << endl; - return TQChar::null; - } - //kdDebug() << "got entity " << str << " = " << e->code << endl; - - return TQChar(e->code); -} - -TQChar KCharsets::fromEntity(const TQString &str, int &len) -{ - // entities are never longer than 8 chars... we start from - // that length and work backwards... - len = 8; - while(len > 0) - { - TQString tmp = str.left(len); - TQChar res = fromEntity(tmp); - if( res != (QChar)TQChar::null ) return res; - len--; - } - return TQChar::null; -} - - -TQString KCharsets::toEntity(const TQChar &ch) -{ - TQString ent; - ent.sprintf("�x%x;", ch.tqunicode()); - return ent; -} - -TQString KCharsets::resolveEntities( const TQString &input ) -{ - TQString text = input; - const TQChar *p = text.tqunicode(); - const TQChar *end = p + text.length(); - const TQChar *ampersand = 0; - bool scanForSemicolon = false; - - for ( ; p < end; ++p ) { - const TQChar ch = *p; - - if ( ch == (QChar)'&' ) { - ampersand = p; - scanForSemicolon = true; - continue; - } - - if ( ch != (QChar)';' || scanForSemicolon == false ) - continue; - - assert( ampersand ); - - scanForSemicolon = false; - - const TQChar *entityBegin = ampersand + 1; - - const uint entityLength = p - entityBegin; - if ( entityLength == 0 ) - continue; - - const TQChar entityValue = KCharsets::fromEntity( TQConstString( entityBegin, entityLength ).string() ); - if ( entityValue.isNull() ) - continue; - - const uint ampersandPos = ampersand - text.tqunicode(); - - text[ (int)ampersandPos ] = entityValue; - text.remove( ampersandPos + 1, entityLength + 1 ); - p = text.tqunicode() + ampersandPos; - end = text.tqunicode() + text.length(); - ampersand = 0; - } - - return text; -} - -TQStringList KCharsets::availableEncodingNames() -{ - TQStringList available; - for ( const char* const* pos = charsets_for_encoding; *pos; ++pos ) { - //kdDebug(0) << *charsets << " available" << endl; - available.append( TQString::tqfromLatin1( *pos )); - } - return available; -} - -TQString KCharsets::languageForEncoding( const TQString &encoding ) -{ - int lang = kcharsets_array_search< LanguageForEncoding, int > - ( language_for_encoding, encoding.latin1()); - return i18n( language_names[lang] ); -} - -TQString KCharsets::encodingForName( const TQString &descriptiveName ) -{ - const int left = descriptiveName.findRev( '(' ); - - if (left<0) // No parenthesis, so assume it is a normal encoding name - return descriptiveName.stripWhiteSpace(); - - TQString name(descriptiveName.mid(left+1)); - - const int right = name.findRev( ')' ); - - if (right<0) - return name; - - return name.left(right).stripWhiteSpace(); -} - -TQStringList KCharsets::descriptiveEncodingNames() -{ - // As we are sorting, we can directly read the array language_for_encoding - TQStringList encodings; - for ( const LanguageForEncoding* pos = language_for_encoding; pos->index; ++pos ) { - const TQString name = TQString::tqfromLatin1( pos->index ); - const TQString description = i18n( language_names[ pos->data ] ); - encodings.append( i18n("Descriptive Encoding Name", "%1 ( %2 )"). arg ( description ). arg( name ) ); - } - encodings.sort(); - return encodings; -} - -TQTextCodec *KCharsets::codecForName(const TQString &n) const -{ - bool b; - return codecForName( n, b ); -} - -TQTextCodec *KCharsets::codecForName(const TQString &n, bool &ok) const -{ - ok = true; - - TQTextCodec* codec = 0; - // dict lookup is case insensitive anyway - if((codec = d->codecForNameDict[n.isEmpty() ? "->locale<-" : n.latin1()])) - return codec; // cache hit, return - - if (n.isEmpty()) { - codec = KGlobal::locale()->codecForEncoding(); - d->codecForNameDict.replace("->locale<-", codec); - return codec; - } - - TQCString name = n.lower().latin1(); - TQCString key = name; - if (name.right(8) == "_charset") - name.truncate(name.length()-8); - - if (name.isEmpty()) { - ok = false; - return TQTextCodec::codecForName("iso8859-1"); - } - - codec = TQTextCodec::codecForName(name); - - if(codec) { - d->codecForNameDict.replace(key, codec); - return codec; - } - - // these codecs are built into Qt, but the name given for the codec is different, - // so TQTextCodec did not recognize it. - TQCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data()); - - if(!cname.isEmpty()) - codec = TQTextCodec::codecForName(cname); - - if(codec) - { - d->codecForNameDict.replace(key, codec); - return codec; - } - - TQString dir; - { - KConfigGroupSaver cfgsav( KGlobal::config(), "i18n" ); - dir = KGlobal::config()->readPathEntry("i18ndir", TQString::tqfromLatin1("/usr/share/i18n/charmaps")); - } - - // these are codecs not included in Qt. They can be build up if the corresponding charmap - // is available in the charmap directory. - cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data()); - - if(cname.isEmpty()) - cname = name; - cname = cname.upper(); - - const TQString basicName = TQString::tqfromLatin1(cname); - kdDebug() << k_funcinfo << endl << " Trying to find " << cname << " in " << dir << endl; - - TQString charMapFileName; - bool gzipped = false; - TQDir qdir(dir); - if (!qdir.exists()) { - // The directory for the charmaps does not even exist... (That is common!) - } - else if (qdir.exists(basicName, false)) { - charMapFileName = basicName; - } - else if (qdir.exists(basicName+".gz", false)) { - charMapFileName = basicName + ".gz"; - gzipped = true; - } - else { - // Check if we are asking a code page - // If yes, then check "CP99999" and "IBM99999" - // First we need to find the number of the codepage - TQRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+"); - if ( regexp.search(basicName) != -1) { - const TQString num = regexp.cap(4); - if (num.isEmpty()) { - // No number, not a code page (or something went wrong) - } - else if (qdir.exists("IBM"+num)) { - charMapFileName = "IBM"+num; - } - else if (qdir.exists("IBM"+num+".gz")) { - charMapFileName = "IBM"+num+".gz"; - gzipped = true; - } - else if (qdir.exists("CP"+num)) { - charMapFileName = "CP"+num; - } - else if (qdir.exists("CP"+num+".gz")) { - charMapFileName = "CP"+num+".gz"; - gzipped = true; - } - } - } - - if (gzipped && !charMapFileName.isEmpty()) { - KQIODeviceGZip gzip(dir + "/" + charMapFileName); - if (gzip.open(IO_ReadOnly)) { - kdDebug() << "Loading gzipped charset..." << endl; - codec = TQTextCodec::loadCharmap(&gzip); - gzip.close(); - } - else - kdWarning() << "Could not open gzipped charset!" << endl; - } - else if (!charMapFileName.isEmpty()) { - codec = TQTextCodec::loadCharmapFile(dir + "/" + charMapFileName); - } - - if(codec) { - d->codecForNameDict.replace(key, codec); - return codec; - } - - // this also failed, the last resort is now to take some compatibility charmap - - cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (const char*)name.data() ); - - if(!cname.isEmpty()) - codec = TQTextCodec::codecForName(cname); - - if(codec) { - d->codecForNameDict.replace(key, codec); - return codec; - } - - // could not assign a codec, let's return Latin1 - ok = false; - return TQTextCodec::codecForName("iso8859-1"); -} -- cgit v1.2.1