summaryrefslogtreecommitdiffstats
path: root/kdecore/kcharsets.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'kdecore/kcharsets.cpp')
-rw-r--r--kdecore/kcharsets.cpp664
1 files changed, 0 insertions, 664 deletions
diff --git a/kdecore/kcharsets.cpp b/kdecore/kcharsets.cpp
deleted file mode 100644
index b78345276..000000000
--- a/kdecore/kcharsets.cpp
+++ /dev/null
@@ -1,664 +0,0 @@
-/* This file is part of the KDE libraries
- Copyright (C) 1999 Lars Knoll (knoll@kde.org)
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public License
- along with this library; see the file COPYING.LIB. If not, write to
- the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- Boston, MA 02110-1301, USA.
-*/
-#include "kcharsets.h"
-
-#include "kqiodevicegzip_p.h"
-#include "kentities.c"
-
-#include <kapplication.h>
-#include <kglobal.h>
-#include <klocale.h>
-#include <kconfig.h>
-
-#include <tqfontinfo.h>
-#include <tqstrlist.h>
-#include <tqfontdatabase.h>
-#include <kdebug.h>
-
-#include <tqtextcodec.h>
-#include <tqmap.h>
-#include <tqcstring.h>
-#include <tqdir.h>
-#include <tqregexp.h>
-
-#include <assert.h>
-
-static const char * const language_names[] = {
- I18N_NOOP( "Other" ),
- I18N_NOOP( "Arabic" ),
- I18N_NOOP( "Baltic" ),
- I18N_NOOP( "Central European" ),
- I18N_NOOP( "Chinese Simplified" ),
- I18N_NOOP( "Chinese Traditional" ),
- I18N_NOOP( "Cyrillic" ),
- I18N_NOOP( "Greek" ),
- I18N_NOOP( "Hebrew" ),
- I18N_NOOP( "Japanese" ),
- I18N_NOOP( "Korean" ),
- I18N_NOOP( "Thai" ),
- I18N_NOOP( "Turkish" ),
- I18N_NOOP( "Western European" ),
- I18N_NOOP( "Tamil" ),
- I18N_NOOP( "Unicode" ),
- I18N_NOOP( "Northern Saami" ),
- I18N_NOOP( "Vietnamese" ),
- I18N_NOOP( "South-Eastern Europe" )
-};
-
-// This list gives the charsets that can be used to display a file given in a certain encoding.
-// The list should be in order of preference
-static const char* const charsets_for_encoding[] = {
- "koi8-r",
- "koi8-u",
- "iso 8859-1",
- "iso 8859-2",
- "iso 8859-3",
- "iso 8859-4",
- "iso 8859-5",
- "iso 8859-6",
- "iso 8859-7",
- "iso 8859-8",
- "iso 8859-8-i",
- "iso 8859-9",
- "iso 8859-11",
- "iso 8859-13",
- "iso 8859-14",
- "iso 8859-15",
- "iso 8859-16",
- "utf8",
- "utf16",
- "iso-10646-ucs-2",
- "cp 1250",
- "cp 1251",
- "cp 1252",
- "cp 1253",
- "cp 1254",
- "cp 1255",
- "cp 1256",
- "cp 1257",
- "cp 1258",
- "ibm850",
- "ibm852",
- "ibm866",
- "tis620",
- "eucjp",
- "sjis",
- "jis7",
- "big5",
- "big5-hkscs",
- "gbk",
- "gb18030",
- "gb2312",
- "euckr",
- "tscii",
-// "pt 154",
- "winsami2",
- "cp 874",
- 0 }; // extra 0 for end
-
-// 0 other
-// 1 Arabic
-// 2 Baltic
-// 3 Central European
-// 4 Chinese Simplified
-// 5 Chinese Traditional
-// 6 Cyrillic
-// 7 Greek
-// 8 Hebrew
-// 9 Japanese
-// 10 Korean
-// 11 Thai
-// 12 Turkish
-// 13 Western European
-// 14 Tamil
-// 15 Unicode
-// 16 Northern Sami
-// 17 Vietnamese
-// 18 South-Eastern Europe
-// ### FIXME KDE4: the name of the encodings should mostly be uppercase
-static struct LanguageForEncoding
- {
- const char* index;
- int data;
- } const language_for_encoding[] = {
- { "iso 8859-1", 13 },
- { "iso 8859-15", 13 },
- { "iso 8859-14", 13 },
- { "cp 1252", 13 },
- { "ibm850", 13 },
- { "iso 8859-2", 3 },
- { "iso 8859-3", 3 },
- { "iso 8859-4", 2 },
- { "iso 8859-13", 2 },
- { "iso 8859-16", 18 },
- { "cp 1250", 3 },
- { "cp 1254", 12 },
- { "cp 1257", 2 },
- { "ibm852", 3 },
- { "koi8-r", 6 },
- { "iso 8859-5", 6 },
- { "cp 1251", 6 },
- { "koi8-u", 6 },
-// { "pt 154", 6 },
- { "ibm866", 6 },
- { "big5", 5 },
- { "big5-hkscs", 5 },
- { "gb18030", 4 },
- { "gbk", 4 },
- { "gb2312", 4 },
- { "euckr", 10 },
- { "sjis", 9 },
- { "jis7", 9 },
- { "eucjp", 9 },
- { "iso 8859-7", 7 },
- { "cp 1253", 7 },
- { "iso 8859-6", 1 },
- { "cp 1256", 1 },
- { "iso 8859-8", 8 },
- { "iso 8859-8-i", 8 },
- { "cp 1255", 8 },
- { "iso 8859-9", 12 },
- { "tis620", 11 },
- { "iso 8859-11", 11 },
- { "cp 874", 11 },
- { "cp 1258", 17 },
- { "tscii", 14 },
- { "utf8", 15 },
- { "utf16", 15 },
- { "utf7", 15 }, // ### FIXME: UTF-7 is not in Qt
- { "ucs2", 15 },
- { "iso-10646-ucs-2", 15 },
- { "winsami2", 16},
- { 0, 0 } };
-
-// defines some different names for codecs that are built into Qt.
-static struct Builtin
- {
- const char* index;
- const char* data;
- } const builtin[] = {
- { "iso-ir-111", "koi8-r" },
- { "koi8-ru", "koi8-u" }, // ### Qt 3.3 maps it to koi8-r
- { "koi unified", "koi8-r" }, // ### FIXME: Qt 3.3 seems to map this to EUC-KR, so this mapping is too late :-(
- // Using ISO-8859-1 for ASCII is an approximation at write
- { "us-ascii", "iso 8859-1" },
- { "usascii", "iso 8859-1" },
- { "ascii", "iso 8859-1" },
- { "x-utf-8", "utf-8" },
- { "x-utf-7", "utf-7" }, // ### FIXME: UTF-7 is not in Qt
- { "tqunicode-1-1-utf-7", "utf-7" }, // ### FIXME: UTF-7 is not in Qt
- { "utf-16", "iso-10646-ucs-2" },
- { "utf16", "iso-10646-ucs-2" },
- { "ucs2", "iso-10646-ucs-2" },
- { "iso10646-1", "iso-10646-ucs-2" },
- { "gb18030.2000-1", "gb18030" },
- { "gb18030.2000-0", "gb18030" },
- { "gbk-0", "gbk" },
- { "gb2312.1980-0", "gbk" },
- { "gb_2312-80", "gbk" },/* this one is not official, but MS is using it :/ */
- { "x-euc-kr", "euckr" },
- { "jisx0201.1976-0", "eucjp" },
- { "jisx0208.1983-0", "eucjp" },
- { "jisx0208.1990-0", "eucjp" },
- { "jisx0208.1997-0", "eucjp" },
- { "jisx0212.1990-0", "eucjp" },
- { "jisx0213.2000-1", "eucjp" },
- { "jisx0213.2000-2", "eucjp" },
- { "windows850", "ibm850" },
- { "windows866", "ibm866" },
- { "windows1251", "cp 1251" },
- { "windows1252", "cp 1252" },
- { "windows1253", "cp 1253" },
- { "windows1254", "cp 1254" },
- { "windows1255", "cp 1255" },
- { "windows1256", "cp 1256" },
- { "windows1257", "cp 1257" },
- { "windows1258", "cp 1258" },
- { "windows-850", "ibm850" },
- { "windows-866", "ibm866" },
- { "x-windows-850", "ibm850" },
- { "x-windows-866", "ibm866" },
- { "x-windows-1250", "cp 1250" },
- { "x-windows-1251", "cp 1251" },
- { "x-windows-1252", "cp 1252" },
- { "x-windows-1253", "cp 1253" },
- { "x-windows-1254", "cp 1254" },
- { "x-windows-1255", "cp 1255" },
- { "x-windows-1256", "cp 1256" },
- { "x-windows-1257", "cp 1257" },
- { "x-windows-1258", "cp 1258" },
- { "cp819", "iso 8859-1" },
- { "cp850", "ibm850" },
- { "cp866", "ibm866" },
- { "cp-819", "iso 8859-1" },
- { "cp-850", "ibm850" },
- { "cp-866", "ibm866" },
- { "cp-1250", "cp 1250" },
- { "cp-1251", "cp 1251" },
- { "cp-1252", "cp 1252" },
- { "cp-1253", "cp 1253" },
- { "cp-1254", "cp 1254" },
- { "cp-1255", "cp 1255" },
- { "cp-1256", "cp 1256" },
- { "cp-1257", "cp 1257" },
- { "cp-1258", "cp 1258" },
- { "cp-10000", "apple roman" },
- { "x-cp-850", "ibm850" },
- { "x-cp-866", "ibm866" },
- { "x-cp-1250", "cp 1250" },
- { "x-cp-1251", "cp 1251" },
- { "x-cp-1252", "cp 1252" },
- { "x-cp-1253", "cp 1253" },
- { "x-cp-1254", "cp 1254" },
- { "x-cp-1255", "cp 1255" },
- { "x-cp-1256", "cp 1256" },
- { "x-cp-1257", "cp 1257" },
- { "x-cp-1258", "cp 1258" },
- { "x-cp-10000", "apple roman" },
- { "ibm819", "iso 8859-1" },
- { "thai-tis620", "iso 8859-11" },
- { "windows-874", "cp 874" },
- { "windows874", "cp 874" },
- { "x-windows-874", "cp 874" },
- { "x-cp-874", "cp 874" },
- { "ibm 874", "cp 874" },
- { "ibm874", "cp 874" }, // Qt4 name
- { "x-ibm874", "cp 874" },
- { "ksc5601.1987-0", "euckr" },
- { "x-winsami2", "winsami2" },
- { "x-mac-roman", "apple roman" },
- { "macintosh", "apple roman" },
- { "mac", "apple roman" },
- { "csiso2022jp", "jis7" }, // See bug #77243
- { "big5-eten", "big5-hkscs" },
- { "cp950", "big5-hkscs" },
- { 0, 0 }};
-
-// some different names for the encodings defined in the charmaps files.
-// even though the charmap file names are all uppercase, the names are all lowercase here.
-static struct Aliases
- {
- const char* index;
- const char* data;
- } const aliases[] = {
- { "cp852", "ibm852" },
- { "cp-852", "ibm852" },
- { "x-cp-852", "ibm852" },
- { "windows852", "ibm852" },
- { "windows-852", "ibm852" },
- { "x-windows-852", "ibm852" },
- { 0, 0 }};
-
-// some last resort hints in case the charmap file couldn't be found. This gives at least a partial conversion
-// and helps making things readable.
-// the name used as input here is already converted to the more canonical name as defined in the aliases array.
-static struct ConversionHints
- {
- const char* index;
- const char* data;
- } const conversion_hints[] = {
- { "cp1250", "iso-8859-2" },
- { "koi8-r", "iso-8859-5" },
- { "koi8-u", "koi8-r" },
- // KDE had always "CP 1251" as best fallback to PT 154. Now that Qt does not offer this encoding anymore, it is our fallback.
- { "pt 154", "cp 1251" },
- { "paratype-154", "cp 1251" },
- { "pt-154", "cp 1251" },
- { 0, 0 }};
-
-
-// search an array of items index/data, index is const char*, data is T, find first matching index
-// and return data, or return 0
-template< typename T, typename Data >
-static Data kcharsets_array_search( const T* start, const char* entry )
-{
- for( const T* pos = start;
- pos->index != 0;
- ++pos )
- if( qstrcmp( pos->index, entry ) == 0 )
- return pos->data;
- return 0;
-}
-
-
-class KCharsetsPrivate
-{
-public:
- KCharsetsPrivate(KCharsets* _kc)
- : codecForNameDict(43, false) // case insensitive
- {
- db = 0;
- kc = _kc;
- }
- ~KCharsetsPrivate()
- {
- delete db;
- }
- TQFontDatabase *db;
- TQAsciiDict<TQTextCodec> codecForNameDict;
- KCharsets* kc;
-};
-
-// --------------------------------------------------------------------------
-
-KCharsets::KCharsets()
-{
- d = new KCharsetsPrivate(this);
-}
-
-KCharsets::~KCharsets()
-{
- delete d;
-}
-
-TQChar KCharsets::fromEntity(const TQString &str)
-{
- TQChar res = TQChar::null;
-
- int pos = 0;
- if(str[pos] == (QChar)'&') pos++;
-
- // Check for '&#000' or '&#x0000' sequence
- if (str[pos] == (QChar)'#' && str.length()-pos > 1) {
- bool ok;
- pos++;
- if (str[pos] == (QChar)'x' || str[pos] == (QChar)'X') {
- pos++;
- // '&#x0000', hexadeciaml character reference
- TQString tmp(str.tqunicode()+pos, str.length()-pos);
- res = tmp.toInt(&ok, 16);
- } else {
- // '&#0000', decimal character reference
- TQString tmp(str.tqunicode()+pos, str.length()-pos);
- res = tmp.toInt(&ok, 10);
- }
- return res;
- }
-
- const entity *e = kde_findEntity(str.ascii(), str.length());
-
- if(!e)
- {
- //kdDebug( 0 ) << "unknown entity " << str <<", len = " << str.length() << endl;
- return TQChar::null;
- }
- //kdDebug() << "got entity " << str << " = " << e->code << endl;
-
- return TQChar(e->code);
-}
-
-TQChar KCharsets::fromEntity(const TQString &str, int &len)
-{
- // entities are never longer than 8 chars... we start from
- // that length and work backwards...
- len = 8;
- while(len > 0)
- {
- TQString tmp = str.left(len);
- TQChar res = fromEntity(tmp);
- if( res != (QChar)TQChar::null ) return res;
- len--;
- }
- return TQChar::null;
-}
-
-
-TQString KCharsets::toEntity(const TQChar &ch)
-{
- TQString ent;
- ent.sprintf("&#0x%x;", ch.tqunicode());
- return ent;
-}
-
-TQString KCharsets::resolveEntities( const TQString &input )
-{
- TQString text = input;
- const TQChar *p = text.tqunicode();
- const TQChar *end = p + text.length();
- const TQChar *ampersand = 0;
- bool scanForSemicolon = false;
-
- for ( ; p < end; ++p ) {
- const TQChar ch = *p;
-
- if ( ch == (QChar)'&' ) {
- ampersand = p;
- scanForSemicolon = true;
- continue;
- }
-
- if ( ch != (QChar)';' || scanForSemicolon == false )
- continue;
-
- assert( ampersand );
-
- scanForSemicolon = false;
-
- const TQChar *entityBegin = ampersand + 1;
-
- const uint entityLength = p - entityBegin;
- if ( entityLength == 0 )
- continue;
-
- const TQChar entityValue = KCharsets::fromEntity( TQConstString( entityBegin, entityLength ).string() );
- if ( entityValue.isNull() )
- continue;
-
- const uint ampersandPos = ampersand - text.tqunicode();
-
- text[ (int)ampersandPos ] = entityValue;
- text.remove( ampersandPos + 1, entityLength + 1 );
- p = text.tqunicode() + ampersandPos;
- end = text.tqunicode() + text.length();
- ampersand = 0;
- }
-
- return text;
-}
-
-TQStringList KCharsets::availableEncodingNames()
-{
- TQStringList available;
- for ( const char* const* pos = charsets_for_encoding; *pos; ++pos ) {
- //kdDebug(0) << *charsets << " available" << endl;
- available.append( TQString::tqfromLatin1( *pos ));
- }
- return available;
-}
-
-TQString KCharsets::languageForEncoding( const TQString &encoding )
-{
- int lang = kcharsets_array_search< LanguageForEncoding, int >
- ( language_for_encoding, encoding.latin1());
- return i18n( language_names[lang] );
-}
-
-TQString KCharsets::encodingForName( const TQString &descriptiveName )
-{
- const int left = descriptiveName.findRev( '(' );
-
- if (left<0) // No parenthesis, so assume it is a normal encoding name
- return descriptiveName.stripWhiteSpace();
-
- TQString name(descriptiveName.mid(left+1));
-
- const int right = name.findRev( ')' );
-
- if (right<0)
- return name;
-
- return name.left(right).stripWhiteSpace();
-}
-
-TQStringList KCharsets::descriptiveEncodingNames()
-{
- // As we are sorting, we can directly read the array language_for_encoding
- TQStringList encodings;
- for ( const LanguageForEncoding* pos = language_for_encoding; pos->index; ++pos ) {
- const TQString name = TQString::tqfromLatin1( pos->index );
- const TQString description = i18n( language_names[ pos->data ] );
- encodings.append( i18n("Descriptive Encoding Name", "%1 ( %2 )"). arg ( description ). arg( name ) );
- }
- encodings.sort();
- return encodings;
-}
-
-TQTextCodec *KCharsets::codecForName(const TQString &n) const
-{
- bool b;
- return codecForName( n, b );
-}
-
-TQTextCodec *KCharsets::codecForName(const TQString &n, bool &ok) const
-{
- ok = true;
-
- TQTextCodec* codec = 0;
- // dict lookup is case insensitive anyway
- if((codec = d->codecForNameDict[n.isEmpty() ? "->locale<-" : n.latin1()]))
- return codec; // cache hit, return
-
- if (n.isEmpty()) {
- codec = KGlobal::locale()->codecForEncoding();
- d->codecForNameDict.replace("->locale<-", codec);
- return codec;
- }
-
- TQCString name = n.lower().latin1();
- TQCString key = name;
- if (name.right(8) == "_charset")
- name.truncate(name.length()-8);
-
- if (name.isEmpty()) {
- ok = false;
- return TQTextCodec::codecForName("iso8859-1");
- }
-
- codec = TQTextCodec::codecForName(name);
-
- if(codec) {
- d->codecForNameDict.replace(key, codec);
- return codec;
- }
-
- // these codecs are built into Qt, but the name given for the codec is different,
- // so TQTextCodec did not recognize it.
- TQCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data());
-
- if(!cname.isEmpty())
- codec = TQTextCodec::codecForName(cname);
-
- if(codec)
- {
- d->codecForNameDict.replace(key, codec);
- return codec;
- }
-
- TQString dir;
- {
- KConfigGroupSaver cfgsav( KGlobal::config(), "i18n" );
- dir = KGlobal::config()->readPathEntry("i18ndir", TQString::tqfromLatin1("/usr/share/i18n/charmaps"));
- }
-
- // these are codecs not included in Qt. They can be build up if the corresponding charmap
- // is available in the charmap directory.
- cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
-
- if(cname.isEmpty())
- cname = name;
- cname = cname.upper();
-
- const TQString basicName = TQString::tqfromLatin1(cname);
- kdDebug() << k_funcinfo << endl << " Trying to find " << cname << " in " << dir << endl;
-
- TQString charMapFileName;
- bool gzipped = false;
- TQDir qdir(dir);
- if (!qdir.exists()) {
- // The directory for the charmaps does not even exist... (That is common!)
- }
- else if (qdir.exists(basicName, false)) {
- charMapFileName = basicName;
- }
- else if (qdir.exists(basicName+".gz", false)) {
- charMapFileName = basicName + ".gz";
- gzipped = true;
- }
- else {
- // Check if we are asking a code page
- // If yes, then check "CP99999" and "IBM99999"
- // First we need to find the number of the codepage
- TQRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+");
- if ( regexp.search(basicName) != -1) {
- const TQString num = regexp.cap(4);
- if (num.isEmpty()) {
- // No number, not a code page (or something went wrong)
- }
- else if (qdir.exists("IBM"+num)) {
- charMapFileName = "IBM"+num;
- }
- else if (qdir.exists("IBM"+num+".gz")) {
- charMapFileName = "IBM"+num+".gz";
- gzipped = true;
- }
- else if (qdir.exists("CP"+num)) {
- charMapFileName = "CP"+num;
- }
- else if (qdir.exists("CP"+num+".gz")) {
- charMapFileName = "CP"+num+".gz";
- gzipped = true;
- }
- }
- }
-
- if (gzipped && !charMapFileName.isEmpty()) {
- KQIODeviceGZip gzip(dir + "/" + charMapFileName);
- if (gzip.open(IO_ReadOnly)) {
- kdDebug() << "Loading gzipped charset..." << endl;
- codec = TQTextCodec::loadCharmap(&gzip);
- gzip.close();
- }
- else
- kdWarning() << "Could not open gzipped charset!" << endl;
- }
- else if (!charMapFileName.isEmpty()) {
- codec = TQTextCodec::loadCharmapFile(dir + "/" + charMapFileName);
- }
-
- if(codec) {
- d->codecForNameDict.replace(key, codec);
- return codec;
- }
-
- // this also failed, the last resort is now to take some compatibility charmap
-
- cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (const char*)name.data() );
-
- if(!cname.isEmpty())
- codec = TQTextCodec::codecForName(cname);
-
- if(codec) {
- d->codecForNameDict.replace(key, codec);
- return codec;
- }
-
- // could not assign a codec, let's return Latin1
- ok = false;
- return TQTextCodec::codecForName("iso8859-1");
-}