diff options
Diffstat (limited to 'src/codecs/qjiscodec.cpp')
-rw-r--r-- | src/codecs/qjiscodec.cpp | 706 |
1 files changed, 706 insertions, 0 deletions
diff --git a/src/codecs/qjiscodec.cpp b/src/codecs/qjiscodec.cpp new file mode 100644 index 000000000..ae7d258af --- /dev/null +++ b/src/codecs/qjiscodec.cpp @@ -0,0 +1,706 @@ +/**************************************************************************** +** +** Implementation of TQJisCodec class +** +** Created : 990225 +** +** Copyright (C) 2000-2008 Trolltech ASA. All rights reserved. +** +** This file is part of the tools module of the TQt GUI Toolkit. +** +** This file may be used under the terms of the GNU General +** Public License versions 2.0 or 3.0 as published by the Free +** Software Foundation and appearing in the files LICENSE.GPL2 +** and LICENSE.GPL3 included in the packaging of this file. +** Alternatively you may (at your option) use any later version +** of the GNU General Public License if such license has been +** publicly approved by Trolltech ASA (or its successors, if any) +** and the KDE Free TQt Foundation. +** +** Please review the following information to ensure GNU General +** Public Licensing retquirements will be met: +** http://trolltech.com/products/qt/licenses/licensing/opensource/. +** If you are unsure which license is appropriate for your use, please +** review the following information: +** http://trolltech.com/products/qt/licenses/licensing/licensingoverview +** or contact the sales department at sales@trolltech.com. +** +** This file may be used under the terms of the Q Public License as +** defined by Trolltech ASA and appearing in the file LICENSE.TQPL +** included in the packaging of this file. Licensees holding valid TQt +** Commercial licenses may use this file in accordance with the TQt +** Commercial License Agreement provided with the Software. +** +** This file is provided "AS IS" with NO WARRANTY OF ANY KIND, +** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR +** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted +** herein. +** +**********************************************************************/ + +// Most of the code here was originally written by Serika Kurusugawa, +// a.k.a. Junji Takagi, and is included in TQt with the author's permission +// and the grateful thanks of the Trolltech team. + +/*! \class TQJisCodec + \reentrant + \ingroup i18n + + \brief The TQJisCodec class provides conversion to and from JIS character sets. + + More precisely, the TQJisCodec class subclasses TQTextCodec to + provide support for JIS X 0201 Latin, JIS X 0201 Kana, JIS X 0208 + and JIS X 0212. + + The environment variable UNICODEMAP_JP can be used to fine-tune + TQJisCodec, TQSjisCodec and TQEucJpCodec. The mapping names are as for + the Japanese XML working group's \link + http://www.y-adagio.com/public/standards/tr_xml_jpf/toc.htm XML + Japanese Profile\endlink, because it names and explains all the + widely used mappings. Here are brief descriptions, written by + Serika Kurusugawa: + + \list + + \i "unicode-0.9" or "unicode-0201" for Unicode style. This assumes + JISX0201 for 0x00-0x7f. (0.9 is a table version of jisx02xx mapping + used for Uniocde spec version 1.1.) + + \i "unicode-ascii" This assumes US-ASCII for 0x00-0x7f; some + chars (JISX0208 0x2140 and JISX0212 0x2237) are different from + Unicode 1.1 to avoid conflict. + + \i "open-19970715-0201" ("open-0201" for convenience) or + "jisx0221-1995" for JISX0221-JISX0201 style. JIS X 0221 is JIS + version of Unicode, but a few chars (0x5c, 0x7e, 0x2140, 0x216f, + 0x2131) are different from Unicode 1.1. This is used when 0x5c is + treated as YEN SIGN. + + \i "open-19970715-ascii" ("open-ascii" for convenience) for + JISX0221-ASCII style. This is used when 0x5c is treated as REVERSE + SOLIDUS. + + \i "open-19970715-ms" ("open-ms" for convenience) or "cp932" for + Microsoft Windows style. Windows Code Page 932. Some chars (0x2140, + 0x2141, 0x2142, 0x215d, 0x2171, 0x2172) are different from Unicode + 1.1. + + \i "jdk1.1.7" for Sun's JDK style. Same as Unicode 1.1, except that + JIS 0x2140 is mapped to UFF3C. Either ASCII or JISX0201 can be used + for 0x00-0x7f. + + \endlist + + In addition, the extensions "nec-vdc", "ibm-vdc" and "udc" are + supported. + + For example, if you want to use Unicode style conversion but with + NEC's extension, set \c UNICODEMAP_JP to + <nobr>\c {unicode-0.9, nec-vdc}.</nobr> (You will probably + need to quote that in a shell command.) + + Most of the code here was written by Serika Kurusugawa, + a.k.a. Junji Takagi, and is included in TQt with the author's + permission and the grateful thanks of the Trolltech team. Here is + the copyright statement for that code: + + \legalese + + Copyright (C) 1999 Serika Kurusugawa. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + \list 1 + \i Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + \i Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + \endlist + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS". + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSETQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. +*/ + +#include "qjiscodec.h" + +#ifndef QT_NO_BIG_CODECS + +static const uchar Esc = 0x1b; +static const uchar So = 0x0e; // Shift Out +static const uchar Si = 0x0f; // Shift In + +static const uchar ReverseSolidus = 0x5c; +static const uchar YenSign = 0x5c; +static const uchar Tilde = 0x7e; +static const uchar Overline = 0x7e; + +#define IsKana(c) (((c) >= 0xa1) && ((c) <= 0xdf)) +#define IsJisChar(c) (((c) >= 0x21) && ((c) <= 0x7e)) + +#define TQValidChar(u) ((u) ? TQChar((ushort)(u)) : TQChar::replacement) + +enum Iso2022State{ Ascii, MinState = Ascii, + JISX0201_Latin, JISX0201_Kana, + JISX0208_1978, JISX0208_1983, + JISX0212, MaxState = JISX0212, + UnknownState }; + +static const char Esc_CHARS[] = "()*+-./"; + +static const char Esc_Ascii[] = {Esc, '(', 'B', 0 }; +static const char Esc_JISX0201_Latin[] = {Esc, '(', 'J', 0 }; +static const char Esc_JISX0201_Kana[] = {Esc, '(', 'I', 0 }; +static const char Esc_JISX0208_1978[] = {Esc, '$', '@', 0 }; +static const char Esc_JISX0208_1983[] = {Esc, '$', 'B', 0 }; +static const char Esc_JISX0212[] = {Esc, '$', '(', 'D', 0 }; +static const char * const Esc_SEQ[] = { Esc_Ascii, + Esc_JISX0201_Latin, + Esc_JISX0201_Kana, + Esc_JISX0208_1978, + Esc_JISX0208_1983, + Esc_JISX0212 }; + +/*! \internal */ +TQJisCodec::TQJisCodec() : conv(TQJpUnicodeConv::newConverter(TQJpUnicodeConv::Default)) +{ +} + + +/*! \internal */ +TQJisCodec::~TQJisCodec() +{ + delete (TQJpUnicodeConv*)conv; + conv = 0; +} + + +/*! \internal */ +int TQJisCodec::mibEnum() const +{ + /* + Name: JIS_Encoding + MIBenum: 16 + Source: JIS X 0202-1991. Uses ISO 2022 escape sequences to + shift code sets as documented in JIS X 0202-1991. + Alias: csJISEncoding + */ + return 16; +} + +/*! \internal */ +TQCString TQJisCodec::fromUnicode(const TQString& uc, int& lenInOut) const +{ + int l = TQMIN((int)uc.length(),lenInOut); + TQCString result; + Iso2022State state = Ascii; + Iso2022State prev = Ascii; + for (int i=0; i<l; i++) { + TQChar ch = uc[i]; + uint j; + if ( ch.row() == 0x00 && ch.cell() < 0x80 ) { + // Ascii + if (state != JISX0201_Latin || + ch.cell() == ReverseSolidus || ch.cell() == Tilde) { + state = Ascii; + } + j = ch.cell(); + } else if ((j = conv->unicodeToJisx0201(ch.row(), ch.cell())) != 0) { + if (j < 0x80) { + // JIS X 0201 Latin + if (state != Ascii || + ch.cell() == YenSign || ch.cell() == Overline) { + state = JISX0201_Latin; + } + } else { + // JIS X 0201 Kana + state = JISX0201_Kana; + j &= 0x7f; + } + } else if ((j = conv->unicodeToJisx0208(ch.row(), ch.cell())) != 0) { + // JIS X 0208 + state = JISX0208_1983; + } else if ((j = conv->unicodeToJisx0212(ch.row(), ch.cell())) != 0) { + // JIS X 0212 + state = JISX0212; + } else { + // Invalid + state = UnknownState; + j = '?'; + } + if (state != prev) { + if (state == UnknownState) { + result += Esc_Ascii; + } else { + result += Esc_SEQ[state - MinState]; + } + prev = state; + } + if (j < 0x0100) { + result += j & 0xff; + } else { + result += (j >> 8) & 0xff; + result += j & 0xff; + } + } + if (prev != Ascii) { + result += Esc_Ascii; + } + lenInOut = result.length(); + return result; +} + +/*! \internal */ +TQString TQJisCodec::toUnicode(const char* chars, int len) const +{ + TQString result; + Iso2022State state = Ascii, prev = Ascii; + for (int i=0; i<len; i++) { + uchar ch = chars[i]; + if ( ch == Esc ) { + // Escape sequence + state = UnknownState; + if ( i < len-1 ) { + uchar c2 = chars[++i]; + if (c2 == '$') { + if ( i < len-1 ) { + uchar c3 = chars[++i]; + if (strchr(Esc_CHARS, c3)) { + if ( i < len-1 ) { + uchar c4 = chars[++i]; + if (c4 == '(') { + switch (c4) { + case 'D': + state = JISX0212; // Esc $ ( D + break; + } + } + } + } else { + switch (c3) { + case '@': + state = JISX0208_1978; // Esc $ @ + break; + case 'B': + state = JISX0208_1983; // Esc $ B + break; + } + } + } + } else { + if (strchr(Esc_CHARS, c2)) { + if ( i < len-1 ) { + uchar c3 = chars[++i]; + if (c2 == '(') { + switch (c3) { + case 'B': + state = Ascii; // Esc ( B + break; + case 'I': + state = JISX0201_Kana; // Esc ( I + break; + case 'J': + state = JISX0201_Latin; // Esc ( J + break; + } + } + } + } + } + } + } else if (ch == So) { + // Shift out + prev = state; + state = JISX0201_Kana; + } else if (ch == Si) { + // Shift in + if (prev == Ascii || prev == JISX0201_Latin) { + state = prev; + } else { + state = Ascii; + } + } else { + uint u; + switch (state) { + case Ascii: + if (ch < 0x80) { + result += TQChar(ch); + break; + } + /* fall throught */ + case JISX0201_Latin: + u = conv->jisx0201ToUnicode(ch); + result += TQValidChar(u); + break; + case JISX0201_Kana: + u = conv->jisx0201ToUnicode(ch | 0x80); + result += TQValidChar(u); + break; + case JISX0208_1978: + case JISX0208_1983: + if ( i < len-1 ) { + uchar c2 = chars[++i]; + u = conv->jisx0208ToUnicode(ch & 0x7f, c2 & 0x7f); + result += TQValidChar(u); + } + break; + case JISX0212: + if ( i < len-1 ) { + uchar c2 = chars[++i]; + u = conv->jisx0212ToUnicode(ch & 0x7f, c2 & 0x7f); + result += TQValidChar(u); + } + break; + default: + result += TQChar::replacement; + break; + } + } + } + return result; +} + +/*! \internal */ +const char* TQJisCodec::name() const +{ + return "JIS7"; +} + +/*! + Returns the codec's mime name. +*/ +const char* TQJisCodec::mimeName() const +{ + return "ISO-2022-JP"; +} + +/*! \internal */ +int TQJisCodec::heuristicNameMatch(const char* hint) const +{ + if ( qstrnicmp( hint, "ISO-2022-JP", 11 ) == 0 ) + return 10000; + if ( simpleHeuristicNameMatch( "ISO-2022-JP-2", hint ) > 0 ) + return 10; + + int score = 0; + bool ja = FALSE; + if (qstrnicmp(hint, "ja_JP", 5) == 0 || qstrnicmp(hint, "japan", 5) == 0) { + score += 3; + ja = TRUE; + } else if (qstrnicmp(hint, "ja", 2) == 0) { + score += 2; + ja = TRUE; + } + const char *p; + if (ja) { + p = strchr(hint, '.'); + if (p == 0) { + return score - 2; + } + p++; + } else { + p = hint; + } + if (p) { + if ((qstricmp(p, "JIS") == 0) || + (qstricmp(p, "JIS7") == 0) || + (simpleHeuristicNameMatch("ISO-2022-JP", p) > 0)) { + return score + 4; + } + } + return TQTextCodec::heuristicNameMatch(hint); +} + +/*! \internal */ +int TQJisCodec::heuristicContentMatch(const char* chars, int len) const +{ + int score = 0; + Iso2022State state = Ascii, prev = Ascii; + for (int i=0; i<len; i++) { + uchar ch = chars[i]; + // No nulls allowed. + if ( !ch ) + return -1; + if ( ch == Esc ) { + // Escape sequence + state = UnknownState; + if ( i < len-1 ) { + uchar c2 = chars[++i]; + if (c2 == '$') { + if ( i < len-1 ) { + uchar c3 = chars[++i]; + if (strchr(Esc_CHARS, c3)) { + if ( i < len-1 ) { + uchar c4 = chars[++i]; + if (c4 == '(') { + switch (c4) { + case 'D': + state = JISX0212; // Esc $ ( D + score++; + break; + } + } + } + score++; + } else { + switch (c3) { + case '@': + state = JISX0208_1978; // Esc $ @ + score++; + break; + case 'B': + state = JISX0208_1983; // Esc $ B + score++; + break; + } + } + } + score++; + } else { + if (strchr(Esc_CHARS, c2)) { + if ( i < len-1 ) { + uchar c3 = chars[++i]; + if (c2 == '(') { + switch (c3) { + case 'B': + state = Ascii; // Esc ( B + score++; + break; + case 'I': + state = JISX0201_Kana; // Esc ( I + score++; + break; + case 'J': + state = JISX0201_Latin; // Esc ( J + score++; + break; + } + } + } + score++; + } + } + } + if ( state == UnknownState ) { + return -1; + } + score++; + } else if (ch == So) { + // Shift out + prev = state; + state = JISX0201_Kana; + score++; + } else if (ch == Si) { + // Shift in + if (prev == Ascii || prev == JISX0201_Latin) { + state = prev; + } else { + state = Ascii; + } + score++; + } else { + switch (state) { + case Ascii: + case JISX0201_Latin: + if ( ch < 32 && ch != '\t' && ch != '\n' && ch != '\r' ) { + // Suspicious + if ( score ) + score--; + } else { + // Inconclusive + } + break; + case JISX0201_Kana: + if ( !IsKana(ch | 0x80) ) { + return -1; + } + score++; + break; + case JISX0208_1978: + case JISX0208_1983: + case JISX0212: + if ( !IsJisChar(ch) ) { + // Invalid + return -1; + } + if ( i < len-1 ) { + uchar c2 = chars[++i]; + if ( !IsJisChar(c2) ) { + // Invalid + return -1; + } + score++; + } + score++; + break; + default: + return -1; + } + } + } + return score; +} + +class TQJisDecoder : public TQTextDecoder { + uchar buf[4]; + int nbuf; + Iso2022State state, prev; + bool esc; + const TQJpUnicodeConv * const conv; +public: + TQJisDecoder(const TQJpUnicodeConv *c) : nbuf(0), state(Ascii), prev(Ascii), esc(FALSE), conv(c) + { + } + + TQString toUnicode(const char* chars, int len) + { + TQString result; + for (int i=0; i<len; i++) { + uchar ch = chars[i]; + if (esc) { + // Escape sequence + state = UnknownState; + switch (nbuf) { + case 0: + if (ch == '$' || strchr(Esc_CHARS, ch)) { + buf[nbuf++] = ch; + } else { + nbuf = 0; + esc = FALSE; + } + break; + case 1: + if (buf[0] == '$') { + if (strchr(Esc_CHARS, ch)) { + buf[nbuf++] = ch; + } else { + switch (ch) { + case '@': + state = JISX0208_1978; // Esc $ @ + break; + case 'B': + state = JISX0208_1983; // Esc $ B + break; + } + nbuf = 0; + esc = FALSE; + } + } else { + if (buf[0] == '(') { + switch (ch) { + case 'B': + state = Ascii; // Esc ( B + break; + case 'I': + state = JISX0201_Kana; // Esc ( I + break; + case 'J': + state = JISX0201_Latin; // Esc ( J + break; + } + } + nbuf = 0; + esc = FALSE; + } + break; + case 2: + if (buf[1] == '(') { + switch (ch) { + case 'D': + state = JISX0212; // Esc $ ( D + break; + } + } + nbuf = 0; + esc = FALSE; + break; + } + } else { + if (ch == Esc) { + // Escape sequence + nbuf = 0; + esc = TRUE; + } else if (ch == So) { + // Shift out + prev = state; + state = JISX0201_Kana; + nbuf = 0; + } else if (ch == Si) { + // Shift in + if (prev == Ascii || prev == JISX0201_Latin) { + state = prev; + } else { + state = Ascii; + } + nbuf = 0; + } else { + uint u; + switch (nbuf) { + case 0: + switch (state) { + case Ascii: + if (ch < 0x80) { + result += TQChar(ch); + break; + } + /* fall throught */ + case JISX0201_Latin: + u = conv->jisx0201ToUnicode(ch); + result += TQValidChar(u); + break; + case JISX0201_Kana: + u = conv->jisx0201ToUnicode(ch | 0x80); + result += TQValidChar(u); + break; + case JISX0208_1978: + case JISX0208_1983: + case JISX0212: + buf[nbuf++] = ch; + break; + default: + result += TQChar::replacement; + break; + } + break; + case 1: + switch (state) { + case JISX0208_1978: + case JISX0208_1983: + u = conv->jisx0208ToUnicode(buf[0] & 0x7f, ch & 0x7f); + result += TQValidChar(u); + break; + case JISX0212: + u = conv->jisx0212ToUnicode(buf[0] & 0x7f, ch & 0x7f); + result += TQValidChar(u); + break; + default: + result += TQChar::replacement; + break; + } + nbuf = 0; + break; + } + } + } + } + return result; + } +}; + +/*! \internal */ +TQTextDecoder* TQJisCodec::makeDecoder() const +{ + return new TQJisDecoder(conv); +} + +#endif |