summaryrefslogtreecommitdiffstats
path: root/kiten/dict.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'kiten/dict.cpp')
-rw-r--r--kiten/dict.cpp876
1 files changed, 876 insertions, 0 deletions
diff --git a/kiten/dict.cpp b/kiten/dict.cpp
new file mode 100644
index 00000000..ad282807
--- /dev/null
+++ b/kiten/dict.cpp
@@ -0,0 +1,876 @@
+/**
+ This file is part of Kiten, a KDE Japanese Reference Tool...
+ Copyright (C) 2001 Jason Katz-Brown <jason@katzbrown.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
+ USA
+**/
+
+#include <kdebug.h>
+#include <klocale.h>
+#include <kmessagebox.h>
+#include <kprocess.h>
+#include <kstandarddirs.h>
+
+#include <qfileinfo.h>
+#include <qregexp.h>
+#include <qtextcodec.h>
+
+#include "dict.h"
+
+#include <iostream>
+#include <cassert>
+#include <sys/mman.h>
+#include <stdio.h>
+
+namespace
+{
+void msgerr(const QString &msg, const QString &dict = QString::null)
+{
+ QString output = msg;
+ if (!dict.isNull())
+ output = msg.arg(dict);
+ KMessageBox::error(0, output);
+}
+}
+
+using namespace Dict;
+
+TextType Dict::textType(const QString &text)
+{
+ ushort first = text.at(0).unicode();
+
+ if (first < 0x3000)
+ return Text_Latin;
+ // else if (first < 0x3040) // CJK Symbols and Punctuation
+ // return Text_Kana;
+ // else if (first < 0x30A0) // Hiragana
+ // return Text_Kana;
+ else if (first < 0x3100) // Katakana
+ return Text_Kana;
+
+ else /*if (first >= 0x3400 && first < 0x4DC0)*/ // CJK Unified Ideographs Extension A
+ return Text_Kanji;
+}
+
+File::File(QString path, QString n)
+ : myName(n)
+ , dictFile(path)
+ , dictPtr((const unsigned char *)MAP_FAILED)
+ , indexFile(KGlobal::dirs()->saveLocation("data", "kiten/xjdx/", true) + QFileInfo(path).baseName() + ".xjdx")
+ , indexPtr((const uint32_t *)MAP_FAILED)
+ , valid(false)
+{
+ bool forceUpdate = false;
+
+ bool indexFileExists = indexFile.exists();
+ if (indexFileExists)
+ {
+ // ### change this if need be!!
+ const int indexFileVersion = 14;
+
+ // this up-to-date code from xjdservcomm.c
+ // we need to check if the index needs to
+ // remade
+
+ int dictionaryLength;
+ QFile dictionary(path);
+ dictionaryLength = dictionary.size();
+ dictionaryLength++;
+ //kdDebug() << "dictionaryLength = " << dictionaryLength << endl;
+
+ int32_t testWord[1];
+ fread(&testWord[0], sizeof(int32_t), 1, fopen(indexFile.name().latin1(), "rb"));
+
+ //kdDebug() << "testWord[0] = " << testWord[0] << endl;
+
+ if (testWord[0] != (dictionaryLength + indexFileVersion))
+ forceUpdate = true;
+ }
+
+ if (!indexFileExists || forceUpdate)
+ {
+ //kdDebug() << "creating " << indexFile.name() << endl;
+ // find the index generator executable
+ KProcess proc;
+ proc << KStandardDirs::findExe("kitengen") << path << indexFile.name();
+ // TODO: put up a status dialog and event loop instead of blocking
+ proc.start(KProcess::Block, KProcess::NoCommunication);
+ }
+
+ if (!dictFile.open(IO_ReadOnly))
+ {
+ msgerr(i18n("Could not open dictionary %1."), path);
+ return;
+ }
+
+ dictPtr = (const unsigned char *)mmap(0, dictFile.size(), PROT_READ, MAP_SHARED, dictFile.handle(), 0);
+ if (dictPtr == (unsigned char*) MAP_FAILED)
+ {
+ msgerr(i18n("Memory error when loading dictionary %1."), path);
+ return;
+ }
+
+ if (!indexFile.open(IO_ReadOnly))
+ {
+ msgerr(i18n("Could not open index for dictionary %1."), path);
+ return;
+ }
+
+ indexPtr = (const uint32_t*)mmap(0, indexFile.size(), PROT_READ, MAP_SHARED, indexFile.handle(), 0);
+ if (indexPtr == (uint32_t*) MAP_FAILED)
+ {
+ msgerr(i18n("Memory error when loading dictionary %1's index file."), path);
+ return;
+ }
+
+ valid = true;
+}
+
+File::~File(void)
+{
+ if (dictPtr != (unsigned char*) MAP_FAILED)
+ munmap((char *)dictPtr, dictFile.size());
+ dictFile.close();
+
+ if (indexPtr != (uint32_t*) MAP_FAILED)
+ munmap((char *)indexPtr, indexFile.size());
+ indexFile.close();
+}
+
+QString File::name(void)
+{
+ return myName;
+}
+
+Array<const unsigned char> File::dict(void)
+{
+ assert(valid);
+ return Array<const unsigned char>(dictPtr, dictFile.size());
+}
+
+Array<const uint32_t> File::index(void)
+{
+ assert(valid);
+ return Array<const uint32_t>(indexPtr, indexFile.size());
+}
+
+int File::dictLength(void)
+{
+ return dictFile.size();
+}
+
+int File::indexLength(void)
+{
+ return indexFile.size();
+}
+
+bool File::isValid(void)
+{
+ return valid;
+}
+
+// returns specified character from a dictionary
+unsigned char File::lookup(unsigned i, int offset)
+{
+ uint32_t pos = indexPtr[i] + offset - 1;
+ if (pos > dictFile.size()) return 10;
+ return dictPtr[pos];
+}
+
+QCString File::lookup(unsigned i)
+{
+ uint32_t start = indexPtr[i] - 1;
+ uint32_t pos = start;
+ const unsigned size = dictFile.size();
+ // get the whole word
+ while(pos <= size && dictPtr[pos] != 0 && dictPtr[pos] != 0x0a)
+ ++pos;
+ // put the word in the QCString
+ QCString retval((const char *)(dictPtr + start), pos - start);
+ // tack on a null
+ char null = 0;
+ retval.append(&null);
+ // and away we go
+ return retval;
+}
+
+// And last, Index itself is the API presented to the rest of Kiten
+Index::Index()
+ : QObject()
+{
+ dictFiles.setAutoDelete(true);
+ kanjiDictFiles.setAutoDelete(true);
+}
+
+Index::~Index()
+{
+}
+
+void Index::setDictList(const QStringList &list, const QStringList &names)
+{
+ loadDictList(dictFiles, list, names);
+}
+
+void Index::setKanjiDictList(const QStringList &list, const QStringList &names)
+{
+ loadDictList(kanjiDictFiles, list, names);
+}
+
+void Index::loadDictList(QPtrList<File> &fileList, const QStringList &dictList, const QStringList &dictNameList)
+{
+ fileList.clear();
+
+ // check if we have a dict
+ if (dictList.size() < 1)
+ {
+ msgerr(i18n("No dictionaries in list!"));
+ return;
+ }
+
+ QStringList::ConstIterator it;
+ QStringList::ConstIterator dictIt;
+ for (it = dictList.begin(), dictIt = dictNameList.begin(); it != dictList.end(); ++it, ++dictIt)
+ {
+ File *f = new File(*it, *dictIt);
+ // our ugly substitute for exceptions
+ if (f->isValid())
+ fileList.append(f);
+ else
+ delete f;
+ }
+}
+
+QStringList Index::doSearch(File &file, const QString &text)
+{
+ // Do a binary search to find an entry that matches text
+ QTextCodec &codec = *QTextCodec::codecForName("eucJP");
+ QCString eucString = codec.fromUnicode(text);
+
+ QString prevResult;
+
+ Array<const uint32_t> index = file.index();
+ Array<const unsigned char> dict = file.dict();
+ int lo = 0;
+ int hi = index.size() - 1;
+ unsigned cur;
+ int comp = 0;
+
+ do
+ {
+ cur = (hi + lo) / 2;
+ comp = stringCompare(file, cur, eucString);
+
+ if (comp < 0)
+ hi = cur - 1;
+ else if (comp > 0)
+ lo = cur + 1;
+ }
+ while(hi >= lo && comp != 0 && !(hi == 0 && lo == 0));
+ QStringList results;
+ // A match?
+ if (comp == 0)
+ {
+ // wheel back to make sure we get the first matching entry
+ while(cur - 1 && 0 == stringCompare(file, cur - 1, eucString))
+ --cur;
+
+ // output every matching entry
+ while(cur < index.size() && 0 == stringCompare(file, cur, eucString))
+ {
+ // because the index doesn't point
+ // to the start of the line, find the
+ // start of the line:
+ int i = 0;
+ while(file.lookup(cur, i - 1) != 0x0a) --i;
+
+ QByteArray bytes(0);
+ while(file.lookup(cur, i) != 0x0a) // get to end of our line
+ {
+ const char eucchar = file.lookup(cur, i);
+ bytes.resize(bytes.size() + 1);
+ bytes[bytes.size() - 1] = eucchar;
+ ++i;
+ }
+
+ QString result = codec.toUnicode(bytes) + QString("\n");
+ if (prevResult != result)
+ {
+ results.append(result);
+ prevResult = result;
+ }
+
+ ++cur;
+ }
+ }
+
+ // return all the entries found, or null if no match
+ return results;
+}
+
+SearchResult Index::scanResults(QRegExp regexp, QStringList results, bool common)
+{
+ unsigned int num = 0;
+ unsigned int fullNum = 0;
+
+ SearchResult ret;
+
+ //ret.results = results; //not here..
+
+ for (QStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
+ {
+ if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
+ {
+ ret.list.append(parse(*itr));
+ continue;
+ }
+
+ int found = regexp.search(*itr);
+
+ if (found >= 0)
+ {
+ ++fullNum;
+ if ((*itr).find(QString("(P)")) >= 0 || !common)
+ {
+ // we append HERE, so we get the exact
+ // results we have in ret.list
+
+ ret.results.append(*itr);
+ ret.list.append(parse(*itr));
+ ++num;
+ }
+ }
+ }
+
+ ret.count = num;
+ ret.outOf = fullNum;
+ ret.common = common;
+ return ret;
+}
+
+SearchResult Index::search(QRegExp regexp, const QString &text, bool common)
+{
+ QStringList results;
+ for (QPtrListIterator<File> file(dictFiles); *file; ++file)
+ {
+ results.append(QString("DICT ") + (*file)->name());
+
+ results += doSearch(**file, text);
+ }
+
+ SearchResult res = scanResults(regexp, results, common);
+ res.text = text;
+ return res;
+}
+
+SearchResult Index::scanKanjiResults(QRegExp regexp, QStringList results, bool common)
+{
+ unsigned int num = 0;
+ unsigned int fullNum = 0;
+ const bool jmyCount = false; // don't count JinMeiYou as common
+ SearchResult ret;
+ ret.results = results;
+
+ for (QStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
+ {
+ if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
+ {
+ ret.list.append(kanjiParse(*itr));
+ continue;
+ }
+
+ int found = regexp.search(*itr);
+
+ if (found >= 0)
+ {
+ ++fullNum;
+ // common entries have G[1-8] (jouyou)
+ QRegExp comregexp(jmyCount ? "G[1-9]" : "G[1-8]");
+ if ((*itr).find(comregexp) >= 0 || !common)
+ {
+ ret.list.append(kanjiParse(*itr));
+ ++num;
+ }
+ }
+ }
+
+ ret.count = num;
+ ret.outOf = fullNum;
+ ret.common = common;
+ return ret;
+}
+
+SearchResult Index::searchKanji(QRegExp regexp, const QString &text, bool common)
+{
+ QStringList results;
+ for (QPtrListIterator<File> file(kanjiDictFiles); *file; ++file)
+ {
+ results.append(QString("DICT ") + (*file)->name());
+
+ results += doSearch(**file, text);
+ }
+
+ SearchResult res = scanKanjiResults(regexp, results, common);
+ res.text = text;
+ return res;
+}
+
+SearchResult Index::searchPrevious(QRegExp regexp, const QString &text, SearchResult list, bool common)
+{
+ SearchResult res;
+
+ if (firstEntry(list).extendedKanjiInfo())
+ res = scanKanjiResults(regexp, list.results, common);
+ else
+ res = scanResults(regexp, list.results, common);
+
+ res.text = text;
+ return res;
+}
+
+QRegExp Dict::Index::createRegExp(SearchType type, const QString &text, DictionaryType dictionaryType, bool caseSensitive)
+{
+ QString regExp;
+ switch (type)
+ {
+ case Search_Beginning:
+ switch (textType(text))
+ {
+ case Dict::Text_Latin:
+ regExp = "\\W%1";
+ break;
+
+ case Dict::Text_Kana:
+ if (dictionaryType == Kanjidict)
+ regExp = "\\W%1";
+ else // edict
+ regExp = "\\[%1";
+ break;
+
+ case Dict::Text_Kanji:
+ regExp = "^%1";
+ }
+ break;
+
+ case Search_FullWord:
+ switch (textType(text))
+ {
+ case Dict::Text_Latin:
+ regExp = "\\W%1\\W";
+ break;
+
+ case Dict::Text_Kana:
+ if (dictionaryType == Kanjidict)
+ regExp = " %1 ";
+ else // edict
+ regExp = "\\[%1\\]";
+ break;
+
+ case Dict::Text_Kanji:
+ regExp = "^%1\\W";
+ }
+ break;
+
+ case Search_Anywhere:
+ regExp = "%1";
+ }
+
+ return QRegExp(regExp.arg(text), caseSensitive);
+}
+
+int Index::stringCompare(File &file, int index, QCString str)
+{
+ return eucStringCompare(file.lookup(index), str);
+}
+
+// effectively does a strnicmp on two "strings"
+// except it will make katakana and hiragana match (EUC A4 & A5)
+int Dict::eucStringCompare(const char *str, const char *str2)
+{
+ for (unsigned i = 0; ; ++i)
+ {
+ unsigned char c = static_cast<unsigned char>(str[i]);
+ unsigned char c2 = static_cast<unsigned char>(str2[i]);
+ if ((c2 == '\0') || (c == '\0'))
+ return 0;
+
+ if ((i % 2) == 0)
+ {
+ if (c2 == 0xA5)
+ c2 = 0xA4;
+
+ if (c == 0xA5)
+ c = 0xA4;
+ }
+
+ if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; /*fix ucase*/
+ if ((c >= 'A') && (c <= 'Z')) c |= 0x20;
+
+ if (c2 != c)
+ return (int)c2 - (int)c;
+ }
+
+ return 0;
+}
+
+bool Dict::isEUC(unsigned char c)
+{
+ return (c & 0x80);
+}
+
+Entry Dict::parse(const QString &raw)
+{
+ unsigned int length = raw.length();
+ if (raw.left(5) == "DICT ")
+ return Entry(raw.right(length - 5));
+ if (raw.left(8) == "HEADING ")
+ return Entry(raw.right(length - 8), true);
+
+ QString reading;
+ QString kanji;
+ QStringList meanings;
+ QString curmeaning;
+ bool firstmeaning = true;
+ QCString parsemode("kanji");
+
+ unsigned int i;
+ for (i = 0; i < length; i++)
+ {
+ QChar ichar(raw.at(i));
+
+ if (ichar == '[')
+ {
+ parsemode = "reading";
+ }
+ else if (ichar == ']')
+ {
+ // do nothing
+ }
+ else if (ichar == '/')
+ {
+ if (!firstmeaning)
+ {
+ meanings.append(curmeaning);
+ curmeaning = "";
+ }
+ else
+ {
+ firstmeaning = false;
+ parsemode = "meaning";
+ }
+ }
+ else if (ichar == ' ')
+ {
+ if (parsemode == "meaning") // only one that needs the space
+ curmeaning += ' ';
+ }
+ else if (parsemode == "kanji")
+ {
+ kanji += ichar;
+ }
+ else if (parsemode == "meaning")
+ {
+ curmeaning += ichar;
+ }
+ else if (parsemode == "reading")
+ {
+ reading += ichar;
+ }
+ }
+
+ return (Entry(kanji, reading, meanings));
+}
+
+Entry Dict::kanjiParse(const QString &raw)
+{
+ unsigned int length = raw.length();
+ if (raw.left(5) == "DICT ")
+ return Entry(raw.right(length - 5));
+ if (raw.left(8) == "HEADING ")
+ return Entry(raw.right(length - 8), true);
+
+ QStringList readings;
+ QString kanji;
+ QStringList meanings;
+ QString curmeaning;
+ QString curreading;
+
+ QString strfreq;
+ QString strgrade;
+ QString strstrokes;
+ QString strmiscount = "";
+
+ bool prevwasspace = true;
+ QChar detailname;
+ QCString parsemode("kanji");
+
+ // if there are two S entries, second is common miscount
+ bool strokesset = false;
+
+ unsigned int i;
+ QChar ichar;
+ for (i = 0; i < length; i++)
+ {
+ ichar = raw.at(i);
+
+ if (ichar == ' ')
+ {
+ if (parsemode == "reading")
+ {
+ readings.append(curreading);
+ curreading = "";
+ }
+ else if (parsemode == "kanji")
+ {
+ parsemode = "misc";
+ }
+ else if (parsemode == "detail")
+ {
+ if (detailname == 'S')
+ strokesset = true;
+
+ parsemode = "misc";
+ }
+ else if (parsemode == "meaning")
+ {
+ curmeaning += ichar;
+ }
+ prevwasspace = true;
+ }
+ else if (ichar == '{')
+ {
+ parsemode = "meaning";
+ }
+ else if (ichar == '}')
+ {
+ meanings.append(curmeaning);
+ curmeaning = "";
+ }
+ else if (parsemode == "detail")
+ {
+ if (detailname == 'G')
+ {
+ strgrade += ichar;
+ }
+ else if (detailname == 'F')
+ {
+ strfreq += ichar;
+ }
+ else if (detailname == 'S')
+ {
+ if (strokesset)
+ strmiscount += ichar;
+ else
+ strstrokes += ichar;
+ }
+ prevwasspace = false;
+ }
+ else if (parsemode == "kanji")
+ {
+ kanji += ichar;
+ }
+ else if (parsemode == "meaning")
+ {
+ curmeaning += ichar;
+ }
+ else if (parsemode == "reading")
+ {
+ curreading += ichar;
+ }
+ else if (parsemode == "misc" && prevwasspace)
+ {
+ if (QRegExp("[A-Za-z0-9]").search(QString(ichar)) >= 0)
+ // is non-japanese?
+ {
+ detailname = ichar;
+ parsemode = "detail";
+ }
+ else
+ {
+ curreading = QString(ichar);
+ parsemode = "reading";
+ }
+ }
+ }
+
+ return (Entry(kanji, readings, meanings, strgrade.toUInt(), strfreq.toUInt(), strstrokes.toUInt(), strmiscount.toUInt()));
+}
+
+QString Dict::prettyMeaning(QStringList Meanings)
+{
+ QString meanings;
+ QStringList::Iterator it;
+ for (it = Meanings.begin(); it != Meanings.end(); ++it)
+ meanings.append((*it).stripWhiteSpace()).append("; ");
+
+ meanings.truncate(meanings.length() - 2);
+ return meanings;
+}
+
+QString Dict::prettyKanjiReading(QStringList Readings)
+{
+ QStringList::Iterator it;
+ QString html;
+
+ for (it = Readings.begin(); it != Readings.end(); ++it)
+ {
+ if ((*it) == "T1")
+ html += i18n("In names: ");
+ else
+ {
+ if ((*it) == "T2")
+ html += i18n("As radical: ");
+ else
+ {
+ html += (*it).stripWhiteSpace();
+ html += ", ";
+ }
+ }
+ }
+ html.truncate(html.length() - 2); // get rid of last ,
+
+ return html;
+}
+
+Dict::Entry Dict::firstEntry(Dict::SearchResult result)
+{
+ for (QValueListIterator<Dict::Entry> it = result.list.begin(); it != result.list.end(); ++it)
+ {
+ if ((*it).dictName() == "__NOTSET" && (*it).header() == "__NOTSET")
+ return (*it);
+ }
+
+ return Dict::Entry("__NOTHING");
+}
+
+QString Dict::firstEntryText(Dict::SearchResult result)
+{
+ for (QStringList::Iterator it = result.results.begin(); it != result.results.end(); ++it)
+ {
+ if ((*it).left(5) != "DICT " && (*it).left(7) != "HEADER ")
+ return (*it);
+ }
+
+ return QString("NONE ");
+}
+
+///////////////////////////////////////////////////////////////
+
+Entry::Entry(const QString & kanji, const QString & reading, const QStringList &meanings)
+ : DictName(QString::fromLatin1("__NOTSET"))
+ , Header(QString::fromLatin1("__NOTSET"))
+ , Meanings(meanings)
+ , Kanji(kanji)
+ , KanaOnly(reading.isEmpty())
+ , Readings(KanaOnly ? kanji : reading)
+ , ExtendedKanjiInfo(false)
+ , Grade(0)
+ , Strokes(0)
+ , Miscount(0)
+ , Freq(0)
+{
+}
+
+Entry::Entry(const QString &kanji, QStringList &readings, QStringList &meanings, unsigned int grade, unsigned int freq, unsigned int strokes, unsigned int miscount)
+ : DictName(QString::fromLatin1("__NOTSET"))
+ , Header(QString::fromLatin1("__NOTSET"))
+ , Meanings(meanings)
+ , Kanji(kanji)
+ , KanaOnly(false)
+ , Readings(readings)
+ , ExtendedKanjiInfo(true)
+ , Grade(grade)
+ , Strokes(strokes)
+ , Miscount(miscount)
+ , Freq(freq)
+{
+}
+
+Entry::Entry(const QString &dictname)
+ : KanaOnly(true)
+ , ExtendedKanjiInfo(false)
+{
+ DictName = dictname;
+}
+
+Entry::Entry(const QString &headername, bool)
+ : DictName(QString::fromLatin1("__NOTSET"))
+ , Header(headername)
+ , KanaOnly(true)
+ , ExtendedKanjiInfo(false)
+{
+}
+
+QString Entry::dictName()
+{
+ return DictName;
+}
+
+QString Entry::header()
+{
+ return Header;
+}
+
+bool Entry::kanaOnly()
+{
+ return KanaOnly;
+}
+
+QString Entry::kanji()
+{
+ return Kanji;
+}
+
+QStringList Entry::readings()
+{
+ return Readings;
+}
+
+QString Entry::firstReading()
+{
+ return *Readings.at(0);
+}
+
+QStringList Entry::meanings()
+{
+ return Meanings;
+}
+
+unsigned int Entry::grade()
+{
+ return Grade;
+}
+
+unsigned int Entry::freq()
+{
+ return Freq;
+}
+
+unsigned int Entry::miscount()
+{
+ return Miscount;
+}
+
+unsigned int Entry::strokes()
+{
+ return Strokes;
+}
+
+bool Entry::extendedKanjiInfo()
+{
+ return ExtendedKanjiInfo;
+}
+
+#include "dict.moc"