1 files changed, 876 insertions, 0 deletions
diff --git a/kiten/dict.cpp b/kiten/dict.cpp
new file mode 100644
index 00000000..ad282807
--- /dev/null
+++ b/kiten/dict.cpp
@@ -0,0 +1,876 @@
+/**
+ This file is part of Kiten, a KDE Japanese Reference Tool...
+ Copyright (C) 2001  Jason Katz-Brown <jason@katzbrown.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
+ USA
+**/
+
+#include <kdebug.h>
+#include <klocale.h>
+#include <kmessagebox.h>
+#include <kprocess.h>
+#include <kstandarddirs.h>
+
+#include <qfileinfo.h> 
+#include <qregexp.h>
+#include <qtextcodec.h>
+
+#include "dict.h"
+
+#include <iostream>
+#include <cassert>
+#include <sys/mman.h> 
+#include <stdio.h>
+
+namespace
+{
+void msgerr(const QString &msg, const QString &dict = QString::null)
+{
+	QString output = msg;
+	if (!dict.isNull())
+		output = msg.arg(dict);
+	KMessageBox::error(0, output);
+}
+}
+
+using namespace Dict;
+
+TextType Dict::textType(const QString &text)
+{
+	ushort first = text.at(0).unicode();
+	
+	if (first < 0x3000)
+		return Text_Latin;
+	// else if (first < 0x3040) // CJK Symbols and Punctuation
+			// return Text_Kana;
+		// else if (first < 0x30A0) // Hiragana
+			// return Text_Kana;
+	else if (first < 0x3100) // Katakana
+		return Text_Kana;
+	
+	else /*if (first >= 0x3400 && first < 0x4DC0)*/ // CJK Unified Ideographs Extension A
+		return Text_Kanji;
+}
+
+File::File(QString path, QString n)
+	: myName(n)
+	, dictFile(path)
+	, dictPtr((const unsigned char *)MAP_FAILED)
+	, indexFile(KGlobal::dirs()->saveLocation("data", "kiten/xjdx/", true) + QFileInfo(path).baseName() + ".xjdx")
+	, indexPtr((const uint32_t *)MAP_FAILED)
+	, valid(false)
+{
+	bool forceUpdate = false;
+
+	bool indexFileExists = indexFile.exists();
+	if (indexFileExists)
+	{
+		// ### change this if need be!!
+		const int indexFileVersion = 14;
+
+		// this up-to-date code from xjdservcomm.c
+		// we need to check if the index needs to
+		// remade
+
+		int dictionaryLength;
+		QFile dictionary(path);
+		dictionaryLength = dictionary.size();
+		dictionaryLength++;
+		//kdDebug() << "dictionaryLength = " << dictionaryLength << endl;
+
+		int32_t testWord[1];
+		fread(&testWord[0], sizeof(int32_t), 1, fopen(indexFile.name().latin1(), "rb"));
+
+		//kdDebug() << "testWord[0] = " << testWord[0] << endl;
+
+		if (testWord[0] != (dictionaryLength + indexFileVersion))
+			forceUpdate = true;
+	}
+
+	if (!indexFileExists || forceUpdate)
+	{
+		//kdDebug() << "creating " << indexFile.name() << endl;
+		// find the index generator executable
+		KProcess proc;
+		proc << KStandardDirs::findExe("kitengen") << path << indexFile.name();
+		// TODO: put up a status dialog and event loop instead of blocking
+		proc.start(KProcess::Block, KProcess::NoCommunication);
+	}
+
+	if (!dictFile.open(IO_ReadOnly))
+	{
+		msgerr(i18n("Could not open dictionary %1."), path);
+		return;
+	}
+
+	dictPtr = (const unsigned char *)mmap(0, dictFile.size(), PROT_READ, MAP_SHARED, dictFile.handle(), 0);
+	if (dictPtr == (unsigned char*) MAP_FAILED)
+	{
+		msgerr(i18n("Memory error when loading dictionary %1."), path);
+		return;
+	}
+
+	if (!indexFile.open(IO_ReadOnly))
+	{
+		msgerr(i18n("Could not open index for dictionary %1."), path);
+		return;
+	}
+
+	indexPtr = (const uint32_t*)mmap(0, indexFile.size(), PROT_READ, MAP_SHARED, indexFile.handle(), 0);
+	if (indexPtr == (uint32_t*) MAP_FAILED)
+	{
+		msgerr(i18n("Memory error when loading dictionary %1's index file."), path);
+		return;
+	}
+
+	valid = true;
+}
+
+File::~File(void)
+{
+	if (dictPtr != (unsigned char*) MAP_FAILED)
+		munmap((char *)dictPtr, dictFile.size());
+	dictFile.close();
+
+	if (indexPtr != (uint32_t*) MAP_FAILED)
+		munmap((char *)indexPtr, indexFile.size());
+	indexFile.close();
+}
+
+QString File::name(void)
+{
+	return myName;
+}
+
+Array<const unsigned char> File::dict(void)
+{
+	assert(valid);
+	return Array<const unsigned char>(dictPtr, dictFile.size());
+}
+
+Array<const uint32_t> File::index(void)
+{
+	assert(valid);
+	return Array<const uint32_t>(indexPtr, indexFile.size());
+}
+
+int File::dictLength(void)
+{
+	return dictFile.size();
+}
+
+int File::indexLength(void)
+{
+	return indexFile.size();
+}
+
+bool File::isValid(void)
+{
+	return valid;
+}
+
+// returns specified character from a dictionary
+unsigned char File::lookup(unsigned i, int offset)
+{
+	uint32_t pos = indexPtr[i] + offset - 1;
+	if (pos > dictFile.size()) return 10;
+	return dictPtr[pos];
+}
+
+QCString File::lookup(unsigned i)
+{
+	uint32_t start = indexPtr[i] - 1;
+	uint32_t pos = start;
+	const unsigned size = dictFile.size();
+	// get the whole word
+	while(pos <= size && dictPtr[pos] != 0 && dictPtr[pos] != 0x0a)
+		++pos;
+	// put the word in the QCString
+	QCString retval((const char *)(dictPtr + start), pos - start);
+	// tack on a null
+	char null = 0;
+	retval.append(&null);
+	// and away we go
+	return retval;
+}
+
+// And last, Index itself is the API presented to the rest of Kiten
+Index::Index()
+	: QObject()
+{
+	dictFiles.setAutoDelete(true);
+	kanjiDictFiles.setAutoDelete(true);
+}
+
+Index::~Index()
+{
+}
+
+void Index::setDictList(const QStringList &list, const QStringList &names)
+{
+	loadDictList(dictFiles, list, names);
+}
+
+void Index::setKanjiDictList(const QStringList &list, const QStringList &names)
+{
+	loadDictList(kanjiDictFiles, list, names);
+}
+
+void Index::loadDictList(QPtrList<File> &fileList, const QStringList &dictList, const QStringList &dictNameList)
+{
+	fileList.clear();
+
+	// check if we have a dict
+	if (dictList.size() < 1)
+	{
+		msgerr(i18n("No dictionaries in list!"));
+		return;
+	}
+
+	QStringList::ConstIterator it;
+	QStringList::ConstIterator dictIt;
+	for (it = dictList.begin(), dictIt = dictNameList.begin(); it != dictList.end(); ++it, ++dictIt)
+	{
+		File *f = new File(*it, *dictIt);
+		// our ugly substitute for exceptions
+		if (f->isValid())
+			fileList.append(f);
+		else
+			delete f;
+	}
+}
+
+QStringList Index::doSearch(File &file, const QString &text)
+{
+	// Do a binary search to find an entry that matches text
+	QTextCodec &codec = *QTextCodec::codecForName("eucJP");
+	QCString eucString = codec.fromUnicode(text);
+
+	QString prevResult;
+
+	Array<const uint32_t> index = file.index();
+	Array<const unsigned char> dict = file.dict();
+	int lo = 0;
+	int hi = index.size() - 1;
+	unsigned cur;
+	int comp = 0;
+
+	do
+	{
+		cur = (hi + lo) / 2;
+		comp = stringCompare(file, cur, eucString);
+
+		if (comp < 0)
+			hi = cur - 1;
+		else if (comp > 0)
+			lo = cur + 1;
+	}
+	while(hi >= lo && comp != 0 && !(hi == 0 && lo == 0));
+	QStringList results;
+	// A match?
+	if (comp == 0)
+	{
+		// wheel back to make sure we get the first matching entry
+		while(cur - 1 && 0 == stringCompare(file, cur - 1, eucString))
+			--cur;
+
+		// output every matching entry
+		while(cur < index.size() && 0 == stringCompare(file, cur, eucString))
+		{
+			// because the index doesn't point
+			// to the start of the line, find the
+			// start of the line:
+			int i = 0;
+			while(file.lookup(cur, i - 1) != 0x0a) --i;
+
+			QByteArray bytes(0);
+			while(file.lookup(cur, i) != 0x0a) // get to end of our line
+			{
+				const char eucchar = file.lookup(cur, i);
+				bytes.resize(bytes.size() + 1);
+				bytes[bytes.size() - 1] = eucchar;
+				++i;
+			}
+
+			QString result = codec.toUnicode(bytes) + QString("\n");
+			if (prevResult != result)
+			{
+				results.append(result);
+				prevResult = result;
+			}
+
+			++cur;
+		}
+	}
+
+	// return all the entries found, or null if no match
+	return results;
+}
+
+SearchResult Index::scanResults(QRegExp regexp, QStringList results, bool common)
+{
+	unsigned int num = 0;
+	unsigned int fullNum = 0;
+
+	SearchResult ret;
+	
+	//ret.results = results; //not here..
+	
+	for (QStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
+	{
+		if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
+		{
+			ret.list.append(parse(*itr));
+			continue;
+		}
+
+		int found = regexp.search(*itr);
+
+		if (found >= 0)
+		{
+			++fullNum;
+			if ((*itr).find(QString("(P)")) >= 0 || !common)
+			{
+				// we append HERE, so we get the exact
+				// results we have in ret.list
+				
+				ret.results.append(*itr);
+				ret.list.append(parse(*itr));
+				++num;
+			}
+		}
+	}
+
+	ret.count = num;
+	ret.outOf = fullNum;
+	ret.common = common;
+	return ret;
+}
+
+SearchResult Index::search(QRegExp regexp, const QString &text, bool common)
+{
+	QStringList results;
+	for (QPtrListIterator<File> file(dictFiles); *file; ++file)
+	{
+		results.append(QString("DICT ") + (*file)->name());
+
+		results += doSearch(**file, text);
+	}
+
+	SearchResult res = scanResults(regexp, results, common);
+	res.text = text;
+	return res;
+}
+
+SearchResult Index::scanKanjiResults(QRegExp regexp, QStringList results, bool common)
+{
+	unsigned int num = 0;
+	unsigned int fullNum = 0;
+	const bool jmyCount = false; // don't count JinMeiYou as common
+	SearchResult ret;
+	ret.results = results;
+
+	for (QStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
+	{
+		if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
+		{
+			ret.list.append(kanjiParse(*itr));
+			continue;
+		}
+
+		int found = regexp.search(*itr);
+
+		if (found >= 0)
+		{
+			++fullNum;
+			// common entries have G[1-8] (jouyou)
+			QRegExp comregexp(jmyCount ? "G[1-9]" : "G[1-8]");
+			if ((*itr).find(comregexp) >= 0 || !common)
+			{
+				ret.list.append(kanjiParse(*itr));
+				++num;
+			}
+		}
+	}
+
+	ret.count = num;
+	ret.outOf = fullNum;
+	ret.common = common;
+	return ret;
+}
+
+SearchResult Index::searchKanji(QRegExp regexp, const QString &text,  bool common)
+{
+	QStringList results;
+	for (QPtrListIterator<File> file(kanjiDictFiles); *file; ++file)
+	{
+		results.append(QString("DICT ") + (*file)->name());
+
+		results += doSearch(**file, text);
+	}
+
+	SearchResult res = scanKanjiResults(regexp, results, common);
+	res.text = text;
+	return res;
+}
+
+SearchResult Index::searchPrevious(QRegExp regexp, const QString &text, SearchResult list, bool common)
+{
+	SearchResult res;
+
+	if (firstEntry(list).extendedKanjiInfo())
+		res = scanKanjiResults(regexp, list.results, common);
+	else
+		res = scanResults(regexp, list.results, common);
+
+	res.text = text;
+	return res;
+}
+
+QRegExp Dict::Index::createRegExp(SearchType type, const QString &text, DictionaryType dictionaryType, bool caseSensitive)
+{
+	QString regExp;
+	switch (type)
+	{
+	case Search_Beginning:
+		switch (textType(text))
+		{
+		case Dict::Text_Latin:
+			regExp = "\\W%1";
+			break;
+
+		case Dict::Text_Kana:
+			if (dictionaryType == Kanjidict)
+				regExp = "\\W%1";
+			else // edict
+				regExp = "\\[%1";
+			break;
+
+		case Dict::Text_Kanji:
+			regExp = "^%1";
+		}
+		break;
+	
+	case Search_FullWord:
+		switch (textType(text))
+		{
+		case Dict::Text_Latin:
+			regExp = "\\W%1\\W";
+			break;
+
+		case Dict::Text_Kana:
+			if (dictionaryType == Kanjidict)
+				regExp = " %1 ";
+			else // edict
+				regExp = "\\[%1\\]";
+			break;
+
+		case Dict::Text_Kanji:
+			regExp = "^%1\\W";
+		}
+		break;
+	
+	case Search_Anywhere:
+		regExp = "%1";
+	}
+
+	return QRegExp(regExp.arg(text), caseSensitive);
+}
+
+int Index::stringCompare(File &file, int index, QCString str)
+{
+	return eucStringCompare(file.lookup(index), str);
+}
+
+// effectively does a strnicmp on two "strings" 
+// except it will make katakana and hiragana match (EUC A4 & A5)
+int Dict::eucStringCompare(const char *str, const char *str2)
+{
+	for (unsigned i = 0; ; ++i)
+	{
+		unsigned char c = static_cast<unsigned char>(str[i]);
+		unsigned char c2 = static_cast<unsigned char>(str2[i]);
+		if ((c2 == '\0') || (c == '\0'))
+			return 0;
+
+		if ((i % 2) == 0)
+		{
+			if (c2 == 0xA5)
+				c2 = 0xA4;
+
+			if (c == 0xA5)
+				c = 0xA4;
+		}
+
+		if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; /*fix ucase*/
+		if ((c >= 'A') && (c <= 'Z')) c |= 0x20;
+
+		if (c2 != c)
+			return (int)c2 - (int)c;
+	}
+
+	return 0;
+}
+
+bool Dict::isEUC(unsigned char c)
+{
+	return (c & 0x80);
+}
+
+Entry Dict::parse(const QString &raw)
+{
+	unsigned int length = raw.length();
+	if (raw.left(5) == "DICT ")
+		return Entry(raw.right(length - 5));
+	if (raw.left(8) == "HEADING ")
+		return Entry(raw.right(length - 8), true);
+
+	QString reading;
+	QString kanji;
+	QStringList meanings;
+	QString curmeaning;
+	bool firstmeaning = true;
+	QCString parsemode("kanji");
+
+	unsigned int i;
+	for (i = 0; i < length; i++)
+	{
+		QChar ichar(raw.at(i));
+
+		if (ichar == '[')
+		{
+			parsemode = "reading";
+		}
+		else if (ichar == ']')
+		{
+			// do nothing
+		}
+		else if (ichar == '/')
+		{
+			if (!firstmeaning)
+			{
+				meanings.append(curmeaning);
+				curmeaning = "";
+			}
+			else
+			{
+				firstmeaning = false;
+				parsemode = "meaning";
+			}
+		}
+		else if (ichar == ' ')
+		{
+			if (parsemode == "meaning") // only one that needs the space
+				curmeaning += ' ';
+		}
+		else if (parsemode == "kanji")
+		{
+			kanji += ichar;
+		}
+		else if (parsemode == "meaning")
+		{
+			curmeaning += ichar;
+		}
+		else if (parsemode == "reading")
+		{
+			reading += ichar;
+		}
+	}
+
+	return (Entry(kanji, reading, meanings));
+}
+
+Entry Dict::kanjiParse(const QString &raw)
+{
+	unsigned int length = raw.length();
+	if (raw.left(5) == "DICT ")
+		return Entry(raw.right(length - 5));
+	if (raw.left(8) == "HEADING ")
+		return Entry(raw.right(length - 8), true);
+
+	QStringList readings;
+	QString kanji;
+	QStringList meanings;
+	QString curmeaning;
+	QString curreading;
+
+	QString strfreq;
+	QString strgrade;
+	QString strstrokes;
+	QString strmiscount = "";
+
+	bool prevwasspace = true;
+	QChar detailname;
+	QCString parsemode("kanji");
+
+	// if there are two S entries, second is common miscount
+	bool strokesset = false;
+
+	unsigned int i;
+	QChar ichar;
+	for (i = 0; i < length; i++)
+	{
+		ichar = raw.at(i);
+
+		if (ichar == ' ')
+		{
+			if (parsemode == "reading")
+			{
+				readings.append(curreading);
+				curreading = "";
+			}
+			else if (parsemode == "kanji")
+			{
+				parsemode = "misc";
+			}
+			else if (parsemode == "detail")
+			{
+				if (detailname == 'S')
+					strokesset = true;
+
+				parsemode = "misc";
+			}
+			else if (parsemode == "meaning")
+			{
+				curmeaning += ichar;
+			}
+			prevwasspace = true;
+		}
+		else if (ichar == '{')
+		{
+			parsemode = "meaning";
+		}
+		else if (ichar == '}')
+		{
+			meanings.append(curmeaning);
+			curmeaning = "";
+		}
+		else if (parsemode == "detail")
+		{
+			if (detailname == 'G')
+			{
+				strgrade += ichar;
+			}
+			else if (detailname == 'F')
+			{
+				strfreq += ichar;
+			}
+			else if (detailname == 'S')
+			{
+				if (strokesset)
+					strmiscount += ichar;
+				else
+					strstrokes += ichar;
+			}
+			prevwasspace = false;
+		}
+		else if (parsemode == "kanji")
+		{
+			kanji += ichar;
+		}
+		else if (parsemode == "meaning")
+		{
+			curmeaning += ichar;
+		}
+		else if (parsemode == "reading")
+		{
+			curreading += ichar;
+		}
+		else if (parsemode == "misc" && prevwasspace)
+		{
+			if (QRegExp("[A-Za-z0-9]").search(QString(ichar)) >= 0)
+				   // is non-japanese?
+			{
+				detailname = ichar;
+				parsemode = "detail";
+			}
+			else
+			{
+				curreading = QString(ichar);
+				parsemode = "reading";
+			}
+		}
+	}
+
+	return (Entry(kanji, readings, meanings, strgrade.toUInt(), strfreq.toUInt(), strstrokes.toUInt(), strmiscount.toUInt()));
+}
+
+QString Dict::prettyMeaning(QStringList Meanings)
+{
+	QString meanings;
+	QStringList::Iterator it;
+	for (it = Meanings.begin(); it != Meanings.end(); ++it)
+		meanings.append((*it).stripWhiteSpace()).append("; ");
+
+	meanings.truncate(meanings.length() - 2);
+	return meanings;
+}
+
+QString Dict::prettyKanjiReading(QStringList Readings)
+{
+	QStringList::Iterator it;
+	QString html;
+
+	for (it = Readings.begin(); it != Readings.end(); ++it)
+	{
+		if ((*it) == "T1")
+			html += i18n("In names: ");
+		else
+		{
+			if ((*it) == "T2")
+				html += i18n("As radical: ");
+			else
+			{
+				html += (*it).stripWhiteSpace();
+				html += ", ";
+			}
+		}
+	}
+	html.truncate(html.length() - 2); // get rid of last ,
+
+	return html;
+}
+
+Dict::Entry Dict::firstEntry(Dict::SearchResult result)
+{
+	for (QValueListIterator<Dict::Entry> it = result.list.begin(); it != result.list.end(); ++it)
+	{
+		if ((*it).dictName() == "__NOTSET" && (*it).header() == "__NOTSET")
+			return (*it);
+	}
+
+	return Dict::Entry("__NOTHING");
+}
+
+QString Dict::firstEntryText(Dict::SearchResult result)
+{
+	for (QStringList::Iterator it = result.results.begin(); it != result.results.end(); ++it)
+	{
+		if ((*it).left(5) != "DICT " && (*it).left(7) != "HEADER ")
+			return (*it);
+	}
+
+	return QString("NONE ");
+}
+
+///////////////////////////////////////////////////////////////
+
+Entry::Entry(const QString & kanji, const QString & reading, const QStringList &meanings)
+	: DictName(QString::fromLatin1("__NOTSET"))
+	, Header(QString::fromLatin1("__NOTSET"))
+	, Meanings(meanings)
+	, Kanji(kanji)
+	, KanaOnly(reading.isEmpty())
+	, Readings(KanaOnly ? kanji : reading)
+	, ExtendedKanjiInfo(false)
+	, Grade(0)
+	, Strokes(0)
+	, Miscount(0)
+	, Freq(0)
+{
+}
+
+Entry::Entry(const QString &kanji, QStringList &readings, QStringList &meanings, unsigned int grade, unsigned int freq, unsigned int strokes, unsigned int miscount)
+	: DictName(QString::fromLatin1("__NOTSET"))
+	, Header(QString::fromLatin1("__NOTSET"))
+	, Meanings(meanings)
+	, Kanji(kanji)
+	, KanaOnly(false)
+	, Readings(readings)
+	, ExtendedKanjiInfo(true)
+	, Grade(grade)
+	, Strokes(strokes)
+	, Miscount(miscount)
+	, Freq(freq)
+{
+}
+
+Entry::Entry(const QString &dictname)
+	: KanaOnly(true)
+	, ExtendedKanjiInfo(false)
+{
+	DictName = dictname;
+}
+
+Entry::Entry(const QString &headername, bool)
+	: DictName(QString::fromLatin1("__NOTSET"))
+	, Header(headername)
+	, KanaOnly(true)
+	, ExtendedKanjiInfo(false)
+{
+}
+
+QString Entry::dictName()
+{
+	return DictName;
+}
+
+QString Entry::header()
+{
+	return Header;
+}
+
+bool Entry::kanaOnly()
+{
+	return KanaOnly;
+}
+
+QString Entry::kanji()
+{
+	return Kanji;
+}
+
+QStringList Entry::readings()
+{
+	return Readings;
+}
+
+QString Entry::firstReading()
+{
+	return *Readings.at(0);
+}
+
+QStringList Entry::meanings()
+{
+	return Meanings;
+}
+
+unsigned int Entry::grade()
+{
+	return Grade;
+}
+
+unsigned int Entry::freq()
+{
+	return Freq;
+}
+
+unsigned int Entry::miscount()
+{
+	return Miscount;
+}
+
+unsigned int Entry::strokes()
+{
+	return Strokes;
+}
+
+bool Entry::extendedKanjiInfo()
+{
+	return ExtendedKanjiInfo;
+}
+
+#include "dict.moc"