summaryrefslogtreecommitdiffstats
path: root/kbabel/kbabeldict/modules/dbsearchengine2/database.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'kbabel/kbabeldict/modules/dbsearchengine2/database.cpp')
-rw-r--r--kbabel/kbabeldict/modules/dbsearchengine2/database.cpp752
1 files changed, 752 insertions, 0 deletions
diff --git a/kbabel/kbabeldict/modules/dbsearchengine2/database.cpp b/kbabel/kbabeldict/modules/dbsearchengine2/database.cpp
new file mode 100644
index 00000000..ea0e8379
--- /dev/null
+++ b/kbabel/kbabeldict/modules/dbsearchengine2/database.cpp
@@ -0,0 +1,752 @@
+/*
+
+DBSE 3
+(c) 2000-2003 Andrea Rizzi
+License: GPLv2
+
+*/
+#include <math.h>
+#include "database.h"
+
+#include <qregexp.h>
+#include <qdict.h>
+#include <kapplication.h>
+#include <kdebug.h>
+#include <kmessagebox.h>
+
+#define i18n (const char*)
+
+
+
+
+
+
+DataBase::DataBase(QString dbpath,QString dbname, QString dblang) : Db(0,DB_CXX_NO_EXCEPTIONS)
+{
+
+ filename=dbpath+"."+dblang+".db";
+ database=dbname;
+
+}
+
+int DataBase::open(DBTYPE type,unsigned int flags)
+{
+ int ret;
+ ret = Db::open(
+#if DB_VERSION_MINOR > 0
+ NULL,
+#endif
+ (const char*)filename.local8Bit(),(const char *)database.local8Bit(),type,flags,0644);
+ mytype=type;
+ return ret;
+}
+
+unsigned int DataBase::getLast()
+{
+ if(mytype!=DB_RECNO)
+ return 0;
+
+ Dbc *cur;
+ cursor(0,&cur,0);
+ DBItemNum index;
+ DBItemMainKey key;
+ cur->get(&index,&key,DB_LAST);
+ return index.getNum();
+
+}
+
+
+
+
+
+QueryResult::QueryResult(QString r)
+{
+ res=r;
+}
+QueryResult::QueryResult(QString r,QString o,int s)
+{
+ res=r;
+ richr=r;
+ orig=o;
+ richo=o;
+ sco=s;
+}
+
+QueryResult::QueryResult()
+{
+ res="";
+}
+
+
+
+
+DataBaseInterface::DataBaseInterface(QString dir, DBSESettings *sets)
+{
+
+ //FIXME Better db names!!
+ main = openMyDataBase(dir+"/testm","main","it",DB_BTREE);
+ alpha = openMyDataBase(dir+"/testa","alpha","it",DB_BTREE);
+ numindex = openMyDataBase(dir+"/testn","numindex","it",DB_RECNO);
+ wordsindex = openMyDataBase(dir+"/testw","wordsindex","it",DB_BTREE);
+ sentence = openMyDataBase(dir+"/tests","sentence","it",DB_BTREE);
+ corr = openMyDataBase(dir+"/testc","corr","it",DB_BTREE);
+ transword = openMyDataBase(dir+"/testt","transword","it",DB_RECNO);
+
+ // kdDebug(0) << main << endl;
+ // kdDebug(0) << alpha << endl;
+ settings=sets;
+ _stopNow=false;
+}
+
+DataBaseInterface::~DataBaseInterface()
+{
+
+ if(main){
+ main->close(0);
+ delete main;
+ }
+ if(numindex){
+ numindex->close(0);
+ delete numindex;
+ }
+
+ if(alpha){
+ alpha->close(0);
+ delete alpha;
+ }
+ if(wordsindex){
+ wordsindex->close(0);
+ delete wordsindex;
+ }
+ if(sentence){
+ sentence->close(0);
+ delete sentence;
+ }
+
+}
+
+DataBase *DataBaseInterface::openMyDataBase(const QString& prefix,const QString& name,const QString& l,DBTYPE tt)
+{
+
+ DataBase *aDb = new DataBase(prefix,name,l);
+ if(aDb==0){
+ return 0;
+ }
+ else
+ {
+ if(aDb->open(tt)!=0)
+ {
+ kdDebug(0) << "Database '"<< name <<"'do not exist, I try to create it.." << endl;
+ //ask only the first time.
+ static bool create=( KMessageBox::questionYesNo(0,"Database do not exist. Do you want to create it now?",
+ i18n("Create Database"), i18n("Create"), i18n("Do Not Create"))==KMessageBox::Yes);
+ if(create)
+ if(aDb->open(tt,DB_CREATE)!=0)
+ {
+ kdDebug(0) << "...cannot create!!"<< endl;
+ return 0;
+ }
+ else
+ {
+ kdDebug(0) << "...done!" << endl;
+ return aDb;
+ }
+ }
+
+ }
+ return aDb;
+}
+
+/*
+ * query functions.
+ *
+ */
+
+
+DataBaseInterface::MainEntry DataBaseInterface::get(const QString& query,SearchFilter *filter)
+{
+ static int counter=1;
+ counter++;
+ DBItemMainKey k(query);
+ DBItemMainData d;
+ //int r=
+ main->get(&k,&d);
+ // kdDebug(0) << "MAINDB->GET returned: " << r << endl;
+ if(counter%5==0) kapp->processEvents(100);
+ // kdDebug(0) << "events processed" << endl;
+ return qMakePair(k,d);
+
+}
+
+/*
+ * put functions
+ * *
+ */
+
+
+bool DataBaseInterface::addEntry(QString original,QString translated,InputInfo *info)
+{
+ DBItemMainKey mk(original);
+ DBItemMainData md;
+ QMap<QString, int> correlationDiff;
+ bool newentry=false;
+ //try to get
+ kdDebug(0) << "Inserting the pair:" << endl;
+ kdDebug(0) << "ORIGINAL:" << original << endl;
+ kdDebug(0) << "TRANSLATED:" << translated << endl;
+
+ if(main->get(&mk,&md)==DB_NOTFOUND)
+ {
+ kdDebug(0) << "new entry" << endl;
+ newentry=true;
+ //This is a new entry, create index entry
+ DBItemNum *nind;
+ int newid=numindex->getLast()+1;
+ nind=new DBItemNum(newid);
+ numindex->put(nind,&mk);
+
+ delete nind;
+
+ md.clear();
+ md.setIndexnumber(newid);
+
+
+ //Update secondary index alpha
+ DBItemMainKey ka(simple(original));
+ DBItemMultiIndex in;
+ if(alpha->get(&ka,&in)==DB_NOTFOUND) in.clear() ;
+ //alpha->get(&ka,&in);
+ in.addEntry(newid);
+ alpha->put(&ka,&in);
+ kdDebug(0) << "Updating the word index " << endl;
+ //Update words index
+ QStringList ws=words(original);
+ for(QStringList::iterator it = ws.begin(); it!=ws.end(); ++it)
+ {
+ DBItemMainKey word(*it);
+ DBItemMultiIndex win;
+ if(wordsindex->get(&word,&win)==DB_NOTFOUND) win.clear();
+ win.addEntry(newid);
+ wordsindex->put(&word,&win);
+ }
+
+ kdDebug(0) << "new entry preparation DONE" << endl;
+ }
+ else
+ {
+
+ kdDebug(0) << "It exists!" <<endl;
+ }
+
+
+ //Update sentence index
+ QStringList so=sentences(original);
+ QStringList st=sentences(translated);
+ if(so.count()==st.count() && st.count() >1 ) //we already hav a database for single string.
+ {
+ kdDebug(0) << "inside sentence loop" << endl;
+ for(int i=0; i< so.count() ; i++)
+ {
+ DBItemMainKey sk(so[i]);
+ DBItemMainData sd;
+ if(sentence->get(&sk,&sd)==DB_NOTFOUND&&!newentry)
+ kdDebug(0) << "Warning: new sentence for old entry, do we changed sentence definition? " << endl;
+
+ kdDebug(0) << "here alive" << endl;
+
+ // if(clean)
+ sd.removeRef(info->ref());
+ kdDebug(0) << "now alive" << endl;
+ sd.addTranslation(st[i],info->ref());
+ kdDebug(0) << "still alive" << endl;
+
+ sentence->put(&sk,&sd);
+
+ }
+
+
+
+ }
+ kdDebug(0) << "Fuzzy sentence archive updated" << endl;
+
+
+
+ //Add that translation, link to ref for information on that translation
+
+ if(!translated.isEmpty())
+ {
+ //loop on all translations to update correlation
+ QStringList tmpTranslations=md.getTranslations();
+ for(QStringList::iterator otIt=tmpTranslations.begin(); otIt!=tmpTranslations.end();++otIt)
+ {
+ QStringList wt=words(*otIt);
+ for(QStringList::iterator it = wt.begin(); it!=wt.end(); ++it)
+ {
+ if(correlationDiff.contains(*it))
+ correlationDiff[*it]--;
+ else
+ correlationDiff[*it]=-1;
+ }
+ }
+
+ //clean so that we have only one translation per catalog.
+ md.removeRef(info->ref());
+ md.addTranslation(translated,info->ref());
+
+ tmpTranslations=md.getTranslations();
+ for(QStringList::iterator otIt=tmpTranslations.begin(); otIt!=tmpTranslations.end();++otIt)
+ {
+ QStringList wt=words(*otIt);
+ for(QStringList::iterator it = wt.begin(); it!=wt.end(); ++it)
+ {
+ if(correlationDiff.contains(*it))
+ correlationDiff[*it]++;
+ else
+ correlationDiff[*it]=1;
+ }
+ }
+
+ //FIXME: use the correlationDIff map somehow
+
+ }
+
+ //finally put!
+ return (main->put(&mk,&md)==0);
+
+}
+
+
+bool DataBaseInterface::removeEntry(QString original)
+{
+ DBItemMainKey mk(original);
+ DBItemMainData md;
+
+ //FIXME implement remove
+ //try to get
+ if(main->get(&mk,&md)==DB_NOTFOUND)
+ {
+ /* //This is a new entry, create index entry
+ DBItemNum *nind;
+ int newid=numindex->getLast()+1;
+ nind=new DBItemNum(newid);
+ numindex->put(nind,&mk);
+
+ delete nind;
+
+ md.clear();
+ md.setIndexnumber(newid);
+
+
+ //Update secondary index alpha
+ DBItemMainKey ka(simple(original));
+ DBItemMultiIndex in;
+ if(alpha->get(&ka,&in)==DB_NOTFOUND) in.clear() ;
+ //alpha->get(&ka,&in);
+ in.addEntry(newid);
+ alpha->put(&ka,&in);
+
+ //Update words index
+ QStringList ws=words(original);
+ for(QStringList::iterator it = ws.begin(); it!=ws.end(); it++)
+ {
+ DBItemMainKey word(*it);
+ DBItemMultiIndex win;
+ if(wordsindex->get(&word,&win)==DB_NOTFOUND) win.clear();
+ win.addEntry(newid);
+ wordsindex->put(&word,&win);
+ }
+
+ //Update sentence index
+ QStringList so=sentences(original);
+ QStringList st=sentences(translated);
+ if(so.count()==st.count() && st.count() >1 ) //we already hav a database for single string.
+ {
+ for(int i=0; i< so.count() ; i++)
+ {
+ DBItemMainKey sk(so[i]);
+ DBItemMainKey sd(st[i]); //should be a list i.e. main data?
+ sentence->put(&sk,&sd);
+
+ }
+ }
+
+*/
+ }
+
+
+ return false;
+
+}
+
+
+
+QMap<QString,float> DataBaseInterface::correlation(QString word,SearchFilter *filter,bool notify, float minSign)
+{
+ QDict<unsigned int> res;
+ // res.setAutoDelete(true);
+ QMap<QString, float>final;
+ DBItemMultiIndex::IndexList il;
+ unsigned int tot=0;
+ unsigned int background=0;
+ unsigned int nocck;
+ QString sword=simple(word);
+ DBItemMainKey *k = new DBItemMainKey(sword);
+ DBItemMultiIndex *d = new DBItemMultiIndex();
+ if(wordsindex->get(k,d)!=DB_NOTFOUND)
+ {
+
+ il=d->getList();
+ kdDebug(0) << il.count()<<endl;
+ tot=0;
+ for(QValueList<unsigned int>::iterator it=il.begin();it!=il.end();++it)
+ {
+ numindex->get(*it,k);
+
+
+ // QValueList<QueryResult> trad=exactMatch(k->getString(),filter);
+
+ MainEntry e=get(k->getString(),filter);
+ QStringList trad=e.second.getTranslations();
+
+ nocck=words(k->getString()).contains(sword);
+ for( QStringList::iterator it2=trad.begin();it2!=trad.end();++it2)
+ {
+
+ QStringList w=words(*it2);
+ unsigned int numWords = w.count()*10+1;
+ unsigned int wei=100000/sqrt(numWords); //weight (is the best one?)
+
+ background+=(numWords-nocck)*wei;
+ QDict<uint> count;
+ //count.setAutoDelete(true);
+ //FIXME:SET AUTODELETE FOR ALL DICTS
+ for(QStringList::iterator it1=w.begin();it1!=w.end();it1++)
+ {
+ uint *ip;
+ if(!(ip=count[*it1])) count.insert(*it1,new uint(1));
+ else
+ (*ip)++;
+ }
+
+ for(QStringList::iterator it1=w.begin();it1!=w.end();it1++)
+ {
+ uint *ip;
+ if(*(count[*it1])==nocck) //add only if same number of entry (it cuts articles)
+ if(!(ip=res[*it1])) res.insert(*it1,new uint(wei));
+ else
+ (*ip)+=wei;
+ }
+
+ }
+ }
+
+ unsigned int sqrBG=sqrt((1.0*background+1)/10000);
+
+ for(QDictIterator<uint> it(res) ; it.current(); ++it)
+ {
+ float sign=1.0*(*(it.current()))/(10000.0*sqrBG);
+ if(sign >minSign){
+ final[it.currentKey()]=sign;
+ kdDebug(0) << it.currentKey() <<" Score:" << 1.0*(*(it.current()))/10000 << "/" <<sqrBG << " = " <<sign << endl;
+ }
+ }
+
+ kdDebug(0) << "final count " <<final.count()<< endl;
+ }
+
+ return final;
+}
+
+QStringList DataBaseInterface::words(QString s)
+{
+ QString str=simple(s);
+ QStringList list;
+
+ int pos;
+
+ do {
+ pos=str.find(QRegExp("\\s"));
+ // if(!simple(str.left(pos)).isEmpty())
+ // list.append(simple(str.left(pos)));
+ if(!str.left(pos).isEmpty())
+ list.append(str.left(pos));
+ str=str.remove(0,pos+1);
+ } while(!str.isEmpty() && pos != -1);
+
+ return list;
+}
+
+QString DataBaseInterface::simple(QString str,bool ck)
+{
+ QString res;
+ if(ck)
+ res=str; //case keep
+ else
+ res=str.lower(); //lowercase
+ //FIXME: uncoment the foll. line (check speed)
+ res=res.replace(QRegExp("(<(.*)>)(.*)(</\\2>)"),"\\3"); //remove enclosing tags
+
+
+ //Try to get rid of regexps.
+ // res=res.replace(QRegExp("(('|-|_|\\s|[^\\w%])+)")," "); //strip non-word char
+ // res=res.replace(QRegExp("(('|-|_)+)")," "); //strip non-word char
+ // res=res.replace(QRegExp("[^\\w\\s%]"),""); //strip non-word char
+
+ QString r;
+ QChar c;
+ bool wasSpace=true;
+ uint len=res.length();
+ for(uint i=0; i<len;i++)
+ {
+ c=res[i];
+ if(c.isLetterOrNumber())
+ {
+ r+=c;
+ wasSpace=false;
+ }
+ else
+ {
+ if(!wasSpace && c.isSpace())
+ {
+ r+=' ';
+ wasSpace=true;
+ }
+ else
+ {
+ if(!wasSpace && (c=='-' || c=='\'' || c=='_'))
+ {
+ r+=' ';
+ wasSpace=true;
+ }
+ else
+ {
+ if(c=='%'){
+ r+=c;
+ wasSpace=false;
+ }
+ }
+ }
+ }
+ // wasSpace=c.isSpace();
+ }
+ if(r[len-1].isSpace())
+ r.truncate(len-1);
+ res=r;
+ //kdDebug(0) << "Simple: "<<res<< endl;
+ //res=res.simplifyWhiteSpace(); //remove double spaces
+ //res=res.stripWhiteSpace(); //" as " -> "as"
+
+ // kdDebug(0) << res << endl;
+ return res;
+}
+
+QStringList DataBaseInterface::sentences(QString s)
+{
+ QString str=s;
+ QStringList list;
+
+ // kdDebug(0) << s << endl;
+
+ int pos;
+
+
+ do {
+ QRegExp re("((\\.|;|\\?|\\!|:)( |$|\\\\n\\n))");
+ pos=re.search(str);
+ if(!str.left(pos).isEmpty())
+ list.append(str.left(pos).stripWhiteSpace());
+
+ kdDebug(0) << str.left(pos) << endl;
+
+ str=str.remove(0,pos+re.cap(1).length());
+ } while(!str.isEmpty() && pos != -1);
+
+
+ return list;
+}
+
+QStringList DataBaseInterface::sentencesSeparator(QString s)
+{
+ QString str=s;
+ QStringList list;
+
+ // kdDebug(0) << s << endl;
+
+ int pos;
+
+ do {
+ QRegExp re;
+ re.setPattern("([.:?!;]( |$|\\\\n\\n))");
+ pos = re.search(str);
+ QString separator=re.cap(1);
+ if(pos!=-1){
+ list.append(separator);
+ }
+
+ str=str.remove(0,pos+1);
+ } while(!str.isEmpty() && pos != -1);
+
+ return list;
+}
+
+bool DataBaseInterface::isUpper(QChar s)
+{
+ return s==s.upper();
+}
+
+bool DataBaseInterface::isLower(QChar s)
+{
+ return s==s.lower();
+}
+
+
+
+QString DataBaseInterface::format(QString _s,QString t)
+{
+ //FIXME use settings
+ //FIXME use regexp
+
+ QString s=_s;
+ QString noTagT=t.replace(QRegExp("(<(.*)>)(.*)(</\\2>)"),"\\3");
+ QChar first=noTagT[noTagT.find(QRegExp("\\w"))];
+ bool firstCapital=isUpper(first);
+
+ /*
+bool dotsAtEnd=(t.find("...")+3==t.length());
+bool gtgtAtEnd=(t.find(">>")+2==t.length());
+bool ltltAtEnd=(t.find("<<")==t.length()-2);
+
+bool columnAtEnd=(t.find(":")+1==t.length());
+*/
+
+ bool allupper=(t.upper()==t);
+
+
+ if(firstCapital)
+ s[0]=s[0].upper();
+ else
+ s[0]=s[0].lower();
+
+ //if(dotsAtEnd)
+ // s+="...";
+
+ /*if(gtgtAtEnd)
+ s+=">>";
+
+if(ltltAtEnd)
+ s+="<<";
+
+if(columnAtEnd)
+ s+=":";
+*/
+
+ if(allupper)
+ s=s.upper();
+
+ int pos=t.find(QRegExp("&"));
+ if(pos>=0) {
+ QChar accel=t[t.find(QRegExp("&"))+1];
+ if(accel!='&')
+ {
+
+ pos=s.find(accel,false);
+ if(pos<0)
+ pos=0;
+ s.insert(pos,"&");
+ }
+ }
+ s=formatRegExp(s,t,".*(\\.\\.\\.|:|>>|<<|\\.|\\?)$",
+ "^(.*)$",
+ "\\1@CAP1@");
+ s=formatRegExp(s,t,"(<(.*)>).*(\\.\\.\\.|:|>>|<<|\\.|\\?)*(</\\2>)$",
+ "^(.*)$",
+ "@CAP1@\\1@CAP3@@CAP4@");
+
+ return s;
+
+}
+
+
+QString DataBaseInterface::formatRegExp(QString _s, QString t, QString tre,QString stringSearch,QString stringReplace)
+{
+ QString s=_s;
+ QRegExp templateRegExp(tre);
+ //QString stringSearch = "(.*)!@CAP1@$"; // use @CAP1.. fot caps in templates
+ //QString stringReplace = "\\1@CAP1@"; // use \1, \2 for caps in str and @CAP1 fot caps in template
+
+
+ if(templateRegExp.exactMatch(t))
+ {
+ QStringList caps=templateRegExp.capturedTexts();
+ int i=0;
+ for(QStringList::iterator capit=caps.begin();capit!=caps.end();++capit)
+ {
+ QString phRegExp="(?!\\\\)@CAP"+QString::number(i)+"@";
+ //kdDebug(0) << "phRegExp: " << phRegExp << endl;
+ //kdDebug(0) << "cap[" << i << "]: "<< *capit<< endl;
+
+ stringReplace = stringReplace.replace(QRegExp(phRegExp),*capit);
+ stringSearch = stringSearch.replace(QRegExp(phRegExp),*capit);
+ i++;
+
+ }
+ // kdDebug(0) << "stringSearch " << stringSearch << endl;
+ // kdDebug(0) << "stringReplace " << stringReplace << endl;
+ QRegExp stringSearchRegExp = QRegExp(stringSearch);
+ // kdDebug(0) << "before: "<<s<<endl;
+ s = s.replace(stringSearchRegExp,stringReplace);
+ // kdDebug(0) << "after: "<<s<<endl;
+
+ }
+
+ return s;
+}
+
+DBItemMultiIndex::IndexList DataBaseInterface::getAlpha( const QString & query )
+{
+ DBItemMainKey *k = new DBItemMainKey(simple(query));
+ DBItemMultiIndex *d = new DBItemMultiIndex();
+ alpha->get(k,d);
+
+ return d->getList();
+}
+
+DataBaseInterface::MainEntry DataBaseInterface::getFromIndex( uint i )
+{
+ DBItemMainKey k;
+ numindex->get(i,&k);
+ return get(k.getString(),0); //FIXME: this is a BUG right now but the filter should be removed
+}
+
+DataBaseInterface::MainEntry DataBaseInterface::getSentence( const QString & query )
+{
+
+ static int counter=1;
+ counter++;
+ DBItemMainKey k(query);
+ DBItemMainData d;
+ sentence->get(&k,&d);
+ if(counter%5==0) kapp->processEvents(100);
+ return qMakePair(k,d);
+
+}
+
+DBItemMultiIndex::IndexList DataBaseInterface::getWordIndex( const QString & query )
+{
+ DBItemMainKey k = DBItemMainKey(query);
+ DBItemMultiIndex d = DBItemMultiIndex();
+ if(wordsindex->get(&k,&d)!=DB_NOTFOUND){
+ return d.getList();
+ }
+ else
+ {
+ QValueList<unsigned int> tmpList;
+ return tmpList;
+ }
+
+}
+
+
+
+//#include "database.moc.cpp"
+