diff options
Diffstat (limited to 'src/translators/pdfimporter.cpp')
-rw-r--r-- | src/translators/pdfimporter.cpp | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/src/translators/pdfimporter.cpp b/src/translators/pdfimporter.cpp new file mode 100644 index 0000000..2d59b33 --- /dev/null +++ b/src/translators/pdfimporter.cpp @@ -0,0 +1,281 @@ +/*************************************************************************** + copyright : (C) 2007 by Robby Stephenson + email : robby@periapsis.org + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of version 2 of the GNU General Public License as * + * published by the Free Software Foundation; * + * * + ***************************************************************************/ + +#include "pdfimporter.h" +#include "tellicoimporter.h" +#include "xslthandler.h" +#include "../collections/bibtexcollection.h" +#include "../xmphandler.h" +#include "../filehandler.h" +#include "../imagefactory.h" +#include "../tellico_kernel.h" +#include "../fetch/fetchmanager.h" +#include "../fetch/crossreffetcher.h" +#include "../tellico_utils.h" +#include "../progressmanager.h" +#include "../core/netaccess.h" +#include "../tellico_debug.h" + +#include <kstandarddirs.h> +#include <kmessagebox.h> + +#include <config.h> +#ifdef HAVE_POPPLER +#include <poppler-qt.h> +#endif + +namespace { + static const int PDF_FILE_PREVIEW_SIZE = 196; +} + +using Tellico::Import::PDFImporter; + +PDFImporter::PDFImporter(const KURL::List& urls_) : Importer(urls_), m_cancelled(false) { +} + +bool PDFImporter::canImport(int type_) const { + return type_ == Data::Collection::Bibtex; +} + +Tellico::Data::CollPtr PDFImporter::collection() { + QString xsltfile = ::locate("appdata", QString::fromLatin1("xmp2tellico.xsl")); + if(xsltfile.isEmpty()) { + kdWarning() << "DropHandler::handleURL() - can not locate xmp2tellico.xsl" << endl; + return 0; + } + + ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true); + item.setTotalSteps(urls().count()); + connect(&item, SIGNAL(signalCancelled(ProgressItem*)), SLOT(slotCancel())); + ProgressItem::Done done(this); + const bool showProgress = options() & ImportProgress; + + KURL u; + u.setPath(xsltfile); + + XSLTHandler xsltHandler(u); + if(!xsltHandler.isValid()) { + kdWarning() << "DropHandler::handleURL() - invalid xslt in xmp2tellico.xsl" << endl; + return 0; + } + + bool hasDOI = false; + bool hasArxiv = false; + + uint j = 0; + + Data::CollPtr coll; + XMPHandler xmpHandler; + KURL::List list = urls(); + for(KURL::List::Iterator it = list.begin(); it != list.end() && !m_cancelled; ++it, ++j) { + FileHandler::FileRef* ref = FileHandler::fileRef(*it); + if(!ref) { + continue; + } + + Data::CollPtr newColl; + Data::EntryPtr entry; + + QString xmp = xmpHandler.extractXMP(ref->fileName()); + // myDebug() << xmp << endl; + if(xmp.isEmpty()) { + setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); + } else { + setStatusMessage(QString()); + + Import::TellicoImporter importer(xsltHandler.applyStylesheet(xmp)); + newColl = importer.collection(); + if(!newColl || newColl->entryCount() == 0) { + kdWarning() << "DropHandler::handleURL() - no collection found" << endl; + setStatusMessage(i18n("Tellico was unable to read any metadata from the PDF file.")); + } else { + entry = newColl->entries().front(); + hasDOI |= !entry->field(QString::fromLatin1("doi")).isEmpty(); + } + } + + if(!newColl) { + newColl = new Data::BibtexCollection(true); + } + if(!entry) { + entry = new Data::Entry(newColl); + newColl->addEntries(entry); + } + +#ifdef HAVE_POPPLER + + // now load from poppler + Poppler::Document* doc = Poppler::Document::load(ref->fileName()); + if(doc && !doc->isLocked()) { + // now the question is, do we overwrite XMP data with Poppler data? + // for now, let's say yes conditionally + QString s = doc->getInfo(QString::fromLatin1("Title")).simplifyWhiteSpace(); + if(!s.isEmpty()) { + entry->setField(QString::fromLatin1("title"), s); + } + // author could be separated by commas, "and" or whatever + // we're not going to overwrite it + if(entry->field(QString::fromLatin1("author")).isEmpty()) { + QRegExp rx(QString::fromLatin1("\\s*(and|,|;)\\s*")); + QStringList authors = QStringList::split(rx, doc->getInfo(QString::fromLatin1("Author")).simplifyWhiteSpace()); + entry->setField(QString::fromLatin1("author"), authors.join(QString::fromLatin1("; "))); + } + s = doc->getInfo(QString::fromLatin1("Keywords")).simplifyWhiteSpace(); + if(!s.isEmpty()) { + // keywords are also separated by semi-colons in poppler + entry->setField(QString::fromLatin1("keyword"), s); + } + + // now parse the first page text and try to guess + Poppler::Page* page = doc->getPage(0); + if(page) { + // a null rectangle means get all text on page + QString text = page->getText(Poppler::Rectangle()); + // borrowed from Referencer + QRegExp rx(QString::fromLatin1("(?:" + "(?:[Dd][Oo][Ii]:? *)" + "|" + "(?:[Dd]igital *[Oo]bject *[Ii]dentifier:? *)" + ")" + "(" + "[^\\.\\s]+" + "\\." + "[^\\/\\s]+" + "\\/" + "[^\\s]+" + ")")); + if(rx.search(text) > -1) { + QString doi = rx.cap(1); + myDebug() << "PDFImporter::collection() - in PDF file, found DOI: " << doi << endl; + entry->setField(QString::fromLatin1("doi"), doi); + hasDOI = true; + } + rx = QRegExp(QString::fromLatin1("arXiv:" + "(" + "[^\\/\\s]+" + "[\\/\\.]" + "[^\\s]+" + ")")); + if(rx.search(text) > -1) { + QString arxiv = rx.cap(1); + myDebug() << "PDFImporter::collection() - in PDF file, found arxiv: " << arxiv << endl; + if(entry->collection()->fieldByName(QString::fromLatin1("arxiv")) == 0) { + Data::FieldPtr field = new Data::Field(QString::fromLatin1("arxiv"), i18n("arXiv ID")); + field->setCategory(i18n("Publishing")); + entry->collection()->addField(field); + } + entry->setField(QString::fromLatin1("arxiv"), arxiv); + hasArxiv = true; + } + + delete page; + } + } else { + myDebug() << "PDFImporter::collection() - unable to read PDF info (poppler)" << endl; + } + delete doc; +#endif + + entry->setField(QString::fromLatin1("url"), (*it).url()); + // always an article? + entry->setField(QString::fromLatin1("entry-type"), QString::fromLatin1("article")); + + QPixmap pix = NetAccess::filePreview(ref->fileName(), PDF_FILE_PREVIEW_SIZE); + delete ref; // removes temp file + + if(!pix.isNull()) { + // is png best option? + QString id = ImageFactory::addImage(pix, QString::fromLatin1("PNG")); + if(!id.isEmpty()) { + Data::FieldPtr field = newColl->fieldByName(QString::fromLatin1("cover")); + if(!field && !newColl->imageFields().isEmpty()) { + field = newColl->imageFields().front(); + } else if(!field) { + field = new Data::Field(QString::fromLatin1("cover"), i18n("Front Cover"), Data::Field::Image); + newColl->addField(field); + } + entry->setField(field, id); + } + } + if(coll) { + coll->addEntries(newColl->entries()); + } else { + coll = newColl; + } + + if(showProgress) { + ProgressManager::self()->setProgress(this, j); + kapp->processEvents(); + } + } + + if(m_cancelled) { + return 0; + } + + if(hasDOI) { + myDebug() << "looking for DOI" << endl; + Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::DOI); + if(vec.isEmpty()) { + GUI::CursorSaver cs(Qt::arrowCursor); + KMessageBox::information(Kernel::self()->widget(), + i18n("Tellico is able to download information about entries with a DOI from " + "CrossRef.org. However, you must create an CrossRef account and add a new " + "data source with your account information."), + QString::null, + QString::fromLatin1("CrossRefSourceNeeded")); + } else { + Data::EntryVec entries = coll->entries(); + for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) { + for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) { + fetcher->updateEntrySynchronous(entry); + } + } + } + } + + if(m_cancelled) { + return 0; + } + + if(hasArxiv) { + Data::EntryVec entries = coll->entries(); + Fetch::FetcherVec vec = Fetch::Manager::self()->createUpdateFetchers(coll->type(), Fetch::ArxivID); + for(Fetch::FetcherVec::Iterator fetcher = vec.begin(); fetcher != vec.end(); ++fetcher) { + for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) { + fetcher->updateEntrySynchronous(entry); + } + } + } + +// finally + Data::EntryVec entries = coll->entries(); + for(Data::EntryVecIt entry = entries.begin(); entry != entries.end(); ++entry) { + if(entry->title().isEmpty()) { + // use file name + KURL u = entry->field(QString::fromLatin1("url")); + entry->setField(QString::fromLatin1("title"), u.fileName()); + } + } + + if(m_cancelled) { + return 0; + } + return coll; +} + +void PDFImporter::slotCancel() { + m_cancelled = true; +} + +#include "pdfimporter.moc" |