summaryrefslogtreecommitdiffstats
path: root/konq-plugins/webarchiver/archivedialog.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'konq-plugins/webarchiver/archivedialog.cpp')
-rw-r--r--konq-plugins/webarchiver/archivedialog.cpp565
1 files changed, 565 insertions, 0 deletions
diff --git a/konq-plugins/webarchiver/archivedialog.cpp b/konq-plugins/webarchiver/archivedialog.cpp
new file mode 100644
index 0000000..71390c2
--- /dev/null
+++ b/konq-plugins/webarchiver/archivedialog.cpp
@@ -0,0 +1,565 @@
+/*
+ Copyright (C) 2001 Andreas Schlapbach <schlpbch@iam.unibe.ch>
+ Copyright (C) 2003 Antonio Larrosa <larrosa@kde.org>
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+*/
+
+#include "archivedialog.h"
+#include <qwidget.h>
+#include <khtml_part.h>
+#include "archiveviewbase.h"
+#include <kinstance.h>
+#include <ktempfile.h>
+#include <ktar.h>
+
+#include <kfiledialog.h>
+#include <kmessagebox.h>
+#include <kpassivepopup.h>
+#include <klocale.h>
+#include <kio/netaccess.h>
+#include <khtml_part.h>
+#include <kdebug.h>
+#include <kgenericfactory.h>
+#include <kactivelabel.h>
+#include <qstylesheet.h>
+#include <qiodevice.h>
+#include <klistview.h>
+#include <kio/job.h>
+#include <kapplication.h>
+#include <kurllabel.h>
+#include <kprogress.h>
+#include <kstringhandler.h>
+#include <qpushbutton.h>
+
+#undef DEBUG_WAR
+
+#define CONTENT_TYPE "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">"
+
+ArchiveDialog::ArchiveDialog(QWidget *parent, const QString &filename,
+ KHTMLPart *part) :
+ KDialogBase(parent, "WebArchiveDialog", false, i18n("Web Archiver"),
+ KDialogBase::Ok | KDialogBase::Cancel | KDialogBase::User1 ),
+ m_bPreserveWS(false), m_tmpFile(0), m_url(part->url())
+{
+ m_widget=new ArchiveViewBase(this);
+ setMainWidget(m_widget);
+ setWFlags(getWFlags() | WDestructiveClose);
+
+ m_widget->urlLabel->setText(QString("<a href=\"")+m_url.url()+"\">"+KStringHandler::csqueeze( m_url.url(), 80 )+"</a>");
+ m_widget->targetLabel->setText(QString("<a href=\"")+filename+"\">"+KStringHandler::csqueeze( filename, 80 )+"</a>");
+
+ if(part->document().ownerDocument().isNull())
+ m_document = part->document();
+ else
+ m_document = part->document().ownerDocument();
+
+ enableButtonOK( false );
+ showButton( KDialogBase::User1, false );
+ setButtonOK( KStdGuiItem::close() );
+
+ m_tarBall = new KTar(filename,"application/x-gzip");
+}
+
+void ArchiveDialog::archive()
+{
+ m_iterator=0;
+ m_currentLVI=0;
+ if (m_tarBall->open(IO_WriteOnly)) {
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "Web Archive opened " << endl;
+#endif
+
+ m_linkDict.insert(QString("index.html"), QString(""));
+ saveFile("index.html");
+
+ } else {
+ const QString title = i18n( "Unable to Open Web-Archive" );
+ const QString text = i18n( "Unable to open \n %1 \n for writing." ).arg(m_tarBall->fileName());
+ KMessageBox::sorry( 0L, text, title );
+ }
+}
+
+ArchiveDialog::~ArchiveDialog()
+{
+ delete m_tarBall;
+}
+
+/* Store the HTMLized DOM-Tree to a temporary file and add it to the Tar-Ball */
+
+void ArchiveDialog::saveFile( const QString&)
+{
+ KTempFile tmpFile;
+ if (!(tmpFile.status())) {
+
+ QString temp;
+
+ m_state=Retrieving;
+ QTextStream *tempStream = new QTextStream(&temp, IO_ReadOnly);
+
+ saveToArchive(tempStream);
+
+ delete tempStream;
+
+ m_downloadedURLDict.clear();
+
+ m_state=Downloading;
+ m_widget->progressBar->setTotalSteps(m_urlsToDownload.count());
+ m_widget->progressBar->setProgress(0);
+ downloadNext();
+
+ } else {
+ const QString title = i18n( "Could Not Open Temporary File" );
+ const QString text = i18n( "Could not open a temporary file" );
+ KMessageBox::sorry( 0, text, title );
+ }
+}
+
+void ArchiveDialog::setSavingState()
+{
+ KTempFile tmpFile;
+ QTextStream* textStream = tmpFile.textStream();
+ textStream->setEncoding(QTextStream::UnicodeUTF8);
+
+ m_widget->progressBar->setProgress(m_widget->progressBar->totalSteps());
+
+ m_state=Saving;
+ saveToArchive(textStream);
+
+ tmpFile.close();
+
+ QString fileName="index.html";
+ QFile file(tmpFile.name());
+ file.open(IO_ReadOnly);
+ m_tarBall->writeFile(fileName, QString::null, QString::null, file.size(), file.readAll());
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "HTML-file written: " << fileName << endl;
+#endif
+ file.close();
+
+ // Cleaning up
+ file.remove();
+ m_tarBall->close();
+
+ KPassivePopup::message( m_url.prettyURL() , i18n( "Archiving webpage completed." ), this );
+
+ enableButtonOK(true);
+ setEscapeButton(Ok);
+ actionButton(Ok)->setFocus();
+ enableButtonCancel(false);
+}
+
+/* Recursively travers the DOM-Tree */
+
+void ArchiveDialog::saveToArchive(QTextStream* _textStream)
+{
+ if (!_textStream) return;
+
+ // Add a doctype
+
+ (*_textStream) <<"<!-- saved from:" << endl << m_url.url() << " -->" << endl;
+
+ try
+ {
+ saveArchiveRecursive(m_document.documentElement(), m_url, _textStream, 0);
+ }
+ catch (...)
+ {
+ kdDebug(90110) << "exception" << endl;
+ }
+}
+
+static bool hasAttribute(const DOM::Node &pNode, const QString &attrName, const QString &attrValue)
+{
+ const DOM::Element element = (const DOM::Element) pNode;
+ DOM::Attr attr;
+ DOM::NamedNodeMap attrs = element.attributes();
+ unsigned long lmap = attrs.length();
+ for( unsigned int j=0; j<lmap; j++ ) {
+ attr = static_cast<DOM::Attr>(attrs.item(j));
+ if ((attr.name().string().upper() == attrName) &&
+ (attr.value().string().upper() == attrValue))
+ return true;
+ }
+ return false;
+}
+
+static bool hasChildNode(const DOM::Node &pNode, const QString &nodeName)
+{
+ DOM::Node child;
+ try
+ {
+ // We might throw a DOM exception
+ child = pNode.firstChild();
+ }
+ catch (...)
+ {
+ // No children, stop recursion here
+ child = DOM::Node();
+ }
+
+ while(!child.isNull()) {
+ if (child.nodeName().string().upper() == nodeName)
+ return true;
+ child = child.nextSibling();
+ }
+ return false;
+}
+
+/* Transform DOM-Tree to HTML */
+
+void ArchiveDialog::saveArchiveRecursive(const DOM::Node &pNode, const KURL& baseURL,
+ QTextStream* _textStream, int indent)
+{
+ const QString nodeNameOrig(pNode.nodeName().string());
+ const QString nodeName(pNode.nodeName().string().upper());
+ QString text;
+ QString strIndent;
+ strIndent.fill(' ', indent);
+ const DOM::Element element = (const DOM::Element) pNode;
+ DOM::Node child;
+
+ if ( !element.isNull() ) {
+ if (nodeName.at(0)=='-') {
+ /* Don't save khtml internal tags '-konq..'
+ * Approximating it with <DIV>
+ */
+ text += "<DIV> <!-- -KONQ_BLOCK -->";
+ } else if (nodeName == "BASE") {
+ /* Skip BASE, everything is relative to index.html
+ * Saving SCRIPT but they can cause trouble!
+ */
+ } else if ((nodeName == "META") && hasAttribute(pNode, "HTTP-EQUIV", "CONTENT-TYPE")) {
+ /* Skip content-type meta tag, we provide our own.
+ */
+ } else {
+ if (!m_bPreserveWS) {
+ if (nodeName == "PRE") {
+ m_bPreserveWS = true;
+ }
+ text = strIndent;
+ }
+ text += "<" + nodeNameOrig;
+ QString attributes;
+ QString attrNameOrig, attrName, attrValue;
+ DOM::Attr attr;
+ DOM::NamedNodeMap attrs = element.attributes();
+ unsigned long lmap = attrs.length();
+ for( unsigned int j=0; j<lmap; j++ ) {
+ attr = static_cast<DOM::Attr>(attrs.item(j));
+ attrNameOrig = attr.name().string();
+ attrName = attrNameOrig.upper();
+ attrValue = attr.value().string();
+
+#if 0
+ if ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") {
+ //attrValue = handleLink(baseURL, attrValue);
+
+ /* Going recursively down creating a DOM-Tree for the Frame, second Level of recursion */
+ //## Add Termination criteria, on the other hand frames are not indefinetly nested, are they :)
+
+ KHTMLPart* part = new KHTMLPart();
+ KURL absoluteURL = getAbsoluteURL(baseURL, attrValue);
+ part->openURL(absoluteURL);
+ saveFile(getUniqueFileName(absoluteURL.fileName()), part);
+ delete part;
+
+ } else if
+#endif
+ if ((nodeName == "LINK" && attrName == "HREF") || // Down load stylesheets, js-script, ..
+ ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") ||
+ ((nodeName == "IMG" || nodeName == "INPUT" || nodeName == "SCRIPT") && attrName == "SRC") ||
+ ((nodeName == "BODY" || nodeName == "TABLE" || nodeName == "TH" || nodeName == "TD") && attrName == "BACKGROUND")) {
+ // Some people use carriage return in file names and browsers support that!
+ attrValue = handleLink(baseURL, attrValue.replace(QRegExp("\\s"), ""));
+ }
+ /*
+ * ## Make recursion level configurable
+ */
+ /*
+ } else if (nodeName == "A" && attrName == "HREF") {
+ attrValue = handleLink(baseURL, attrValue);
+ */
+
+ attributes += " " + attrName + "=\"" + attrValue + "\"";
+ }
+ if (!(attributes.isEmpty())){
+ text += " ";
+ }
+ text += attributes.simplifyWhiteSpace();
+ text += ">";
+
+ if (nodeName == "HTML") {
+ /* Search for a HEAD tag, if not found, generate one.
+ */
+ if (!hasChildNode(pNode, "HEAD"))
+ text += "\n" + strIndent + " <HEAD>" CONTENT_TYPE "</HEAD>";
+ }
+ else if (nodeName == "HEAD") {
+ text += "\n" + strIndent + " " + CONTENT_TYPE;
+ }
+ }
+ } else {
+ const QString& nodeValue(pNode.nodeValue().string());
+ if (!(nodeValue.isEmpty())) {
+ // Don't escape < > in JS or CSS
+ QString parentNodeName = pNode.parentNode().nodeName().string().upper();
+ if (parentNodeName == "STYLE") {
+ text = analyzeInternalCSS(baseURL, pNode.nodeValue().string());
+ } else if (m_bPreserveWS) {
+ text = QStyleSheet::escape(pNode.nodeValue().string());
+ } else if (parentNodeName == "SCRIPT") {
+ text = pNode.nodeValue().string();
+ } else {
+ text = strIndent + QStyleSheet::escape(pNode.nodeValue().string());
+ }
+ }
+ }
+
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "text:" << text << endl;
+#endif
+ if (!(text.isEmpty())) {
+ (*_textStream) << text;
+ if (!m_bPreserveWS) {
+ (*_textStream) << endl;
+ }
+ }
+
+ try
+ {
+ // We might throw a DOM exception
+ child = pNode.firstChild();
+ }
+ catch (...)
+ {
+ // No children, stop recursion here
+ child = DOM::Node();
+ }
+
+ while(!child.isNull()) {
+ saveArchiveRecursive(child, baseURL, _textStream, indent+2);
+ child = child.nextSibling();
+ }
+
+ if (!(element.isNull())) {
+ if (nodeName == "AREA" || nodeName == "BASE" || nodeName == "BASEFONT" ||
+ nodeName == "BR" || nodeName == "COL" || nodeName == "FRAME" ||
+ nodeName == "HR" || nodeName == "IMG" || nodeName == "INPUT" ||
+ nodeName == "ISINDEX" || nodeName == "META" || nodeName == "PARAM") {
+
+ /* Closing Tag is forbidden, see HTML 4.01 Specs: Index of Elements */
+
+ } else {
+ if (!m_bPreserveWS) {
+ text = strIndent;
+ } else {
+ text ="";
+ }
+ if (nodeName.at(0)=='-') {
+ text += "</DIV> <!-- -KONQ_BLOCK -->";
+ } else {
+ text += "</" + pNode.nodeName().string() + ">";
+ if (nodeName == "PRE") {
+ m_bPreserveWS = false;
+ }
+ }
+#ifdef DEBUG_WAR
+ kdDebug(90110) << text << endl;
+#endif
+ if (!(text.isEmpty())) {
+ (*_textStream) << text;
+ if (!m_bPreserveWS) {
+ (*_textStream) << endl;
+ }
+ }
+ }
+ }
+}
+
+/* Extract the URL, download it's content and return an unique name for the link */
+
+QString ArchiveDialog::handleLink(const KURL& _url, const QString& _link)
+{
+ KURL url(getAbsoluteURL(_url, _link));
+
+ QString tarFileName;
+ if (kapp->authorizeURLAction("redirect", _url, url))
+ {
+ if (m_state==Retrieving)
+ m_urlsToDownload.append(url);
+ else if (m_state==Saving)
+ tarFileName = m_downloadedURLDict[url.url()];
+ }
+
+ return tarFileName;
+}
+
+void ArchiveDialog::downloadNext()
+{
+ if (m_iterator>=m_urlsToDownload.count())
+ {
+ // We've already downloaded all the files we wanted, let's save them
+ setSavingState();
+ return;
+ }
+
+ KURL url=m_urlsToDownload[m_iterator];
+
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "URL : " << url.url() << endl;
+#endif
+ QString tarFileName;
+
+ // Only download file once
+ if (m_downloadedURLDict.contains(url.url())) {
+ tarFileName = m_downloadedURLDict[url.url()];
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "File already downloaded: " << url.url()
+ << m_downloadedURLDict.count() << endl;
+#endif
+ m_iterator++;
+ downloadNext();
+ return;
+ } else {
+
+ // Gets the name of a temporary file into m_tmpFileName
+ delete m_tmpFile;
+ m_tmpFile=new KTempFile();
+ m_tmpFile->close();
+ QFile::remove(m_tmpFile->name());
+ kdDebug(90110) << "downloading: " << url.url() << " to: " << m_tmpFile->name() << endl;
+ KURL dsturl;
+ dsturl.setPath(m_tmpFile->name());
+ KIO::Job *job=KIO::file_copy(url, dsturl, -1, false, false, false);
+ job->addMetaData("cache", "cache"); // Use entry from cache if available.
+ connect(job, SIGNAL(result( KIO::Job *)), this, SLOT(finishedDownloadingURL( KIO::Job *)) );
+
+ m_currentLVI=new QListViewItem(m_widget->listView, url.prettyURL());
+ m_widget->listView->insertItem( m_currentLVI );
+ m_currentLVI->setText(1,i18n("Downloading"));
+ }
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "TarFileName: [" << tarFileName << "]" << endl << endl;
+#endif
+}
+
+void ArchiveDialog::finishedDownloadingURL( KIO::Job *job )
+{
+ if ( job->error() )
+ {
+// QString s=job->errorString();
+ m_currentLVI->setText(1,i18n("Error"));
+ }
+ else
+ m_currentLVI->setText(1,i18n("Ok"));
+
+ m_widget->progressBar->advance(1);
+
+
+ KURL url=m_urlsToDownload[m_iterator];
+
+ QString tarFileName = getUniqueFileName(url.fileName());
+
+ // Add file to Tar-Ball
+ QFile file(m_tmpFile->name());
+ file.open(IO_ReadOnly);
+ m_tarBall->writeFile(tarFileName, QString::null, QString::null, file.size(), file.readAll());
+ file.close();
+ m_tmpFile->unlink();
+ delete m_tmpFile;
+ m_tmpFile=0;
+
+ // Add URL to downloaded URLs
+
+ m_downloadedURLDict.insert(url.url(), tarFileName);
+ m_linkDict.insert(tarFileName, QString(""));
+
+ m_iterator++;
+ downloadNext();
+}
+
+/* Create an absolute URL for download */
+
+KURL ArchiveDialog::getAbsoluteURL(const KURL& _url, const QString& _link)
+{
+ // Does all the magic for me
+ return KURL(_url, _link);
+}
+
+/* Adds an id to a fileName to make it unique relative to the Tar-Ball */
+
+QString ArchiveDialog::getUniqueFileName(const QString& fileName)
+{
+ // Name clash -> add unique id
+ static int id=2;
+ QString uniqueFileName(fileName);
+
+#ifdef DEBUG_WAR
+ kdDebug(90110) << "getUniqueFileName(..): [" << fileName << "]" << endl;
+#endif
+
+ while (uniqueFileName.isEmpty() || m_linkDict.contains(uniqueFileName))
+ uniqueFileName = QString::number(id++) + fileName;
+
+ return uniqueFileName;
+}
+
+/* Search for Images in CSS, extract them and adjust CSS */
+
+QString ArchiveDialog::analyzeInternalCSS(const KURL& _url, const QString& string)
+{
+#ifdef DEBUG_WAR
+ kdDebug () << "analyzeInternalCSS" << endl;
+#endif
+
+ QString str(string);
+ int pos = 0;
+ int startUrl = 0;
+ int endUrl = 0;
+ int length = string.length();
+ while (pos < length && pos >= 0) {
+ pos = str.find("url(", pos);
+ if (pos!=-1) {
+ pos += 4; // url(
+
+ if (str[pos]=='"' || str[pos]=='\'') // CSS 'feature'
+ pos++;
+ startUrl = pos;
+ pos = str.find(")",startUrl);
+ endUrl = pos;
+ if (str[pos-1]=='"' || str[pos-1]=='\'') // CSS 'feature'
+ endUrl--;
+ QString url = str.mid(startUrl, endUrl-startUrl);
+
+#ifdef DEBUG_WAR
+ kdDebug () << "url: " << url << endl;
+#endif
+
+ url = handleLink(_url, url);
+
+#ifdef DEBUG_WAR
+ kdDebug () << "url: " << url << endl;
+#endif
+
+ str = str.replace(startUrl, endUrl-startUrl, url);
+ pos++;
+ }
+ }
+ return str;
+}
+
+#include "archivedialog.moc"