diff options
Diffstat (limited to 'klinkstatus/src/engine')
-rw-r--r-- | klinkstatus/src/engine/Makefile.am | 9 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkchecker.cpp | 703 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkchecker.h | 128 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkfilter.cpp | 46 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkfilter.h | 49 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkstatus.cpp | 214 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkstatus.h | 187 | ||||
-rw-r--r-- | klinkstatus/src/engine/linkstatus_impl.h | 417 | ||||
-rw-r--r-- | klinkstatus/src/engine/searchmanager.cpp | 916 | ||||
-rw-r--r-- | klinkstatus/src/engine/searchmanager.h | 193 | ||||
-rw-r--r-- | klinkstatus/src/engine/searchmanager_impl.h | 158 |
11 files changed, 3020 insertions, 0 deletions
diff --git a/klinkstatus/src/engine/Makefile.am b/klinkstatus/src/engine/Makefile.am new file mode 100644 index 00000000..1bd3ba88 --- /dev/null +++ b/klinkstatus/src/engine/Makefile.am @@ -0,0 +1,9 @@ +INCLUDES = -I$(top_srcdir)/src/ui $(all_includes) +METASOURCES = AUTO +noinst_HEADERS = linkchecker.h linkstatus.h linkstatus_impl.h searchmanager.h \ + searchmanager_impl.h linkfilter.h +libengine_la_LDFLAGS = $(all_libraries) +noinst_LTLIBRARIES = libengine.la +libengine_la_SOURCES = linkchecker.cpp linkstatus.cpp searchmanager.cpp \ + linkfilter.cpp +libengine_la_LIBADD = $(LIB_KHTML) diff --git a/klinkstatus/src/engine/linkchecker.cpp b/klinkstatus/src/engine/linkchecker.cpp new file mode 100644 index 00000000..bcc503ad --- /dev/null +++ b/klinkstatus/src/engine/linkchecker.cpp @@ -0,0 +1,703 @@ +/*************************************************************************** + * Copyright (C) 2004 by Puto Moura * + * mojo@localhost.localdomain * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#include "linkchecker.h" +#include "searchmanager.h" +#include "../utils/utils.h" +#include "../parser/htmlparser.h" + +#include <qstring.h> +#include <qtimer.h> +#include <qtextcodec.h> +#include <qcstring.h> + +#include <kio/netaccess.h> +#include <kio/global.h> +#include <kio/job.h> +#include <kio/scheduler.h> +#include <kio/slave.h> +#include <kmimetype.h> +#include <kapplication.h> +#include <klocale.h> +#include <khtml_part.h> +#include <dom/html_misc.h> +#include <dom/dom_node.h> +#include <dom/dom_string.h> + + +int LinkChecker::count_ = 0; + +LinkChecker::LinkChecker(LinkStatus* linkstatus, int time_out, + QObject *parent, const char *name) + : QObject(parent, name), search_manager_(0), + linkstatus_(linkstatus), t_job_(0), time_out_(time_out), checker_(0), document_charset_(), + redirection_(false), header_checked_(false), finnished_(false), + parsing_(false), is_charset_checked_(false), has_defined_charset_(false) +{ + Q_ASSERT(linkstatus_); + Q_ASSERT(!linkstatus_->checked()); + + kdDebug(23100) << endl << ++count_ << ": " << "Checking " << linkstatus_->absoluteUrl().url() << endl; +} + +LinkChecker::~LinkChecker() +{} + +void LinkChecker::setSearchManager(SearchManager* search_manager) +{ + Q_ASSERT(search_manager); + search_manager_ = search_manager; +} + +void LinkChecker::check() +{ + Q_ASSERT(!finnished_); + + KURL url(linkStatus()->absoluteUrl()); + Q_ASSERT(url.isValid()); + + if(url.hasRef()) { + KMimeType::Ptr mimeType = KMimeType::findByURL(url); + if(mimeType->is("text/html") || mimeType->is("application/xml")) { + checkRef(); + return; + } + } + + t_job_ = KIO::get(url, false, false); + + t_job_->addMetaData("PropagateHttpHeader", "true"); // to have the http header + + if (linkstatus_->parent()) { + t_job_->addMetaData("referrer", linkstatus_->parent()->absoluteUrl().prettyURL()); + } + + if(search_manager_->sendIdentification()) + { + t_job_->addMetaData("SendUserAgent", "true"); + t_job_->addMetaData("UserAgent", search_manager_->userAgent()); + } + else + t_job_->addMetaData("SendUserAgent", "false"); + + + QObject::connect(t_job_, SIGNAL(data(KIO::Job *, const QByteArray &)), + this, SLOT(slotData(KIO::Job *, const QByteArray &))); + QObject::connect(t_job_, SIGNAL(mimetype(KIO::Job *, const QString &)), + this, SLOT(slotMimetype(KIO::Job *, const QString &))); + QObject::connect(t_job_, SIGNAL(result(KIO::Job *)), + this, SLOT(slotResult(KIO::Job *))); + QObject::connect(t_job_, SIGNAL(redirection(KIO::Job *, const KURL &)), + this, SLOT(slotRedirection(KIO::Job *, const KURL &))); + + QTimer::singleShot( time_out_ * 1000, this, SLOT(slotTimeOut()) ); + + t_job_->setInteractive(false); +} + +void LinkChecker::slotTimeOut() +{ + if(!finnished_ && !parsing_) + { + kdDebug(23100) << "timeout: " << linkstatus_->absoluteUrl().url() << endl; + if(t_job_ && t_job_->slave()) + kdDebug(23100) << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + else + kdDebug(23100) << endl; + + +// Q_ASSERT(t_job_); // can happen: e.g. bad result signal + if(t_job_->error() != KIO::ERR_USER_CANCELED) + { + linkstatus_->setErrorOccurred(true); + linkstatus_->setChecked(true); + linkstatus_->setError(i18n("Timeout")); + linkstatus_->setStatus(LinkStatus::TIMEOUT); + + killJob(); + finnish(); + } + } +} + +void LinkChecker::slotMimetype (KIO::Job* /*job*/, const QString &type) +{ + if(finnished_) + return; + +// kdDebug(23100) << "LinkChecker::slotMimetype:" << type << "-> " << linkstatus_->absoluteUrl().url() +// << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + + Q_ASSERT(t_job_); + + LinkStatus* ls = 0; +/* if(redirection_) + ls = linkStatus()->redirection(); + else*/ + ls = linkstatus_; + Q_ASSERT(ls); + + ls->setMimeType(type); + KURL url = ls->absoluteUrl(); + + // we doesn't do nothing if file is http or https because we need the header + // which is only available in the data response + if(!t_job_->error()) // if a error happened let result() handle that + { + if(ls->onlyCheckHeader()) + { + //kdDebug(23100) << "only check header: " << ls->absoluteUrl().prettyURL() << endl; + + // file is OK (http can have an error page though job->error() is false) + if(!url.protocol().startsWith("http")) + { + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + + killJob(); + finnish(); + } + } + else // !ls->onlyCheckHeader() + { + //kdDebug(23100) << "NOT only check header: " << ls->absoluteUrl().prettyURL() << endl; + + // file is OK (http can have an error page though job->error() is false) + if(!url.protocol().startsWith("http")) // if not, it have to go trough slotData to get the http header + { + // it's not an html page, so we don't want the file content + if(type != "text/html"/* && type != "text/plain"*/) + { + //kdDebug(23100) << "mimetype: " << type << endl; + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + + killJob(); + finnish(); + } + } + } + } +} + +void LinkChecker::slotData(KIO::Job* /*job*/, const QByteArray& data) +{ + if(finnished_) + return; + + kdDebug(23100) << "LinkChecker::slotData -> " << linkstatus_->absoluteUrl().url() + << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + + Q_ASSERT(t_job_); + + LinkStatus* ls = 0; +/* if(redirection_) + ls = linkStatus()->redirection(); + else*/ + ls = linkstatus_; + Q_ASSERT(ls); + + KURL url = ls->absoluteUrl(); + + if(!t_job_->error()) + { + if(ls->onlyCheckHeader()) + { + Q_ASSERT(header_checked_ == false); + // the job should have been killed in slotMimetype + Q_ASSERT(url.protocol() == "http" || url.protocol() == "https"); + + // get the header and quit + if(url.protocol().startsWith("http")) + { + // get the header + ls->setHttpHeader(getHttpHeader(t_job_)); + + if(t_job_->isErrorPage()) + ls->setIsErrorPage(true); + + if(header_checked_) + { + killJob(); + linkstatus_->setStatus(getHttpStatus()); + linkstatus_->setChecked(true); + finnish(); + return; + } + } + } + else + { + if(url.protocol().startsWith("http")) + { + if(!header_checked_) + { + ls->setHttpHeader(getHttpHeader(t_job_)); + } + if(ls->mimeType() != "text/html" && header_checked_) + { + //kdDebug(23100) << "mimetype of " << ls->absoluteUrl().prettyURL() << ": " << ls->mimeType() << endl; + ls->setStatus(getHttpStatus()); + killJob(); + finnish(); // if finnish is called before kill what you get is a segfault, don't know why + return; + } + else if(t_job_->isErrorPage() && header_checked_) + { + //kdDebug(23100) << "ERROR PAGE" << endl; + ls->setIsErrorPage(true); + ls->setStatus(getHttpStatus()); + killJob(); + finnish(); + return; + } + } + else + { + Q_ASSERT(ls->mimeType() == "text/html"); + } + if(!is_charset_checked_) + findDocumentCharset(data); + + QTextCodec* codec = 0; + if(has_defined_charset_) + codec = QTextCodec::codecForName(document_charset_); + if(!codec) + codec = QTextCodec::codecForName("iso8859-1"); // default + + doc_html_ += codec->toUnicode(data); + } + } +} + +void LinkChecker::findDocumentCharset(QString const& doc) +{ + Q_ASSERT(!is_charset_checked_); + + is_charset_checked_ = true; // only check the first stream of data + + if(header_checked_) + document_charset_ = linkstatus_->httpHeader().charset(); + + // try to look in the meta elements + if(document_charset_.isNull() || document_charset_.isEmpty()) + document_charset_ = HtmlParser::findCharsetInMetaElement(doc); + + if(!document_charset_.isNull() && !document_charset_.isEmpty()) + has_defined_charset_ = true; +} + +// only comes here if an error happened or in case of a clean html page +// if onlyCheckHeader is false +void LinkChecker::slotResult(KIO::Job* /*job*/) +{ + if(finnished_) + return; + + kdDebug(23100) << "LinkChecker::slotResult -> " << linkstatus_->absoluteUrl().url() << endl; + + Q_ASSERT(t_job_); + if(!t_job_) + return; + + if(redirection_) { + if(!processRedirection(redirection_url_)) { + t_job_ = 0; + linkstatus_->setChecked(true); + finnish(); + return; + } + } + + KIO::TransferJob* job = t_job_; + t_job_ = 0; + + emit jobFinnished(this); + + if(job->error() == KIO::ERR_USER_CANCELED) + { + // FIXME This can happen! If the job is non interactive... + kdWarning(23100) << endl << "Job killed quietly, yet signal result was emited..." << endl; + kdDebug(23100) << linkstatus_->toString() << endl; + finnish(); + return; + } + + LinkStatus* ls = 0; + if(redirection_) + ls = linkStatus()->redirection(); + else + ls = linkstatus_; + Q_ASSERT(ls); + + if(!(!ls->onlyCheckHeader() || + job->error() || + !header_checked_)) + kdWarning(23100) << ls->toString() << endl; + + Q_ASSERT(!ls->onlyCheckHeader() || job->error() || !header_checked_); + + if(ls->isErrorPage()) + kdWarning(23100) << "\n\n" << ls->toString() << endl << endl; + + Q_ASSERT(!job->isErrorPage()); + + if(job->error()) + { + kdDebug(23100) << "Job error: " << job->errorString() << endl; + kdDebug(23100) << "Job error code: " << job->error() << endl; + + if(job->error() == KIO::ERR_IS_DIRECTORY) + { + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + ls->setErrorOccurred(true); + if(job->error() == KIO::ERR_SERVER_TIMEOUT) + ls->setStatus(LinkStatus::TIMEOUT); + else + ls->setStatus(LinkStatus::BROKEN); + + if(job->errorString().isEmpty()) + kdWarning(23100) << "\n\nError string is empty, error = " << job->error() << "\n\n\n"; + if(job->error() != KIO::ERR_NO_CONTENT) + ls->setError(job->errorString()); + else + ls->setError(i18n("No Content")); + } + } + + else + { + if(!ls->absoluteUrl().protocol().startsWith("http")) { + ls->setStatusText("OK"); + ls->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + if(!header_checked_) + { + kdDebug(23100) << "\n\nheader not received... checking again...\n\n\n"; + //check again + check(); + return; + } + Q_ASSERT(header_checked_); + + ls->setStatus(getHttpStatus()); + } + + if(!doc_html_.isNull() && !doc_html_.isEmpty()) + { + ls->setDocHtml(doc_html_); + + parsing_ = true; + HtmlParser parser(doc_html_); + + if(parser.hasBaseUrl()) + ls->setBaseURI(KURL(parser.baseUrl().url())); + if(parser.hasTitle()) + ls->setHtmlDocTitle(parser.title().attributeTITLE()); + + ls->setChildrenNodes(parser.nodes()); + parsing_ = false; + } + } + finnish(); +} + + +void LinkChecker::slotRedirection (KIO::Job* /*job*/, const KURL &url) +{ + kdDebug(23100) << "LinkChecker::slotRedirection -> " << + linkstatus_->absoluteUrl().url() << " -> " << url.url() << endl; +// << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; + + redirection_ = true; + redirection_url_ = url; +} + +bool LinkChecker::processRedirection(KURL const& toUrl) +{ + if(finnished_) + return true; + + kdDebug(23100) << "LinkChecker::processRedirection -> " << linkstatus_->absoluteUrl().url() << " -> " << toUrl.url() << endl; + + Q_ASSERT(t_job_); + Q_ASSERT(linkstatus_->absoluteUrl().protocol().startsWith("http")); + Q_ASSERT(redirection_); + + linkstatus_->setHttpHeader(getHttpHeader(t_job_, false)); + linkstatus_->setIsRedirection(true); + linkstatus_->setStatusText("redirection"); + linkstatus_->setStatus(LinkStatus::HTTP_REDIRECTION); + linkstatus_->setChecked(true); + + LinkStatus* ls_red = new LinkStatus(*linkstatus_); + ls_red->setAbsoluteUrl(toUrl); + ls_red->setRootUrl(linkstatus_->rootUrl()); + + if(!linkstatus_->onlyCheckHeader()) + ls_red->setOnlyCheckHeader(false); + + linkstatus_->setRedirection(ls_red); + ls_red->setParent(linkstatus_); + ls_red->setOriginalUrl(toUrl.url()); + + Q_ASSERT(search_manager_); + + if(search_manager_->localDomain(ls_red->absoluteUrl())) + ls_red->setExternalDomainDepth(-1); + else + { + if(search_manager_->localDomain(linkstatus_->absoluteUrl())) + ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth() + 1); + else + ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth()); + } + + if(!toUrl.isValid() || search_manager_->existUrl(toUrl, linkstatus_->absoluteUrl())) + { + ls_red->setChecked(false); + return false; + } + else + { + ls_red->setChecked(true); + return true; + } +} + +void LinkChecker::finnish() +{ + Q_ASSERT(!t_job_); + + if(!finnished_) + { + kdDebug(23100) << "LinkChecker::finnish -> " << linkstatus_->absoluteUrl().url() << endl; + + finnished_ = true; + + if(redirection_) + Q_ASSERT(linkstatus_->checked()); + else + linkstatus_->setChecked(true); + + emit transactionFinished(linkstatus_, this); + } +} + +HttpResponseHeader LinkChecker::getHttpHeader(KIO::Job* /*job*/, bool remember_check) +{ + //kdDebug(23100) << "LinkChecker::getHttpHeader -> " << linkstatus_->absoluteUrl().url() << endl; + + Q_ASSERT(!finnished_); + Q_ASSERT(t_job_); + + QString header_string = t_job_->queryMetaData("HTTP-Headers"); + // Q_ASSERT(!header_string.isNull() && !header_string.isEmpty()); +// kdDebug(23100) << "HTTP header: " << endl << header_string << endl; +// kdDebug(23100) << "Keys: " << HttpResponseHeader(header_string).keys() << endl; +// kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).contentType() << endl; +// kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).value("content-type") << endl; + + if(header_string.isNull() || header_string.isEmpty()) + { + header_checked_ = false; + kdWarning(23100) << "header_string.isNull() || header_string.isEmpty(): " + << linkstatus_->toString() << endl; + } + else if(remember_check) + header_checked_ = true; + + return HttpResponseHeader(header_string); +} + +void LinkChecker::checkRef() +{ + KURL url(linkStatus()->absoluteUrl()); + Q_ASSERT(url.hasRef()); + + QString ref = url.ref(); + if(ref == "" || ref == "top") { + linkstatus_->setStatusText("OK"); + linkstatus_->setStatus(LinkStatus::SUCCESSFULL); + finnish(); + return; + } + + QString url_base; + LinkStatus const* ls_parent = 0; + int i_ref = -1; + + if(linkStatus()->originalUrl().startsWith("#")) + ls_parent = linkStatus()->parent(); + + else + { + i_ref = url.url().find("#"); + url_base = url.url().left(i_ref); + //kdDebug(23100) << "url_base: " << url_base << endl; + + Q_ASSERT(search_manager_); + + ls_parent = search_manager_->linkStatus(url_base); + } + + if(ls_parent) + checkRef(ls_parent); + else + { + url = KURL::fromPathOrURL(url.url().left(i_ref)); + checkRef(url); + } +} + +void LinkChecker::checkRef(KURL const& url) +{ + Q_ASSERT(search_manager_); + + QString url_string = url.url(); + KHTMLPart* html_part = search_manager_->htmlPart(url_string); + if(!html_part) + { + kdDebug() << "new KHTMLPart: " + url_string << endl; + + html_part = new KHTMLPart(); + html_part->setOnlyLocalReferences(true); + + QString tmpFile; + if(KIO::NetAccess::download(url, tmpFile, 0)) + { + QString doc_html = FileManager::read(tmpFile); + html_part->begin(); + html_part->write(doc_html); + html_part->end(); + + KIO::NetAccess::removeTempFile(tmpFile); + } + else + { + kdDebug(23100) << KIO::NetAccess::lastErrorString() << endl; + } + + search_manager_->addHtmlPart(url_string, html_part); + } + + if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref())) + { + linkstatus_->setStatusText("OK"); + linkstatus_->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + linkstatus_->setErrorOccurred(true); + linkstatus_->setError(i18n( "Link destination not found." )); + linkstatus_->setStatus(LinkStatus::BROKEN); + } + + finnish(); +} + +void LinkChecker::checkRef(LinkStatus const* linkstatus_parent) +{ + Q_ASSERT(search_manager_); + + QString url_string = linkstatus_parent->absoluteUrl().url(); + KHTMLPart* html_part = search_manager_->htmlPart(url_string); + if(!html_part) + { + kdDebug() << "new KHTMLPart: " + url_string << endl; + + html_part = new KHTMLPart(); + html_part->setOnlyLocalReferences(true); + + html_part->begin(); + html_part->write(linkstatus_parent->docHtml()); + html_part->end(); + + search_manager_->addHtmlPart(url_string, html_part); + } + + if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref())) + { + linkstatus_->setStatusText("OK"); + linkstatus_->setStatus(LinkStatus::SUCCESSFULL); + } + else + { + linkstatus_->setErrorOccurred(true); + linkstatus_->setError(i18n( "Link destination not found." )); + linkstatus_->setStatus(LinkStatus::BROKEN); + } + + finnish(); +} + +bool LinkChecker::hasAnchor(KHTMLPart* html_part, QString const& anchor) +{ + DOM::HTMLDocument htmlDocument = html_part->htmlDocument(); + DOM::HTMLCollection anchors = htmlDocument.anchors(); + + DOM::DOMString name_ref(anchor); + Q_ASSERT(!name_ref.isNull()); + + DOM::Node node = anchors.namedItem(name_ref); + if(node.isNull()) + { + node = htmlDocument.getElementById(name_ref); + } + + if(!node.isNull()) + return true; + else + return false; +} + +void LinkChecker::killJob() +{ + if(!t_job_) + return; + + KIO::TransferJob* aux = t_job_; + t_job_ = 0; + aux->disconnect(this); + aux->kill(true); // quietly +} + +LinkStatus::Status LinkChecker::getHttpStatus() const +{ + QString status_code = QString::number(linkstatus_->httpHeader().statusCode()); + + if(status_code[0] == '2') + return LinkStatus::SUCCESSFULL; + else if(status_code[0] == '3') + return LinkStatus::HTTP_REDIRECTION; + else if(status_code[0] == '4') + return LinkStatus::HTTP_CLIENT_ERROR; + else if(status_code[0] == '5') + return LinkStatus::HTTP_SERVER_ERROR; + else + return LinkStatus::UNDETERMINED; +} + +#include "linkchecker.moc" diff --git a/klinkstatus/src/engine/linkchecker.h b/klinkstatus/src/engine/linkchecker.h new file mode 100644 index 00000000..a992e5fd --- /dev/null +++ b/klinkstatus/src/engine/linkchecker.h @@ -0,0 +1,128 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#ifndef LINKCHECKER_H +#define LINKCHECKER_H + +#include <qobject.h> +#include <qthread.h> +#include <qstring.h> + +#include <kio/jobclasses.h> +class KHTMLPart; + +#include "../parser/http.h" +#include "linkstatus.h" +class SearchManager; + +#include <iostream> +using namespace std; + +/** +@author Paulo Moura Guedes +*/ +class LinkChecker : public QObject +{ + Q_OBJECT +public: + LinkChecker(LinkStatus* linkstatus, int time_out = 50, + QObject *parent = 0, const char *name = 0); + ~LinkChecker(); + + //virtual void run(); + void check(); + void setSearchManager(SearchManager* search_manager); + + LinkStatus const* linkStatus() const; + + static bool hasAnchor(KHTMLPart* html_part, QString const& anchor); + +signals: + + void transactionFinished(const LinkStatus * linkstatus, + LinkChecker * checker); + void jobFinnished(LinkChecker * checker); + +protected slots: + + void slotData(KIO::Job *, const QByteArray &data); + void slotRedirection (KIO::Job *, const KURL &url); + void slotMimetype(KIO::Job *, const QString &type); + void slotResult(KIO::Job* job); + void slotTimeOut(); + +protected: + + void finnish(); + HttpResponseHeader getHttpHeader(KIO::Job* job, bool remember_check = true); + void checkRef(); // #... + +private: + + LinkStatus::Status getHttpStatus() const; + void checkRef(LinkStatus const* linkstatus_parent); + void checkRef(KURL const& url); + void killJob(); + /** + * @param url + * @return false if the redirection was already checked by the search manager + */ + bool processRedirection(KURL const& url); + + void findDocumentCharset(QString const& data); + +private: + + SearchManager* search_manager_; + LinkStatus* const linkstatus_; + KIO::TransferJob* t_job_; + int time_out_; + LinkChecker* checker_; + QString document_charset_; +/* A redirection has appened, with the current URL. Several redirections + can happen until the final URL is reached.*/ + bool redirection_; + KURL redirection_url_; + QString doc_html_; + bool header_checked_; + bool finnished_; + bool parsing_; + + /** + * Whether the charset of the document is already checked. + * (e.g. <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>) + */ + bool is_charset_checked_; + /** + * Wheter the page define the enconding (latin1, utf8, etc). + * According to the spec (http://www.w3.org/TR/html4/charset.html), + * it first check the server response and then the info in the html meta element. + */ + bool has_defined_charset_; + + static int count_; // debug attribute that counts how many links were checked +}; + +inline LinkStatus const* LinkChecker::linkStatus() const +{ + return linkstatus_; +} + + +#endif diff --git a/klinkstatus/src/engine/linkfilter.cpp b/klinkstatus/src/engine/linkfilter.cpp new file mode 100644 index 00000000..4d15f2e6 --- /dev/null +++ b/klinkstatus/src/engine/linkfilter.cpp @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#include "linkfilter.h" + +#include "linkstatus.h" + + +LinkMatcher::LinkMatcher(QString const& text, ResultView::Status status) + : m_text(text), m_status(status) +{ +} + +LinkMatcher::~LinkMatcher() +{ +} + +bool LinkMatcher::matches(LinkStatus const& link ) const +{ +/* kdDebug() << link.absoluteUrl().url() << endl; + kdDebug() << link.label() << endl; + kdDebug() << link.absoluteUrl().url().contains(m_text) << endl; + kdDebug() << link.label().contains(m_text) << endl; + */ + return (link.absoluteUrl().url().contains(m_text, false) || link.label().contains(m_text, false)) && + ResultView::displayableWithStatus(&link, m_status); +} + + + diff --git a/klinkstatus/src/engine/linkfilter.h b/klinkstatus/src/engine/linkfilter.h new file mode 100644 index 00000000..84da16cb --- /dev/null +++ b/klinkstatus/src/engine/linkfilter.h @@ -0,0 +1,49 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ +#ifndef LINKFILTER_H +#define LINKFILTER_H + +#include "../ui/resultview.h" + +/** + @author Paulo Moura Guedes <moura@kdewebdev.org> +*/ +class LinkMatcher +{ +public: + LinkMatcher(QString const& text, ResultView::Status status); + ~LinkMatcher(); + + bool matches(LinkStatus const& link) const; + + void setText(const QString& text) { m_text = text; } + QString text() const { return m_text; } + + void setStatus(ResultView::Status status) { m_status = status; } + ResultView::Status status() const { return m_status; } + + bool nullFilter() const { return m_text.isEmpty() && m_status == ResultView::none; } + +private: + QString m_text; + ResultView::Status m_status; +}; + +#endif diff --git a/klinkstatus/src/engine/linkstatus.cpp b/klinkstatus/src/engine/linkstatus.cpp new file mode 100644 index 00000000..c8b359ed --- /dev/null +++ b/klinkstatus/src/engine/linkstatus.cpp @@ -0,0 +1,214 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "linkstatus.h" +#include "../parser/node.h" +#include "../ui/treeview.h" + +#include <klocale.h> +#include <kcharsets.h> + +#include <qdom.h> + + +LinkStatus::~LinkStatus() +{ + //kdDebug(23100) << "|"; + + for(uint i = 0; i != children_nodes_.size(); ++i) + { + if(children_nodes_[i]) + { + delete children_nodes_[i]; + children_nodes_[i] = 0; + } + } + + children_nodes_.clear(); + + if(isRedirection()) + { + if(redirection_) + { + delete redirection_; + redirection_ = 0; + } + } +} + +void LinkStatus::reset() +{ + depth_ = -1; + external_domain_depth_ = -1; + is_root_ = false; + error_occurred_ = false; + is_redirection_ = false; + checked_ = false; + only_check_header_ = true; + malformed_ = false; + Q_ASSERT(!node_); + has_base_URI_ = false; + label_ = ""; + absolute_url_ = ""; + doc_html_ = ""; + http_header_ = HttpResponseHeader(); + error_ = ""; + + for(uint i = 0; i != children_nodes_.size(); ++i) + { + if(children_nodes_[i]) + { + delete children_nodes_[i]; + children_nodes_[i] = 0; + } + } + + children_nodes_.clear(); + + if(isRedirection()) + { + if(redirection_) + { + delete redirection_; + redirection_ = 0; + } + } + Q_ASSERT(!parent_); + base_URI_ = ""; +} + +QString const LinkStatus::toString() const +{ + QString aux; + + if(!is_root_) + { + Q_ASSERT(parent_); + aux += i18n( "Parent: %1" ).arg( parent()->absoluteUrl().prettyURL() ) + "\n"; + } + Q_ASSERT(!original_url_.isNull()); + + aux += i18n( "URL: %1" ).arg( absoluteUrl().prettyURL() ) + "\n"; + aux += i18n( "Original URL: %1" ).arg( originalUrl() ) + "\n"; + if(node()) + aux += i18n( "Node: %1" ).arg( node()->content() ) + "\n"; + + return aux; +} + + +LinkStatus* LinkStatus::lastRedirection(LinkStatus* ls) +{ + if(ls->isRedirection()) + if(ls->redirection()) + return lastRedirection(ls->redirection()); + else + return ls; + else + return ls; +} + +void LinkStatus::loadNode() +{ + Q_ASSERT(node_); + + setOriginalUrl(node_->url()); + setLabel(node_->linkLabel()); + + if(malformed()) + { + setErrorOccurred(true); + setError(i18n( "Malformed" )); + setStatus(LinkStatus::MALFORMED); + kdDebug(23100) << "Malformed:" << endl; + kdDebug(23100) << "Node: " << node()->content() << endl; + //kdDebug(23100) << toString() << endl; // probable segfault + } +} + +bool LinkStatus::malformed() const // don't inline please (#include "node.h") +{ + return (malformed_ || node_->malformed()); +} + +void LinkStatus::setChildrenNodes(vector<Node*> const& nodes) // don't inline please (#include "node.h") +{ + children_nodes_.reserve(nodes.size()); + children_nodes_ = nodes; +} + +void LinkStatus::setMalformed(bool flag) +{ + malformed_ = flag; + if(flag) + { + setErrorOccurred(true); + setError(i18n( "Malformed" )); + setStatus(LinkStatus::MALFORMED); + kdDebug(23100) << "Malformed!" << endl; + kdDebug(23100) << node()->content() << endl; + //kdDebug(23100) << toString() << endl; // probable segfault + } + else if(error() == i18n( "Malformed" )) + { + setErrorOccurred(false); + setError(""); + setStatus(LinkStatus::UNDETERMINED); + } +} + +void LinkStatus::save(QDomElement& element) const +{ + QDomElement child_element = element.ownerDocument().createElement("link"); + + // <url> + QDomElement tmp_1 = element.ownerDocument().createElement("url"); + tmp_1.appendChild(element.ownerDocument().createTextNode(absoluteUrl().prettyURL())); + child_element.appendChild(tmp_1); + + // <status> + tmp_1 = element.ownerDocument().createElement("status"); + tmp_1.setAttribute("broken", + ResultView::displayableWithStatus(this, ResultView::bad) ? + "true" : "false"); + tmp_1.appendChild(element.ownerDocument().createTextNode(statusText())); + child_element.appendChild(tmp_1); + + // <label> + tmp_1 = element.ownerDocument().createElement("label"); + tmp_1.appendChild(element.ownerDocument().createTextNode(KCharsets::resolveEntities(label()))); + child_element.appendChild(tmp_1); + + // <referers> + tmp_1 = element.ownerDocument().createElement("referrers"); + + for(QValueVector<KURL>::const_iterator it = referrers_.begin(); it != referrers_.end(); ++it) + { + QDomElement tmp_2 = element.ownerDocument().createElement("url"); + tmp_2.appendChild(element.ownerDocument().createTextNode(it->prettyURL())); + + tmp_1.appendChild(tmp_2); + } + Q_ASSERT(!referrers_.isEmpty()); + child_element.appendChild(tmp_1); + + element.appendChild(child_element); +} + diff --git a/klinkstatus/src/engine/linkstatus.h b/klinkstatus/src/engine/linkstatus.h new file mode 100644 index 00000000..e7567460 --- /dev/null +++ b/klinkstatus/src/engine/linkstatus.h @@ -0,0 +1,187 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef LINKSTATUS_H +#define LINKSTATUS_H + +#include "../parser/http.h" +#include "../utils/mvector.h" + +#include <kurl.h> +#include <klocale.h> +#include <kdebug.h> +class TreeView; +class TreeViewItem; + +#include <qstring.h> +#include <qobject.h> +#include <qvaluevector.h> +class QDomElement; + +#include <vector> +#include <iostream> + +using namespace std; + + +class Node; + +class LinkStatus +{ +public: + + enum Status { + UNDETERMINED, + SUCCESSFULL, + BROKEN, + HTTP_REDIRECTION, + HTTP_CLIENT_ERROR, + HTTP_SERVER_ERROR, + TIMEOUT, + NOT_SUPPORTED, + MALFORMED + }; + + LinkStatus(); + LinkStatus(KURL const& absolute_url); + LinkStatus(Node* node, LinkStatus* parent); + ~LinkStatus(); + + void save(QDomElement& element) const; + + void reset(); + void setRootUrl(KURL const& url); + void setStatus(Status status); + void setDepth(uint depth); + void setParent(LinkStatus* parent); + void setOriginalUrl(QString const& url_original); + void setLabel(QString const& label); + void setAbsoluteUrl(KURL const& url_absoluto); + void setDocHtml(QString const& doc_html); + void setHttpHeader(HttpResponseHeader const& cabecalho_http); + void setStatusText(QString const& statusText); // FIXME Legacy. This should be eliminated in favor of LinkStatus::Status + void setError(QString const& error); + void setIsRoot(bool flag); + void setErrorOccurred(bool houve_error); + void setIsRedirection(bool e_redirection); + void setRedirection(LinkStatus* redirection); + void setNode(Node* node); + void setChildrenNodes(vector<Node*> const& nodes); + void addChildNode(Node* node); + void reserveMemoryForChildrenNodes(int n); + void setChecked(bool flag); + void setExternalDomainDepth(int p); + void setOnlyCheckHeader(bool flag); + void setMalformed(bool flag = true); + void setHasBaseURI(bool flag = true); + void setHasHtmlDocTitle(bool flag = true); + void setBaseURI(KURL const& base_url); + void setHtmlDocTitle(QString const& title); + void setIgnored(bool flag = true); + void setMimeType(QString const& mimetype); + void setIsErrorPage(bool flag); + void setIsLocalRestrict(bool flag); + void setTreeViewItem(TreeViewItem* tree_view_item); + void addReferrer(KURL const& url); + + KURL const& rootUrl() const; + Status const& status() const; + uint depth() const; + bool local() const; // linkstatus.paradigma.co.pt == paradigma.co.pt + bool isLocalRestrict() const; // linkstatus.paradigma.co.pt != paradigma.co.pt + LinkStatus const* parent() const; + QString const& originalUrl() const; + QString const& label() const; + KURL const& absoluteUrl() const; + QString const& docHtml() const; + HttpResponseHeader const& httpHeader() const; + HttpResponseHeader& httpHeader(); + QString statusText() const; // FIXME Legacy. This should be eliminated in favor of LinkStatus::Status + QString const& error() const; + bool isRoot() const; + bool errorOccurred() const; + bool isRedirection() const; + LinkStatus* redirection() const; + Node* node() const; + vector<Node*> const& childrenNodes() const; + QString const toString() const; + bool checked() const; + int externalDomainDepth() const; + bool onlyCheckHeader() const; + bool malformed() const; + bool hasBaseURI() const; + bool hasHtmlDocTitle() const; + KURL const& baseURI() const; + QString const& htmlDocTitle() const; + bool ignored() const; + bool redirectionExists(KURL const& url) const; // to avoid cyclic links + QString mimeType() const; + bool isErrorPage() const; + TreeViewItem* treeViewItem() const; + QValueVector<KURL> const& referrers() const; + + static LinkStatus* lastRedirection(LinkStatus* ls); + +private: + + /** + Load some atributes in function of his parent node. + */ + void loadNode(); + +private: + + KURL root_url_; // The URL which made the search start + Status status_; + int depth_; + int external_domain_depth_; // Para se poder escolher explorar domains diferentes ate n depth + QString original_url_; + QString label_; + KURL absolute_url_; + QString doc_html_; + HttpResponseHeader http_header_; + QString status_text_; // FIXME Legacy. This should be eliminated in favor of LinkStatus::Status + QString error_; + bool is_root_; + bool error_occurred_; + bool is_redirection_; + vector<Node*> children_nodes_; + LinkStatus* parent_; + LinkStatus* redirection_; + bool checked_; + bool only_check_header_; + bool malformed_; + Node* node_; + bool has_base_URI_; + bool has_html_doc_title_; + KURL base_URI_; + QString html_doc_title_; + bool ignored_; + QString mimetype_; + bool is_error_page_; + bool is_local_restrict_; + TreeViewItem* tree_view_item_; + QValueVector<KURL> referrers_; +}; + +#include "../parser/url.h" +#include "linkstatus_impl.h" + +#endif diff --git a/klinkstatus/src/engine/linkstatus_impl.h b/klinkstatus/src/engine/linkstatus_impl.h new file mode 100644 index 00000000..3359664c --- /dev/null +++ b/klinkstatus/src/engine/linkstatus_impl.h @@ -0,0 +1,417 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +inline LinkStatus::LinkStatus() + : status_(LinkStatus::UNDETERMINED), depth_(-1), external_domain_depth_(-1), is_root_(false), + error_occurred_(false), is_redirection_(false), parent_(0), redirection_(0), checked_(false), + only_check_header_(true), malformed_(false), + node_(0), has_base_URI_(false), has_html_doc_title_(false), ignored_(false), + mimetype_(""), is_error_page_(false), tree_view_item_(0) +{} + +inline LinkStatus::LinkStatus(KURL const& absolute_url) + : status_(LinkStatus::UNDETERMINED), depth_(-1), external_domain_depth_(-1), is_root_(false), + error_occurred_(false), is_redirection_(false), parent_(0), redirection_(0), checked_(false), + only_check_header_(true), malformed_(false), + node_(0), has_base_URI_(false), has_html_doc_title_(false), ignored_(false), + mimetype_(""), is_error_page_(false), tree_view_item_(0) +{ + setAbsoluteUrl(absolute_url); +} + +inline LinkStatus::LinkStatus(Node* node, LinkStatus* parent) + : status_(LinkStatus::UNDETERMINED), depth_(-1), external_domain_depth_(-1), is_root_(false), + error_occurred_(false), is_redirection_(false), parent_(0), redirection_(0), checked_(false), + only_check_header_(true), malformed_(false), + node_(node), has_base_URI_(false), has_html_doc_title_(false), ignored_(false), + mimetype_(""), is_error_page_(false), tree_view_item_(0) +{ + loadNode(); + + setDepth(parent->depth() + 1); + setParent(parent); + setRootUrl(parent->rootUrl()); +} + +inline void LinkStatus::setRootUrl(KURL const& url) +{ + root_url_ = url; +} + +inline void LinkStatus::setStatus(Status status) +{ + status_ = status; +} + +inline void LinkStatus::setDepth(uint depth) +{ + depth_ = depth; +} + +inline void LinkStatus::setParent(LinkStatus* parent) +{ + Q_ASSERT(parent); + + parent_ = parent; + addReferrer(parent->absoluteUrl()); +} + +inline void LinkStatus::setAbsoluteUrl(KURL const& url_absoluto) +{ + absolute_url_ = url_absoluto; +} + +inline void LinkStatus::setOriginalUrl(QString const& url_original) +{ + original_url_ = url_original; +} + +inline void LinkStatus::setLabel(QString const& label) +{ + label_ = label; +} + +inline void LinkStatus::setDocHtml(QString const& doc_html) +{ + Q_ASSERT(!doc_html.isEmpty()); + doc_html_ = doc_html; +} + +inline void LinkStatus::setHttpHeader(HttpResponseHeader const& cabecalho_http) +{ + http_header_ = cabecalho_http; +} + +inline void LinkStatus::setStatusText(QString const& status) +{ + Q_ASSERT(!status.isEmpty()); + status_text_ = status; +} + +inline void LinkStatus::setError(QString const& error) +{ + Q_ASSERT(!error.isEmpty()); + error_ = error; +} + +inline void LinkStatus::setErrorOccurred(bool houve_error) +{ + error_occurred_ = houve_error; +} + +inline void LinkStatus::setIsRoot(bool flag) +{ + is_root_ = flag; + label_ = i18n("ROOT"); +} + +inline void LinkStatus::setRedirection(LinkStatus* redirection) +{ + Q_ASSERT(redirection != NULL); + Q_ASSERT(isRedirection()); + redirection_ = redirection; +} + +inline void LinkStatus::setIsRedirection(bool e_redirection) +{ + is_redirection_ = e_redirection; +} + +inline void LinkStatus::addChildNode(Node* node) +{ + children_nodes_.push_back(node); +} + +inline void LinkStatus::reserveMemoryForChildrenNodes(int n) +{ + Q_ASSERT(n > 0); + children_nodes_.reserve(n); +} + +inline void LinkStatus::setChecked(bool flag) +{ + checked_ = flag; +} + +inline void LinkStatus::setExternalDomainDepth(int p) +{ + Q_ASSERT(p >= -1); + external_domain_depth_ = p; +} + +inline void LinkStatus::setOnlyCheckHeader(bool flag) +{ + only_check_header_= flag; +} + +inline void LinkStatus::setHasBaseURI(bool flag) +{ + has_base_URI_ = flag; +} + +inline void LinkStatus::setHasHtmlDocTitle(bool flag) +{ + has_html_doc_title_ = flag; +} + +inline void LinkStatus::setBaseURI(KURL const& base_url) +{ + if(!base_url.isValid()) + { + kdWarning(23100) << "base url not valid: " << endl + << "parent: " << parent()->absoluteUrl().prettyURL() << endl + << "url: " << absoluteUrl().prettyURL() << endl + << "base url resolved: " << base_url.prettyURL() << endl; + } + + Q_ASSERT(base_url.isValid()); + has_base_URI_ = true; + base_URI_ = base_url; +} + +inline void LinkStatus::setHtmlDocTitle(QString const& title) +{ + if(title.isNull() || title.isEmpty()) + { + kdError(23100) << "HTML doc title is null or empty!" << endl + << toString() << endl; + } + Q_ASSERT(!title.isNull() && !title.isEmpty()); + + has_html_doc_title_ = true; + html_doc_title_ = title; +} + +inline void LinkStatus::setIgnored(bool flag) +{ + ignored_ = flag; +} + +inline void LinkStatus::setMimeType(QString const& mimetype) +{ + Q_ASSERT(!mimetype.isNull() && !mimetype.isEmpty()); + mimetype_ = mimetype; +} + +inline void LinkStatus::setIsErrorPage(bool flag) +{ + is_error_page_ = flag; +} + +inline void LinkStatus::setIsLocalRestrict(bool flag) +{ + is_local_restrict_ = flag; +} + +inline void LinkStatus::setTreeViewItem(TreeViewItem* tree_view_item) +{ + Q_ASSERT(tree_view_item); + tree_view_item_ = tree_view_item; +} + +inline void LinkStatus::addReferrer(KURL const& url) +{ + Q_ASSERT(url.isValid()); + + referrers_.push_back(url); +} + + + + +inline KURL const& LinkStatus::rootUrl() const +{ + return root_url_; +} + +inline LinkStatus::Status const& LinkStatus::status() const +{ + return status_; +} + +inline uint LinkStatus::depth() const +{ + return depth_; +} + +inline bool LinkStatus::local() const +{ + return external_domain_depth_ == -1; +} + +inline bool LinkStatus::isLocalRestrict() const +{ + return is_local_restrict_; +} + +inline LinkStatus const* LinkStatus::parent() const +{ + return parent_; +} + +inline QString const& LinkStatus::originalUrl() const +{ + return original_url_; +} + +inline QString const& LinkStatus::label() const +{ + return label_; +} + +inline KURL const& LinkStatus::absoluteUrl() const +{ + return absolute_url_; +} + +inline QString const& LinkStatus::docHtml() const +{ + return doc_html_; +} + +inline HttpResponseHeader const& LinkStatus::httpHeader() const +{ + return http_header_; +} + +inline HttpResponseHeader& LinkStatus::httpHeader() +{ + return http_header_; +} + +inline QString LinkStatus::statusText() const +{ + if(errorOccurred()) + return error(); + else if(!absoluteUrl().protocol().startsWith("http")) + return status_text_; + else + { + QString string_code = QString::number(httpHeader().statusCode()); + if(absoluteUrl().hasRef()) // ref URL + return status_text_; + else if(string_code == "200"/* or string_code == "304"*/) + return "OK"; + else + return string_code; + } +} + +inline QString const& LinkStatus::error() const +{ + return error_; +} + +inline bool LinkStatus::isRoot() const +{ + return is_root_; +} + +inline bool LinkStatus::errorOccurred() const +{ + return error_occurred_; +} + +inline bool LinkStatus::isRedirection() const +{ + return is_redirection_; +} + +inline LinkStatus* LinkStatus::redirection() const +{ + Q_ASSERT(isRedirection()); + + return redirection_; +} + +inline Node* LinkStatus::node() const +{ + //Q_ASSERT(node_); + return node_; +} + +inline vector<Node*> const& LinkStatus::childrenNodes() const +{ + return children_nodes_; +} + +inline bool LinkStatus::checked() const +{ + return checked_; +} + +inline int LinkStatus::externalDomainDepth() const +{ + return external_domain_depth_; +} + +inline bool LinkStatus::onlyCheckHeader() const +{ + return only_check_header_; +} + +inline bool LinkStatus::hasBaseURI() const +{ + return has_base_URI_; +} + +inline bool LinkStatus::hasHtmlDocTitle() const +{ + return has_html_doc_title_; +} + +inline KURL const& LinkStatus::baseURI() const +{ + Q_ASSERT(hasBaseURI()); + return base_URI_; +} + +inline QString const& LinkStatus::htmlDocTitle() const +{ + Q_ASSERT(has_html_doc_title_); + return html_doc_title_; +} + +inline bool LinkStatus::ignored() const +{ + return ignored_; +} + +inline QString LinkStatus::mimeType() const +{ + Q_ASSERT(!mimetype_.isNull()); + return mimetype_; +} + +inline bool LinkStatus::isErrorPage() const +{ + return is_error_page_; +} + +inline TreeViewItem* LinkStatus::treeViewItem() const +{ + return tree_view_item_; +} + +inline QValueVector<KURL> const& LinkStatus::referrers() const +{ + return referrers_; +} + diff --git a/klinkstatus/src/engine/searchmanager.cpp b/klinkstatus/src/engine/searchmanager.cpp new file mode 100644 index 00000000..81562a7a --- /dev/null +++ b/klinkstatus/src/engine/searchmanager.cpp @@ -0,0 +1,916 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include <kapplication.h> +#include <kdebug.h> +#include <klocale.h> +#include <khtml_part.h> +#include <kprotocolmanager.h> + +#include <qstring.h> +#include <qvaluevector.h> +#include <qdom.h> + +#include <iostream> +#include <unistd.h> + +#include "searchmanager.h" +#include "../parser/mstring.h" +#include "../cfg/klsconfig.h" + + +SearchManager::SearchManager(int max_simultaneous_connections, int time_out, + QObject *parent, const char *name) + : QObject(parent, name), + max_simultaneous_connections_(max_simultaneous_connections), has_document_root_(false), + depth_(-1), current_depth_(0), external_domain_depth_(0), + current_node_(0), current_index_(0), links_being_checked_(0), + finished_connections_(max_simultaneous_connections_), + maximum_current_connections_(-1), general_domain_(false), + checked_general_domain_(false), time_out_(time_out), current_connections_(0), + send_identification_(true), canceled_(false), searching_(false), checked_links_(0), ignored_links_(0), + check_parent_dirs_(true), check_external_links_(true), check_regular_expressions_(false), + number_of_level_links_(0), number_of_links_to_check_(0) +{ + root_.setIsRoot(true); + + if (KLSConfig::userAgent().isEmpty()) { + KLSConfig::setUserAgent(KProtocolManager::defaultUserAgent()); + } + user_agent_ = KLSConfig::userAgent(); +} + +void SearchManager::reset() +{ + kdDebug(23100) << "SearchManager::reset()" << endl; + + //Q_ASSERT(not links_being_checked_); + + root_.reset(); + cleanItems(); + depth_ = -1; + current_depth_ = 0; + current_node_ = 0; + current_index_ = 0; + finished_connections_ = max_simultaneous_connections_; + domain_ = ""; + maximum_current_connections_ = -1; + general_domain_ = false; + checked_general_domain_ = false; + check_regular_expressions_ = false; + current_connections_ = 0; + canceled_ = false; + searching_ = false; + checked_links_ = 0; + if(KLSConfig::userAgent().isEmpty()) { + KLSConfig::setUserAgent(KProtocolManager::defaultUserAgent()); + } + user_agent_ = KLSConfig::userAgent(); + + removeHtmlParts(); +} + +SearchManager::~SearchManager() +{ + reset(); +} + +void SearchManager::cleanItems() +{ + for(uint i = 0; i != search_results_.size(); ++i) + { + for(uint j = 0; j != search_results_[i].size() ; ++j) + { + for(uint l = 0; l != (search_results_[i])[j].size(); ++l) + { + if(((search_results_[i])[j])[l] != 0) + { + delete ((search_results_[i])[j])[l]; + ((search_results_[i])[j])[l] = 0; + } + else + kdDebug(23100) << "LinkStatus NULL!!" << endl; + } + search_results_[i][j].clear(); + } + search_results_[i].clear(); + } + search_results_.clear(); + kdDebug(23100) << endl; +} + +void SearchManager::startSearch(KURL const& root, SearchMode const& modo) +{ + canceled_ = false; + + //time_.restart(); + time_.start(); + + Q_ASSERT(root.isValid()); + //Q_ASSERT(root.protocol() == "http" || root.protocol() == "https"); + + if(root.hasHost() && (domain_.isNull() || domain_.isEmpty())) + { + setDomain(root.host() + root.directory()); + kdDebug(23100) << "Domain: " << domain_ << endl; + } + root_.setIsRoot(true); + root_.setDepth(0); + root_.setOriginalUrl(root.prettyURL()); + root_.setAbsoluteUrl(root); + root_.setOnlyCheckHeader(false); + root_.setRootUrl(root); + + search_mode_ = modo; + if(modo == depth) + Q_ASSERT(depth_ != -1); + else if(modo == domain) + Q_ASSERT(depth_ == -1); + else + Q_ASSERT(depth_ != -1); + + searching_ = true; + + //Q_ASSERT(domain_ != QString::null); + checkRoot(); +} + +void SearchManager::resume() +{ + searching_ = true; + canceled_ = false; + continueSearch(); +} + +void SearchManager::finnish() +{ + searching_ = false; + while(links_being_checked_) + { + kdDebug(23100) << "links_being_checked_: " << links_being_checked_ << endl; + sleep(1); + } + emit signalSearchFinished(); +} + +void SearchManager::pause() +{ + searching_ = false; + while(links_being_checked_) + { + kdDebug(23100) << "links_being_checked_: " << links_being_checked_ << endl; + sleep(1); + } + emit signalSearchPaused(); +} + +void SearchManager::cancelSearch() +{ + canceled_ = true; +} + +void SearchManager::checkRoot() +{ + LinkChecker* checker = new LinkChecker(&root_, time_out_, this, "link_checker"); + checker->setSearchManager(this); + + connect(checker, SIGNAL(transactionFinished(const LinkStatus *, LinkChecker *)), + this, SLOT(slotRootChecked(const LinkStatus *, LinkChecker *))); + /* + connect(checker, SIGNAL(jobFinnished(LinkChecker *)), + this, SLOT(slotLinkCheckerFinnished(LinkChecker *))); + */ + checker->check(); +} + +void SearchManager::slotRootChecked(const LinkStatus * link, LinkChecker * checker) +{ + kdDebug(23100) << "SearchManager::slotRootChecked:" << endl; + kdDebug(23100) << link->absoluteUrl().url() << " -> " << + LinkStatus::lastRedirection(&root_)->absoluteUrl().url() << endl; + + Q_ASSERT(checked_links_ == 0); + Q_ASSERT(search_results_.size() == 0); + + ++checked_links_; + //kdDebug(23100) << "++checked_links_: SearchManager::slotRootChecked" << endl; + emit signalRootChecked(link, checker); + + if(search_mode_ != depth || depth_ > 0) + { + current_depth_ = 1; + + vector<LinkStatus*> no = children(LinkStatus::lastRedirection(&root_)); + + emit signalLinksToCheckTotalSteps(no.size()); + + vector< vector<LinkStatus*> > nivel; + nivel.push_back(no); + + search_results_.push_back(nivel); + + if(search_results_.size() != 1) + { + kdDebug(23100) << "search_results_.size() != 1:" << endl; + kdDebug(23100) << "size: " << search_results_.size() << endl; + } + Q_ASSERT(search_results_.size() == 1); + + if(no.size() > 0) + { + startSearch(); + } + else + { + kdDebug(23100) << "SearchManager::slotRootChecked#1" << endl; + finnish(); + } + } + + else + { + Q_ASSERT(search_results_.size() == 0); + kdDebug(23100) << "SearchManager::slotRootChecked#2" << endl; + finnish(); + } + + delete checker; + checker = 0; +} + +vector<LinkStatus*> SearchManager::children(LinkStatus* link) +{ + vector<LinkStatus*> children; + + if(!link || link->absoluteUrl().hasRef()) + return children; + + vector<Node*> const& nodes = link->childrenNodes(); + + int count = 0; + for(uint i = 0; i != nodes.size(); ++i) + { + ++count; + + Node* node = nodes[i]; + KURL url; + if(node->url().isEmpty()) + url = ""; + else + url = Url::normalizeUrl(node->url(), *link, documentRoot().path()); + + if( (node->isLink() && + checkable(url, *link) && + !Url::existUrl(url, children) && + !node->url().isEmpty()) + || + node->malformed() ) + { + LinkStatus* ls = new LinkStatus(node, link); + ls->setAbsoluteUrl(url); + + if(localDomain(ls->absoluteUrl())) + ls->setExternalDomainDepth(-1); + else + ls->setExternalDomainDepth(link->externalDomainDepth() + 1); + + //ls->setIsLocalRestrict(localDomain(url)); + ls->setIsLocalRestrict(ls->local()); // @todo clean this nonsense + + if(!validUrl(url)) { + ls->setMalformed(true); + ls->setErrorOccurred(true); + } + + ls->setOnlyCheckHeader(onlyCheckHeader(ls)); + + if(link->externalDomainDepth() > external_domain_depth_) + { + kdDebug(23100) << "link->externalDomainDepth() > external_domain_depth_: " + << link->externalDomainDepth() << endl; + kdDebug(23100) << "link: " << endl << link->toString() << endl; + kdDebug(23100) << "child: " << endl << ls->toString() << endl; + } + Q_ASSERT(link->externalDomainDepth() <= external_domain_depth_); + + children.push_back(ls); + } + if(count == 50) + { + kapp->processEvents(); + count = 0; + } + } + + return children; +} + +bool SearchManager::existUrl(KURL const& url, KURL const& url_parent) const +{ + if(url.prettyURL().isEmpty() || root_.originalUrl() == url.prettyURL()) + return true; + + for(uint i = 0; i != search_results_.size(); ++i) + for(uint j = 0; j != search_results_[i].size(); ++j) + for(uint l = 0; l != (search_results_[i])[j].size(); ++l) + { + LinkStatus* tmp = search_results_[i][j][l]; + Q_ASSERT(tmp); + if(tmp->absoluteUrl() == url) + { // URL exists + QValueVector<KURL> referrers(tmp->referrers()); + + // Add new referrer + for(uint i = 0; i != referrers.size(); ++i) + { + if(referrers[i] == url_parent) + return true; + } + tmp->addReferrer(url_parent); + + return true; + } + } + + return false; +} + +LinkStatus const* SearchManager::linkStatus(QString const& s_url) const +{ + Q_ASSERT(!s_url.isEmpty()); + + if(root_.absoluteUrl().url() == s_url) + return &root_; + + int count = 0; + for(uint i = 0; i != search_results_.size(); ++i) + for(uint j = 0; j != search_results_[i].size(); ++j) + for(uint l = 0; l != (search_results_[i])[j].size(); ++l) + { + ++count; + + LinkStatus* ls = search_results_[i][j][l]; + Q_ASSERT(ls); + if(ls->absoluteUrl().url() == s_url && ls->checked()) + return ls; + + if(count == 50) + { + count = 0; + kapp->processEvents(); + } + + } + + return 0; +} + + +void SearchManager::startSearch() +{ + Q_ASSERT(current_depth_ == 1); + Q_ASSERT(search_results_[current_depth_ - 1].size() == 1); + Q_ASSERT(current_node_ == 0); + + if( (int)current_depth_ <= depth_ || search_mode_ != depth ) + checkVectorLinks(nodeToAnalize()); + else + { + kdDebug(23100) << "Search Finished! (SearchManager::comecaPesquisa)" << endl; + finnish(); + } +} + +void SearchManager::continueSearch() +{ + Q_ASSERT(!links_being_checked_); + + vector<LinkStatus*> const& no = nodeToAnalize(); + + if((uint)current_index_ < no.size()) + checkVectorLinks(no); + + else + { + current_index_ = 0; + kdDebug(23100) << "Next node_____________________\n\n"; + ++current_node_; + if( (uint)current_node_ < (search_results_[current_depth_ - 1]).size() ) + checkVectorLinks(nodeToAnalize()); + else + { + kdDebug(23100) << "Next Level_____________________________________________________________________________________\n\n\n"; + if(search_mode_ == SearchManager::domain || + current_depth_ < depth_) + { + current_node_ = 0; + ++current_depth_; + + addLevel(); + + if( (uint)current_depth_ == search_results_.size() ) + checkVectorLinks(nodeToAnalize()); + else + { + kdDebug(23100) << "Search Finished! (SearchManager::continueSearch#1)" << endl; + finnish(); + } + } + else + { + kdDebug(23100) << "Search Finished! (SearchManager::continueSearch#2)" << endl; + finnish(); + } + } + } +} + +vector<LinkStatus*> const& SearchManager::nodeToAnalize() const +{ + Q_ASSERT( (uint)current_depth_ == search_results_.size() ); + Q_ASSERT( (uint)current_node_ < (search_results_[current_depth_ - 1]).size() ); + + return (search_results_[current_depth_ - 1])[current_node_]; +} + +void SearchManager::checkVectorLinks(vector<LinkStatus*> const& links) +{ + checkLinksSimultaneously(chooseLinks(links)); +} + +vector<LinkStatus*> SearchManager::chooseLinks(vector<LinkStatus*> const& links) +{ + vector<LinkStatus*> escolha; + for(int i = 0; i != max_simultaneous_connections_; ++i) + { + if((uint)current_index_ < links.size()) + escolha.push_back(links[current_index_++]); + } + return escolha; +} + +void SearchManager::checkLinksSimultaneously(vector<LinkStatus*> const& links) +{ + Q_ASSERT(finished_connections_ <= max_simultaneous_connections_); + finished_connections_ = 0; + links_being_checked_ = 0; + maximum_current_connections_ = -1; + + if(links.size() < (uint)max_simultaneous_connections_) + maximum_current_connections_ = links.size(); + else + maximum_current_connections_ = max_simultaneous_connections_; + + for(uint i = 0; i != links.size(); ++i) + { + LinkStatus* ls(links[i]); + Q_ASSERT(ls); + + QString protocol = ls->absoluteUrl().protocol(); + + ++links_being_checked_; + Q_ASSERT(links_being_checked_ <= max_simultaneous_connections_); + + if(ls->malformed()) + { + Q_ASSERT(ls->errorOccurred()); + Q_ASSERT(ls->status() == LinkStatus::MALFORMED); + + ls->setChecked(true); + slotLinkChecked(ls, 0); + } + + else if(ls->absoluteUrl().prettyURL().contains("javascript:", false)) + { + ++ignored_links_; + ls->setIgnored(true); + ls->setErrorOccurred(true); + ls->setError(i18n( "Javascript not supported" )); + ls->setStatus(LinkStatus::NOT_SUPPORTED); + ls->setChecked(true); + slotLinkChecked(ls, 0); + } + /* + else if(!(protocol == "http" || protocol == "https")) + { + ++ignored_links_; + ls->setIgnored(true); + ls->setErrorOccurred(true); + ls->setError(i18n("Protocol %1 not supported").arg(protocol)); + ls->setStatus(LinkStatus::MALFORMED); + ls->setChecked(true); + slotLinkChecked(ls, 0); + } + */ + else + { + LinkChecker* checker = new LinkChecker(ls, time_out_, this, "link_checker"); + checker->setSearchManager(this); + + connect(checker, SIGNAL(transactionFinished(const LinkStatus *, LinkChecker *)), + this, SLOT(slotLinkChecked(const LinkStatus *, LinkChecker *))); + /* + connect(checker, SIGNAL(jobFinnished(LinkChecker *)), + this, SLOT(slotLinkCheckerFinnished(LinkChecker *))); + */ + checker->check(); + } + } +} + +void SearchManager::slotLinkChecked(const LinkStatus * link, LinkChecker * checker) +{ + kdDebug(23100) << "SearchManager::slotLinkChecked:" << endl; +// kdDebug(23100) << link->absoluteUrl().url() << " -> " << +// LinkStatus::lastRedirection((const_cast<LinkStatus*> (link)))->absoluteUrl().url() << endl; + + Q_ASSERT(link); + emit signalLinkChecked(link, checker); + ++checked_links_; + ++finished_connections_; + --links_being_checked_; + + if(links_being_checked_ < 0) + kdDebug(23100) << link->toString() << endl; + Q_ASSERT(links_being_checked_ >= 0); + + if(canceled_ && searching_ && !links_being_checked_) + { + pause(); + } + + else if(!canceled_ && finished_connections_ == maximumCurrentConnections() ) + { + continueSearch(); + return; + } + /* + delete checker; + checker = 0; + */ +} + +void SearchManager::addLevel() +{ + search_results_.push_back(vector< vector <LinkStatus*> >()); + vector< vector <LinkStatus*> >& ultimo_nivel(search_results_[search_results_.size() - 2]); + + number_of_level_links_ = 0; + number_of_links_to_check_ = 0; + uint end = ultimo_nivel.size(); + + for(uint i = 0; i != end; ++i) // nodes + { + uint end_sub1 = ultimo_nivel[i].size(); + for(uint j = 0; j != end_sub1; ++j) // links + ++number_of_level_links_; + } + + if(number_of_level_links_) + emit signalAddingLevelTotalSteps(number_of_level_links_); + + for(uint i = 0; i != end; ++i) // nodes + { + uint end_sub1 = ultimo_nivel[i].size(); + for(uint j = 0; j != end_sub1; ++j) // links + { + vector <LinkStatus*> f(children( LinkStatus::lastRedirection(((ultimo_nivel[i])[j])) )); + if(f.size() != 0) + { + search_results_[search_results_.size() - 1].push_back(f); + number_of_links_to_check_ += f.size(); + } + + emit signalAddingLevelProgress(); +// kapp->processEvents(); + } + } + if( (search_results_[search_results_.size() - 1]).size() == 0 ) + search_results_.pop_back(); + else + emit signalLinksToCheckTotalSteps(number_of_links_to_check_); +} + +bool SearchManager::checkable(KURL const& url, LinkStatus const& link_parent) const +{ + if(existUrl(url, link_parent.absoluteUrl())) + return false; + + if(!checkableByDomain(url, link_parent)) + return false; + + if(!check_parent_dirs_) + { + if(Url::parentDir(root_.absoluteUrl(), url)) + return false; + } + if(!check_external_links_) + { + if(Url::externalLink(root_.absoluteUrl(), url)) + return false; + } + if(check_regular_expressions_) + { + Q_ASSERT(!reg_exp_.isEmpty()); + + if(reg_exp_.search(url.url()) != -1) + return false; + } + + //kdDebug(23100) << "url " << url.url() << " is checkable!" << endl; + return true; +} + +bool SearchManager::checkableByDomain(KURL const& url, LinkStatus const& link_parent) const +{ + bool result = false; + if(localDomain(url)) + result = true; + else if( (link_parent.externalDomainDepth() + 1) < external_domain_depth_ ) + result = true; + else + result = false; + /* + if(!result) + kdDebug(23100) << "\n\nURL " << url.url() << " is not checkable by domain\n\n" << endl; + */ + return result; +} +/* +bool SearchManager::localDomain(KURL const& url) const + { + KURL url_root = root_.absoluteUrl(); + + if(url_root.protocol() != url.protocol()) + return false; + + if(url_root.hasHost()) + { + if(generalDomain()) + { + return equalHost(domain_, url.host()); + } + else + { + vector<QString> referencia = tokenizeWordsSeparatedBy(domain_, QChar('/')); + vector<QString> a_comparar = tokenizeWordsSeparatedBy(url.host() + url.directory(), QChar('/')); + + if(a_comparar.size() < referencia.size()) + return false; + else + { + for(uint i = 0; i != referencia.size(); ++i) + { + if(i == 0) + { // host, deal with specific function + if(!equalHost(referencia[i], a_comparar[i], !check_parent_dirs_)) + return false; + } + else if(referencia[i] != a_comparar[i]) + return false; + } + } + return true; + } + } + else if(checkParentDirs()) + return true; + else + return url_root.isParentOf(url); + } +*/ + +/** + The same as SearchManager::localDomain(), but only for http or https. + http://linkstatus.paradigma.co.pt != http://paradigma.co.pt +*/ +/* +bool SearchManager::isLocalRestrict(KURL const& url) const + { + Q_ASSERT(url.protocol() == "http" || url.protocol() == "https"); + + KURL url_root = root_.absoluteUrl(); + + if(url_root.protocol() != url.protocol()) + return false; + + if(url_root.hasHost()) + { + vector<QString> referencia = tokenizeWordsSeparatedBy(domain_, QChar('/')); + vector<QString> a_comparar = tokenizeWordsSeparatedBy(url.host() + url.directory(), QChar('/')); + + if(a_comparar.size() < referencia.size()) + return false; + else + { + for(uint i = 0; i != referencia.size(); ++i) + { + if(i == 0) + { // host, deal with specific function + if(!equalHost(referencia[i], a_comparar[i], true)) + return false; + } + else if(referencia[i] != a_comparar[i]) + return false; + } + } + return true; + } + else + return false; + } +*/ +bool SearchManager::generalDomain() const +{ + if(checked_general_domain_) + return general_domain_; + + else + { + Q_ASSERT(!domain_.isEmpty()); + + if(!check_parent_dirs_) + return false; + + int barra = domain_.find('/'); + if(barra != -1 && (uint)barra != domain_.length() - 1) + { + kdDebug(23100) << "Domain nao vago" << endl; + return false; + } + else + { + vector<QString> palavras = tokenizeWordsSeparatedByDots(domain_); + Q_ASSERT(palavras.size() >= 1); // host might be localhost + + QString primeira_palavra = palavras[0]; + if(primeira_palavra == "www") + { + Q_ASSERT(palavras.size() >= 3); + kdDebug(23100) << "Domain vago" << endl; + return true; + } + else if(palavras.size() == 2) + { + kdDebug(23100) << "Domain vago" << endl; + return true; + } + else + { + kdDebug(23100) << "Domain nao vago" << endl; + return false; + } + } + } +} + +bool SearchManager::onlyCheckHeader(LinkStatus* ls) const +{ + if(search_mode_ == depth) + return current_depth_ == depth_; + + else if(search_mode_ == domain) + return !ls->local() && + ls->externalDomainDepth() == external_domain_depth_ - 1; + + else + return + current_depth_ == depth_ || + (!ls->local() && + ls->externalDomainDepth() == external_domain_depth_ - 1); +} + +void SearchManager::slotSearchFinished() +{} + +void SearchManager::slotLinkCheckerFinnished(LinkChecker * checker) +{ + kdDebug(23100) << "deleting linkchecker" << endl; + + Q_ASSERT(checker); + //Q_ASSERT(checker->linkStatus()->checked()); + + delete checker; + checker = 0; +} + +KHTMLPart* SearchManager::htmlPart(QString const& key_url) const +{ + if(!html_parts_.contains(key_url)) + return 0; + + return html_parts_[key_url]; +} + +void SearchManager::addHtmlPart(QString const& key_url, KHTMLPart* html_part) +{ + Q_ASSERT(!key_url.isEmpty()); + Q_ASSERT(html_part); + + // FIXME configurable + if(html_parts_.count() > 150) + removeHtmlParts(); + + html_parts_.insert(key_url, html_part); +} + +void SearchManager::removeHtmlParts() +{ + KHTMLPartMap::Iterator it; + for(it = html_parts_.begin(); it != html_parts_.end(); ++it) + { + delete it.data(); + it.data() = 0; + } + + html_parts_.clear(); +} + +void SearchManager::save(QDomElement& element) const +{ + // <url> + QDomElement child_element = element.ownerDocument().createElement("url"); + child_element.appendChild(element.ownerDocument().createTextNode(root_.absoluteUrl().prettyURL())); + element.appendChild(child_element); + + // <recursively> + bool recursively = searchMode() == domain || depth_ > 0; + child_element = element.ownerDocument().createElement("recursively"); + child_element.appendChild(element.ownerDocument().createTextNode(recursively ? "true" : "false")); + element.appendChild(child_element); + + // <depth> + child_element = element.ownerDocument().createElement("depth"); + child_element.appendChild(element.ownerDocument(). + createTextNode(searchMode() == domain ? QString("Unlimited") : QString::number(depth_))); + element.appendChild(child_element); + + // <check_parent_folders> + child_element = element.ownerDocument().createElement("check_parent_folders"); + child_element.appendChild(element.ownerDocument(). + createTextNode(checkParentDirs() ? "true" : "false")); + element.appendChild(child_element); + + // <check_external_links> + child_element = element.ownerDocument().createElement("check_external_links"); + child_element.appendChild(element.ownerDocument(). + createTextNode(checkExternalLinks() ? "true" : "false")); + element.appendChild(child_element); + + // <check_regular_expression> + child_element = element.ownerDocument().createElement("check_regular_expression"); + child_element.setAttribute("check", checkRegularExpressions() ? "true" : "false"); + if(checkRegularExpressions()) + child_element.appendChild(element.ownerDocument(). + createTextNode(reg_exp_.pattern())); + element.appendChild(child_element); + + child_element = element.ownerDocument().createElement("link_list"); + element.appendChild(child_element); + + for(uint i = 0; i != search_results_.size(); ++i) + { + for(uint j = 0; j != search_results_[i].size() ; ++j) + { + for(uint l = 0; l != (search_results_[i])[j].size(); ++l) + { + LinkStatus* ls = ((search_results_[i])[j])[l]; + if(ls->checked()) + ls->save(child_element); + } + } + } +} + +QString SearchManager::toXML() const +{ + QDomDocument doc; + doc.appendChild(doc.createProcessingInstruction( "xml", + "version=\"1.0\" encoding=\"UTF-8\"")); + + QDomElement root = doc.createElement("klinkstatus"); + doc.appendChild(root); + + save(root); + + return doc.toString(4); +} + +#include "searchmanager.moc" diff --git a/klinkstatus/src/engine/searchmanager.h b/klinkstatus/src/engine/searchmanager.h new file mode 100644 index 00000000..135d267a --- /dev/null +++ b/klinkstatus/src/engine/searchmanager.h @@ -0,0 +1,193 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#ifndef GESTOR_PESQUISA_H +#define GESTOR_PESQUISA_H + +#include <kurl.h> + +#include <qobject.h> +#include <qstring.h> +#include <qdatetime.h> +#include <qregexp.h> +#include <qmap.h> +class QDomElement; + +#include <vector> + +#include "linkstatus.h" +#include "linkchecker.h" +#include "../parser/node.h" +#include "../parser/url.h" + +using namespace std; + +typedef QMap<QString, KHTMLPart*> KHTMLPartMap; + +class SearchManager: public QObject +{ + Q_OBJECT + +public: + + enum SearchMode { + depth, + domain, + depth_and_domain + }; + + SearchManager(int max_simultaneous_connections = 3, int time_out = 50, + QObject *parent = 0, const char *name = 0); + ~SearchManager(); + + QString toXML() const; + void save(QDomElement& element) const; + + KHTMLPartMap const& htmlParts() const { return html_parts_; } + + KHTMLPart* htmlPart(QString const& key_url) const; + void addHtmlPart(QString const& key_url, KHTMLPart* html_part); + void removeHtmlParts(); + + void startSearch(KURL const& root); + void startSearch(KURL const& root, SearchMode const& modo); + void resume(); + void cancelSearch(); + + bool hasDocumentRoot() const; + KURL const& documentRoot() const; + void setDocumentRoot(KURL const& url); + + void setSearchMode(SearchMode modo); + void setDepth(int depth); + void setExternalDomainDepth(int depth); + void setDomain(QString const& domain); + void setCheckParentDirs(bool flag); + void setCheckExternalLinks(bool flag); + void setCheckRegularExpressions(bool flag); + void setRegularExpression(QString const& reg_exp, bool case_sensitive); + void setTimeOut(int time_out); + + void cleanItems(); + void reset(); + + bool searching() const; + bool localDomain(KURL const& url, bool restrict = true) const; + //bool isLocalRestrict(KURL const& url) const; + SearchMode const& searchMode() const; + bool checkRegularExpressions() const { return check_regular_expressions_; } + bool existUrl(KURL const& url, KURL const& url_parent) const; + LinkStatus const* linkStatus(QString const& s_url) const; + int checkedLinks() const; + QTime timeElapsed() const; + bool checkParentDirs() const; + bool checkExternalLinks() const; + LinkStatus const* linkStatusRoot() const; + int maxSimultaneousConnections() const; + int timeOut() const; + + bool sendIdentification() const { return send_identification_; } + QString const& userAgent() const { return user_agent_; } + +private: + + void checkRoot(); + void checkVectorLinks(vector<LinkStatus*> const& links); // corresponde a um no de um nivel de depth + vector<LinkStatus*> children(LinkStatus* link); + void startSearch(); + void continueSearch(); + void finnish(); + void pause(); + vector<LinkStatus*> const& nodeToAnalize() const; + vector<LinkStatus*> chooseLinks(vector<LinkStatus*> const& links); + void checkLinksSimultaneously(vector<LinkStatus*> const& links); + void addLevel(); + bool checkableByDomain(KURL const& url, LinkStatus const& link_parent) const; + bool checkable(KURL const& url, LinkStatus const& link_parent) const; + int maximumCurrentConnections() const; + bool onlyCheckHeader(LinkStatus* ls) const; + + /* + Entende-se por domain vago um domain do tipo www.google.pt ou google.pt, pelo que, + por exemplo, imagens.google.pt, e considerado estar no mesmo domain. + pwp.netcabo.pt ou www.google.pt/imagens nao sao considerados domains vagos. + */ + bool generalDomain() const; + bool generalDomainChecked() const; // Para garantir que o procedimento generalDomain() so e chamado uma vez + +private slots: + + void slotRootChecked(const LinkStatus * link, LinkChecker * checker); + void slotLinkChecked(const LinkStatus * link, LinkChecker * checker); + void slotSearchFinished(); + void slotLinkCheckerFinnished(LinkChecker * checker); + +signals: + + void signalRootChecked(const LinkStatus * link, LinkChecker * checker); + void signalLinkChecked(const LinkStatus * link, LinkChecker * checker); + void signalSearchFinished(); + void signalSearchPaused(); + void signalAddingLevelTotalSteps(uint number_of_links); + void signalAddingLevelProgress(); + void signalLinksToCheckTotalSteps(uint links_to_check); + //void signalLinksToCheckProgress(); + +private: + + int max_simultaneous_connections_; + SearchMode search_mode_; + LinkStatus root_; + bool has_document_root_; + KURL document_root_url_; // in case of non http protocols the document root must be explicitly given + int depth_; + int current_depth_; + int external_domain_depth_; + int current_node_; + int current_index_; + int links_being_checked_; + int finished_connections_; + int maximum_current_connections_; + QRegExp reg_exp_; + QString domain_; + bool general_domain_; + bool checked_general_domain_; + int time_out_; + int current_connections_; + bool send_identification_; // user-agent + QString user_agent_; + + bool canceled_; + bool searching_; + int checked_links_; + QTime time_; + int ignored_links_; + bool check_parent_dirs_; + bool check_external_links_; + bool check_regular_expressions_; + uint number_of_level_links_; + uint number_of_links_to_check_; + vector< vector< vector <LinkStatus*> > > search_results_; + KHTMLPartMap html_parts_; +}; + +#include "searchmanager_impl.h" + +#endif diff --git a/klinkstatus/src/engine/searchmanager_impl.h b/klinkstatus/src/engine/searchmanager_impl.h new file mode 100644 index 00000000..eaa5e572 --- /dev/null +++ b/klinkstatus/src/engine/searchmanager_impl.h @@ -0,0 +1,158 @@ +/*************************************************************************** + * Copyright (C) 2004 by Paulo Moura Guedes * + * moura@kdewebdev.org * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + + + + +inline int SearchManager::maximumCurrentConnections() const +{ + Q_ASSERT(maximum_current_connections_ != -1); + return maximum_current_connections_; +} + +inline SearchManager::SearchMode const& SearchManager::searchMode() const +{ + return search_mode_; +} + +inline int SearchManager::checkedLinks() const +{ + Q_ASSERT(checked_links_ > 0); + return checked_links_; +} + +inline QTime SearchManager::timeElapsed() const +{ + int ms = time_.elapsed(); + //kdDebug(23100) << "Time elapsed (ms): " << ms << endl; + return QTime(0, 0).addMSecs(ms); +} + +inline void SearchManager::startSearch(KURL const& root) +{ + startSearch(root, search_mode_); +} + +inline void SearchManager::setSearchMode(SearchMode modo) +{ + search_mode_ = modo; +} + +inline void SearchManager::setDepth(int depth) +{ + depth_ = depth; +} + +inline void SearchManager::setExternalDomainDepth(int depth) +{ + external_domain_depth_ = depth; +} + +inline void SearchManager::setDomain(QString const& domain) +{ + Q_ASSERT(domain.find("http://") == -1); + domain_ = domain; + general_domain_ = generalDomain(); + checked_general_domain_ = true; +} + +inline void SearchManager::setCheckParentDirs(bool flag) +{ + check_parent_dirs_ = flag; +} + +inline void SearchManager::setCheckExternalLinks(bool flag) +{ + check_external_links_ = flag; +} + +inline void SearchManager::setCheckRegularExpressions(bool flag) +{ + check_regular_expressions_ = flag; +} + +inline void SearchManager::setRegularExpression(QString const& reg_exp, bool case_sensitive) +{ + reg_exp_ = QRegExp(reg_exp, case_sensitive); +} + +inline void SearchManager::setTimeOut(int time_out) +{ + Q_ASSERT(time_out > 0); + time_out_ = time_out; +} + + + +inline bool SearchManager::checkParentDirs() const +{ + return check_parent_dirs_; +} + +inline bool SearchManager::checkExternalLinks() const +{ + return check_external_links_; +} + +inline LinkStatus const* SearchManager::linkStatusRoot() const +{ + return &root_; +} + +inline bool SearchManager::searching() const +{ + return searching_; +} + +inline bool SearchManager::localDomain(KURL const& url, bool restrict) const +{ + return Url::localDomain(root_.absoluteUrl(), url, restrict); +} + +inline int SearchManager::maxSimultaneousConnections() const +{ + return max_simultaneous_connections_; +} + +inline int SearchManager::timeOut() const +{ + return time_out_; +} + +inline bool SearchManager::hasDocumentRoot() const +{ + return has_document_root_; +} + +inline KURL const& SearchManager::documentRoot() const +{ + return document_root_url_; +} + +inline void SearchManager::setDocumentRoot(KURL const& url) +{ + Q_ASSERT(url.isValid()); // includes empty URLs + Q_ASSERT(!url.protocol().startsWith("http")); + + document_root_url_ = url; + has_document_root_ = true; +} + + |