/* This file is part of Akregator. Copyright (C) 2004 Teemu Rytilahti This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. As a special exception, permission is given to link this program with any edition of TQt, and distribute the resulting executable, without including the source code for TQt in the source distribution. */ #include #include #include #include #include #include #include "feeddetector.h" using namespace RSS; FeedDetectorEntryList FeedDetector::extractFromLinkTags(const TQString& s) { //reduce all sequences of spaces, newlines etc. to one space: TQString str = s.simplifyWhiteSpace(); // extracts tags TQRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false); // extracts the URL (href="url") TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); // extracts type attribute TQRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); // extracts the title (title="title") TQRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); int pos = 0; int matchpos = 0; // get all tags TQStringList linkTags; //int strlength = str.length(); while ( matchpos != -1 ) { matchpos = reLinkTag.search(str, pos); if (matchpos != -1) { linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) ); pos = matchpos + reLinkTag.matchedLength(); } } FeedDetectorEntryList list; for ( TQStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it ) { TQString type; int pos = reType.search(*it, 0); if (pos != -1) type = reType.cap(1).lower(); // we accept only type attributes indicating a feed if ( type != "application/rss+xml" && type != "application/rdf+xml" && type != "application/atom+xml" && type != "text/xml" ) continue; TQString title; pos = reTitle.search(*it, 0); if (pos != -1) title = reTitle.cap(1); title = KCharsets::resolveEntities(title); TQString url; pos = reHref.search(*it, 0); if (pos != -1) url = reHref.cap(1); url = KCharsets::resolveEntities(url); // if feed has no title, use the url as preliminary title (until feed is parsed) if ( title.isEmpty() ) title = url; if ( !url.isEmpty() ) list.append(FeedDetectorEntry(url, title) ); } return list; } TQStringList FeedDetector::extractBruteForce(const TQString& s) { TQString str = s.simplifyWhiteSpace(); TQRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false); // extracts the URL (href="url") TQRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false); TQRegExp rssrdfxml(".*(RSS|RDF|XML)", false); int pos = 0; int matchpos = 0; // get all tags and capture url TQStringList list; //int strlength = str.length(); while ( matchpos != -1 ) { matchpos = reAhrefTag.search(str, pos); if ( matchpos != -1 ) { TQString ahref = str.mid(matchpos, reAhrefTag.matchedLength()); int hrefpos = reHref.search(ahref, 0); if ( hrefpos != -1 ) { TQString url = reHref.cap(1); url = KCharsets::resolveEntities(url); if ( rssrdfxml.exactMatch(url) ) list.append(url); } pos = matchpos + reAhrefTag.matchedLength(); } } return list; } TQString FeedDetector::fixRelativeURL(const TQString &s, const KURL &baseurl) { TQString s2=s; KURL u; if (KURL::isRelativeURL(s2)) { if (s2.startsWith("//")) { s2=s2.prepend(baseurl.protocol()+":"); u=s2; } else if (s2.startsWith("/")) { KURL b2(baseurl); b2.setPath(TQString()); // delete path and query, so that only protocol://host remains b2.setQuery(TQString()); u = KURL(b2, s2.remove(0,1)); // remove leading "/" } else { u = KURL(baseurl, s2); } } else u=s2; u.cleanPath(); //kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() << //endl; return u.url(); }