/* This file is part of the KDE libraries Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ //---------------------------------------------------------------------------- // // KDE HTML Widget -- Tokenizers #ifndef HTMLTOKENIZER_H #define HTMLTOKENIZER_H #include #include #include #include "misc/loader_client.h" #include "misc/htmltags.h" #include "misc/stringit.h" #include "xml/dom_stringimpl.h" #include "xml/xml_tokenizer.h" #include "xml/dom_elementimpl.h" #include "xml/dom_docimpl.h" class KCharsets; class KHTMLView; namespace DOM { class DocumentImpl; class DocumentFragmentImpl; } namespace khtml { class CachedScript; class KHTMLParser; /** * @internal * represents one HTML tag. Consists of a numerical id, and the list * of attributes. Can also represent text. In this case the id = 0 and * text contains the text. */ class Token { public: Token() { tid = 0; attrs = 0; text = 0; flat = false; //tqDebug("new token, creating %08lx", attrs); } ~Token() { if(attrs) attrs->deref(); if(text) text->deref(); } void addAttribute(DocumentImpl* doc, TQChar* buffer, const TQString& attrName, const DOMString& v) { DOMStringImpl *value = 0; NodeImpl::Id tid = 0; if(buffer->unicode()) { tid = buffer->unicode(); value = v.implementation(); } else if ( !attrName.isEmpty() && attrName != "/" ) { tid = doc->getId(NodeImpl::AttributeId, DOMString(attrName).implementation(), false, true); value = v.implementation(); } if (value && tid) { if(!attrs) { attrs = new DOM::NamedAttrMapImpl(0); attrs->ref(); } if (!attrs->getValue(tid)) attrs->setValue(tid,value); } } void reset() { if(attrs) { attrs->deref(); attrs = 0; } tid = 0; if(text) { text->deref(); text = 0; } flat = false; } DOM::NamedAttrMapImpl* attrs; DOMStringImpl* text; ushort tid; bool flat; }; // The count of spaces used for each tab. #define TAB_SIZE 8 //----------------------------------------------------------------------------- class HTMLTokenizer : public Tokenizer, public CachedObjectClient { friend class KHTMLParser; public: HTMLTokenizer(DOM::DocumentImpl *, KHTMLView * = 0); HTMLTokenizer(DOM::DocumentImpl *, DOM::DocumentFragmentImpl *frag); virtual ~HTMLTokenizer(); void begin(); void write( const khtml::TokenizerString &str, bool appendData ); void end(); void finish(); void timerEvent( TQTimerEvent *e ); virtual void setOnHold(bool _onHold); void abort() { m_abort = true; } virtual void setAutoClose(bool b=true); virtual bool isWaitingForScripts() const; virtual bool isExecutingScript() const; protected: void reset(); void addPending(); void processToken(); void processListing(khtml::TokenizerString list); void parseComment(khtml::TokenizerString &str); void parseServer(khtml::TokenizerString &str); void parseText(khtml::TokenizerString &str); void parseListing(khtml::TokenizerString &str); void parseSpecial(khtml::TokenizerString &str); void parseTag(khtml::TokenizerString &str); void parseEntity(khtml::TokenizerString &str, TQChar *&dest, bool start = false); void parseProcessingInstruction(khtml::TokenizerString &str); void scriptHandler(); void scriptExecution(const TQString& script, const TQString& scriptURL = TQString::null, int baseLine = 0); void setSrc(const TokenizerString& source); // check if we have enough space in the buffer. // if not enlarge it inline void checkBuffer(int len = 10) { if ( (dest - buffer) > size-len ) enlargeBuffer(len); } inline void checkScriptBuffer(int len = 10) { if ( scriptCodeSize + len >= scriptCodeMaxSize ) enlargeScriptBuffer(len); } void enlargeBuffer(int len); void enlargeScriptBuffer(int len); // from CachedObjectClient void notifyFinished(khtml::CachedObject *finishedObj); protected: // Internal buffers /////////////////// TQChar *buffer; TQChar *dest; khtml::Token currToken; // the size of buffer int size; // Tokenizer flags ////////////////// // are we in quotes within a html tag enum { NoQuote = 0, SingleQuote, DoubleQuote } tquote; enum { NonePending = 0, SpacePending, LFPending, TabPending } pending; enum { NoneDiscard = 0, SpaceDiscard, // Discard spaces after '=' within tags LFDiscard, // Discard line breaks immediately after start-tags AllDiscard // discard all spaces, LF's etc until next non white char } discard; // Discard the LF part of CRLF sequence bool skipLF; // Flag to say that we have the '<' but not the character following it. bool startTag; // Flag to say, we are just parsing a tag, meaning, we are in the middle // of ... block bool script; TQChar EntityChar; // Are we in a
 ... 
block bool pre; // if 'pre == true' we track in which column we are int prePos; // Are we in a block bool style; // Are we in a block bool select; // Are we in a ... block bool xmp; // Are we in a ... block bool title; // Are we in plain textmode ? bool plaintext; // XML processing instructions. Ignored at the moment bool processingInstruction; // Area we in a block bool comment; // Are we in a block bool textarea; // was the previous character escaped ? bool escaped; // are we in a server includes statement? bool server; bool brokenServer; bool brokenScript; // name of an unknown attribute TQString attrName; // Used to store the code of a srcipting sequence TQChar *scriptCode; // Size of the script sequenze stored in scriptCode int scriptCodeSize; // Maximal size that can be stored in scriptCode int scriptCodeMaxSize; // resync point of script code size int scriptCodeResync; // Stores characters if we are scanning for a string like "" TQChar searchBuffer[ 10 ]; // Counts where we are in the string we are scanning for int searchCount; // The string we are searching for const TQChar *searchFor; // the stopper string const char* searchStopper; // the stopper len int searchStopperLen; // if no more data is coming, just parse what we have (including ext scripts that // may be still downloading) and finish bool noMoreData; // URL to get source code of script from TQString scriptSrc; TQString scriptSrcCharset; bool javascript; // the HTML code we will parse after the external script we are waiting for has loaded TokenizerQueue pendingQueue; // true if we are executing a script while parsing a document. This causes the parsing of // the output of the script to be postponed until after the script has finished executing int m_executingScript; TQPtrQueue cachedScript; // you can pause the tokenizer if you need to display a dialog or something bool onHold; // you can ask the tokenizer to abort the current write() call, e.g. to redirect somewhere else bool m_abort; // if we found one broken comment, there are most likely others as well // store a flag to get rid of the O(n^2) behavior in such a case. bool brokenComments; // current line number int lineno; // line number at which the current