summaryrefslogtreecommitdiffstats
path: root/khtml/html/htmltokenizer.h
diff options
context:
space:
mode:
Diffstat (limited to 'khtml/html/htmltokenizer.h')
-rw-r--r--khtml/html/htmltokenizer.h358
1 files changed, 358 insertions, 0 deletions
diff --git a/khtml/html/htmltokenizer.h b/khtml/html/htmltokenizer.h
new file mode 100644
index 000000000..5e4186d58
--- /dev/null
+++ b/khtml/html/htmltokenizer.h
@@ -0,0 +1,358 @@
+/*
+ This file is part of the KDE libraries
+
+ Copyright (C) 1997 Martin Jones (mjones@kde.org)
+ (C) 1997 Torben Weis (weis@kde.org)
+ (C) 1998 Waldo Bastian (bastian@kde.org)
+ (C) 2001 Dirk Mueller (mueller@kde.org)
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+*/
+//----------------------------------------------------------------------------
+//
+// KDE HTML Widget -- Tokenizers
+
+#ifndef HTMLTOKENIZER_H
+#define HTMLTOKENIZER_H
+
+#include <qstring.h>
+#include <qobject.h>
+#include <qptrqueue.h>
+
+#include "misc/loader_client.h"
+#include "misc/htmltags.h"
+#include "misc/stringit.h"
+#include "xml/dom_stringimpl.h"
+#include "xml/xml_tokenizer.h"
+#include "xml/dom_elementimpl.h"
+#include "xml/dom_docimpl.h"
+
+class KCharsets;
+class KHTMLView;
+
+namespace DOM {
+ class DocumentImpl;
+ class DocumentFragmentImpl;
+}
+
+namespace khtml {
+ class CachedScript;
+ class KHTMLParser;
+
+ /**
+ * @internal
+ * represents one HTML tag. Consists of a numerical id, and the list
+ * of attributes. Can also represent text. In this case the id = 0 and
+ * text contains the text.
+ */
+ class Token
+ {
+ public:
+ Token() {
+ tid = 0;
+ attrs = 0;
+ text = 0;
+ flat = false;
+ //qDebug("new token, creating %08lx", attrs);
+ }
+ ~Token() {
+ if(attrs) attrs->deref();
+ if(text) text->deref();
+ }
+ void addAttribute(DocumentImpl* doc, QChar* buffer, const QString& attrName, const DOMString& v)
+ {
+ DOMStringImpl *value = 0;
+ NodeImpl::Id tid = 0;
+ if(buffer->unicode()) {
+ tid = buffer->unicode();
+ value = v.implementation();
+ }
+ else if ( !attrName.isEmpty() && attrName != "/" ) {
+ tid = doc->getId(NodeImpl::AttributeId, DOMString(attrName).implementation(), false, true);
+ value = v.implementation();
+ }
+
+ if (value && tid) {
+ if(!attrs) {
+ attrs = new DOM::NamedAttrMapImpl(0);
+ attrs->ref();
+ }
+ if (!attrs->getValue(tid))
+ attrs->setValue(tid,value);
+ }
+ }
+ void reset()
+ {
+ if(attrs) {
+ attrs->deref();
+ attrs = 0;
+ }
+ tid = 0;
+ if(text) {
+ text->deref();
+ text = 0;
+ }
+ flat = false;
+ }
+ DOM::NamedAttrMapImpl* attrs;
+ DOMStringImpl* text;
+ ushort tid;
+ bool flat;
+ };
+
+// The count of spaces used for each tab.
+#define TAB_SIZE 8
+
+//-----------------------------------------------------------------------------
+
+class HTMLTokenizer : public Tokenizer, public CachedObjectClient
+{
+ friend class KHTMLParser;
+public:
+ HTMLTokenizer(DOM::DocumentImpl *, KHTMLView * = 0);
+ HTMLTokenizer(DOM::DocumentImpl *, DOM::DocumentFragmentImpl *frag);
+ virtual ~HTMLTokenizer();
+
+ void begin();
+ void write( const khtml::TokenizerString &str, bool appendData );
+ void end();
+ void finish();
+ void timerEvent( QTimerEvent *e );
+ virtual void setOnHold(bool _onHold);
+ void abort() { m_abort = true; }
+ virtual void setAutoClose(bool b=true);
+ virtual bool isWaitingForScripts() const;
+ virtual bool isExecutingScript() const;
+
+protected:
+ void reset();
+ void addPending();
+ void processToken();
+ void processListing(khtml::TokenizerString list);
+
+ void parseComment(khtml::TokenizerString &str);
+ void parseServer(khtml::TokenizerString &str);
+ void parseText(khtml::TokenizerString &str);
+ void parseListing(khtml::TokenizerString &str);
+ void parseSpecial(khtml::TokenizerString &str);
+ void parseTag(khtml::TokenizerString &str);
+ void parseEntity(khtml::TokenizerString &str, QChar *&dest, bool start = false);
+ void parseProcessingInstruction(khtml::TokenizerString &str);
+ void scriptHandler();
+ void scriptExecution(const QString& script, const QString& scriptURL = QString::null, int baseLine = 0);
+ void setSrc(const TokenizerString& source);
+
+ // check if we have enough space in the buffer.
+ // if not enlarge it
+ inline void checkBuffer(int len = 10)
+ {
+ if ( (dest - buffer) > size-len )
+ enlargeBuffer(len);
+ }
+ inline void checkScriptBuffer(int len = 10)
+ {
+ if ( scriptCodeSize + len >= scriptCodeMaxSize )
+ enlargeScriptBuffer(len);
+ }
+
+ void enlargeBuffer(int len);
+ void enlargeScriptBuffer(int len);
+
+ // from CachedObjectClient
+ void notifyFinished(khtml::CachedObject *finishedObj);
+
+protected:
+ // Internal buffers
+ ///////////////////
+ QChar *buffer;
+ QChar *dest;
+
+ khtml::Token currToken;
+
+ // the size of buffer
+ int size;
+
+ // Tokenizer flags
+ //////////////////
+ // are we in quotes within a html tag
+ enum
+ {
+ NoQuote = 0,
+ SingleQuote,
+ DoubleQuote
+ } tquote;
+
+ enum
+ {
+ NonePending = 0,
+ SpacePending,
+ LFPending,
+ TabPending
+ } pending;
+
+ enum
+ {
+ NoneDiscard = 0,
+ SpaceDiscard, // Discard spaces after '=' within tags
+ LFDiscard, // Discard line breaks immediately after start-tags
+ AllDiscard // discard all spaces, LF's etc until next non white char
+ } discard;
+
+ // Discard the LF part of CRLF sequence
+ bool skipLF;
+
+ // Flag to say that we have the '<' but not the character following it.
+ bool startTag;
+
+ // Flag to say, we are just parsing a tag, meaning, we are in the middle
+ // of <tag...
+ enum {
+ NoTag = 0,
+ TagName,
+ SearchAttribute,
+ AttributeName,
+ SearchEqual,
+ SearchValue,
+ QuotedValue,
+ Value,
+ SearchEnd
+ } tag;
+
+ // Are we in a &... character entity description?
+ enum {
+ NoEntity = 0,
+ SearchEntity,
+ NumericSearch,
+ Hexadecimal,
+ Decimal,
+ EntityName,
+ SearchSemicolon
+ } Entity;
+
+ // are we in a <script> ... </script> block
+ bool script;
+
+ QChar EntityChar;
+
+ // Are we in a <pre> ... </pre> block
+ bool pre;
+
+ // if 'pre == true' we track in which column we are
+ int prePos;
+
+ // Are we in a <style> ... </style> block
+ bool style;
+
+ // Are we in a <select> ... </select> block
+ bool select;
+
+ // Are we in a <xmp> ... </xmp> block
+ bool xmp;
+
+ // Are we in a <title> ... </title> block
+ bool title;
+
+ // Are we in plain textmode ?
+ bool plaintext;
+
+ // XML processing instructions. Ignored at the moment
+ bool processingInstruction;
+
+ // Area we in a <!-- comment --> block
+ bool comment;
+
+ // Are we in a <textarea> ... </textarea> block
+ bool textarea;
+
+ // was the previous character escaped ?
+ bool escaped;
+
+ // are we in a server includes statement?
+ bool server;
+
+ bool brokenServer;
+
+ bool brokenScript;
+
+ // name of an unknown attribute
+ QString attrName;
+
+ // Used to store the code of a srcipting sequence
+ QChar *scriptCode;
+ // Size of the script sequenze stored in scriptCode
+ int scriptCodeSize;
+ // Maximal size that can be stored in scriptCode
+ int scriptCodeMaxSize;
+ // resync point of script code size
+ int scriptCodeResync;
+
+ // Stores characters if we are scanning for a string like "</script>"
+ QChar searchBuffer[ 10 ];
+ // Counts where we are in the string we are scanning for
+ int searchCount;
+ // The string we are searching for
+ const QChar *searchFor;
+ // the stopper string
+ const char* searchStopper;
+ // the stopper len
+ int searchStopperLen;
+ // if no more data is coming, just parse what we have (including ext scripts that
+ // may be still downloading) and finish
+ bool noMoreData;
+ // URL to get source code of script from
+ QString scriptSrc;
+ QString scriptSrcCharset;
+ bool javascript;
+ // the HTML code we will parse after the external script we are waiting for has loaded
+ TokenizerQueue pendingQueue;
+ // true if we are executing a script while parsing a document. This causes the parsing of
+ // the output of the script to be postponed until after the script has finished executing
+ int m_executingScript;
+ QPtrQueue<khtml::CachedScript> cachedScript;
+ // you can pause the tokenizer if you need to display a dialog or something
+ bool onHold;
+ // you can ask the tokenizer to abort the current write() call, e.g. to redirect somewhere else
+ bool m_abort;
+
+ // if we found one broken comment, there are most likely others as well
+ // store a flag to get rid of the O(n^2) behavior in such a case.
+ bool brokenComments;
+ // current line number
+ int lineno;
+ // line number at which the current <script> started
+ int scriptStartLineno;
+ int tagStartLineno;
+ // autoClose mode is used when the tokenizer was created by a script document.writing
+ // on an already loaded document
+ int m_autoCloseTimer;
+
+#define CBUFLEN 1024
+ char cBuffer[CBUFLEN+2];
+ unsigned int cBufferPos;
+ unsigned int entityLen;
+
+ khtml::TokenizerString src;
+
+ KCharsets *charsets;
+ KHTMLParser *parser;
+
+ KHTMLView *view;
+};
+
+} // namespace
+
+#endif // HTMLTOKENIZER
+