From dfe289850f068f19ba4a83ab4e7e22a7e09c13c9 Mon Sep 17 00:00:00 2001 From: Timothy Pearson Date: Sat, 26 Jan 2013 13:17:21 -0600 Subject: Rename a number of libraries and executables to avoid conflicts with KDE4 --- tdehtml/html/htmltokenizer.cpp | 1798 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1798 insertions(+) create mode 100644 tdehtml/html/htmltokenizer.cpp (limited to 'tdehtml/html/htmltokenizer.cpp') diff --git a/tdehtml/html/htmltokenizer.cpp b/tdehtml/html/htmltokenizer.cpp new file mode 100644 index 000000000..83bfd4bd5 --- /dev/null +++ b/tdehtml/html/htmltokenizer.cpp @@ -0,0 +1,1798 @@ +/* + This file is part of the KDE libraries + + Copyright (C) 1997 Martin Jones (mjones@kde.org) + (C) 1997 Torben Weis (weis@kde.org) + (C) 1998 Waldo Bastian (bastian@kde.org) + (C) 1999 Lars Knoll (knoll@kde.org) + (C) 1999 Antti Koivisto (koivisto@kde.org) + (C) 2001-2003 Dirk Mueller (mueller@kde.org) + (C) 2004 Apple Computer, Inc. + (C) 2006 Germain Garand (germain@ebooksfrance.org) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ +//---------------------------------------------------------------------------- +// +// KDE HTML Widget - Tokenizers + +//#define TOKEN_DEBUG 1 +//#define TOKEN_DEBUG 2 + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "html/htmltokenizer.h" +#include "html/html_documentimpl.h" +#include "html/htmlparser.h" +#include "html/dtd.h" + +#include "misc/loader.h" +#include "misc/htmlhashes.h" + +#include "tdehtmlview.h" +#include "tdehtml_part.h" +#include "xml/dom_docimpl.h" +#include "css/csshelper.h" +#include "ecma/kjs_proxy.h" +#include +#include +#include +#include +#include +#include +#include + +#include "kentities.c" + +using namespace tdehtml; + +static const TQChar commentStart [] = { '<','!','-','-', TQChar::null }; + +static const char scriptEnd [] = "deref(this); + + if ( buffer ) + KHTML_DELETE_QCHAR_VEC(buffer); + buffer = dest = 0; + size = 0; + + if ( scriptCode ) + KHTML_DELETE_QCHAR_VEC(scriptCode); + scriptCode = 0; + scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0; + + if (m_autoCloseTimer) { + killTimer(m_autoCloseTimer); + m_autoCloseTimer = 0; + } + + currToken.reset(); +} + +void HTMLTokenizer::begin() +{ + m_executingScript = 0; + onHold = false; + reset(); + size = 254; + buffer = KHTML_ALLOC_QCHAR_VEC( 255 ); + dest = buffer; + tag = NoTag; + pending = NonePending; + discard = NoneDiscard; + pre = false; + prePos = 0; + plaintext = false; + xmp = false; + processingInstruction = false; + script = false; + escaped = false; + style = false; + skipLF = false; + select = false; + comment = false; + server = false; + textarea = false; + title = false; + startTag = false; + tquote = NoQuote; + searchCount = 0; + Entity = NoEntity; + noMoreData = false; + brokenComments = false; + brokenServer = false; + brokenScript = false; + lineno = 0; + scriptStartLineno = 0; + tagStartLineno = 0; +} + +void HTMLTokenizer::processListing(TokenizerString list) +{ + bool old_pre = pre; + + // This function adds the listing 'list' as + // preformatted text-tokens to the token-collection + // thereby converting TABs. + if(!style) pre = true; + prePos = 0; + + while ( !list.isEmpty() ) + { + checkBuffer(3*TAB_SIZE); + + if (skipLF && ( *list != '\n' )) + { + skipLF = false; + } + + if (skipLF) + { + skipLF = false; + ++list; + } + else if (( *list == '\n' ) || ( *list == '\r' )) + { + if (discard == LFDiscard) + { + // Ignore this LF + discard = NoneDiscard; // We have discarded 1 LF + } + else + { + // Process this LF + if (pending) + addPending(); + + // we used to do it not at all and we want to have + // it fixed for textarea. So here we are + if ( textarea ) { + prePos++; + *dest++ = *list; + } else + pending = LFPending; + } + /* Check for MS-DOS CRLF sequence */ + if (*list == '\r') + { + skipLF = true; + } + ++list; + } + else if (( *list == ' ' ) || ( *list == '\t')) + { + if (pending) + addPending(); + if (*list == ' ') + pending = SpacePending; + else + pending = TabPending; + + ++list; + } + else + { + discard = NoneDiscard; + if (pending) + addPending(); + + prePos++; + *dest++ = *list; + ++list; + } + + } + + if ((pending == SpacePending) || (pending == TabPending)) + addPending(); + else + pending = NonePending; + + prePos = 0; + pre = old_pre; +} + +void HTMLTokenizer::parseSpecial(TokenizerString &src) +{ + assert( textarea || title || !Entity ); + assert( !tag ); + assert( xmp+textarea+title+style+script == 1 ); + if (script) + scriptStartLineno = lineno+src.lineCount(); + + if ( comment ) parseComment( src ); + + while ( !src.isEmpty() ) { + checkScriptBuffer(); + unsigned char ch = src->latin1(); + if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && TQConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "' ) ) { + ++src; + scriptCodeSize = scriptCodeResync-1; + scriptCodeResync = 0; + scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0; + if ( script ) + scriptHandler(); + else { + processListing(TokenizerString(scriptCode, scriptCodeSize)); + processToken(); + if ( style ) { currToken.tid = ID_STYLE + ID_CLOSE_TAG; } + else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; } + else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; } + else if ( xmp ) { currToken.tid = ID_XMP + ID_CLOSE_TAG; } + processToken(); + script = style = textarea = title = xmp = false; + tquote = NoQuote; + scriptCodeSize = scriptCodeResync = 0; + } + return; + } + // possible end of tagname, lets check. + if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch && + scriptCodeSize >= searchStopperLen && + !TQConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) { + scriptCodeResync = scriptCodeSize-searchStopperLen+1; + tquote = NoQuote; + continue; + } + if ( scriptCodeResync && !escaped ) { + if(ch == '\"') + tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote); + else if(ch == '\'') + tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote; + else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) + tquote = NoQuote; + } + escaped = ( !escaped && ch == '\\' ); + if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') { + TQChar *scriptCodeDest = scriptCode+scriptCodeSize; + ++src; + parseEntity(src,scriptCodeDest,true); + scriptCodeSize = scriptCodeDest-scriptCode; + } + else { + scriptCode[ scriptCodeSize++ ] = *src; + ++src; + } + } +} + +void HTMLTokenizer::scriptHandler() +{ + TQString currentScriptSrc = scriptSrc; + scriptSrc = TQString::null; + + processListing(TokenizerString(scriptCode, scriptCodeSize)); + TQString exScript( buffer, dest-buffer ); + + processToken(); + currToken.tid = ID_SCRIPT + ID_CLOSE_TAG; + processToken(); + + // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts. + bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET); + bool effectiveScript = !parser->skipMode() && !followingFrameset; + bool deferredScript = false; + + if ( effectiveScript ) { + CachedScript* cs = 0; + + // forget what we just got, load from src url instead + if ( !currentScriptSrc.isEmpty() && javascript && + (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) { + cachedScript.enqueue(cs); + } + + if (cs) { + pendingQueue.push(src); + uint scriptCount = cachedScript.count(); + setSrc(TokenizerString()); + scriptCodeSize = scriptCodeResync = 0; + cs->ref(this); + if (cachedScript.count() == scriptCount) + deferredScript = true; + } + else if (currentScriptSrc.isEmpty() && view && javascript ) { + pendingQueue.push(src); + setSrc(TokenizerString()); + scriptCodeSize = scriptCodeResync = 0; + scriptExecution( exScript, TQString::null, tagStartLineno /*scriptStartLineno*/ ); + } else { + // script was filtered or disallowed + effectiveScript = false; + } + } + + script = false; + scriptCodeSize = scriptCodeResync = 0; + + if ( !effectiveScript ) + return; + + if ( !m_executingScript && cachedScript.isEmpty() ) { + src.append(pendingQueue.pop()); + } else if ( cachedScript.isEmpty() ) { + write( pendingQueue.pop(), false ); + } else if ( !deferredScript && pendingQueue.count() > 1) { + TokenizerString t = pendingQueue.pop(); + pendingQueue.top().prepend( t ); + } +} + +void HTMLTokenizer::scriptExecution( const TQString& str, const TQString& scriptURL, + int baseLine) +{ + bool oldscript = script; + m_executingScript++; + script = false; + TQString url; + if (scriptURL.isNull() && view) + url = static_cast(view->part()->document().handle())->URL().url(); + else + url = scriptURL; + + if (view) + view->part()->executeScript(url,baseLine+1,Node(),str); + m_executingScript--; + script = oldscript; +} + +void HTMLTokenizer::parseComment(TokenizerString &src) +{ + // SGML strict + bool strict = parser->doc()->inStrictMode() && parser->doc()->htmlMode() != DocumentImpl::XHtml && !script && !style; + int delimiterCount = 0; + bool canClose = false; + + checkScriptBuffer(src.length()); + while ( src.length() ) { + scriptCode[ scriptCodeSize++ ] = *src; + +#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 + tqDebug("comment is now: *%s*", src.toString().left(16).latin1()); +#endif + + if (strict) + { + if (src->unicode() == '-') { + delimiterCount++; + if (delimiterCount == 2) { + delimiterCount = 0; + canClose = !canClose; + } + } + else + delimiterCount = 0; + } + + if ((!strict || canClose) && src->unicode() == '>') + { + bool handleBrokenComments = brokenComments && !( script || style ); + bool scriptEnd=false; + if (!strict) + { + if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && + scriptCode[scriptCodeSize-2] == '-' ) + scriptEnd=true; + } + + if (canClose || handleBrokenComments || scriptEnd ){ + ++src; + if ( !( title || script || xmp || textarea || style) ) { +#ifdef COMMENTS_IN_DOM + checkScriptBuffer(); + scriptCode[ scriptCodeSize ] = 0; + scriptCode[ scriptCodeSize + 1 ] = 0; + currToken.tid = ID_COMMENT; + processListing(DOMStringIt(scriptCode, scriptCodeSize - 2)); + processToken(); + currToken.tid = ID_COMMENT + ID_CLOSE_TAG; + processToken(); +#endif + scriptCodeSize = 0; + } + comment = false; + return; // Finished parsing comment + } + } + ++src; + } +} + +void HTMLTokenizer::parseServer(TokenizerString &src) +{ + checkScriptBuffer(src.length()); + while ( !src.isEmpty() ) { + scriptCode[ scriptCodeSize++ ] = *src; + if (src->unicode() == '>' && + scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') { + ++src; + server = false; + scriptCodeSize = 0; + return; // Finished parsing server include + } + ++src; + } +} + +void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src) +{ + char oldchar = 0; + while ( !src.isEmpty() ) + { + unsigned char chbegin = src->latin1(); + if(chbegin == '\'') { + tquote = tquote == SingleQuote ? NoQuote : SingleQuote; + } + else if(chbegin == '\"') { + tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; + } + // Look for '?>' + // some crappy sites omit the "?" before it, so + // we look for an unquoted '>' instead. (IE compatible) + else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) ) + { + // We got a '?>' sequence + processingInstruction = false; + ++src; + discard=LFDiscard; + return; // Finished parsing comment! + } + ++src; + oldchar = chbegin; + } +} + +void HTMLTokenizer::parseText(TokenizerString &src) +{ + while ( !src.isEmpty() ) + { + // do we need to enlarge the buffer? + checkBuffer(); + + // ascii is okay because we only do ascii comparisons + unsigned char chbegin = src->latin1(); + + if (skipLF && ( chbegin != '\n' )) + { + skipLF = false; + } + + if (skipLF) + { + skipLF = false; + ++src; + } + else if (( chbegin == '\n' ) || ( chbegin == '\r' )) + { + if (chbegin == '\r') + skipLF = true; + + *dest++ = '\n'; + ++src; + } + else { + *dest++ = *src; + ++src; + } + } +} + + +void HTMLTokenizer::parseEntity(TokenizerString &src, TQChar *&dest, bool start) +{ + if( start ) + { + cBufferPos = 0; + entityLen = 0; + Entity = SearchEntity; + } + + while( !src.isEmpty() ) + { + ushort cc = src->unicode(); + switch(Entity) { + case NoEntity: + return; + + break; + case SearchEntity: + if(cc == '#') { + cBuffer[cBufferPos++] = cc; + ++src; + Entity = NumericSearch; + } + else + Entity = EntityName; + + break; + + case NumericSearch: + if(cc == 'x' || cc == 'X') { + cBuffer[cBufferPos++] = cc; + ++src; + Entity = Hexadecimal; + } + else if(cc >= '0' && cc <= '9') + Entity = Decimal; + else + Entity = SearchSemicolon; + + break; + + case Hexadecimal: + { + int uc = EntityChar.unicode(); + int ll = kMin(src.length(), 8); + while(ll--) { + TQChar csrc(src->lower()); + cc = csrc.cell(); + + if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) { + break; + } + uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10)); + cBuffer[cBufferPos++] = cc; + ++src; + } + EntityChar = TQChar(uc); + Entity = SearchSemicolon; + break; + } + case Decimal: + { + int uc = EntityChar.unicode(); + int ll = kMin(src.length(), 9-cBufferPos); + while(ll--) { + cc = src->cell(); + + if(src->row() || !(cc >= '0' && cc <= '9')) { + Entity = SearchSemicolon; + break; + } + + uc = uc * 10 + (cc - '0'); + cBuffer[cBufferPos++] = cc; + ++src; + } + EntityChar = TQChar(uc); + if(cBufferPos == 9) Entity = SearchSemicolon; + break; + } + case EntityName: + { + int ll = kMin(src.length(), 9-cBufferPos); + while(ll--) { + TQChar csrc = *src; + cc = csrc.cell(); + + if(csrc.row() || !((cc >= 'a' && cc <= 'z') || + (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { + Entity = SearchSemicolon; + break; + } + + cBuffer[cBufferPos++] = cc; + ++src; + + // be IE compatible and interpret even unterminated entities + // outside tags. like "foo  stuff bla". + if ( tag == NoTag ) { + const entity* e = kde_findEntity(cBuffer, cBufferPos); + if ( e && e->code < 256 ) { + EntityChar = e->code; + entityLen = cBufferPos; + } + } + } + if(cBufferPos == 9) Entity = SearchSemicolon; + if(Entity == SearchSemicolon) { + if(cBufferPos > 1) { + const entity *e = kde_findEntity(cBuffer, cBufferPos); + // IE only accepts unterminated entities < 256, + // Gecko accepts them all, but only outside tags + if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) { + EntityChar = e->code; + entityLen = cBufferPos; + } + } + } + break; + } + case SearchSemicolon: +#ifdef TOKEN_DEBUG + kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << endl; +#endif + fixUpChar(EntityChar); + + if (*src == ';') + ++src; + + if ( !EntityChar.isNull() ) { + checkBuffer(); + if (entityLen > 0 && entityLen < cBufferPos) { + int rem = cBufferPos - entityLen; + src.prepend( TokenizerString(TQString::fromAscii(cBuffer+entityLen, rem)) ); + } + src.push( EntityChar ); + } else { +#ifdef TOKEN_DEBUG + kdDebug( 6036 ) << "unknown entity!" << endl; +#endif + checkBuffer(11); + // ignore the sequence, add it to the buffer as plaintext + *dest++ = '&'; + for(unsigned int i = 0; i < cBufferPos; i++) + dest[i] = cBuffer[i]; + dest += cBufferPos; + if (pre) + prePos += cBufferPos+1; + } + + Entity = NoEntity; + EntityChar = TQChar::null; + return; + }; + } +} + +void HTMLTokenizer::parseTag(TokenizerString &src) +{ + assert(!Entity ); + checkScriptBuffer( src.length() ); + + while ( !src.isEmpty() ) + { + checkBuffer(); +#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 + uint l = 0; + while(l < src.length() && (src.toString()[l]).latin1() != '>') + l++; + tqDebug("src is now: *%s*, tquote: %d", + src.toString().left(l).latin1(), tquote); +#endif + switch(tag) { + case NoTag: + return; + case TagName: + { +#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 + tqDebug("TagName"); +#endif + if (searchCount > 0) + { + if (*src == commentStart[searchCount]) + { + searchCount++; + if (searchCount == 4) + { +#ifdef TOKEN_DEBUG + kdDebug( 6036 ) << "Found comment" << endl; +#endif + // Found ' + searchCount = 1; // Look for ' " << name << " id = " << currToken.tid << endl; + if (currToken.flat) + kdDebug( 6036 ) << "Token is FLAT!" << endl; + if(!text.isNull()) + kdDebug( 6036 ) << "text: \"" << text << "\"" << endl; + unsigned long l = currToken.attrs ? currToken.attrs->length() : 0; + if(l) { + kdDebug( 6036 ) << "Attributes: " << l << endl; + for (unsigned long i = 0; i < l; ++i) { + NodeImpl::Id tid = currToken.attrs->idAt(i); + DOMString value = currToken.attrs->valueAt(i); + kdDebug( 6036 ) << " " << tid << " " << parser->doc()->getDocument()->getName(NodeImpl::AttributeId, tid).string() + << "=\"" << value.string() << "\"" << endl; + } + } + kdDebug( 6036 ) << endl; +#endif + + // In some cases, parseToken() can cause javascript code to be executed + // (for example, when setting an attribute that causes an event handler + // to be created). So we need to protect against re-entrancy into the parser + m_executingScript++; + + // pass the token over to the parser, the parser DOES NOT delete the token + parser->parseToken(&currToken); + + m_executingScript--; + + if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() ) + discard = NoneDiscard; + + currToken.reset(); + if (jsProxy) + jsProxy->setEventHandlerLineno(1); +} + + +HTMLTokenizer::~HTMLTokenizer() +{ + reset(); + delete parser; +} + + +void HTMLTokenizer::enlargeBuffer(int len) +{ + int newsize = kMax(size*2, size+len); + int oldoffs = (dest - buffer); + + buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize); + dest = buffer + oldoffs; + size = newsize; +} + +void HTMLTokenizer::enlargeScriptBuffer(int len) +{ + int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len); + scriptCode = KHTML_REALLOC_QCHAR_VEC(scriptCode, newsize); + scriptCodeMaxSize = newsize; +} + +void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/) +{ + assert(!cachedScript.isEmpty()); + bool done = false; + while (!done && cachedScript.head()->isLoaded()) { + + kdDebug( 6036 ) << "Finished loading an external script" << endl; + + CachedScript* cs = cachedScript.dequeue(); + DOMString scriptSource = cs->script(); +#ifdef TOKEN_DEBUG + kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl; +#endif + setSrc(TokenizerString()); + + // make sure we forget about the script before we execute the new one + // infinite recursion might happen otherwise + TQString cachedScriptUrl( cs->url().string() ); + cs->deref(this); + + scriptExecution( scriptSource.string(), cachedScriptUrl ); + + done = cachedScript.isEmpty(); + + // 'script' is true when we are called synchronously from + // scriptHandler(). In that case scriptHandler() will take care + // of 'scriptOutput'. + if ( !script ) { + while (pendingQueue.count() > 1) { + TokenizerString t = pendingQueue.pop(); + pendingQueue.top().prepend( t ); + } + if (done) { + write(pendingQueue.pop(), false); + } + // we might be deleted at this point, do not + // access any members. + } + } +} + +bool HTMLTokenizer::isWaitingForScripts() const +{ + return cachedScript.count(); +} + +bool HTMLTokenizer::isExecutingScript() const +{ + return (m_executingScript > 0); +} + +void HTMLTokenizer::setSrc(const TokenizerString& source) +{ + lineno += src.lineCount(); + src = source; + src.resetLineCount(); +} + +void HTMLTokenizer::setOnHold(bool _onHold) +{ + if (onHold == _onHold) return; + onHold = _onHold; +} + -- cgit v1.2.1