summaryrefslogtreecommitdiffstats
path: root/mimelib/token.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'mimelib/token.cpp')
-rw-r--r--mimelib/token.cpp617
1 files changed, 617 insertions, 0 deletions
diff --git a/mimelib/token.cpp b/mimelib/token.cpp
new file mode 100644
index 000000000..654d764f7
--- /dev/null
+++ b/mimelib/token.cpp
@@ -0,0 +1,617 @@
+//=============================================================================
+// File: token.cpp
+// Contents: Definitions for DwTokenizer, DwRfc822Tokenizer
+// Maintainer: Doug Sauder <dwsauder@fwb.gulf.net>
+// WWW: http://www.fwb.gulf.net/~dwsauder/mimepp.html
+//
+// Copyright (c) 1996, 1997 Douglas W. Sauder
+// All rights reserved.
+//
+// IN NO EVENT SHALL DOUGLAS W. SAUDER BE LIABLE TO ANY PARTY FOR DIRECT,
+// INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+// THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF DOUGLAS W. SAUDER
+// HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// DOUGLAS W. SAUDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT
+// NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
+// BASIS, AND DOUGLAS W. SAUDER HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
+// SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+//
+//=============================================================================
+
+#define DW_IMPLEMENTATION
+
+#include <mimelib/config.h>
+#include <mimelib/debug.h>
+#include <assert.h>
+#include <ctype.h>
+#include <mimelib/string.h>
+#include <mimelib/token.h>
+
+
+std::ostream* DwTokenizer::mDebugOut = 0;
+
+
+DwTokenizer::DwTokenizer(const DwString& aStr)
+ : mString(aStr)
+{
+ mTokenStart = 0;
+ mTokenLength = 0;
+ mNextStart = 0;
+ mTkType = eTkError;
+}
+
+
+DwTokenizer::DwTokenizer(const char* aCStr)
+ : mString(aCStr)
+{
+ mTokenStart = 0;
+ mTokenLength = 0;
+ mNextStart = 0;
+ mTkType = eTkError;
+}
+
+
+DwTokenizer::~DwTokenizer()
+{
+}
+
+
+void DwTokenizer::StripDelimiters()
+{
+ if (mTokenLength < 2) return;
+ // const ref -- avoids copy on write when using operator[]
+ const DwString& token = mToken;
+ switch (mTkType) {
+ case eTkQuotedString:
+ if (token[0] == '"') {
+ mToken = mToken.substr(1);
+ ++mTokenStart;
+ --mTokenLength;
+ }
+ if (mTokenLength > 0 && token[mTokenLength-1] == '"') {
+ mToken = mToken.substr(0, mTokenLength-1);
+ --mTokenLength;
+ }
+ break;
+ case eTkDomainLiteral:
+ if (token[0] == '[') {
+ mToken = mToken.substr(1);
+ ++mTokenStart;
+ --mTokenLength;
+ }
+ if (mTokenLength > 0 && token[mTokenLength-1] == ']') {
+ mToken = mToken.substr(0, mTokenLength-1);
+ --mTokenLength;
+ }
+ break;
+ case eTkComment:
+ if (token[0] == '(') {
+ mToken = mToken.substr(1);
+ ++mTokenStart;
+ --mTokenLength;
+ }
+ if (mTokenLength > 0 && token[mTokenLength-1] == ')') {
+ mToken = mToken.substr(0, mTokenLength-1);
+ --mTokenLength;
+ }
+ break;
+ }
+}
+
+
+void DwTokenizer::ParseQuotedString()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ else if (mString[pos] == '\\') {
+ // Quoted character
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ }
+ else if (mString[pos] == '"') {
+ // End of quoted string
+ ++pos;
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+void DwTokenizer::ParseComment()
+{
+ size_t pos = mTokenStart;
+ int level = 1;
+ while (1) {
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ else if (mString[pos] == '\\') {
+ // Quoted character
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ }
+ else if (mString[pos] == ')') {
+ --level;
+ if (level == 0) {
+ // End of comment
+ ++pos;
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+ else if (mString[pos] == '(') {
+ ++level;
+ }
+ }
+}
+
+
+void DwTokenizer::ParseDomainLiteral()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ else if (mString[pos] == '\\') {
+ // Quoted character
+ ++pos;
+ if (pos >= mString.length()) {
+ // Ran out of string
+ mTokenLength = 0;
+ mToken = "";
+ mNextStart = pos;
+ mTkType = eTkError;
+ break;
+ }
+ }
+ else if (mString[pos] == ']') {
+ // End of domain literal
+ ++pos;
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+void DwTokenizer::PrintToken(std::ostream* aOut)
+{
+ if (!aOut) return;
+ const char* type = 0;
+ switch (mTkType) {
+ case eTkError:
+ type = "error ";
+ break;
+ case eTkNull:
+ type = "null ";
+ break;
+ case eTkSpecial:
+ type = "special ";
+ break;
+ case eTkAtom:
+ type = "atom ";
+ break;
+ case eTkComment:
+ type = "comment ";
+ break;
+ case eTkQuotedString:
+ type = "quoted string ";
+ break;
+ case eTkDomainLiteral:
+ type = "domain literal ";
+ break;
+ case eTkTspecial:
+ type = "tspecial ";
+ break;
+ case eTkToken:
+ type = "token ";
+ break;
+ default:
+ type = "unknown ";
+ break;
+ }
+ *aOut << type << mToken << '\n';
+}
+
+
+static inline bool isspecialorspaceorcntrl( int c )
+{
+ switch ( c ) {
+ case '(':
+ case ')':
+ case '<':
+ case '>':
+ case '@':
+ case ',':
+ case ';':
+ case ':':
+ case '\\':
+ case '"':
+ case '.':
+ case '[':
+ case ']':
+ // isspace()
+ case ' ':
+ return true;
+ //case '\r': included in iscntrl()
+ //case '\f': included in iscntrl()
+ //case '\t': included in iscntrl()
+ //case '\n': included in iscntrl()
+ //case '\v': included in iscntrl()
+ // iscntrl()
+ default:
+ return ( (c >= 0 && c <= 15) || (c >= 17 && c <= 31) );
+ }
+}
+
+static inline bool isnotspaceorcntrl( int c )
+{
+ switch ( c ) {
+ // isspace()
+ case ' ':
+ //case '\r': included in iscntrl()
+ //case '\f': included in iscntrl()
+ //case '\t': included in iscntrl()
+ //case '\n': included in iscntrl()
+ //case '\v': included in iscntrl()
+ // iscntrl()
+ return false;
+ default:
+ return !( (c >= 0 && c <= 15) || (c >= 17 && c <= 31) );
+ }
+}
+
+DwRfc822Tokenizer::DwRfc822Tokenizer(const DwString& aStr)
+ : DwTokenizer(aStr)
+{
+ ParseToken();
+}
+
+
+DwRfc822Tokenizer::DwRfc822Tokenizer(const char* aCStr)
+ : DwTokenizer(aCStr)
+{
+ ParseToken();
+}
+
+
+DwRfc822Tokenizer::~DwRfc822Tokenizer()
+{
+}
+
+
+int DwRfc822Tokenizer::Restart()
+{
+ mNextStart = 0;
+ ParseToken();
+ return mTkType;
+}
+
+
+int DwRfc822Tokenizer::operator ++ ()
+{
+ ParseToken();
+ return mTkType;
+}
+
+
+void DwRfc822Tokenizer::ParseToken()
+{
+ // Assume the field body has already been extracted. That is, we don't
+ // have to watch for the end of the field body or folding. We just
+ // treat any CRs or LFs as white space.
+ mTokenStart = mNextStart;
+ mTokenLength = 0;
+ mTkType = eTkNull;
+ // Skip leading space. Also, since control chars are not permitted
+ // in atoms, skip these, too.
+ while (1) {
+ if (mTokenStart >= mString.length()) {
+ return;
+ }
+ if (isnotspaceorcntrl(mString[mTokenStart]))
+ break;
+ ++mTokenStart;
+ }
+ char ch = mString[mTokenStart];
+ switch (ch) {
+ // Quoted string
+ case '"':
+ mTkType = eTkQuotedString;
+ ParseQuotedString();
+ break;
+ // Comment
+ case '(':
+ mTkType = eTkComment;
+ ParseComment();
+ break;
+ // Domain literal
+ case '[':
+ mTkType = eTkDomainLiteral;
+ ParseDomainLiteral();
+ break;
+ // Special
+ case ')':
+ case '<':
+ case '>':
+ case '@':
+ case ',':
+ case ';':
+ case ':':
+ case '\\':
+ case '.':
+ case ']':
+ mTkType = eTkSpecial;
+ mTokenLength = 1;
+ mToken = mString.substr(mTokenStart, 1);
+ mNextStart = mTokenStart + 1;
+ break;
+ default:
+ mTkType = eTkAtom;
+ ParseAtom();
+ break;
+ }
+ if (mDebugOut) PrintToken(mDebugOut);
+}
+
+
+void DwRfc822Tokenizer::ParseAtom()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ char ch = (pos < mString.length()) ? mString[pos] : (char) 0;
+ if (pos >= mString.length()
+ || isspecialorspaceorcntrl(ch)) {
+
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+static inline bool istspecialorspaceorcntrl( int c )
+{
+ switch ( c ) {
+ case '(':
+ case ')':
+ case '<':
+ case '>':
+ case '@':
+ case ',':
+ case ';':
+ case ':':
+ case '\\':
+ case '"':
+ case '/':
+ case '[':
+ case ']':
+ case '?':
+ case '=':
+ // isspace()
+ case ' ':
+ return true;
+ //case '\r': included in iscntrl()
+ //case '\f': included in iscntrl()
+ //case '\t': included in iscntrl()
+ //case '\n': included in iscntrl()
+ //case '\v': included in iscntrl()
+ // iscntrl()
+ default:
+ return ( ( c >= 0 && c <= 15) || (c >= 17 && c <= 31) );
+ }
+ }
+
+DwRfc1521Tokenizer::DwRfc1521Tokenizer(const DwString& aStr)
+ : DwTokenizer(aStr)
+{
+ ParseToken();
+}
+
+
+DwRfc1521Tokenizer::DwRfc1521Tokenizer(const char* aCStr)
+ : DwTokenizer(aCStr)
+{
+ ParseToken();
+}
+
+
+DwRfc1521Tokenizer::~DwRfc1521Tokenizer()
+{
+}
+
+
+int DwRfc1521Tokenizer::Restart()
+{
+ mNextStart = 0;
+ ParseToken();
+ return mTkType;
+}
+
+
+int DwRfc1521Tokenizer::operator ++ ()
+{
+ ParseToken();
+ return mTkType;
+}
+
+
+void DwRfc1521Tokenizer::ParseToken()
+{
+ // Assume the field body has already been extracted. That is, we don't
+ // have to watch for the end of the field body or folding. We just
+ // treat any CRs or LFs as white space.
+ mTokenStart = mNextStart;
+ mTokenLength = 0;
+ mTkType = eTkNull;
+ // Skip leading space. Also, since control chars are not permitted
+ // in atoms, skip these, too.
+ while (1) {
+ if (mTokenStart >= mString.length()) {
+ return;
+ }
+ if (isnotspaceorcntrl(mString[mTokenStart]))
+ break;
+ ++mTokenStart;
+ }
+ char ch = mString[mTokenStart];
+ switch (ch) {
+ // Quoted string
+ case '"':
+ mTkType = eTkQuotedString;
+ ParseQuotedString();
+ break;
+ // Comment
+ case '(':
+ mTkType = eTkComment;
+ ParseComment();
+ break;
+ // Domain literal
+ case '[':
+ mTkType = eTkDomainLiteral;
+ ParseDomainLiteral();
+ break;
+ // Special
+ case ')':
+ case '<':
+ case '>':
+ case '@':
+ case ',':
+ case ';':
+ case ':':
+ case '\\':
+ case '/':
+ case ']':
+ case '?':
+ case '=':
+ mTkType = eTkTspecial;
+ mTokenLength = 1;
+ mToken = mString.substr(mTokenStart, 1);
+ mNextStart = mTokenStart + 1;
+ break;
+ default:
+ mTkType = eTkToken;
+ ParseAtom();
+ break;
+ }
+ if (mDebugOut) PrintToken(mDebugOut);
+}
+
+
+void DwRfc1521Tokenizer::ParseAtom()
+{
+ size_t pos = mTokenStart;
+ while (1) {
+ ++pos;
+ char ch = (pos < mString.length()) ? mString[pos] : (char) 0;
+ if (pos >= mString.length()
+ || istspecialorspaceorcntrl(ch)) {
+
+ mTokenLength = pos - mTokenStart;
+ mToken = mString.substr(mTokenStart, mTokenLength);
+ mNextStart = pos;
+ break;
+ }
+ }
+}
+
+
+DwTokenString::DwTokenString(const DwString& aStr)
+ : mString(aStr)
+{
+ mTokensStart = 0;
+ mTokensLength = 0;
+}
+
+
+DwTokenString::~DwTokenString()
+{
+}
+
+
+void DwTokenString::SetFirst(const DwTokenizer& aTkzr)
+{
+ switch (aTkzr.Type()) {
+ case eTkError:
+ case eTkNull:
+ mTokensStart = aTkzr.mTokenStart;
+ mTokensLength = 0;
+ break;
+ case eTkComment:
+ case eTkDomainLiteral:
+ case eTkQuotedString:
+ case eTkSpecial:
+ case eTkAtom:
+ case eTkTspecial:
+ case eTkToken:
+ mTokensStart = aTkzr.mTokenStart;
+ mTokensLength = aTkzr.mTokenLength;
+ break;
+ }
+ mTokens = mString.substr(mTokensStart, mTokensLength);
+}
+
+
+void DwTokenString::SetLast(const DwTokenizer& aTkzr)
+{
+ assert(aTkzr.mTokenStart >= mTokensStart);
+ if (aTkzr.mTokenStart < mTokensStart) return;
+ mTokensLength = aTkzr.mTokenStart + aTkzr.mTokenLength - mTokensStart;
+ mTokens = mString.substr(mTokensStart, mTokensLength);
+}
+
+
+void DwTokenString::ExtendTo(const DwTokenizer& aTkzr)
+{
+ assert(aTkzr.mTokenStart >= mTokensStart);
+ if (aTkzr.mTokenStart < mTokensStart) return;
+ mTokensLength = aTkzr.mTokenStart - mTokensStart;
+ mTokens = mString.substr(mTokensStart, mTokensLength);
+}