summaryrefslogtreecommitdiffstats
path: root/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp')
-rw-r--r--debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp2863
1 files changed, 0 insertions, 2863 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp b/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp
deleted file mode 100644
index f412b6f1..00000000
--- a/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp
+++ /dev/null
@@ -1,2863 +0,0 @@
-/**
- * @file tokenize.cpp
- * This file breaks up the text stream into tokens or chunks.
- *
- * Each routine needs to set pc.len and pc.type.
- *
- * @author Ben Gardner
- * @license GPL v2+
- */
-
-#include "tokenize.h"
-
-#include "keywords.h"
-#include "prototypes.h"
-#include "punctuators.h"
-#include "unc_ctype.h"
-
-#include <regex>
-#include <stack>
-
-
-#define LE_COUNT(x) cpd.le_counts[static_cast<size_t>(LE_ ## x)]
-
-constexpr static auto LCURRENT = LTOK;
-
-using namespace std;
-using namespace uncrustify;
-
-
-struct tok_info
-{
- tok_info()
- : last_ch(0)
- , idx(0)
- , row(1)
- , col(1)
- {
- }
-
- size_t last_ch;
- size_t idx;
- size_t row;
- size_t col;
-};
-
-
-struct tok_ctx
-{
- tok_ctx(const deque<int> &d)
- : data(d)
- {
- }
-
-
- //! save before trying to parse something that may fail
- void save()
- {
- save(s);
- }
-
-
- void save(tok_info &info)
- {
- info = c;
- }
-
-
- //! restore previous saved state
- void restore()
- {
- restore(s);
- }
-
-
- void restore(const tok_info &info)
- {
- c = info;
- }
-
-
- bool more()
- {
- return(c.idx < data.size());
- }
-
-
- size_t peek()
- {
- return(more() ? data[c.idx] : 0);
- }
-
-
- size_t peek(size_t idx)
- {
- idx += c.idx;
- return((idx < data.size()) ? data[idx] : 0);
- }
-
-
- size_t get()
- {
- if (more())
- {
- size_t ch = data[c.idx++];
-
- switch (ch)
- {
- case '\t':
- log_rule_B("input_tab_size");
- c.col = calc_next_tab_column(c.col, options::input_tab_size());
- break;
-
- case '\n':
-
- if (c.last_ch != '\r')
- {
- c.row++;
- c.col = 1;
- }
- break;
-
- case '\r':
- c.row++;
- c.col = 1;
- break;
-
- default:
- c.col++;
- break;
- }
- c.last_ch = ch;
- return(ch);
- }
- return(0);
- }
-
-
- bool expect(size_t ch)
- {
- if (peek() == ch)
- {
- get();
- return(true);
- }
- return(false);
- }
-
-
- const deque<int> &data;
- tok_info c; //! current
- tok_info s; //! saved
-};
-
-
-/**
- * Count the number of characters in a quoted string.
- * The next bit of text starts with a quote char " or ' or <.
- * Count the number of characters until the matching character.
- *
- * @param pc The structure to update, str is an input.
- *
- * @return Whether a string was parsed
- */
-static bool parse_string(tok_ctx &ctx, chunk_t &pc, size_t quote_idx, bool allow_escape);
-
-
-/**
- * Literal string, ends with single "
- * Two "" don't end the string.
- *
- * @param pc The structure to update, str is an input.
- *
- * @return Whether a string was parsed
- */
-static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * VALA verbatim string, ends with three quotes (""")
- *
- * @param pc The structure to update, str is an input.
- */
-static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc);
-
-
-static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len);
-
-
-/**
- * Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)"
- * Newlines may be in the string.
- *
- * @param pc structure to update, str is an input.
- */
-static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, size_t q_idx);
-
-
-/**
- * Count the number of whitespace characters.
- *
- * @param pc The structure to update, str is an input.
- *
- * @return Whether whitespace was parsed
- */
-static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * Called when we hit a backslash.
- * If there is nothing but whitespace until the newline, then this is a
- * backslash newline
- *
- * @param pc structure to update, str is an input
- */
-static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * Parses any number of tab or space chars followed by a newline.
- * Does not change pc.len if a newline isn't found.
- * This is not the same as parse_whitespace() because it only consumes until
- * a single newline is encountered.
- */
-static bool parse_newline(tok_ctx &ctx);
-
-
-/**
- * PAWN #define is different than C/C++.
- * #define PATTERN REPLACEMENT_TEXT
- * The PATTERN may not contain a space or '[' or ']'.
- * A generic whitespace check should be good enough.
- * Do not change the pattern.
- *
- * @param pc structure to update, str is an input
- */
-static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt);
-
-
-static bool parse_ignored(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * Skips the next bit of whatever and returns the type of block.
- *
- * pc.str is the input text.
- * pc.len in the output length.
- * pc.type is the output type
- * pc.column is output column
- *
- * @param pc The structure to update, str is an input.
- * @param prev_pc The previous structure
- *
- * @return true/false - whether anything was parsed
- */
-static bool parse_next(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc);
-
-
-/**
- * Parses all legal D string constants.
- *
- * Quoted strings:
- * r"Wysiwyg" # WYSIWYG string
- * x"hexstring" # Hexadecimal array
- * `Wysiwyg` # WYSIWYG string
- * 'char' # single character
- * "reg_string" # regular string
- *
- * Non-quoted strings:
- * \x12 # 1-byte hex constant
- * \u1234 # 2-byte hex constant
- * \U12345678 # 4-byte hex constant
- * \123 # octal constant
- * \&amp; # named entity
- * \n # single character
- *
- * @param pc The structure to update, str is an input.
- *
- * @return Whether a string was parsed
- */
-static bool d_parse_string(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * Figure of the length of the comment at text.
- * The next bit of text starts with a '/', so it might be a comment.
- * There are three types of comments:
- * - C comments that start with '/ *' and end with '* /'
- * - C++ comments that start with //
- * - D nestable comments '/+' '+/'
- *
- * @param pc The structure to update, str is an input.
- *
- * @return Whether a comment was parsed
- */
-static bool parse_comment(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * Figure of the length of the code placeholder at text, if present.
- * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>.
- *
- * @param pc The structure to update, str is an input.
- *
- * @return Whether a placeholder was parsed.
- */
-static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc);
-
-
-/**
- * Parse any attached suffix, which may be a user-defined literal suffix.
- * If for a string, explicitly exclude common format and scan specifiers, ie,
- * PRIx32 and SCNx64.
- */
-static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring);
-
-
-//! check if a symbol holds a boolean value
-static bool is_bin(int ch);
-static bool is_bin_(int ch);
-
-
-//! check if a symbol holds a octal value
-static bool is_oct(int ch);
-static bool is_oct_(int ch);
-
-
-//! check if a symbol holds a decimal value;
-static bool is_dec(int ch);
-static bool is_dec_(int ch);
-
-
-//! check if a symbol holds a hexadecimal value
-static bool is_hex(int ch);
-static bool is_hex_(int ch);
-
-
-/**
- * Count the number of characters in the number.
- * The next bit of text starts with a number (0-9 or '.'), so it is a number.
- * Count the number of characters in the number.
- *
- * This should cover all number formats for all languages.
- * Note that this is not a strict parser. It will happily parse numbers in
- * an invalid format.
- *
- * For example, only D allows underscores in the numbers, but they are
- * allowed in all formats.
- *
- * @param[in,out] pc The structure to update, str is an input.
- *
- * @return Whether a number was parsed
- */
-static bool parse_number(tok_ctx &ctx, chunk_t &pc);
-
-
-static bool d_parse_string(tok_ctx &ctx, chunk_t &pc)
-{
- size_t ch = ctx.peek();
-
- if ( ch == '"'
- || ch == '\'')
- {
- return(parse_string(ctx, pc, 0, true));
- }
-
- if (ch == '`')
- {
- return(parse_string(ctx, pc, 0, false));
- }
-
- if ( ( ch == 'r'
- || ch == 'x')
- && ctx.peek(1) == '"')
- {
- return(parse_string(ctx, pc, 1, false));
- }
-
- if (ch != '\\')
- {
- return(false);
- }
- ctx.save();
- int cnt;
-
- pc.str.clear();
-
- while (ctx.peek() == '\\')
- {
- pc.str.append(ctx.get());
-
- // Check for end of file
- switch (ctx.peek())
- {
- case 'x': // \x HexDigit HexDigit
- cnt = 3;
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
- break;
-
- case 'u': // \u HexDigit (x4)
- cnt = 5;
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
- break;
-
- case 'U': // \U HexDigit (x8)
- cnt = 9;
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
- break;
-
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- // handle up to 3 octal digits
- pc.str.append(ctx.get());
- ch = ctx.peek();
-
- if ( (ch >= '0')
- && (ch <= '7'))
- {
- pc.str.append(ctx.get());
- ch = ctx.peek();
-
- if ( (ch >= '0')
- && (ch <= '7'))
- {
- pc.str.append(ctx.get());
- }
- }
- break;
-
- case '&':
- // \& NamedCharacterEntity ;
- pc.str.append(ctx.get());
-
- while (unc_isalpha(ctx.peek()))
- {
- pc.str.append(ctx.get());
- }
-
- if (ctx.peek() == ';')
- {
- pc.str.append(ctx.get());
- }
- break;
-
- default:
- // Everything else is a single character
- pc.str.append(ctx.get());
- break;
- } // switch
- }
-
- if (pc.str.size() < 1)
- {
- ctx.restore();
- return(false);
- }
- set_chunk_type(&pc, CT_STRING);
- return(true);
-} // d_parse_string
-
-
-#if 0
-
-
-//! A string-in-string search. Like strstr() with a haystack length.
-static const char *str_search(const char *needle, const char *haystack, int haystack_len)
-{
- int needle_len = strlen(needle);
-
- while (haystack_len-- >= needle_len)
- {
- if (memcmp(needle, haystack, needle_len) == 0)
- {
- return(haystack);
- }
- haystack++;
- }
- return(NULL);
-}
-#endif
-
-
-static bool parse_comment(tok_ctx &ctx, chunk_t &pc)
-{
- bool is_d = language_is_set(LANG_D);
- bool is_cs = language_is_set(LANG_CS);
- size_t d_level = 0;
-
- // does this start with '/ /' or '/ *' or '/ +' (d)
- if ( (ctx.peek() != '/')
- || ( (ctx.peek(1) != '*')
- && (ctx.peek(1) != '/')
- && ( (ctx.peek(1) != '+')
- || !is_d)))
- {
- return(false);
- }
- ctx.save();
-
- // account for opening two chars
- pc.str = ctx.get(); // opening '/'
- size_t ch = ctx.get();
-
- pc.str.append(ch); // second char
-
- if (ch == '/')
- {
- set_chunk_type(&pc, CT_COMMENT_CPP);
-
- while (true)
- {
- int bs_cnt = 0;
-
- while (ctx.more())
- {
- ch = ctx.peek();
-
- if ( (ch == '\r')
- || (ch == '\n'))
- {
- break;
- }
-
- if ( (ch == '\\')
- && !is_cs) // backslashes aren't special in comments in C#
- {
- bs_cnt++;
- }
- else
- {
- bs_cnt = 0;
- }
- pc.str.append(ctx.get());
- }
-
- /*
- * If we hit an odd number of backslashes right before the newline,
- * then we keep going.
- */
- if ( ((bs_cnt & 1) == 0)
- || !ctx.more())
- {
- break;
- }
-
- if (ctx.peek() == '\r')
- {
- pc.str.append(ctx.get());
- }
-
- if (ctx.peek() == '\n')
- {
- pc.str.append(ctx.get());
- }
- pc.nl_count++;
- cpd.did_newline = true;
- }
- }
- else if (!ctx.more())
- {
- // unexpected end of file
- ctx.restore();
- return(false);
- }
- else if (ch == '+')
- {
- set_chunk_type(&pc, CT_COMMENT);
- d_level++;
-
- while ( d_level > 0
- && ctx.more())
- {
- if ( (ctx.peek() == '+')
- && (ctx.peek(1) == '/'))
- {
- pc.str.append(ctx.get()); // store the '+'
- pc.str.append(ctx.get()); // store the '/'
- d_level--;
- continue;
- }
-
- if ( (ctx.peek() == '/')
- && (ctx.peek(1) == '+'))
- {
- pc.str.append(ctx.get()); // store the '/'
- pc.str.append(ctx.get()); // store the '+'
- d_level++;
- continue;
- }
- ch = ctx.get();
- pc.str.append(ch);
-
- if ( (ch == '\n')
- || (ch == '\r'))
- {
- set_chunk_type(&pc, CT_COMMENT_MULTI);
- pc.nl_count++;
-
- if (ch == '\r')
- {
- if (ctx.peek() == '\n')
- {
- ++LE_COUNT(CRLF);
- pc.str.append(ctx.get()); // store the '\n'
- }
- else
- {
- ++LE_COUNT(CR);
- }
- }
- else
- {
- ++LE_COUNT(LF);
- }
- }
- }
- }
- else // must be '/ *'
- {
- set_chunk_type(&pc, CT_COMMENT);
-
- while (ctx.more())
- {
- if ( (ctx.peek() == '*')
- && (ctx.peek(1) == '/'))
- {
- pc.str.append(ctx.get()); // store the '*'
- pc.str.append(ctx.get()); // store the '/'
-
- tok_info ss;
- ctx.save(ss);
- size_t oldsize = pc.str.size();
-
- // If there is another C comment right after this one, combine them
- while ( (ctx.peek() == ' ')
- || (ctx.peek() == '\t'))
- {
- pc.str.append(ctx.get());
- }
-
- if ( (ctx.peek() != '/')
- || (ctx.peek(1) != '*'))
- {
- // undo the attempt to join
- ctx.restore(ss);
- pc.str.resize(oldsize);
- break;
- }
- }
- ch = ctx.get();
- pc.str.append(ch);
-
- if ( (ch == '\n')
- || (ch == '\r'))
- {
- set_chunk_type(&pc, CT_COMMENT_MULTI);
- pc.nl_count++;
-
- if (ch == '\r')
- {
- if (ctx.peek() == '\n')
- {
- ++LE_COUNT(CRLF);
- pc.str.append(ctx.get()); // store the '\n'
- }
- else
- {
- ++LE_COUNT(CR);
- }
- }
- else
- {
- ++LE_COUNT(LF);
- }
- }
- }
- }
-
- if (cpd.unc_off)
- {
- bool found_enable_marker = (find_enable_processing_comment_marker(pc.str) >= 0);
-
- if (found_enable_marker)
- {
- const auto &ontext = options::enable_processing_cmt();
-
- LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n",
- __func__, __LINE__, ontext.c_str(), pc.orig_line);
- cpd.unc_off = false;
- }
- }
- else
- {
- auto position_disable_processing_cmt = find_disable_processing_comment_marker(pc.str);
- bool found_disable_marker = (position_disable_processing_cmt >= 0);
-
- if (found_disable_marker)
- {
- /**
- * the user may wish to disable processing part of a multiline comment,
- * in which case we'll handle at a late time. Check to see if processing
- * is re-enabled elsewhere in this comment
- */
- auto position_enable_processing_cmt = find_enable_processing_comment_marker(pc.str);
-
- if (position_enable_processing_cmt < position_disable_processing_cmt)
- {
- const auto &offtext = options::disable_processing_cmt();
-
- LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n",
- __func__, __LINE__, offtext.c_str(), pc.orig_line);
- cpd.unc_off = true;
- // Issue #842
- cpd.unc_off_used = true;
- }
- }
- }
- return(true);
-} // parse_comment
-
-
-static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc)
-{
- if ( (ctx.peek() != '<')
- || (ctx.peek(1) != '#'))
- {
- return(false);
- }
- ctx.save();
-
- // account for opening two chars '<#'
- pc.str = ctx.get();
- pc.str.append(ctx.get());
-
- // grab everything until '#>', fail if not found.
- size_t last1 = 0;
-
- while (ctx.more())
- {
- size_t last2 = last1;
- last1 = ctx.get();
- pc.str.append(last1);
-
- if ( (last2 == '#')
- && (last1 == '>'))
- {
- set_chunk_type(&pc, CT_WORD);
- return(true);
- }
- }
- ctx.restore();
- return(false);
-}
-
-
-static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring = false)
-{
- if (CharTable::IsKw1(ctx.peek()))
- {
- size_t slen = 0;
- size_t oldsize = pc.str.size();
-
- // don't add the suffix if we see L" or L' or S"
- size_t p1 = ctx.peek();
- size_t p2 = ctx.peek(1);
-
- if ( forstring
- && ( ( (p1 == 'L')
- && ( (p2 == '"')
- || (p2 == '\'')))
- || ( (p1 == 'S')
- && (p2 == '"'))))
- {
- return;
- }
- tok_info ss;
- ctx.save(ss);
-
- while ( ctx.more()
- && CharTable::IsKw2(ctx.peek()))
- {
- slen++;
- pc.str.append(ctx.get());
- }
-
- if ( forstring
- && slen >= 4
- && ( pc.str.startswith("PRI", oldsize)
- || pc.str.startswith("SCN", oldsize)))
- {
- ctx.restore(ss);
- pc.str.resize(oldsize);
- }
- }
-}
-
-
-static bool is_bin(int ch)
-{
- return( (ch == '0')
- || (ch == '1'));
-}
-
-
-static bool is_bin_(int ch)
-{
- return( is_bin(ch)
- || ch == '_'
- || ch == '\'');
-}
-
-
-static bool is_oct(int ch)
-{
- return( (ch >= '0')
- && (ch <= '7'));
-}
-
-
-static bool is_oct_(int ch)
-{
- return( is_oct(ch)
- || ch == '_'
- || ch == '\'');
-}
-
-
-static bool is_dec(int ch)
-{
- return( (ch >= '0')
- && (ch <= '9'));
-}
-
-
-static bool is_dec_(int ch)
-{
- // number separators: JAVA: "_", C++14: "'"
- return( is_dec(ch)
- || (ch == '_')
- || (ch == '\''));
-}
-
-
-static bool is_hex(int ch)
-{
- return( ( (ch >= '0')
- && (ch <= '9'))
- || ( (ch >= 'a')
- && (ch <= 'f'))
- || ( (ch >= 'A')
- && (ch <= 'F')));
-}
-
-
-static bool is_hex_(int ch)
-{
- return( is_hex(ch)
- || ch == '_'
- || ch == '\'');
-}
-
-
-static bool parse_number(tok_ctx &ctx, chunk_t &pc)
-{
- /*
- * A number must start with a digit or a dot, followed by a digit
- * (signs handled elsewhere)
- */
- if ( !is_dec(ctx.peek())
- && ( (ctx.peek() != '.')
- || !is_dec(ctx.peek(1))))
- {
- return(false);
- }
- bool is_float = (ctx.peek() == '.');
-
- if ( is_float
- && (ctx.peek(1) == '.')) // make sure it isn't '..'
- {
- return(false);
- }
- /*
- * Check for Hex, Octal, or Binary
- * Note that only D, C++14 and Pawn support binary
- * Fixes the issue # 1591
- * In c# the numbers starting with 0 are not treated as octal numbers.
- */
- bool did_hex = false;
-
- if ( ctx.peek() == '0'
- && !language_is_set(LANG_CS))
- {
- size_t ch;
- chunk_t pc_temp;
-
- pc.str.append(ctx.get()); // store the '0'
- pc_temp.str.append('0');
-
- // MS constant might have an "h" at the end. Look for it
- ctx.save();
-
- while ( ctx.more()
- && CharTable::IsKw2(ctx.peek()))
- {
- ch = ctx.get();
- pc_temp.str.append(ch);
- }
- ch = pc_temp.str[pc_temp.len() - 1];
- ctx.restore();
- LOG_FMT(LGUY, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.text());
-
- if (ch == 'h') // TODO can we combine this in analyze_character
- {
- // we have an MS hexadecimal number with "h" at the end
- LOG_FMT(LGUY, "%s(%d): MS hexadecimal number\n", __func__, __LINE__);
- did_hex = true;
-
- do
- {
- pc.str.append(ctx.get()); // store the rest
- } while (is_hex_(ctx.peek()));
-
- pc.str.append(ctx.get()); // store the h
- LOG_FMT(LGUY, "%s(%d): pc:%s\n", __func__, __LINE__, pc.text());
- }
- else
- {
- switch (unc_toupper(ctx.peek()))
- {
- case 'X': // hex
- did_hex = true;
-
- do
- {
- pc.str.append(ctx.get()); // store the 'x' and then the rest
- } while (is_hex_(ctx.peek()));
-
- break;
-
- case 'B': // binary
-
- do
- {
- pc.str.append(ctx.get()); // store the 'b' and then the rest
- } while (is_bin_(ctx.peek()));
-
- break;
-
- case '0': // octal or decimal
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
-
- do
- {
- pc.str.append(ctx.get());
- } while (is_oct_(ctx.peek()));
-
- break;
-
- default:
- // either just 0 or 0.1 or 0UL, etc
- break;
- } // switch
- }
- }
- else
- {
- // Regular int or float
- while (is_dec_(ctx.peek()))
- {
- pc.str.append(ctx.get());
- }
- }
-
- // Check if we stopped on a decimal point & make sure it isn't '..'
- if ( (ctx.peek() == '.')
- && (ctx.peek(1) != '.'))
- {
- // Issue #1265, 5.clamp()
- tok_info ss;
- ctx.save(ss);
-
- while ( ctx.more()
- && CharTable::IsKw2(ctx.peek(1)))
- {
- // skip characters to check for paren open
- ctx.get();
- }
-
- if (ctx.peek(1) == '(')
- {
- ctx.restore(ss);
- set_chunk_type(&pc, CT_NUMBER);
- return(true);
- }
- else
- {
- ctx.restore(ss);
- }
- pc.str.append(ctx.get());
- is_float = true;
-
- if (did_hex)
- {
- while (is_hex_(ctx.peek()))
- {
- pc.str.append(ctx.get());
- }
- }
- else
- {
- while (is_dec_(ctx.peek()))
- {
- pc.str.append(ctx.get());
- }
- }
- }
- /*
- * Check exponent
- * Valid exponents per language (not that it matters):
- * C/C++/D/Java: eEpP
- * C#/Pawn: eE
- */
- size_t tmp = unc_toupper(ctx.peek());
-
- if ( (tmp == 'E')
- || (tmp == 'P'))
- {
- is_float = true;
- pc.str.append(ctx.get());
-
- if ( (ctx.peek() == '+')
- || (ctx.peek() == '-'))
- {
- pc.str.append(ctx.get());
- }
-
- while (is_dec_(ctx.peek()))
- {
- pc.str.append(ctx.get());
- }
- }
-
- /*
- * Check the suffixes
- * Valid suffixes per language (not that it matters):
- * Integer Float
- * C/C++: uUlL64 lLfF
- * C#: uUlL fFdDMm
- * D: uUL ifFL
- * Java: lL fFdD
- * Pawn: (none) (none)
- *
- * Note that i, f, d, and m only appear in floats.
- */
- while (1)
- {
- size_t tmp2 = unc_toupper(ctx.peek());
-
- if ( (tmp2 == 'I')
- || (tmp2 == 'F')
- || (tmp2 == 'D')
- || (tmp2 == 'M'))
- {
- is_float = true;
- }
- else if ( (tmp2 != 'L')
- && (tmp2 != 'U'))
- {
- break;
- }
- pc.str.append(ctx.get());
- }
-
- // skip the Microsoft-specific '32' and '64' suffix
- if ( ( (ctx.peek() == '3')
- && (ctx.peek(1) == '2'))
- || ( (ctx.peek() == '6')
- && (ctx.peek(1) == '4')))
- {
- pc.str.append(ctx.get());
- pc.str.append(ctx.get());
- }
- set_chunk_type(&pc, is_float ? CT_NUMBER_FP : CT_NUMBER);
-
- /*
- * If there is anything left, then we are probably dealing with garbage or
- * some sick macro junk. Eat it.
- */
- parse_suffix(ctx, pc);
-
- return(true);
-} // parse_number
-
-
-static bool parse_string(tok_ctx &ctx, chunk_t &pc, size_t quote_idx, bool allow_escape)
-{
- log_rule_B("string_escape_char");
- const size_t escape_char = options::string_escape_char();
-
- log_rule_B("string_escape_char2");
- const size_t escape_char2 = options::string_escape_char2();
-
- log_rule_B("string_replace_tab_chars");
- const bool should_escape_tabs = ( allow_escape
- && options::string_replace_tab_chars()
- && language_is_set(LANG_ALLC));
-
- pc.str.clear();
-
- while (quote_idx-- > 0)
- {
- pc.str.append(ctx.get());
- }
- set_chunk_type(&pc, CT_STRING);
- const size_t termination_character = CharTable::Get(ctx.peek()) & 0xff;
-
- pc.str.append(ctx.get()); // store the "
-
- bool escaped = false;
-
- while (ctx.more())
- {
- const size_t ch = ctx.get();
-
- // convert char 9 (\t) to chars \t
- if ( (ch == '\t')
- && should_escape_tabs)
- {
- const size_t lastcol = ctx.c.col - 1;
- ctx.c.col = lastcol + 2;
- pc.str.append(escape_char);
- pc.str.append('t');
- continue;
- }
- pc.str.append(ch);
-
- if (ch == '\n')
- {
- pc.nl_count++;
- set_chunk_type(&pc, CT_STRING_MULTI);
- }
- else if ( ch == '\r'
- && ctx.peek() != '\n')
- {
- pc.str.append(ctx.get());
- pc.nl_count++;
- set_chunk_type(&pc, CT_STRING_MULTI);
- }
-
- // if last char in prev loop was escaped the one in the current loop isn't
- if (escaped)
- {
- escaped = false;
- continue;
- }
-
- // see if the current char is a escape char
- if (allow_escape)
- {
- if (ch == escape_char)
- {
- escaped = (escape_char != 0);
- continue;
- }
-
- if ( ch == escape_char2
- && (ctx.peek() == termination_character))
- {
- escaped = allow_escape;
- continue;
- }
- }
-
- if (ch == termination_character)
- {
- break;
- }
- }
- parse_suffix(ctx, pc, true);
- return(true);
-} // parse_string
-
-enum cs_string_t
-{
- CS_STRING_NONE = 0,
- CS_STRING_STRING = 1 << 0, // is any kind of string
- CS_STRING_VERBATIM = 1 << 1, // @"" style string
- CS_STRING_INTERPOLATED = 1 << 2, // $"" or $@"" style string
-};
-
-static cs_string_t operator|=(cs_string_t &value, cs_string_t other)
-{
- return(value = static_cast<cs_string_t>(value | other));
-}
-
-
-static cs_string_t parse_cs_string_start(tok_ctx &ctx, chunk_t &pc)
-{
- cs_string_t stringType = CS_STRING_NONE;
- int offset = 0;
-
- if (ctx.peek(offset) == '$')
- {
- stringType |= CS_STRING_INTERPOLATED;
- ++offset;
- }
-
- if (ctx.peek(offset) == '@')
- {
- stringType |= CS_STRING_VERBATIM;
- ++offset;
- }
-
- if (ctx.peek(offset) == '"')
- {
- stringType |= CS_STRING_STRING;
-
- set_chunk_type(&pc, CT_STRING);
-
- for (int i = 0; i <= offset; ++i)
- {
- pc.str.append(ctx.get());
- }
- }
- else
- {
- stringType = CS_STRING_NONE;
- }
- return(stringType);
-} // parse_cs_string_start
-
-
-struct CsStringParseState
-{
- cs_string_t type;
- int braceDepth;
-
-
- CsStringParseState(cs_string_t stringType)
- {
- type = stringType;
- braceDepth = 0;
- }
-};
-
-
-/**
- * C# strings are complex enough (mostly due to interpolation and nesting) that they need a custom parser.
- */
-static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc)
-{
- cs_string_t stringType = parse_cs_string_start(ctx, pc);
-
- if (stringType == 0)
- {
- return(false);
- }
- // an interpolated string can contain {expressions}, which can contain $"strings", which in turn
- // can contain {expressions}, so we must track both as they are interleaved, in order to properly
- // parse the outermost string.
-
- std::stack<CsStringParseState> parseState; // each entry is a nested string
-
- parseState.push(CsStringParseState(stringType));
-
- log_rule_B("string_replace_tab_chars");
- bool should_escape_tabs = options::string_replace_tab_chars();
-
- while (ctx.more())
- {
- if (parseState.top().braceDepth > 0)
- {
- // all we can do when in an expr is look for expr close with }, or a new string opening. must do this first
- // so we can peek and potentially consume chars for new string openings, before the ch=get() happens later,
- // which is needed for newline processing.
-
- if (ctx.peek() == '}')
- {
- pc.str.append(ctx.get());
-
- if (ctx.peek() == '}')
- {
- pc.str.append(ctx.get()); // in interpolated string, `}}` is escape'd `}`
- }
- else
- {
- --parseState.top().braceDepth;
- }
- continue;
- }
- stringType = parse_cs_string_start(ctx, pc);
-
- if (stringType)
- {
- parseState.push(CsStringParseState(stringType));
- continue;
- }
- }
- int lastcol = ctx.c.col;
- int ch = ctx.get();
-
- pc.str.append(ch);
-
- if (ch == '\n')
- {
- set_chunk_type(&pc, CT_STRING_MULTI);
- pc.nl_count++;
- }
- else if (ch == '\r')
- {
- set_chunk_type(&pc, CT_STRING_MULTI);
- }
- else if (parseState.top().braceDepth > 0)
- {
- // do nothing. if we're in a brace, we only want the newline handling, and skip the rest.
- }
- else if ( (ch == '\t')
- && should_escape_tabs)
- {
- if (parseState.top().type & CS_STRING_VERBATIM)
- {
- if (!cpd.warned_unable_string_replace_tab_chars)
- {
- cpd.warned_unable_string_replace_tab_chars = true;
-
- log_rule_B("warn_level_tabs_found_in_verbatim_string_literals");
- log_sev_t warnlevel = (log_sev_t)options::warn_level_tabs_found_in_verbatim_string_literals();
-
- /*
- * a tab char can't be replaced with \\t because escapes don't
- * work in here-strings. best we can do is warn.
- */
- LOG_FMT(warnlevel, "%s(%d): %s: orig_line is %zu, orig_col is %zu, Detected non-replaceable tab char in literal string\n",
- __func__, __LINE__, cpd.filename.c_str(), pc.orig_line, pc.orig_col);
- LOG_FMT(warnlevel, "%s(%d): Warning is given if doing tab-to-\\t replacement and we have found one in a C# verbatim string literal.\n",
- __func__, __LINE__);
-
- if (warnlevel < LWARN)
- {
- cpd.error_count++;
- }
- }
- }
- else
- {
- ctx.c.col = lastcol + 2;
- pc.str.pop_back(); // remove \t
- pc.str.append("\\t");
-
- continue;
- }
- }
- else if ( ch == '\\'
- && !(parseState.top().type & CS_STRING_VERBATIM))
- {
- // catch escaped quote in order to avoid ending string (but also must handle \\ to avoid accidental 'escape' seq of `\\"`)
- if ( ctx.peek() == '"'
- || ctx.peek() == '\\')
- {
- pc.str.append(ctx.get());
- }
- }
- else if (ch == '"')
- {
- if ( (parseState.top().type & CS_STRING_VERBATIM)
- && (ctx.peek() == '"'))
- {
- // in verbatim string, `""` is escape'd `"`
- pc.str.append(ctx.get());
- }
- else
- {
- // end of string
- parseState.pop();
-
- if (parseState.empty())
- {
- break;
- }
- }
- }
- else if (parseState.top().type & CS_STRING_INTERPOLATED)
- {
- if (ch == '{')
- {
- if (ctx.peek() == '{')
- {
- pc.str.append(ctx.get()); // in interpolated string, `{{` is escape'd `{`
- }
- else
- {
- ++parseState.top().braceDepth;
- }
- }
- }
- }
- return(true);
-} // parse_cs_string
-
-
-static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc)
-{
- set_chunk_type(&pc, CT_STRING);
-
- // consume the initial """
- pc.str = ctx.get();
- pc.str.append(ctx.get());
- pc.str.append(ctx.get());
-
- // go until we hit a zero (end of file) or a """
- while (ctx.more())
- {
- size_t ch = ctx.get();
- pc.str.append(ch);
-
- if ( (ch == '"')
- && (ctx.peek() == '"')
- && (ctx.peek(1) == '"'))
- {
- pc.str.append(ctx.get());
- pc.str.append(ctx.get());
- break;
- }
-
- if ( (ch == '\n')
- || (ch == '\r'))
- {
- set_chunk_type(&pc, CT_STRING_MULTI);
- pc.nl_count++;
- }
- }
-}
-
-
-static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len)
-{
- if (a_idx != b_idx)
- {
- while (len-- > 0)
- {
- if (d[a_idx] != d[b_idx])
- {
- return(false);
- }
- }
- }
- return(true);
-}
-
-
-static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, size_t q_idx)
-{
- size_t tag_idx = ctx.c.idx + q_idx + 1;
- size_t tag_len = 0;
-
- ctx.save();
-
- // Copy the prefix + " to the string
- pc.str.clear();
- int cnt = q_idx + 1;
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
-
- // Add the tag and get the length of the tag
- while ( ctx.more()
- && (ctx.peek() != '('))
- {
- tag_len++;
- pc.str.append(ctx.get());
- }
-
- if (ctx.peek() != '(')
- {
- ctx.restore();
- return(false);
- }
- set_chunk_type(&pc, CT_STRING);
-
- while (ctx.more())
- {
- if ( (ctx.peek() == ')')
- && (ctx.peek(tag_len + 1) == '"')
- && tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len))
- {
- cnt = tag_len + 2; // for the )"
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
- parse_suffix(ctx, pc);
- return(true);
- }
-
- if (ctx.peek() == '\n')
- {
- pc.str.append(ctx.get());
- pc.nl_count++;
- set_chunk_type(&pc, CT_STRING_MULTI);
- }
- else
- {
- pc.str.append(ctx.get());
- }
- }
- ctx.restore();
- return(false);
-} // parse_cr_string
-
-
-/**
- * Count the number of characters in a word.
- * The first character is already valid for a keyword
- *
- * @param pc The structure to update, str is an input.
- * @return Whether a word was parsed (always true)
- */
-static bool parse_word(tok_ctx &ctx, chunk_t &pc, bool skipcheck)
-{
- static unc_text intr_txt("@interface");
-
- // The first character is already valid
- pc.str.clear();
- pc.str.append(ctx.get());
-
- while (ctx.more())
- {
- size_t ch = ctx.peek();
-
- if (CharTable::IsKw2(ch))
- {
- pc.str.append(ctx.get());
- }
- else if ( (ch == '\\')
- && (unc_tolower(ctx.peek(1)) == 'u'))
- {
- pc.str.append(ctx.get());
- pc.str.append(ctx.get());
- skipcheck = true;
- }
- else
- {
- break;
- }
-
- // HACK: Non-ASCII character are only allowed in identifiers
- if (ch > 0x7f)
- {
- skipcheck = true;
- }
- }
- set_chunk_type(&pc, CT_WORD);
-
- if (skipcheck)
- {
- return(true);
- }
-
- // Detect pre-processor functions now
- if ( cpd.in_preproc == CT_PP_DEFINE
- && cpd.preproc_ncnl_count == 1)
- {
- if (ctx.peek() == '(')
- {
- set_chunk_type(&pc, CT_MACRO_FUNC);
- }
- else
- {
- set_chunk_type(&pc, CT_MACRO);
-
- log_rule_B("pp_ignore_define_body");
-
- if (options::pp_ignore_define_body())
- {
- /*
- * We are setting the PP_IGNORE preproc state because the following
- * chunks are part of the macro body and will have to be ignored.
- */
- cpd.in_preproc = CT_PP_IGNORE;
- }
- }
- }
- else
- {
- // '@interface' is reserved, not an interface itself
- if ( language_is_set(LANG_JAVA)
- && pc.str.startswith("@")
- && !pc.str.equals(intr_txt))
- {
- set_chunk_type(&pc, CT_ANNOTATION);
- }
- else
- {
- // Turn it into a keyword now
- // Issue #1460 will return "COMMENT_CPP"
- set_chunk_type(&pc, find_keyword_type(pc.text(), pc.str.size()));
-
- /* Special pattern: if we're trying to redirect a preprocessor directive to PP_IGNORE,
- * then ensure we're actually part of a preprocessor before doing the swap, or we'll
- * end up with a function named 'define' as PP_IGNORE. This is necessary because with
- * the config 'set' feature, there's no way to do a pair of tokens as a word
- * substitution. */
- if ( pc.type == CT_PP_IGNORE
- && !cpd.in_preproc)
- {
- set_chunk_type(&pc, find_keyword_type(pc.text(), pc.str.size()));
- }
- else if (pc.type == CT_COMMENT_CPP) // Issue #1460
- {
- size_t ch;
- bool is_cs = language_is_set(LANG_CS);
-
- // read until EOL
- while (true)
- {
- int bs_cnt = 0;
-
- while (ctx.more())
- {
- ch = ctx.peek();
-
- if ( (ch == '\r')
- || (ch == '\n'))
- {
- break;
- }
-
- if ( (ch == '\\')
- && !is_cs) // backslashes aren't special in comments in C#
- {
- bs_cnt++;
- }
- else
- {
- bs_cnt = 0;
- }
- pc.str.append(ctx.get());
- }
-
- /*
- * If we hit an odd number of backslashes right before the newline,
- * then we keep going.
- */
- if ( ((bs_cnt & 1) == 0)
- || !ctx.more())
- {
- break;
- }
-
- if (ctx.peek() == '\r')
- {
- pc.str.append(ctx.get());
- }
-
- if (ctx.peek() == '\n')
- {
- pc.str.append(ctx.get());
- }
- pc.nl_count++;
- cpd.did_newline = true;
- }
- // Store off the end column
- pc.orig_col_end = ctx.c.col;
- }
- }
- }
- return(true);
-} // parse_word
-
-
-static size_t parse_attribute_specifier_sequence(tok_ctx &ctx)
-{
- size_t nested = 0;
- size_t offset = 0;
- size_t parens = 0;
- auto ch1 = ctx.peek(offset++);
-
- while (ch1)
- {
- auto ch2 = ctx.peek(offset++);
-
- while ( ch2 == ' '
- || ch2 == '\n'
- || ch2 == '\r'
- || ch2 == '\t')
- {
- ch2 = ctx.peek(offset++);
- }
-
- if ( nested == 0
- && ch2 != '[')
- {
- break;
- }
-
- if (ch1 == '(')
- {
- ++parens;
- ch1 = ch2;
- continue;
- }
-
- if (ch1 == ')')
- {
- if (parens == 0)
- {
- break;
- }
- --parens;
- ch1 = ch2;
- continue;
- }
-
- if ( ch1 != '['
- && ch1 != ']')
- {
- ch1 = ch2;
- continue;
- }
-
- if (ch2 != ch1)
- {
- if (parens == 0)
- {
- break;
- }
- ch1 = ch2;
- continue;
- }
-
- if (ch1 == '[')
- {
- if ( nested != 0
- && parens == 0)
- {
- break;
- }
- ++nested;
- }
- else if (--nested == 0)
- {
- return(offset);
- }
- ch1 = ctx.peek(offset++);
- }
- return(0);
-} // parse_attribute_specifier_sequence
-
-
-static bool extract_attribute_specifier_sequence(tok_ctx &ctx, chunk_t &pc, size_t length)
-{
- pc.str.clear();
-
- while (length--)
- {
- pc.str.append(ctx.get());
- }
- set_chunk_type(&pc, CT_ATTRIBUTE);
- return(true);
-} // extract_attribute_specifier_sequence
-
-
-static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc)
-{
- size_t nl_count = 0;
- size_t ch = 0;
-
- // REVISIT: use a better whitespace detector?
- while ( ctx.more()
- && unc_isspace(ctx.peek()))
- {
- ch = ctx.get(); // throw away the whitespace char
-
- switch (ch)
- {
- case '\r':
-
- if (ctx.expect('\n'))
- {
- // CRLF ending
- ++LE_COUNT(CRLF);
- }
- else
- {
- // CR ending
- ++LE_COUNT(CR);
- }
- nl_count++;
- pc.orig_prev_sp = 0;
- break;
-
- case '\n':
- // LF ending
- ++LE_COUNT(LF);
- nl_count++;
- pc.orig_prev_sp = 0;
- break;
-
- case '\t':
- log_rule_B("input_tab_size");
- pc.orig_prev_sp += calc_next_tab_column(cpd.column, options::input_tab_size()) - cpd.column;
- break;
-
- case ' ':
- pc.orig_prev_sp++;
- break;
-
- default:
- break;
- }
- }
-
- if (ch != 0)
- {
- pc.str.clear();
- set_chunk_type(&pc, nl_count ? CT_NEWLINE : CT_WHITESPACE);
- pc.nl_count = nl_count;
- pc.after_tab = (ctx.c.last_ch == '\t');
- return(true);
- }
- return(false);
-} // parse_whitespace
-
-
-static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc)
-{
- ctx.save();
- ctx.get(); // skip the '\'
-
- size_t ch;
-
- while ( ctx.more()
- && unc_isspace(ch = ctx.peek()))
- {
- ctx.get();
-
- if ( (ch == '\r')
- || (ch == '\n'))
- {
- if (ch == '\r')
- {
- ctx.expect('\n');
- }
- set_chunk_type(&pc, CT_NL_CONT);
- pc.str = "\\";
- pc.nl_count = 1;
- return(true);
- }
- }
- ctx.restore();
- return(false);
-}
-
-
-static bool parse_newline(tok_ctx &ctx)
-{
- ctx.save();
-
- // Eat whitespace
- while ( (ctx.peek() == ' ')
- || (ctx.peek() == '\t'))
- {
- ctx.get();
- }
-
- if ( (ctx.peek() == '\r')
- || (ctx.peek() == '\n'))
- {
- if (!ctx.expect('\n'))
- {
- ctx.get();
- ctx.expect('\n');
- }
- return(true);
- }
- ctx.restore();
- return(false);
-}
-
-
-static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt)
-{
- pc.str.clear();
- set_chunk_type(&pc, tt);
-
- while (!unc_isspace(ctx.peek()))
- {
- // end the pattern on an escaped newline
- if (ctx.peek() == '\\')
- {
- size_t ch = ctx.peek(1);
-
- if ( (ch == '\n')
- || (ch == '\r'))
- {
- break;
- }
- }
- pc.str.append(ctx.get());
- }
-}
-
-
-static bool parse_off_newlines(tok_ctx &ctx, chunk_t &pc)
-{
- size_t nl_count = 0;
-
- // Parse off newlines/blank lines
- while (parse_newline(ctx))
- {
- nl_count++;
- }
-
- if (nl_count > 0)
- {
- pc.nl_count = nl_count;
- set_chunk_type(&pc, CT_NEWLINE);
- return(true);
- }
- return(false);
-}
-
-
-static bool parse_macro(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc)
-{
- if (parse_off_newlines(ctx, pc))
- {
- return(true);
- }
-
- if (parse_comment(ctx, pc)) // allow CT_COMMENT_MULTI within macros
- {
- return(true);
- }
- ctx.save();
- pc.str.clear();
-
- bool continued = ( chunk_is_token(prev_pc, CT_NL_CONT)
- || chunk_is_token(prev_pc, CT_COMMENT_MULTI));
-
- while (ctx.more())
- {
- size_t pk = ctx.peek(), pk1 = ctx.peek(1);
- bool nl = ( pk == '\n'
- || pk == '\r');
- bool nl_cont = ( pk == '\\'
- && ( pk1 == '\n'
- || pk1 == '\r'));
-
- if ( ( nl_cont
- || ( continued
- && nl))
- && pc.str.size() > 0)
- {
- set_chunk_type(&pc, CT_IGNORED);
- return(true);
- }
- else if (nl)
- {
- break;
- }
- pc.str.append(ctx.get());
- }
- pc.str.clear();
- ctx.restore();
- return(false);
-} // parse_macro
-
-
-static bool parse_ignored(tok_ctx &ctx, chunk_t &pc)
-{
- if (parse_off_newlines(ctx, pc))
- {
- return(true);
- }
- // See if the UO_enable_processing_cmt or #pragma endasm / #endasm text is on this line
- ctx.save();
- pc.str.clear();
-
- while ( ctx.more()
- && (ctx.peek() != '\r')
- && (ctx.peek() != '\n'))
- {
- pc.str.append(ctx.get());
- }
-
- if (pc.str.size() == 0)
- {
- // end of file?
- return(false);
- }
-
- // HACK: turn on if we find '#endasm' or '#pragma' and 'endasm' separated by blanks
- if ( ( ( (pc.str.find("#pragma ") >= 0)
- || (pc.str.find("#pragma ") >= 0))
- && ( (pc.str.find(" endasm") >= 0)
- || (pc.str.find(" endasm") >= 0)))
- || (pc.str.find("#endasm") >= 0))
- {
- cpd.unc_off = false;
- ctx.restore();
- pc.str.clear();
- return(false);
- }
- // Note that we aren't actually making sure this is in a comment, yet
- log_rule_B("enable_processing_cmt");
- const auto &ontext = options::enable_processing_cmt();
-
- if (!ontext.empty())
- {
- bool found_enable_pattern = false;
-
- if ( ontext != UNCRUSTIFY_ON_TEXT
- && options::processing_cmt_as_regex())
- {
- std::wstring pc_wstring(pc.str.get().cbegin(),
- pc.str.get().cend());
- std::wregex criteria(std::wstring(ontext.cbegin(),
- ontext.cend()));
-
- found_enable_pattern = std::regex_search(pc_wstring.cbegin(),
- pc_wstring.cend(),
- criteria);
- }
- else
- {
- found_enable_pattern = (pc.str.find(ontext.c_str()) >= 0);
- }
-
- if (!found_enable_pattern)
- {
- set_chunk_type(&pc, CT_IGNORED);
- return(true);
- }
- }
- ctx.restore();
-
- // parse off whitespace leading to the comment
- if (parse_whitespace(ctx, pc))
- {
- set_chunk_type(&pc, CT_IGNORED);
- return(true);
- }
-
- // Look for the ending comment and let it pass
- if ( parse_comment(ctx, pc)
- && !cpd.unc_off)
- {
- return(true);
- }
- // Reset the chunk & scan to until a newline
- pc.str.clear();
-
- while ( ctx.more()
- && (ctx.peek() != '\r')
- && (ctx.peek() != '\n'))
- {
- pc.str.append(ctx.get());
- }
-
- if (pc.str.size() > 0)
- {
- set_chunk_type(&pc, CT_IGNORED);
- return(true);
- }
- return(false);
-} // parse_ignored
-
-
-static bool parse_next(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc)
-{
- if (!ctx.more())
- {
- return(false);
- }
- // Save off the current column
- set_chunk_type(&pc, CT_NONE);
- pc.orig_line = ctx.c.row;
- pc.column = ctx.c.col;
- pc.orig_col = ctx.c.col;
- pc.nl_count = 0;
- pc.flags = PCF_NONE;
-
- // If it is turned off, we put everything except newlines into CT_UNKNOWN
- if (cpd.unc_off)
- {
- if (parse_ignored(ctx, pc))
- {
- return(true);
- }
- }
- log_rule_B("disable_processing_nl_cont");
-
- // Parse macro blocks
- if (options::disable_processing_nl_cont())
- {
- if (parse_macro(ctx, pc, prev_pc))
- {
- return(true);
- }
- }
-
- // Parse whitespace
- if (parse_whitespace(ctx, pc))
- {
- return(true);
- }
-
- // Handle unknown/unhandled preprocessors
- if ( cpd.in_preproc > CT_PP_BODYCHUNK
- && cpd.in_preproc <= CT_PP_OTHER)
- {
- pc.str.clear();
- tok_info ss;
- ctx.save(ss);
- // Chunk to a newline or comment
- set_chunk_type(&pc, CT_PREPROC_BODY);
- size_t last = 0;
-
- while (ctx.more())
- {
- size_t ch = ctx.peek();
-
- // Fix for issue #1752
- // Ignoring extra spaces after ' \ ' for preproc body continuations
- if ( last == '\\'
- && ch == ' ')
- {
- ctx.get();
- continue;
- }
-
- if ( (ch == '\n')
- || (ch == '\r'))
- {
- // Back off if this is an escaped newline
- if (last == '\\')
- {
- ctx.restore(ss);
- pc.str.pop_back();
- }
- break;
- }
-
- // Quit on a C or C++ comment start Issue #1966
- if ( (ch == '/')
- && ( (ctx.peek(1) == '/')
- || (ctx.peek(1) == '*')))
- {
- break;
- }
- last = ch;
- ctx.save(ss);
-
- pc.str.append(ctx.get());
- }
-
- if (pc.str.size() > 0)
- {
- return(true);
- }
- }
-
- // Detect backslash-newline
- if ( (ctx.peek() == '\\')
- && parse_bs_newline(ctx, pc))
- {
- return(true);
- }
-
- // Parse comments
- if (parse_comment(ctx, pc))
- {
- return(true);
- }
-
- // Parse code placeholders
- if (parse_code_placeholder(ctx, pc))
- {
- return(true);
- }
-
- if (language_is_set(LANG_CS))
- {
- if (parse_cs_string(ctx, pc))
- {
- return(true);
- }
-
- // check for non-keyword identifiers such as @if @switch, etc
- if ( (ctx.peek() == '@')
- && CharTable::IsKw1(ctx.peek(1)))
- {
- parse_word(ctx, pc, true);
- return(true);
- }
- }
-
- // handle VALA """ strings """
- if ( language_is_set(LANG_VALA)
- && (ctx.peek() == '"')
- && (ctx.peek(1) == '"')
- && (ctx.peek(2) == '"'))
- {
- parse_verbatim_string(ctx, pc);
- return(true);
- }
- /*
- * handle C++(11) string/char literal prefixes u8|u|U|L|R including all
- * possible combinations and optional R delimiters: R"delim(x)delim"
- */
- auto ch = ctx.peek();
-
- if ( language_is_set(LANG_C | LANG_CPP)
- && ( ch == 'u'
- || ch == 'U'
- || ch == 'R'
- || ch == 'L'))
- {
- auto idx = size_t{};
- auto is_real = false;
-
- if ( ch == 'u'
- && ctx.peek(1) == '8')
- {
- idx = 2;
- }
- else if ( unc_tolower(ch) == 'u'
- || ch == 'L')
- {
- idx++;
- }
-
- if ( language_is_set(LANG_C | LANG_CPP)
- && ctx.peek(idx) == 'R')
- {
- idx++;
- is_real = true;
- }
- const auto quote = ctx.peek(idx);
-
- if (is_real)
- {
- if ( quote == '"'
- && parse_cr_string(ctx, pc, idx))
- {
- return(true);
- }
- }
- else if ( ( quote == '"'
- || quote == '\'')
- && parse_string(ctx, pc, idx, true))
- {
- return(true);
- }
- }
-
- // PAWN specific stuff
- if (language_is_set(LANG_PAWN))
- {
- if ( cpd.preproc_ncnl_count == 1
- && ( cpd.in_preproc == CT_PP_DEFINE
- || cpd.in_preproc == CT_PP_EMIT))
- {
- parse_pawn_pattern(ctx, pc, CT_MACRO);
- return(true);
- }
-
- // Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi"
- if ( (ctx.peek() == '\\')
- || (ctx.peek() == '!'))
- {
- if (ctx.peek(1) == '"')
- {
- parse_string(ctx, pc, 1, (ctx.peek() == '!'));
- return(true);
- }
-
- if ( ( (ctx.peek(1) == '\\')
- || (ctx.peek(1) == '!'))
- && (ctx.peek(2) == '"'))
- {
- parse_string(ctx, pc, 2, false);
- return(true);
- }
- }
-
- // handle PAWN preprocessor args %0 .. %9
- if ( cpd.in_preproc == CT_PP_DEFINE
- && (ctx.peek() == '%')
- && unc_isdigit(ctx.peek(1)))
- {
- pc.str.clear();
- pc.str.append(ctx.get());
- pc.str.append(ctx.get());
- set_chunk_type(&pc, CT_WORD);
- return(true);
- }
- }
- // Parse strings and character constants
-
-//parse_word(ctx, pc_temp, true);
-//ctx.restore(ctx.c);
- if (parse_number(ctx, pc))
- {
- return(true);
- }
-
- if (language_is_set(LANG_D))
- {
- // D specific stuff
- if (d_parse_string(ctx, pc))
- {
- return(true);
- }
- }
- else
- {
- // Not D stuff
-
- // Check for L'a', L"abc", 'a', "abc", <abc> strings
- ch = ctx.peek();
- size_t ch1 = ctx.peek(1);
-
- if ( ( ( (ch == 'L')
- || (ch == 'S'))
- && ( (ch1 == '"')
- || (ch1 == '\'')))
- || (ch == '"')
- || (ch == '\'')
- || ( (ch == '<')
- && cpd.in_preproc == CT_PP_INCLUDE))
- {
- parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true);
- set_chunk_parent(&pc, CT_PP_INCLUDE);
- return(true);
- }
-
- if ( (ch == '<')
- && cpd.in_preproc == CT_PP_DEFINE)
- {
- if (chunk_is_token(chunk_get_tail(), CT_MACRO))
- {
- // We have "#define XXX <", assume '<' starts an include string
- parse_string(ctx, pc, 0, false);
- return(true);
- }
- }
-
- /* Inside clang's __has_include() could be "path/to/file.h" or system-style <path/to/file.h> */
- if ( (ch == '(')
- && (chunk_get_tail() != nullptr)
- && ( chunk_is_token(chunk_get_tail(), CT_CNG_HASINC)
- || chunk_is_token(chunk_get_tail(), CT_CNG_HASINCN)))
- {
- parse_string(ctx, pc, 0, false);
- return(true);
- }
- }
-
- // Check for Objective C literals and VALA identifiers ('@1', '@if')
- if ( language_is_set(LANG_OC | LANG_VALA)
- && (ctx.peek() == '@'))
- {
- size_t nc = ctx.peek(1);
-
- if (nc == 'R') // Issue #2720
- {
- if (ctx.peek(2) == '"')
- {
- if (parse_cr_string(ctx, pc, 2)) // Issue #3027
- {
- return(true);
- }
- // parse string without escaping
- parse_string(ctx, pc, 2, false);
- return(true);
- }
- }
-
- if ( (nc == '"')
- || (nc == '\''))
- {
- // literal string
- parse_string(ctx, pc, 1, true);
- return(true);
- }
-
- if ( (nc >= '0')
- && (nc <= '9'))
- {
- // literal number
- pc.str.append(ctx.get()); // store the '@'
- parse_number(ctx, pc);
- return(true);
- }
- }
-
- // Check for pawn/ObjectiveC/Java and normal identifiers
- if ( CharTable::IsKw1(ctx.peek())
- || ( (ctx.peek() == '\\')
- && (unc_tolower(ctx.peek(1)) == 'u'))
- || ( (ctx.peek() == '@')
- && CharTable::IsKw1(ctx.peek(1))))
- {
- parse_word(ctx, pc, false);
- return(true);
- }
-
- // Check for C++11/14/17/20 attribute specifier sequences
- if ( language_is_set(LANG_CPP)
- && ctx.peek() == '[')
- {
- if ( !language_is_set(LANG_OC)
- || !chunk_is_token(prev_pc, CT_OC_AT))
- {
- if (auto length = parse_attribute_specifier_sequence(ctx))
- {
- extract_attribute_specifier_sequence(ctx, pc, length);
- return(true);
- }
- }
- }
- // see if we have a punctuator
- char punc_txt[7];
-
- punc_txt[0] = ctx.peek();
- punc_txt[1] = ctx.peek(1);
- punc_txt[2] = ctx.peek(2);
- punc_txt[3] = ctx.peek(3);
- punc_txt[4] = ctx.peek(4);
- punc_txt[5] = ctx.peek(5);
- punc_txt[6] = '\0';
- const chunk_tag_t *punc;
-
- if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != nullptr)
- {
- int cnt = strlen(punc->tag);
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
- set_chunk_type(&pc, punc->type);
- pc.flags |= PCF_PUNCTUATOR;
- return(true);
- }
- /* When parsing C/C++ files and running into some unknown token,
- * check if matches Objective-C as a last resort, before
- * considering it as garbage.
- */
- int probe_lang_flags = 0;
-
- if (language_is_set(LANG_C | LANG_CPP))
- {
- probe_lang_flags = cpd.lang_flags | LANG_OC;
- }
-
- if (probe_lang_flags != 0)
- {
- if ((punc = find_punctuator(punc_txt, probe_lang_flags)) != NULL)
- {
- cpd.lang_flags = probe_lang_flags;
- int cnt = strlen(punc->tag);
-
- while (cnt--)
- {
- pc.str.append(ctx.get());
- }
- set_chunk_type(&pc, punc->type);
- pc.flags |= PCF_PUNCTUATOR;
- return(true);
- }
- }
- // throw away this character
- set_chunk_type(&pc, CT_UNKNOWN);
- pc.str.append(ctx.get());
-
- LOG_FMT(LWARN, "%s:%zu Garbage in col %d: %x\n",
- cpd.filename.c_str(), pc.orig_line, (int)ctx.c.col, pc.str[0]);
- cpd.error_count++;
- return(true);
-} // parse_next
-
-
-int find_disable_processing_comment_marker(const unc_text &text,
- std::size_t start_idx)
-{
- log_rule_B("disable_processing_cmt");
- const auto &offtext = options::disable_processing_cmt();
- int idx = -1;
-
- if ( !offtext.empty()
- && start_idx < text.size())
- {
- if ( offtext != UNCRUSTIFY_OFF_TEXT
- && options::processing_cmt_as_regex())
- {
- std::wsmatch match;
- std::wstring pc_wstring(text.get().cbegin() + start_idx,
- text.get().cend());
- std::wregex criteria(std::wstring(offtext.cbegin(),
- offtext.cend()));
-
- std::regex_search(pc_wstring.cbegin(),
- pc_wstring.cend(),
- match,
- criteria);
-
- if (!match.empty())
- {
- idx = int(match.position() + start_idx);
- }
- }
- else
- {
- idx = text.find(offtext.c_str(),
- start_idx);
-
- if (idx >= 0)
- {
- idx += int(offtext.size());
- }
- }
-
- /**
- * update the position to the start of the current line
- */
- while ( idx > 0
- && text[idx - 1] != '\n')
- {
- --idx;
- }
- }
- return(idx);
-} // find_disable_processing_comment_marker
-
-
-int find_enable_processing_comment_marker(const unc_text &text,
- std::size_t start_idx)
-{
- log_rule_B("enable_processing_cmt");
- const auto &ontext = options::enable_processing_cmt();
- int idx = -1;
-
- if ( !ontext.empty()
- && start_idx < text.size())
- {
- if ( ontext != UNCRUSTIFY_ON_TEXT
- && options::processing_cmt_as_regex())
- {
- std::wsmatch match;
- std::wstring pc_wstring(text.get().cbegin() + start_idx,
- text.get().cend());
- std::wregex criteria(std::wstring(ontext.cbegin(),
- ontext.cend()));
-
- std::regex_search(pc_wstring.cbegin(),
- pc_wstring.cend(),
- match,
- criteria);
-
- if (!match.empty())
- {
- idx = int(start_idx + match.position() + match.size());
- }
- }
- else
- {
- idx = text.find(ontext.c_str(),
- start_idx);
-
- if (idx >= 0)
- {
- idx += int(ontext.size());
- }
- }
-
- /**
- * update the position to the end of the current line
- */
- if (idx >= 0)
- {
- while ( idx < int(text.size())
- && text[idx] != '\n')
- {
- ++idx;
- }
- }
- }
- return(idx);
-} // find_enable_processing_comment_marker
-
-
-void tokenize(const deque<int> &data, chunk_t *ref)
-{
- tok_ctx ctx(data);
- chunk_t chunk;
- chunk_t *pc = nullptr;
- chunk_t *rprev = nullptr;
- bool last_was_tab = false;
- size_t prev_sp = 0;
- int num_stripped = 0; // Issue #1966
-
- cpd.unc_stage = unc_stage_e::TOKENIZE;
-
- while (ctx.more())
- {
- chunk.reset();
- chunk.pp_level = 0;
-
- if (!parse_next(ctx, chunk, pc))
- {
- LOG_FMT(LERR, "%s:%zu Bailed before the end?\n",
- cpd.filename.c_str(), ctx.c.row);
- cpd.error_count++;
- break;
- }
-
- if ( language_is_set(LANG_JAVA)
- && chunk.type == CT_MEMBER
- && !memcmp(chunk.text(), "->", 2))
- {
- chunk.type = CT_LAMBDA;
- }
-
- // Don't create an entry for whitespace
- if (chunk.type == CT_WHITESPACE)
- {
- last_was_tab = chunk.after_tab;
- prev_sp = chunk.orig_prev_sp;
- continue;
- }
- chunk.orig_prev_sp = prev_sp;
- prev_sp = 0;
-
- if (chunk.type == CT_NEWLINE)
- {
- last_was_tab = chunk.after_tab;
- chunk.after_tab = false;
- chunk.str.clear();
- }
- else if (chunk.type == CT_NL_CONT)
- {
- last_was_tab = chunk.after_tab;
- chunk.after_tab = false;
- chunk.str = "\\\n";
- }
- else
- {
- chunk.after_tab = last_was_tab;
- last_was_tab = false;
- }
-
- if (chunk.type != CT_IGNORED)
- {
- // Issue #1338
- // Strip trailing whitespace (for CPP comments and PP blocks)
- num_stripped = 0; // Issue #1966
-
- while ( (chunk.str.size() > 0)
- && ( (chunk.str[chunk.str.size() - 1] == ' ')
- || (chunk.str[chunk.str.size() - 1] == '\t')))
- {
- // If comment contains backslash '\' followed by whitespace chars, keep last one;
- // this will prevent it from turning '\' into line continuation.
- if ( (chunk.str.size() > 1)
- && (chunk.str[chunk.str.size() - 2] == '\\'))
- {
- break;
- }
- chunk.str.pop_back();
- num_stripped++; // Issue #1966
- }
- }
- // Store off the end column
- chunk.orig_col_end = ctx.c.col;
-
- if ( ( chunk.type == CT_COMMENT_MULTI // Issue #1966
- || chunk.type == CT_COMMENT
- || chunk.type == CT_COMMENT_CPP)
- && (pc != nullptr)
- && chunk_is_token(pc, CT_PP_IGNORE))
- {
- chunk.orig_col_end -= num_stripped;
- }
- // Add the chunk to the list
- rprev = pc;
-
- if (rprev != nullptr)
- {
- chunk_flags_set(pc, rprev->flags & PCF_COPY_FLAGS);
-
- // a newline can't be in a preprocessor
- if (chunk_is_token(pc, CT_NEWLINE))
- {
- chunk_flags_clr(pc, PCF_IN_PREPROC);
- }
- }
-
- if (ref != nullptr)
- {
- chunk.flags |= PCF_INSERTED;
- }
- else
- {
- chunk.flags &= ~PCF_INSERTED;
- }
- pc = chunk_add_before(&chunk, ref);
-
- // A newline marks the end of a preprocessor
- if (chunk_is_token(pc, CT_NEWLINE)) // || chunk_is_token(pc, CT_COMMENT_MULTI))
- {
- cpd.in_preproc = CT_NONE;
- cpd.preproc_ncnl_count = 0;
- }
-
- // Disable indentation when #asm directive found
- if (chunk_is_token(pc, CT_PP_ASM))
- {
- LOG_FMT(LBCTRL, "Found a directive %s on line %zu\n", "#asm", pc->orig_line);
- cpd.unc_off = true;
- }
-
- // Special handling for preprocessor stuff
- if (cpd.in_preproc != CT_NONE)
- {
- chunk_flags_set(pc, PCF_IN_PREPROC);
-
- // Count words after the preprocessor
- if ( !chunk_is_comment(pc)
- && !chunk_is_newline(pc))
- {
- cpd.preproc_ncnl_count++;
- }
-
- // Disable indentation if a #pragma asm directive is found
- if (cpd.in_preproc == CT_PP_PRAGMA)
- {
- if (memcmp(pc->text(), "asm", 3) == 0)
- {
- LOG_FMT(LBCTRL, "Found a pragma %s on line %zu\n", "asm", pc->orig_line);
- cpd.unc_off = true;
- }
- }
-
- // Figure out the type of preprocessor for #include parsing
- if (cpd.in_preproc == CT_PREPROC)
- {
- if ( pc->type < CT_PP_DEFINE
- || pc->type > CT_PP_OTHER)
- {
- set_chunk_type(pc, CT_PP_OTHER);
- }
- cpd.in_preproc = pc->type;
- }
- else if (cpd.in_preproc == CT_PP_IGNORE)
- {
- // ASSERT(options::pp_ignore_define_body());
- if ( !chunk_is_token(pc, CT_NL_CONT)
- && !chunk_is_token(pc, CT_COMMENT_CPP)
- && !chunk_is_token(pc, CT_COMMENT)
- && !chunk_is_token(pc, CT_COMMENT_MULTI)) // Issue #1966
- {
- set_chunk_type(pc, CT_PP_IGNORE);
- }
- }
- else if ( cpd.in_preproc == CT_PP_DEFINE
- && chunk_is_token(pc, CT_PAREN_CLOSE)
- && options::pp_ignore_define_body())
- {
- log_rule_B("pp_ignore_define_body");
- // When we have a PAREN_CLOSE in a PP_DEFINE we should be terminating a MACRO_FUNC
- // arguments list. Therefore we can enter the PP_IGNORE state and ignore next chunks.
- cpd.in_preproc = CT_PP_IGNORE;
- }
- }
- else
- {
- // Check for a preprocessor start
- if ( chunk_is_token(pc, CT_POUND)
- && ( rprev == nullptr
- || chunk_is_token(rprev, CT_NEWLINE)))
- {
- set_chunk_type(pc, CT_PREPROC);
- chunk_flags_set(pc, PCF_IN_PREPROC);
- cpd.in_preproc = CT_PREPROC;
- }
- }
-
- if (chunk_is_token(pc, CT_NEWLINE))
- {
- LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, <Newline>, nl is %zu\n",
- __func__, __LINE__, pc->orig_line, pc->orig_col, pc->nl_count);
- }
- else if (chunk_is_token(pc, CT_VBRACE_OPEN))
- {
- LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, type is %s, orig_col_end is %zu\n",
- __func__, __LINE__, pc->orig_line, pc->orig_col, get_token_name(pc->type), pc->orig_col_end);
- }
- else
- {
- char copy[1000];
- LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, text() '%s', type is %s, orig_col_end is %zu\n",
- __func__, __LINE__, pc->orig_line, pc->orig_col, pc->elided_text(copy), get_token_name(pc->type), pc->orig_col_end);
- }
- }
- // Set the cpd.newline string for this file
- log_rule_B("newlines");
-
- if ( options::newlines() == LE_LF
- || ( options::newlines() == LE_AUTO
- && (LE_COUNT(LF) >= LE_COUNT(CRLF))
- && (LE_COUNT(LF) >= LE_COUNT(CR))))
- {
- // LF line ends
- cpd.newline = "\n";
- LOG_FMT(LLINEENDS, "Using LF line endings\n");
- }
- else if ( options::newlines() == LE_CRLF
- || ( options::newlines() == LE_AUTO
- && (LE_COUNT(CRLF) >= LE_COUNT(LF))
- && (LE_COUNT(CRLF) >= LE_COUNT(CR))))
- {
- // CRLF line ends
- cpd.newline = "\r\n";
- LOG_FMT(LLINEENDS, "Using CRLF line endings\r\n");
- }
- else
- {
- // CR line ends
- cpd.newline = "\r";
- LOG_FMT(LLINEENDS, "Using CR line endings\n");
- }
-} // tokenize