diff options
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp')
-rw-r--r-- | debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp | 2863 |
1 files changed, 0 insertions, 2863 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp b/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp deleted file mode 100644 index f412b6f1..00000000 --- a/debian/uncrustify-trinity/uncrustify-trinity-0.74.0/src/tokenize.cpp +++ /dev/null @@ -1,2863 +0,0 @@ -/** - * @file tokenize.cpp - * This file breaks up the text stream into tokens or chunks. - * - * Each routine needs to set pc.len and pc.type. - * - * @author Ben Gardner - * @license GPL v2+ - */ - -#include "tokenize.h" - -#include "keywords.h" -#include "prototypes.h" -#include "punctuators.h" -#include "unc_ctype.h" - -#include <regex> -#include <stack> - - -#define LE_COUNT(x) cpd.le_counts[static_cast<size_t>(LE_ ## x)] - -constexpr static auto LCURRENT = LTOK; - -using namespace std; -using namespace uncrustify; - - -struct tok_info -{ - tok_info() - : last_ch(0) - , idx(0) - , row(1) - , col(1) - { - } - - size_t last_ch; - size_t idx; - size_t row; - size_t col; -}; - - -struct tok_ctx -{ - tok_ctx(const deque<int> &d) - : data(d) - { - } - - - //! save before trying to parse something that may fail - void save() - { - save(s); - } - - - void save(tok_info &info) - { - info = c; - } - - - //! restore previous saved state - void restore() - { - restore(s); - } - - - void restore(const tok_info &info) - { - c = info; - } - - - bool more() - { - return(c.idx < data.size()); - } - - - size_t peek() - { - return(more() ? data[c.idx] : 0); - } - - - size_t peek(size_t idx) - { - idx += c.idx; - return((idx < data.size()) ? data[idx] : 0); - } - - - size_t get() - { - if (more()) - { - size_t ch = data[c.idx++]; - - switch (ch) - { - case '\t': - log_rule_B("input_tab_size"); - c.col = calc_next_tab_column(c.col, options::input_tab_size()); - break; - - case '\n': - - if (c.last_ch != '\r') - { - c.row++; - c.col = 1; - } - break; - - case '\r': - c.row++; - c.col = 1; - break; - - default: - c.col++; - break; - } - c.last_ch = ch; - return(ch); - } - return(0); - } - - - bool expect(size_t ch) - { - if (peek() == ch) - { - get(); - return(true); - } - return(false); - } - - - const deque<int> &data; - tok_info c; //! current - tok_info s; //! saved -}; - - -/** - * Count the number of characters in a quoted string. - * The next bit of text starts with a quote char " or ' or <. - * Count the number of characters until the matching character. - * - * @param pc The structure to update, str is an input. - * - * @return Whether a string was parsed - */ -static bool parse_string(tok_ctx &ctx, chunk_t &pc, size_t quote_idx, bool allow_escape); - - -/** - * Literal string, ends with single " - * Two "" don't end the string. - * - * @param pc The structure to update, str is an input. - * - * @return Whether a string was parsed - */ -static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc); - - -/** - * VALA verbatim string, ends with three quotes (""") - * - * @param pc The structure to update, str is an input. - */ -static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc); - - -static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len); - - -/** - * Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)" - * Newlines may be in the string. - * - * @param pc structure to update, str is an input. - */ -static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, size_t q_idx); - - -/** - * Count the number of whitespace characters. - * - * @param pc The structure to update, str is an input. - * - * @return Whether whitespace was parsed - */ -static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc); - - -/** - * Called when we hit a backslash. - * If there is nothing but whitespace until the newline, then this is a - * backslash newline - * - * @param pc structure to update, str is an input - */ -static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc); - - -/** - * Parses any number of tab or space chars followed by a newline. - * Does not change pc.len if a newline isn't found. - * This is not the same as parse_whitespace() because it only consumes until - * a single newline is encountered. - */ -static bool parse_newline(tok_ctx &ctx); - - -/** - * PAWN #define is different than C/C++. - * #define PATTERN REPLACEMENT_TEXT - * The PATTERN may not contain a space or '[' or ']'. - * A generic whitespace check should be good enough. - * Do not change the pattern. - * - * @param pc structure to update, str is an input - */ -static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt); - - -static bool parse_ignored(tok_ctx &ctx, chunk_t &pc); - - -/** - * Skips the next bit of whatever and returns the type of block. - * - * pc.str is the input text. - * pc.len in the output length. - * pc.type is the output type - * pc.column is output column - * - * @param pc The structure to update, str is an input. - * @param prev_pc The previous structure - * - * @return true/false - whether anything was parsed - */ -static bool parse_next(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc); - - -/** - * Parses all legal D string constants. - * - * Quoted strings: - * r"Wysiwyg" # WYSIWYG string - * x"hexstring" # Hexadecimal array - * `Wysiwyg` # WYSIWYG string - * 'char' # single character - * "reg_string" # regular string - * - * Non-quoted strings: - * \x12 # 1-byte hex constant - * \u1234 # 2-byte hex constant - * \U12345678 # 4-byte hex constant - * \123 # octal constant - * \& # named entity - * \n # single character - * - * @param pc The structure to update, str is an input. - * - * @return Whether a string was parsed - */ -static bool d_parse_string(tok_ctx &ctx, chunk_t &pc); - - -/** - * Figure of the length of the comment at text. - * The next bit of text starts with a '/', so it might be a comment. - * There are three types of comments: - * - C comments that start with '/ *' and end with '* /' - * - C++ comments that start with // - * - D nestable comments '/+' '+/' - * - * @param pc The structure to update, str is an input. - * - * @return Whether a comment was parsed - */ -static bool parse_comment(tok_ctx &ctx, chunk_t &pc); - - -/** - * Figure of the length of the code placeholder at text, if present. - * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>. - * - * @param pc The structure to update, str is an input. - * - * @return Whether a placeholder was parsed. - */ -static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc); - - -/** - * Parse any attached suffix, which may be a user-defined literal suffix. - * If for a string, explicitly exclude common format and scan specifiers, ie, - * PRIx32 and SCNx64. - */ -static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring); - - -//! check if a symbol holds a boolean value -static bool is_bin(int ch); -static bool is_bin_(int ch); - - -//! check if a symbol holds a octal value -static bool is_oct(int ch); -static bool is_oct_(int ch); - - -//! check if a symbol holds a decimal value; -static bool is_dec(int ch); -static bool is_dec_(int ch); - - -//! check if a symbol holds a hexadecimal value -static bool is_hex(int ch); -static bool is_hex_(int ch); - - -/** - * Count the number of characters in the number. - * The next bit of text starts with a number (0-9 or '.'), so it is a number. - * Count the number of characters in the number. - * - * This should cover all number formats for all languages. - * Note that this is not a strict parser. It will happily parse numbers in - * an invalid format. - * - * For example, only D allows underscores in the numbers, but they are - * allowed in all formats. - * - * @param[in,out] pc The structure to update, str is an input. - * - * @return Whether a number was parsed - */ -static bool parse_number(tok_ctx &ctx, chunk_t &pc); - - -static bool d_parse_string(tok_ctx &ctx, chunk_t &pc) -{ - size_t ch = ctx.peek(); - - if ( ch == '"' - || ch == '\'') - { - return(parse_string(ctx, pc, 0, true)); - } - - if (ch == '`') - { - return(parse_string(ctx, pc, 0, false)); - } - - if ( ( ch == 'r' - || ch == 'x') - && ctx.peek(1) == '"') - { - return(parse_string(ctx, pc, 1, false)); - } - - if (ch != '\\') - { - return(false); - } - ctx.save(); - int cnt; - - pc.str.clear(); - - while (ctx.peek() == '\\') - { - pc.str.append(ctx.get()); - - // Check for end of file - switch (ctx.peek()) - { - case 'x': // \x HexDigit HexDigit - cnt = 3; - - while (cnt--) - { - pc.str.append(ctx.get()); - } - break; - - case 'u': // \u HexDigit (x4) - cnt = 5; - - while (cnt--) - { - pc.str.append(ctx.get()); - } - break; - - case 'U': // \U HexDigit (x8) - cnt = 9; - - while (cnt--) - { - pc.str.append(ctx.get()); - } - break; - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - // handle up to 3 octal digits - pc.str.append(ctx.get()); - ch = ctx.peek(); - - if ( (ch >= '0') - && (ch <= '7')) - { - pc.str.append(ctx.get()); - ch = ctx.peek(); - - if ( (ch >= '0') - && (ch <= '7')) - { - pc.str.append(ctx.get()); - } - } - break; - - case '&': - // \& NamedCharacterEntity ; - pc.str.append(ctx.get()); - - while (unc_isalpha(ctx.peek())) - { - pc.str.append(ctx.get()); - } - - if (ctx.peek() == ';') - { - pc.str.append(ctx.get()); - } - break; - - default: - // Everything else is a single character - pc.str.append(ctx.get()); - break; - } // switch - } - - if (pc.str.size() < 1) - { - ctx.restore(); - return(false); - } - set_chunk_type(&pc, CT_STRING); - return(true); -} // d_parse_string - - -#if 0 - - -//! A string-in-string search. Like strstr() with a haystack length. -static const char *str_search(const char *needle, const char *haystack, int haystack_len) -{ - int needle_len = strlen(needle); - - while (haystack_len-- >= needle_len) - { - if (memcmp(needle, haystack, needle_len) == 0) - { - return(haystack); - } - haystack++; - } - return(NULL); -} -#endif - - -static bool parse_comment(tok_ctx &ctx, chunk_t &pc) -{ - bool is_d = language_is_set(LANG_D); - bool is_cs = language_is_set(LANG_CS); - size_t d_level = 0; - - // does this start with '/ /' or '/ *' or '/ +' (d) - if ( (ctx.peek() != '/') - || ( (ctx.peek(1) != '*') - && (ctx.peek(1) != '/') - && ( (ctx.peek(1) != '+') - || !is_d))) - { - return(false); - } - ctx.save(); - - // account for opening two chars - pc.str = ctx.get(); // opening '/' - size_t ch = ctx.get(); - - pc.str.append(ch); // second char - - if (ch == '/') - { - set_chunk_type(&pc, CT_COMMENT_CPP); - - while (true) - { - int bs_cnt = 0; - - while (ctx.more()) - { - ch = ctx.peek(); - - if ( (ch == '\r') - || (ch == '\n')) - { - break; - } - - if ( (ch == '\\') - && !is_cs) // backslashes aren't special in comments in C# - { - bs_cnt++; - } - else - { - bs_cnt = 0; - } - pc.str.append(ctx.get()); - } - - /* - * If we hit an odd number of backslashes right before the newline, - * then we keep going. - */ - if ( ((bs_cnt & 1) == 0) - || !ctx.more()) - { - break; - } - - if (ctx.peek() == '\r') - { - pc.str.append(ctx.get()); - } - - if (ctx.peek() == '\n') - { - pc.str.append(ctx.get()); - } - pc.nl_count++; - cpd.did_newline = true; - } - } - else if (!ctx.more()) - { - // unexpected end of file - ctx.restore(); - return(false); - } - else if (ch == '+') - { - set_chunk_type(&pc, CT_COMMENT); - d_level++; - - while ( d_level > 0 - && ctx.more()) - { - if ( (ctx.peek() == '+') - && (ctx.peek(1) == '/')) - { - pc.str.append(ctx.get()); // store the '+' - pc.str.append(ctx.get()); // store the '/' - d_level--; - continue; - } - - if ( (ctx.peek() == '/') - && (ctx.peek(1) == '+')) - { - pc.str.append(ctx.get()); // store the '/' - pc.str.append(ctx.get()); // store the '+' - d_level++; - continue; - } - ch = ctx.get(); - pc.str.append(ch); - - if ( (ch == '\n') - || (ch == '\r')) - { - set_chunk_type(&pc, CT_COMMENT_MULTI); - pc.nl_count++; - - if (ch == '\r') - { - if (ctx.peek() == '\n') - { - ++LE_COUNT(CRLF); - pc.str.append(ctx.get()); // store the '\n' - } - else - { - ++LE_COUNT(CR); - } - } - else - { - ++LE_COUNT(LF); - } - } - } - } - else // must be '/ *' - { - set_chunk_type(&pc, CT_COMMENT); - - while (ctx.more()) - { - if ( (ctx.peek() == '*') - && (ctx.peek(1) == '/')) - { - pc.str.append(ctx.get()); // store the '*' - pc.str.append(ctx.get()); // store the '/' - - tok_info ss; - ctx.save(ss); - size_t oldsize = pc.str.size(); - - // If there is another C comment right after this one, combine them - while ( (ctx.peek() == ' ') - || (ctx.peek() == '\t')) - { - pc.str.append(ctx.get()); - } - - if ( (ctx.peek() != '/') - || (ctx.peek(1) != '*')) - { - // undo the attempt to join - ctx.restore(ss); - pc.str.resize(oldsize); - break; - } - } - ch = ctx.get(); - pc.str.append(ch); - - if ( (ch == '\n') - || (ch == '\r')) - { - set_chunk_type(&pc, CT_COMMENT_MULTI); - pc.nl_count++; - - if (ch == '\r') - { - if (ctx.peek() == '\n') - { - ++LE_COUNT(CRLF); - pc.str.append(ctx.get()); // store the '\n' - } - else - { - ++LE_COUNT(CR); - } - } - else - { - ++LE_COUNT(LF); - } - } - } - } - - if (cpd.unc_off) - { - bool found_enable_marker = (find_enable_processing_comment_marker(pc.str) >= 0); - - if (found_enable_marker) - { - const auto &ontext = options::enable_processing_cmt(); - - LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n", - __func__, __LINE__, ontext.c_str(), pc.orig_line); - cpd.unc_off = false; - } - } - else - { - auto position_disable_processing_cmt = find_disable_processing_comment_marker(pc.str); - bool found_disable_marker = (position_disable_processing_cmt >= 0); - - if (found_disable_marker) - { - /** - * the user may wish to disable processing part of a multiline comment, - * in which case we'll handle at a late time. Check to see if processing - * is re-enabled elsewhere in this comment - */ - auto position_enable_processing_cmt = find_enable_processing_comment_marker(pc.str); - - if (position_enable_processing_cmt < position_disable_processing_cmt) - { - const auto &offtext = options::disable_processing_cmt(); - - LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n", - __func__, __LINE__, offtext.c_str(), pc.orig_line); - cpd.unc_off = true; - // Issue #842 - cpd.unc_off_used = true; - } - } - } - return(true); -} // parse_comment - - -static bool parse_code_placeholder(tok_ctx &ctx, chunk_t &pc) -{ - if ( (ctx.peek() != '<') - || (ctx.peek(1) != '#')) - { - return(false); - } - ctx.save(); - - // account for opening two chars '<#' - pc.str = ctx.get(); - pc.str.append(ctx.get()); - - // grab everything until '#>', fail if not found. - size_t last1 = 0; - - while (ctx.more()) - { - size_t last2 = last1; - last1 = ctx.get(); - pc.str.append(last1); - - if ( (last2 == '#') - && (last1 == '>')) - { - set_chunk_type(&pc, CT_WORD); - return(true); - } - } - ctx.restore(); - return(false); -} - - -static void parse_suffix(tok_ctx &ctx, chunk_t &pc, bool forstring = false) -{ - if (CharTable::IsKw1(ctx.peek())) - { - size_t slen = 0; - size_t oldsize = pc.str.size(); - - // don't add the suffix if we see L" or L' or S" - size_t p1 = ctx.peek(); - size_t p2 = ctx.peek(1); - - if ( forstring - && ( ( (p1 == 'L') - && ( (p2 == '"') - || (p2 == '\''))) - || ( (p1 == 'S') - && (p2 == '"')))) - { - return; - } - tok_info ss; - ctx.save(ss); - - while ( ctx.more() - && CharTable::IsKw2(ctx.peek())) - { - slen++; - pc.str.append(ctx.get()); - } - - if ( forstring - && slen >= 4 - && ( pc.str.startswith("PRI", oldsize) - || pc.str.startswith("SCN", oldsize))) - { - ctx.restore(ss); - pc.str.resize(oldsize); - } - } -} - - -static bool is_bin(int ch) -{ - return( (ch == '0') - || (ch == '1')); -} - - -static bool is_bin_(int ch) -{ - return( is_bin(ch) - || ch == '_' - || ch == '\''); -} - - -static bool is_oct(int ch) -{ - return( (ch >= '0') - && (ch <= '7')); -} - - -static bool is_oct_(int ch) -{ - return( is_oct(ch) - || ch == '_' - || ch == '\''); -} - - -static bool is_dec(int ch) -{ - return( (ch >= '0') - && (ch <= '9')); -} - - -static bool is_dec_(int ch) -{ - // number separators: JAVA: "_", C++14: "'" - return( is_dec(ch) - || (ch == '_') - || (ch == '\'')); -} - - -static bool is_hex(int ch) -{ - return( ( (ch >= '0') - && (ch <= '9')) - || ( (ch >= 'a') - && (ch <= 'f')) - || ( (ch >= 'A') - && (ch <= 'F'))); -} - - -static bool is_hex_(int ch) -{ - return( is_hex(ch) - || ch == '_' - || ch == '\''); -} - - -static bool parse_number(tok_ctx &ctx, chunk_t &pc) -{ - /* - * A number must start with a digit or a dot, followed by a digit - * (signs handled elsewhere) - */ - if ( !is_dec(ctx.peek()) - && ( (ctx.peek() != '.') - || !is_dec(ctx.peek(1)))) - { - return(false); - } - bool is_float = (ctx.peek() == '.'); - - if ( is_float - && (ctx.peek(1) == '.')) // make sure it isn't '..' - { - return(false); - } - /* - * Check for Hex, Octal, or Binary - * Note that only D, C++14 and Pawn support binary - * Fixes the issue # 1591 - * In c# the numbers starting with 0 are not treated as octal numbers. - */ - bool did_hex = false; - - if ( ctx.peek() == '0' - && !language_is_set(LANG_CS)) - { - size_t ch; - chunk_t pc_temp; - - pc.str.append(ctx.get()); // store the '0' - pc_temp.str.append('0'); - - // MS constant might have an "h" at the end. Look for it - ctx.save(); - - while ( ctx.more() - && CharTable::IsKw2(ctx.peek())) - { - ch = ctx.get(); - pc_temp.str.append(ch); - } - ch = pc_temp.str[pc_temp.len() - 1]; - ctx.restore(); - LOG_FMT(LGUY, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.text()); - - if (ch == 'h') // TODO can we combine this in analyze_character - { - // we have an MS hexadecimal number with "h" at the end - LOG_FMT(LGUY, "%s(%d): MS hexadecimal number\n", __func__, __LINE__); - did_hex = true; - - do - { - pc.str.append(ctx.get()); // store the rest - } while (is_hex_(ctx.peek())); - - pc.str.append(ctx.get()); // store the h - LOG_FMT(LGUY, "%s(%d): pc:%s\n", __func__, __LINE__, pc.text()); - } - else - { - switch (unc_toupper(ctx.peek())) - { - case 'X': // hex - did_hex = true; - - do - { - pc.str.append(ctx.get()); // store the 'x' and then the rest - } while (is_hex_(ctx.peek())); - - break; - - case 'B': // binary - - do - { - pc.str.append(ctx.get()); // store the 'b' and then the rest - } while (is_bin_(ctx.peek())); - - break; - - case '0': // octal or decimal - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - - do - { - pc.str.append(ctx.get()); - } while (is_oct_(ctx.peek())); - - break; - - default: - // either just 0 or 0.1 or 0UL, etc - break; - } // switch - } - } - else - { - // Regular int or float - while (is_dec_(ctx.peek())) - { - pc.str.append(ctx.get()); - } - } - - // Check if we stopped on a decimal point & make sure it isn't '..' - if ( (ctx.peek() == '.') - && (ctx.peek(1) != '.')) - { - // Issue #1265, 5.clamp() - tok_info ss; - ctx.save(ss); - - while ( ctx.more() - && CharTable::IsKw2(ctx.peek(1))) - { - // skip characters to check for paren open - ctx.get(); - } - - if (ctx.peek(1) == '(') - { - ctx.restore(ss); - set_chunk_type(&pc, CT_NUMBER); - return(true); - } - else - { - ctx.restore(ss); - } - pc.str.append(ctx.get()); - is_float = true; - - if (did_hex) - { - while (is_hex_(ctx.peek())) - { - pc.str.append(ctx.get()); - } - } - else - { - while (is_dec_(ctx.peek())) - { - pc.str.append(ctx.get()); - } - } - } - /* - * Check exponent - * Valid exponents per language (not that it matters): - * C/C++/D/Java: eEpP - * C#/Pawn: eE - */ - size_t tmp = unc_toupper(ctx.peek()); - - if ( (tmp == 'E') - || (tmp == 'P')) - { - is_float = true; - pc.str.append(ctx.get()); - - if ( (ctx.peek() == '+') - || (ctx.peek() == '-')) - { - pc.str.append(ctx.get()); - } - - while (is_dec_(ctx.peek())) - { - pc.str.append(ctx.get()); - } - } - - /* - * Check the suffixes - * Valid suffixes per language (not that it matters): - * Integer Float - * C/C++: uUlL64 lLfF - * C#: uUlL fFdDMm - * D: uUL ifFL - * Java: lL fFdD - * Pawn: (none) (none) - * - * Note that i, f, d, and m only appear in floats. - */ - while (1) - { - size_t tmp2 = unc_toupper(ctx.peek()); - - if ( (tmp2 == 'I') - || (tmp2 == 'F') - || (tmp2 == 'D') - || (tmp2 == 'M')) - { - is_float = true; - } - else if ( (tmp2 != 'L') - && (tmp2 != 'U')) - { - break; - } - pc.str.append(ctx.get()); - } - - // skip the Microsoft-specific '32' and '64' suffix - if ( ( (ctx.peek() == '3') - && (ctx.peek(1) == '2')) - || ( (ctx.peek() == '6') - && (ctx.peek(1) == '4'))) - { - pc.str.append(ctx.get()); - pc.str.append(ctx.get()); - } - set_chunk_type(&pc, is_float ? CT_NUMBER_FP : CT_NUMBER); - - /* - * If there is anything left, then we are probably dealing with garbage or - * some sick macro junk. Eat it. - */ - parse_suffix(ctx, pc); - - return(true); -} // parse_number - - -static bool parse_string(tok_ctx &ctx, chunk_t &pc, size_t quote_idx, bool allow_escape) -{ - log_rule_B("string_escape_char"); - const size_t escape_char = options::string_escape_char(); - - log_rule_B("string_escape_char2"); - const size_t escape_char2 = options::string_escape_char2(); - - log_rule_B("string_replace_tab_chars"); - const bool should_escape_tabs = ( allow_escape - && options::string_replace_tab_chars() - && language_is_set(LANG_ALLC)); - - pc.str.clear(); - - while (quote_idx-- > 0) - { - pc.str.append(ctx.get()); - } - set_chunk_type(&pc, CT_STRING); - const size_t termination_character = CharTable::Get(ctx.peek()) & 0xff; - - pc.str.append(ctx.get()); // store the " - - bool escaped = false; - - while (ctx.more()) - { - const size_t ch = ctx.get(); - - // convert char 9 (\t) to chars \t - if ( (ch == '\t') - && should_escape_tabs) - { - const size_t lastcol = ctx.c.col - 1; - ctx.c.col = lastcol + 2; - pc.str.append(escape_char); - pc.str.append('t'); - continue; - } - pc.str.append(ch); - - if (ch == '\n') - { - pc.nl_count++; - set_chunk_type(&pc, CT_STRING_MULTI); - } - else if ( ch == '\r' - && ctx.peek() != '\n') - { - pc.str.append(ctx.get()); - pc.nl_count++; - set_chunk_type(&pc, CT_STRING_MULTI); - } - - // if last char in prev loop was escaped the one in the current loop isn't - if (escaped) - { - escaped = false; - continue; - } - - // see if the current char is a escape char - if (allow_escape) - { - if (ch == escape_char) - { - escaped = (escape_char != 0); - continue; - } - - if ( ch == escape_char2 - && (ctx.peek() == termination_character)) - { - escaped = allow_escape; - continue; - } - } - - if (ch == termination_character) - { - break; - } - } - parse_suffix(ctx, pc, true); - return(true); -} // parse_string - -enum cs_string_t -{ - CS_STRING_NONE = 0, - CS_STRING_STRING = 1 << 0, // is any kind of string - CS_STRING_VERBATIM = 1 << 1, // @"" style string - CS_STRING_INTERPOLATED = 1 << 2, // $"" or $@"" style string -}; - -static cs_string_t operator|=(cs_string_t &value, cs_string_t other) -{ - return(value = static_cast<cs_string_t>(value | other)); -} - - -static cs_string_t parse_cs_string_start(tok_ctx &ctx, chunk_t &pc) -{ - cs_string_t stringType = CS_STRING_NONE; - int offset = 0; - - if (ctx.peek(offset) == '$') - { - stringType |= CS_STRING_INTERPOLATED; - ++offset; - } - - if (ctx.peek(offset) == '@') - { - stringType |= CS_STRING_VERBATIM; - ++offset; - } - - if (ctx.peek(offset) == '"') - { - stringType |= CS_STRING_STRING; - - set_chunk_type(&pc, CT_STRING); - - for (int i = 0; i <= offset; ++i) - { - pc.str.append(ctx.get()); - } - } - else - { - stringType = CS_STRING_NONE; - } - return(stringType); -} // parse_cs_string_start - - -struct CsStringParseState -{ - cs_string_t type; - int braceDepth; - - - CsStringParseState(cs_string_t stringType) - { - type = stringType; - braceDepth = 0; - } -}; - - -/** - * C# strings are complex enough (mostly due to interpolation and nesting) that they need a custom parser. - */ -static bool parse_cs_string(tok_ctx &ctx, chunk_t &pc) -{ - cs_string_t stringType = parse_cs_string_start(ctx, pc); - - if (stringType == 0) - { - return(false); - } - // an interpolated string can contain {expressions}, which can contain $"strings", which in turn - // can contain {expressions}, so we must track both as they are interleaved, in order to properly - // parse the outermost string. - - std::stack<CsStringParseState> parseState; // each entry is a nested string - - parseState.push(CsStringParseState(stringType)); - - log_rule_B("string_replace_tab_chars"); - bool should_escape_tabs = options::string_replace_tab_chars(); - - while (ctx.more()) - { - if (parseState.top().braceDepth > 0) - { - // all we can do when in an expr is look for expr close with }, or a new string opening. must do this first - // so we can peek and potentially consume chars for new string openings, before the ch=get() happens later, - // which is needed for newline processing. - - if (ctx.peek() == '}') - { - pc.str.append(ctx.get()); - - if (ctx.peek() == '}') - { - pc.str.append(ctx.get()); // in interpolated string, `}}` is escape'd `}` - } - else - { - --parseState.top().braceDepth; - } - continue; - } - stringType = parse_cs_string_start(ctx, pc); - - if (stringType) - { - parseState.push(CsStringParseState(stringType)); - continue; - } - } - int lastcol = ctx.c.col; - int ch = ctx.get(); - - pc.str.append(ch); - - if (ch == '\n') - { - set_chunk_type(&pc, CT_STRING_MULTI); - pc.nl_count++; - } - else if (ch == '\r') - { - set_chunk_type(&pc, CT_STRING_MULTI); - } - else if (parseState.top().braceDepth > 0) - { - // do nothing. if we're in a brace, we only want the newline handling, and skip the rest. - } - else if ( (ch == '\t') - && should_escape_tabs) - { - if (parseState.top().type & CS_STRING_VERBATIM) - { - if (!cpd.warned_unable_string_replace_tab_chars) - { - cpd.warned_unable_string_replace_tab_chars = true; - - log_rule_B("warn_level_tabs_found_in_verbatim_string_literals"); - log_sev_t warnlevel = (log_sev_t)options::warn_level_tabs_found_in_verbatim_string_literals(); - - /* - * a tab char can't be replaced with \\t because escapes don't - * work in here-strings. best we can do is warn. - */ - LOG_FMT(warnlevel, "%s(%d): %s: orig_line is %zu, orig_col is %zu, Detected non-replaceable tab char in literal string\n", - __func__, __LINE__, cpd.filename.c_str(), pc.orig_line, pc.orig_col); - LOG_FMT(warnlevel, "%s(%d): Warning is given if doing tab-to-\\t replacement and we have found one in a C# verbatim string literal.\n", - __func__, __LINE__); - - if (warnlevel < LWARN) - { - cpd.error_count++; - } - } - } - else - { - ctx.c.col = lastcol + 2; - pc.str.pop_back(); // remove \t - pc.str.append("\\t"); - - continue; - } - } - else if ( ch == '\\' - && !(parseState.top().type & CS_STRING_VERBATIM)) - { - // catch escaped quote in order to avoid ending string (but also must handle \\ to avoid accidental 'escape' seq of `\\"`) - if ( ctx.peek() == '"' - || ctx.peek() == '\\') - { - pc.str.append(ctx.get()); - } - } - else if (ch == '"') - { - if ( (parseState.top().type & CS_STRING_VERBATIM) - && (ctx.peek() == '"')) - { - // in verbatim string, `""` is escape'd `"` - pc.str.append(ctx.get()); - } - else - { - // end of string - parseState.pop(); - - if (parseState.empty()) - { - break; - } - } - } - else if (parseState.top().type & CS_STRING_INTERPOLATED) - { - if (ch == '{') - { - if (ctx.peek() == '{') - { - pc.str.append(ctx.get()); // in interpolated string, `{{` is escape'd `{` - } - else - { - ++parseState.top().braceDepth; - } - } - } - } - return(true); -} // parse_cs_string - - -static void parse_verbatim_string(tok_ctx &ctx, chunk_t &pc) -{ - set_chunk_type(&pc, CT_STRING); - - // consume the initial """ - pc.str = ctx.get(); - pc.str.append(ctx.get()); - pc.str.append(ctx.get()); - - // go until we hit a zero (end of file) or a """ - while (ctx.more()) - { - size_t ch = ctx.get(); - pc.str.append(ch); - - if ( (ch == '"') - && (ctx.peek() == '"') - && (ctx.peek(1) == '"')) - { - pc.str.append(ctx.get()); - pc.str.append(ctx.get()); - break; - } - - if ( (ch == '\n') - || (ch == '\r')) - { - set_chunk_type(&pc, CT_STRING_MULTI); - pc.nl_count++; - } - } -} - - -static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len) -{ - if (a_idx != b_idx) - { - while (len-- > 0) - { - if (d[a_idx] != d[b_idx]) - { - return(false); - } - } - } - return(true); -} - - -static bool parse_cr_string(tok_ctx &ctx, chunk_t &pc, size_t q_idx) -{ - size_t tag_idx = ctx.c.idx + q_idx + 1; - size_t tag_len = 0; - - ctx.save(); - - // Copy the prefix + " to the string - pc.str.clear(); - int cnt = q_idx + 1; - - while (cnt--) - { - pc.str.append(ctx.get()); - } - - // Add the tag and get the length of the tag - while ( ctx.more() - && (ctx.peek() != '(')) - { - tag_len++; - pc.str.append(ctx.get()); - } - - if (ctx.peek() != '(') - { - ctx.restore(); - return(false); - } - set_chunk_type(&pc, CT_STRING); - - while (ctx.more()) - { - if ( (ctx.peek() == ')') - && (ctx.peek(tag_len + 1) == '"') - && tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len)) - { - cnt = tag_len + 2; // for the )" - - while (cnt--) - { - pc.str.append(ctx.get()); - } - parse_suffix(ctx, pc); - return(true); - } - - if (ctx.peek() == '\n') - { - pc.str.append(ctx.get()); - pc.nl_count++; - set_chunk_type(&pc, CT_STRING_MULTI); - } - else - { - pc.str.append(ctx.get()); - } - } - ctx.restore(); - return(false); -} // parse_cr_string - - -/** - * Count the number of characters in a word. - * The first character is already valid for a keyword - * - * @param pc The structure to update, str is an input. - * @return Whether a word was parsed (always true) - */ -static bool parse_word(tok_ctx &ctx, chunk_t &pc, bool skipcheck) -{ - static unc_text intr_txt("@interface"); - - // The first character is already valid - pc.str.clear(); - pc.str.append(ctx.get()); - - while (ctx.more()) - { - size_t ch = ctx.peek(); - - if (CharTable::IsKw2(ch)) - { - pc.str.append(ctx.get()); - } - else if ( (ch == '\\') - && (unc_tolower(ctx.peek(1)) == 'u')) - { - pc.str.append(ctx.get()); - pc.str.append(ctx.get()); - skipcheck = true; - } - else - { - break; - } - - // HACK: Non-ASCII character are only allowed in identifiers - if (ch > 0x7f) - { - skipcheck = true; - } - } - set_chunk_type(&pc, CT_WORD); - - if (skipcheck) - { - return(true); - } - - // Detect pre-processor functions now - if ( cpd.in_preproc == CT_PP_DEFINE - && cpd.preproc_ncnl_count == 1) - { - if (ctx.peek() == '(') - { - set_chunk_type(&pc, CT_MACRO_FUNC); - } - else - { - set_chunk_type(&pc, CT_MACRO); - - log_rule_B("pp_ignore_define_body"); - - if (options::pp_ignore_define_body()) - { - /* - * We are setting the PP_IGNORE preproc state because the following - * chunks are part of the macro body and will have to be ignored. - */ - cpd.in_preproc = CT_PP_IGNORE; - } - } - } - else - { - // '@interface' is reserved, not an interface itself - if ( language_is_set(LANG_JAVA) - && pc.str.startswith("@") - && !pc.str.equals(intr_txt)) - { - set_chunk_type(&pc, CT_ANNOTATION); - } - else - { - // Turn it into a keyword now - // Issue #1460 will return "COMMENT_CPP" - set_chunk_type(&pc, find_keyword_type(pc.text(), pc.str.size())); - - /* Special pattern: if we're trying to redirect a preprocessor directive to PP_IGNORE, - * then ensure we're actually part of a preprocessor before doing the swap, or we'll - * end up with a function named 'define' as PP_IGNORE. This is necessary because with - * the config 'set' feature, there's no way to do a pair of tokens as a word - * substitution. */ - if ( pc.type == CT_PP_IGNORE - && !cpd.in_preproc) - { - set_chunk_type(&pc, find_keyword_type(pc.text(), pc.str.size())); - } - else if (pc.type == CT_COMMENT_CPP) // Issue #1460 - { - size_t ch; - bool is_cs = language_is_set(LANG_CS); - - // read until EOL - while (true) - { - int bs_cnt = 0; - - while (ctx.more()) - { - ch = ctx.peek(); - - if ( (ch == '\r') - || (ch == '\n')) - { - break; - } - - if ( (ch == '\\') - && !is_cs) // backslashes aren't special in comments in C# - { - bs_cnt++; - } - else - { - bs_cnt = 0; - } - pc.str.append(ctx.get()); - } - - /* - * If we hit an odd number of backslashes right before the newline, - * then we keep going. - */ - if ( ((bs_cnt & 1) == 0) - || !ctx.more()) - { - break; - } - - if (ctx.peek() == '\r') - { - pc.str.append(ctx.get()); - } - - if (ctx.peek() == '\n') - { - pc.str.append(ctx.get()); - } - pc.nl_count++; - cpd.did_newline = true; - } - // Store off the end column - pc.orig_col_end = ctx.c.col; - } - } - } - return(true); -} // parse_word - - -static size_t parse_attribute_specifier_sequence(tok_ctx &ctx) -{ - size_t nested = 0; - size_t offset = 0; - size_t parens = 0; - auto ch1 = ctx.peek(offset++); - - while (ch1) - { - auto ch2 = ctx.peek(offset++); - - while ( ch2 == ' ' - || ch2 == '\n' - || ch2 == '\r' - || ch2 == '\t') - { - ch2 = ctx.peek(offset++); - } - - if ( nested == 0 - && ch2 != '[') - { - break; - } - - if (ch1 == '(') - { - ++parens; - ch1 = ch2; - continue; - } - - if (ch1 == ')') - { - if (parens == 0) - { - break; - } - --parens; - ch1 = ch2; - continue; - } - - if ( ch1 != '[' - && ch1 != ']') - { - ch1 = ch2; - continue; - } - - if (ch2 != ch1) - { - if (parens == 0) - { - break; - } - ch1 = ch2; - continue; - } - - if (ch1 == '[') - { - if ( nested != 0 - && parens == 0) - { - break; - } - ++nested; - } - else if (--nested == 0) - { - return(offset); - } - ch1 = ctx.peek(offset++); - } - return(0); -} // parse_attribute_specifier_sequence - - -static bool extract_attribute_specifier_sequence(tok_ctx &ctx, chunk_t &pc, size_t length) -{ - pc.str.clear(); - - while (length--) - { - pc.str.append(ctx.get()); - } - set_chunk_type(&pc, CT_ATTRIBUTE); - return(true); -} // extract_attribute_specifier_sequence - - -static bool parse_whitespace(tok_ctx &ctx, chunk_t &pc) -{ - size_t nl_count = 0; - size_t ch = 0; - - // REVISIT: use a better whitespace detector? - while ( ctx.more() - && unc_isspace(ctx.peek())) - { - ch = ctx.get(); // throw away the whitespace char - - switch (ch) - { - case '\r': - - if (ctx.expect('\n')) - { - // CRLF ending - ++LE_COUNT(CRLF); - } - else - { - // CR ending - ++LE_COUNT(CR); - } - nl_count++; - pc.orig_prev_sp = 0; - break; - - case '\n': - // LF ending - ++LE_COUNT(LF); - nl_count++; - pc.orig_prev_sp = 0; - break; - - case '\t': - log_rule_B("input_tab_size"); - pc.orig_prev_sp += calc_next_tab_column(cpd.column, options::input_tab_size()) - cpd.column; - break; - - case ' ': - pc.orig_prev_sp++; - break; - - default: - break; - } - } - - if (ch != 0) - { - pc.str.clear(); - set_chunk_type(&pc, nl_count ? CT_NEWLINE : CT_WHITESPACE); - pc.nl_count = nl_count; - pc.after_tab = (ctx.c.last_ch == '\t'); - return(true); - } - return(false); -} // parse_whitespace - - -static bool parse_bs_newline(tok_ctx &ctx, chunk_t &pc) -{ - ctx.save(); - ctx.get(); // skip the '\' - - size_t ch; - - while ( ctx.more() - && unc_isspace(ch = ctx.peek())) - { - ctx.get(); - - if ( (ch == '\r') - || (ch == '\n')) - { - if (ch == '\r') - { - ctx.expect('\n'); - } - set_chunk_type(&pc, CT_NL_CONT); - pc.str = "\\"; - pc.nl_count = 1; - return(true); - } - } - ctx.restore(); - return(false); -} - - -static bool parse_newline(tok_ctx &ctx) -{ - ctx.save(); - - // Eat whitespace - while ( (ctx.peek() == ' ') - || (ctx.peek() == '\t')) - { - ctx.get(); - } - - if ( (ctx.peek() == '\r') - || (ctx.peek() == '\n')) - { - if (!ctx.expect('\n')) - { - ctx.get(); - ctx.expect('\n'); - } - return(true); - } - ctx.restore(); - return(false); -} - - -static void parse_pawn_pattern(tok_ctx &ctx, chunk_t &pc, c_token_t tt) -{ - pc.str.clear(); - set_chunk_type(&pc, tt); - - while (!unc_isspace(ctx.peek())) - { - // end the pattern on an escaped newline - if (ctx.peek() == '\\') - { - size_t ch = ctx.peek(1); - - if ( (ch == '\n') - || (ch == '\r')) - { - break; - } - } - pc.str.append(ctx.get()); - } -} - - -static bool parse_off_newlines(tok_ctx &ctx, chunk_t &pc) -{ - size_t nl_count = 0; - - // Parse off newlines/blank lines - while (parse_newline(ctx)) - { - nl_count++; - } - - if (nl_count > 0) - { - pc.nl_count = nl_count; - set_chunk_type(&pc, CT_NEWLINE); - return(true); - } - return(false); -} - - -static bool parse_macro(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc) -{ - if (parse_off_newlines(ctx, pc)) - { - return(true); - } - - if (parse_comment(ctx, pc)) // allow CT_COMMENT_MULTI within macros - { - return(true); - } - ctx.save(); - pc.str.clear(); - - bool continued = ( chunk_is_token(prev_pc, CT_NL_CONT) - || chunk_is_token(prev_pc, CT_COMMENT_MULTI)); - - while (ctx.more()) - { - size_t pk = ctx.peek(), pk1 = ctx.peek(1); - bool nl = ( pk == '\n' - || pk == '\r'); - bool nl_cont = ( pk == '\\' - && ( pk1 == '\n' - || pk1 == '\r')); - - if ( ( nl_cont - || ( continued - && nl)) - && pc.str.size() > 0) - { - set_chunk_type(&pc, CT_IGNORED); - return(true); - } - else if (nl) - { - break; - } - pc.str.append(ctx.get()); - } - pc.str.clear(); - ctx.restore(); - return(false); -} // parse_macro - - -static bool parse_ignored(tok_ctx &ctx, chunk_t &pc) -{ - if (parse_off_newlines(ctx, pc)) - { - return(true); - } - // See if the UO_enable_processing_cmt or #pragma endasm / #endasm text is on this line - ctx.save(); - pc.str.clear(); - - while ( ctx.more() - && (ctx.peek() != '\r') - && (ctx.peek() != '\n')) - { - pc.str.append(ctx.get()); - } - - if (pc.str.size() == 0) - { - // end of file? - return(false); - } - - // HACK: turn on if we find '#endasm' or '#pragma' and 'endasm' separated by blanks - if ( ( ( (pc.str.find("#pragma ") >= 0) - || (pc.str.find("#pragma ") >= 0)) - && ( (pc.str.find(" endasm") >= 0) - || (pc.str.find(" endasm") >= 0))) - || (pc.str.find("#endasm") >= 0)) - { - cpd.unc_off = false; - ctx.restore(); - pc.str.clear(); - return(false); - } - // Note that we aren't actually making sure this is in a comment, yet - log_rule_B("enable_processing_cmt"); - const auto &ontext = options::enable_processing_cmt(); - - if (!ontext.empty()) - { - bool found_enable_pattern = false; - - if ( ontext != UNCRUSTIFY_ON_TEXT - && options::processing_cmt_as_regex()) - { - std::wstring pc_wstring(pc.str.get().cbegin(), - pc.str.get().cend()); - std::wregex criteria(std::wstring(ontext.cbegin(), - ontext.cend())); - - found_enable_pattern = std::regex_search(pc_wstring.cbegin(), - pc_wstring.cend(), - criteria); - } - else - { - found_enable_pattern = (pc.str.find(ontext.c_str()) >= 0); - } - - if (!found_enable_pattern) - { - set_chunk_type(&pc, CT_IGNORED); - return(true); - } - } - ctx.restore(); - - // parse off whitespace leading to the comment - if (parse_whitespace(ctx, pc)) - { - set_chunk_type(&pc, CT_IGNORED); - return(true); - } - - // Look for the ending comment and let it pass - if ( parse_comment(ctx, pc) - && !cpd.unc_off) - { - return(true); - } - // Reset the chunk & scan to until a newline - pc.str.clear(); - - while ( ctx.more() - && (ctx.peek() != '\r') - && (ctx.peek() != '\n')) - { - pc.str.append(ctx.get()); - } - - if (pc.str.size() > 0) - { - set_chunk_type(&pc, CT_IGNORED); - return(true); - } - return(false); -} // parse_ignored - - -static bool parse_next(tok_ctx &ctx, chunk_t &pc, const chunk_t *prev_pc) -{ - if (!ctx.more()) - { - return(false); - } - // Save off the current column - set_chunk_type(&pc, CT_NONE); - pc.orig_line = ctx.c.row; - pc.column = ctx.c.col; - pc.orig_col = ctx.c.col; - pc.nl_count = 0; - pc.flags = PCF_NONE; - - // If it is turned off, we put everything except newlines into CT_UNKNOWN - if (cpd.unc_off) - { - if (parse_ignored(ctx, pc)) - { - return(true); - } - } - log_rule_B("disable_processing_nl_cont"); - - // Parse macro blocks - if (options::disable_processing_nl_cont()) - { - if (parse_macro(ctx, pc, prev_pc)) - { - return(true); - } - } - - // Parse whitespace - if (parse_whitespace(ctx, pc)) - { - return(true); - } - - // Handle unknown/unhandled preprocessors - if ( cpd.in_preproc > CT_PP_BODYCHUNK - && cpd.in_preproc <= CT_PP_OTHER) - { - pc.str.clear(); - tok_info ss; - ctx.save(ss); - // Chunk to a newline or comment - set_chunk_type(&pc, CT_PREPROC_BODY); - size_t last = 0; - - while (ctx.more()) - { - size_t ch = ctx.peek(); - - // Fix for issue #1752 - // Ignoring extra spaces after ' \ ' for preproc body continuations - if ( last == '\\' - && ch == ' ') - { - ctx.get(); - continue; - } - - if ( (ch == '\n') - || (ch == '\r')) - { - // Back off if this is an escaped newline - if (last == '\\') - { - ctx.restore(ss); - pc.str.pop_back(); - } - break; - } - - // Quit on a C or C++ comment start Issue #1966 - if ( (ch == '/') - && ( (ctx.peek(1) == '/') - || (ctx.peek(1) == '*'))) - { - break; - } - last = ch; - ctx.save(ss); - - pc.str.append(ctx.get()); - } - - if (pc.str.size() > 0) - { - return(true); - } - } - - // Detect backslash-newline - if ( (ctx.peek() == '\\') - && parse_bs_newline(ctx, pc)) - { - return(true); - } - - // Parse comments - if (parse_comment(ctx, pc)) - { - return(true); - } - - // Parse code placeholders - if (parse_code_placeholder(ctx, pc)) - { - return(true); - } - - if (language_is_set(LANG_CS)) - { - if (parse_cs_string(ctx, pc)) - { - return(true); - } - - // check for non-keyword identifiers such as @if @switch, etc - if ( (ctx.peek() == '@') - && CharTable::IsKw1(ctx.peek(1))) - { - parse_word(ctx, pc, true); - return(true); - } - } - - // handle VALA """ strings """ - if ( language_is_set(LANG_VALA) - && (ctx.peek() == '"') - && (ctx.peek(1) == '"') - && (ctx.peek(2) == '"')) - { - parse_verbatim_string(ctx, pc); - return(true); - } - /* - * handle C++(11) string/char literal prefixes u8|u|U|L|R including all - * possible combinations and optional R delimiters: R"delim(x)delim" - */ - auto ch = ctx.peek(); - - if ( language_is_set(LANG_C | LANG_CPP) - && ( ch == 'u' - || ch == 'U' - || ch == 'R' - || ch == 'L')) - { - auto idx = size_t{}; - auto is_real = false; - - if ( ch == 'u' - && ctx.peek(1) == '8') - { - idx = 2; - } - else if ( unc_tolower(ch) == 'u' - || ch == 'L') - { - idx++; - } - - if ( language_is_set(LANG_C | LANG_CPP) - && ctx.peek(idx) == 'R') - { - idx++; - is_real = true; - } - const auto quote = ctx.peek(idx); - - if (is_real) - { - if ( quote == '"' - && parse_cr_string(ctx, pc, idx)) - { - return(true); - } - } - else if ( ( quote == '"' - || quote == '\'') - && parse_string(ctx, pc, idx, true)) - { - return(true); - } - } - - // PAWN specific stuff - if (language_is_set(LANG_PAWN)) - { - if ( cpd.preproc_ncnl_count == 1 - && ( cpd.in_preproc == CT_PP_DEFINE - || cpd.in_preproc == CT_PP_EMIT)) - { - parse_pawn_pattern(ctx, pc, CT_MACRO); - return(true); - } - - // Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" - if ( (ctx.peek() == '\\') - || (ctx.peek() == '!')) - { - if (ctx.peek(1) == '"') - { - parse_string(ctx, pc, 1, (ctx.peek() == '!')); - return(true); - } - - if ( ( (ctx.peek(1) == '\\') - || (ctx.peek(1) == '!')) - && (ctx.peek(2) == '"')) - { - parse_string(ctx, pc, 2, false); - return(true); - } - } - - // handle PAWN preprocessor args %0 .. %9 - if ( cpd.in_preproc == CT_PP_DEFINE - && (ctx.peek() == '%') - && unc_isdigit(ctx.peek(1))) - { - pc.str.clear(); - pc.str.append(ctx.get()); - pc.str.append(ctx.get()); - set_chunk_type(&pc, CT_WORD); - return(true); - } - } - // Parse strings and character constants - -//parse_word(ctx, pc_temp, true); -//ctx.restore(ctx.c); - if (parse_number(ctx, pc)) - { - return(true); - } - - if (language_is_set(LANG_D)) - { - // D specific stuff - if (d_parse_string(ctx, pc)) - { - return(true); - } - } - else - { - // Not D stuff - - // Check for L'a', L"abc", 'a', "abc", <abc> strings - ch = ctx.peek(); - size_t ch1 = ctx.peek(1); - - if ( ( ( (ch == 'L') - || (ch == 'S')) - && ( (ch1 == '"') - || (ch1 == '\''))) - || (ch == '"') - || (ch == '\'') - || ( (ch == '<') - && cpd.in_preproc == CT_PP_INCLUDE)) - { - parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true); - set_chunk_parent(&pc, CT_PP_INCLUDE); - return(true); - } - - if ( (ch == '<') - && cpd.in_preproc == CT_PP_DEFINE) - { - if (chunk_is_token(chunk_get_tail(), CT_MACRO)) - { - // We have "#define XXX <", assume '<' starts an include string - parse_string(ctx, pc, 0, false); - return(true); - } - } - - /* Inside clang's __has_include() could be "path/to/file.h" or system-style <path/to/file.h> */ - if ( (ch == '(') - && (chunk_get_tail() != nullptr) - && ( chunk_is_token(chunk_get_tail(), CT_CNG_HASINC) - || chunk_is_token(chunk_get_tail(), CT_CNG_HASINCN))) - { - parse_string(ctx, pc, 0, false); - return(true); - } - } - - // Check for Objective C literals and VALA identifiers ('@1', '@if') - if ( language_is_set(LANG_OC | LANG_VALA) - && (ctx.peek() == '@')) - { - size_t nc = ctx.peek(1); - - if (nc == 'R') // Issue #2720 - { - if (ctx.peek(2) == '"') - { - if (parse_cr_string(ctx, pc, 2)) // Issue #3027 - { - return(true); - } - // parse string without escaping - parse_string(ctx, pc, 2, false); - return(true); - } - } - - if ( (nc == '"') - || (nc == '\'')) - { - // literal string - parse_string(ctx, pc, 1, true); - return(true); - } - - if ( (nc >= '0') - && (nc <= '9')) - { - // literal number - pc.str.append(ctx.get()); // store the '@' - parse_number(ctx, pc); - return(true); - } - } - - // Check for pawn/ObjectiveC/Java and normal identifiers - if ( CharTable::IsKw1(ctx.peek()) - || ( (ctx.peek() == '\\') - && (unc_tolower(ctx.peek(1)) == 'u')) - || ( (ctx.peek() == '@') - && CharTable::IsKw1(ctx.peek(1)))) - { - parse_word(ctx, pc, false); - return(true); - } - - // Check for C++11/14/17/20 attribute specifier sequences - if ( language_is_set(LANG_CPP) - && ctx.peek() == '[') - { - if ( !language_is_set(LANG_OC) - || !chunk_is_token(prev_pc, CT_OC_AT)) - { - if (auto length = parse_attribute_specifier_sequence(ctx)) - { - extract_attribute_specifier_sequence(ctx, pc, length); - return(true); - } - } - } - // see if we have a punctuator - char punc_txt[7]; - - punc_txt[0] = ctx.peek(); - punc_txt[1] = ctx.peek(1); - punc_txt[2] = ctx.peek(2); - punc_txt[3] = ctx.peek(3); - punc_txt[4] = ctx.peek(4); - punc_txt[5] = ctx.peek(5); - punc_txt[6] = '\0'; - const chunk_tag_t *punc; - - if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != nullptr) - { - int cnt = strlen(punc->tag); - - while (cnt--) - { - pc.str.append(ctx.get()); - } - set_chunk_type(&pc, punc->type); - pc.flags |= PCF_PUNCTUATOR; - return(true); - } - /* When parsing C/C++ files and running into some unknown token, - * check if matches Objective-C as a last resort, before - * considering it as garbage. - */ - int probe_lang_flags = 0; - - if (language_is_set(LANG_C | LANG_CPP)) - { - probe_lang_flags = cpd.lang_flags | LANG_OC; - } - - if (probe_lang_flags != 0) - { - if ((punc = find_punctuator(punc_txt, probe_lang_flags)) != NULL) - { - cpd.lang_flags = probe_lang_flags; - int cnt = strlen(punc->tag); - - while (cnt--) - { - pc.str.append(ctx.get()); - } - set_chunk_type(&pc, punc->type); - pc.flags |= PCF_PUNCTUATOR; - return(true); - } - } - // throw away this character - set_chunk_type(&pc, CT_UNKNOWN); - pc.str.append(ctx.get()); - - LOG_FMT(LWARN, "%s:%zu Garbage in col %d: %x\n", - cpd.filename.c_str(), pc.orig_line, (int)ctx.c.col, pc.str[0]); - cpd.error_count++; - return(true); -} // parse_next - - -int find_disable_processing_comment_marker(const unc_text &text, - std::size_t start_idx) -{ - log_rule_B("disable_processing_cmt"); - const auto &offtext = options::disable_processing_cmt(); - int idx = -1; - - if ( !offtext.empty() - && start_idx < text.size()) - { - if ( offtext != UNCRUSTIFY_OFF_TEXT - && options::processing_cmt_as_regex()) - { - std::wsmatch match; - std::wstring pc_wstring(text.get().cbegin() + start_idx, - text.get().cend()); - std::wregex criteria(std::wstring(offtext.cbegin(), - offtext.cend())); - - std::regex_search(pc_wstring.cbegin(), - pc_wstring.cend(), - match, - criteria); - - if (!match.empty()) - { - idx = int(match.position() + start_idx); - } - } - else - { - idx = text.find(offtext.c_str(), - start_idx); - - if (idx >= 0) - { - idx += int(offtext.size()); - } - } - - /** - * update the position to the start of the current line - */ - while ( idx > 0 - && text[idx - 1] != '\n') - { - --idx; - } - } - return(idx); -} // find_disable_processing_comment_marker - - -int find_enable_processing_comment_marker(const unc_text &text, - std::size_t start_idx) -{ - log_rule_B("enable_processing_cmt"); - const auto &ontext = options::enable_processing_cmt(); - int idx = -1; - - if ( !ontext.empty() - && start_idx < text.size()) - { - if ( ontext != UNCRUSTIFY_ON_TEXT - && options::processing_cmt_as_regex()) - { - std::wsmatch match; - std::wstring pc_wstring(text.get().cbegin() + start_idx, - text.get().cend()); - std::wregex criteria(std::wstring(ontext.cbegin(), - ontext.cend())); - - std::regex_search(pc_wstring.cbegin(), - pc_wstring.cend(), - match, - criteria); - - if (!match.empty()) - { - idx = int(start_idx + match.position() + match.size()); - } - } - else - { - idx = text.find(ontext.c_str(), - start_idx); - - if (idx >= 0) - { - idx += int(ontext.size()); - } - } - - /** - * update the position to the end of the current line - */ - if (idx >= 0) - { - while ( idx < int(text.size()) - && text[idx] != '\n') - { - ++idx; - } - } - } - return(idx); -} // find_enable_processing_comment_marker - - -void tokenize(const deque<int> &data, chunk_t *ref) -{ - tok_ctx ctx(data); - chunk_t chunk; - chunk_t *pc = nullptr; - chunk_t *rprev = nullptr; - bool last_was_tab = false; - size_t prev_sp = 0; - int num_stripped = 0; // Issue #1966 - - cpd.unc_stage = unc_stage_e::TOKENIZE; - - while (ctx.more()) - { - chunk.reset(); - chunk.pp_level = 0; - - if (!parse_next(ctx, chunk, pc)) - { - LOG_FMT(LERR, "%s:%zu Bailed before the end?\n", - cpd.filename.c_str(), ctx.c.row); - cpd.error_count++; - break; - } - - if ( language_is_set(LANG_JAVA) - && chunk.type == CT_MEMBER - && !memcmp(chunk.text(), "->", 2)) - { - chunk.type = CT_LAMBDA; - } - - // Don't create an entry for whitespace - if (chunk.type == CT_WHITESPACE) - { - last_was_tab = chunk.after_tab; - prev_sp = chunk.orig_prev_sp; - continue; - } - chunk.orig_prev_sp = prev_sp; - prev_sp = 0; - - if (chunk.type == CT_NEWLINE) - { - last_was_tab = chunk.after_tab; - chunk.after_tab = false; - chunk.str.clear(); - } - else if (chunk.type == CT_NL_CONT) - { - last_was_tab = chunk.after_tab; - chunk.after_tab = false; - chunk.str = "\\\n"; - } - else - { - chunk.after_tab = last_was_tab; - last_was_tab = false; - } - - if (chunk.type != CT_IGNORED) - { - // Issue #1338 - // Strip trailing whitespace (for CPP comments and PP blocks) - num_stripped = 0; // Issue #1966 - - while ( (chunk.str.size() > 0) - && ( (chunk.str[chunk.str.size() - 1] == ' ') - || (chunk.str[chunk.str.size() - 1] == '\t'))) - { - // If comment contains backslash '\' followed by whitespace chars, keep last one; - // this will prevent it from turning '\' into line continuation. - if ( (chunk.str.size() > 1) - && (chunk.str[chunk.str.size() - 2] == '\\')) - { - break; - } - chunk.str.pop_back(); - num_stripped++; // Issue #1966 - } - } - // Store off the end column - chunk.orig_col_end = ctx.c.col; - - if ( ( chunk.type == CT_COMMENT_MULTI // Issue #1966 - || chunk.type == CT_COMMENT - || chunk.type == CT_COMMENT_CPP) - && (pc != nullptr) - && chunk_is_token(pc, CT_PP_IGNORE)) - { - chunk.orig_col_end -= num_stripped; - } - // Add the chunk to the list - rprev = pc; - - if (rprev != nullptr) - { - chunk_flags_set(pc, rprev->flags & PCF_COPY_FLAGS); - - // a newline can't be in a preprocessor - if (chunk_is_token(pc, CT_NEWLINE)) - { - chunk_flags_clr(pc, PCF_IN_PREPROC); - } - } - - if (ref != nullptr) - { - chunk.flags |= PCF_INSERTED; - } - else - { - chunk.flags &= ~PCF_INSERTED; - } - pc = chunk_add_before(&chunk, ref); - - // A newline marks the end of a preprocessor - if (chunk_is_token(pc, CT_NEWLINE)) // || chunk_is_token(pc, CT_COMMENT_MULTI)) - { - cpd.in_preproc = CT_NONE; - cpd.preproc_ncnl_count = 0; - } - - // Disable indentation when #asm directive found - if (chunk_is_token(pc, CT_PP_ASM)) - { - LOG_FMT(LBCTRL, "Found a directive %s on line %zu\n", "#asm", pc->orig_line); - cpd.unc_off = true; - } - - // Special handling for preprocessor stuff - if (cpd.in_preproc != CT_NONE) - { - chunk_flags_set(pc, PCF_IN_PREPROC); - - // Count words after the preprocessor - if ( !chunk_is_comment(pc) - && !chunk_is_newline(pc)) - { - cpd.preproc_ncnl_count++; - } - - // Disable indentation if a #pragma asm directive is found - if (cpd.in_preproc == CT_PP_PRAGMA) - { - if (memcmp(pc->text(), "asm", 3) == 0) - { - LOG_FMT(LBCTRL, "Found a pragma %s on line %zu\n", "asm", pc->orig_line); - cpd.unc_off = true; - } - } - - // Figure out the type of preprocessor for #include parsing - if (cpd.in_preproc == CT_PREPROC) - { - if ( pc->type < CT_PP_DEFINE - || pc->type > CT_PP_OTHER) - { - set_chunk_type(pc, CT_PP_OTHER); - } - cpd.in_preproc = pc->type; - } - else if (cpd.in_preproc == CT_PP_IGNORE) - { - // ASSERT(options::pp_ignore_define_body()); - if ( !chunk_is_token(pc, CT_NL_CONT) - && !chunk_is_token(pc, CT_COMMENT_CPP) - && !chunk_is_token(pc, CT_COMMENT) - && !chunk_is_token(pc, CT_COMMENT_MULTI)) // Issue #1966 - { - set_chunk_type(pc, CT_PP_IGNORE); - } - } - else if ( cpd.in_preproc == CT_PP_DEFINE - && chunk_is_token(pc, CT_PAREN_CLOSE) - && options::pp_ignore_define_body()) - { - log_rule_B("pp_ignore_define_body"); - // When we have a PAREN_CLOSE in a PP_DEFINE we should be terminating a MACRO_FUNC - // arguments list. Therefore we can enter the PP_IGNORE state and ignore next chunks. - cpd.in_preproc = CT_PP_IGNORE; - } - } - else - { - // Check for a preprocessor start - if ( chunk_is_token(pc, CT_POUND) - && ( rprev == nullptr - || chunk_is_token(rprev, CT_NEWLINE))) - { - set_chunk_type(pc, CT_PREPROC); - chunk_flags_set(pc, PCF_IN_PREPROC); - cpd.in_preproc = CT_PREPROC; - } - } - - if (chunk_is_token(pc, CT_NEWLINE)) - { - LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, <Newline>, nl is %zu\n", - __func__, __LINE__, pc->orig_line, pc->orig_col, pc->nl_count); - } - else if (chunk_is_token(pc, CT_VBRACE_OPEN)) - { - LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, type is %s, orig_col_end is %zu\n", - __func__, __LINE__, pc->orig_line, pc->orig_col, get_token_name(pc->type), pc->orig_col_end); - } - else - { - char copy[1000]; - LOG_FMT(LGUY, "%s(%d): orig_line is %zu, orig_col is %zu, text() '%s', type is %s, orig_col_end is %zu\n", - __func__, __LINE__, pc->orig_line, pc->orig_col, pc->elided_text(copy), get_token_name(pc->type), pc->orig_col_end); - } - } - // Set the cpd.newline string for this file - log_rule_B("newlines"); - - if ( options::newlines() == LE_LF - || ( options::newlines() == LE_AUTO - && (LE_COUNT(LF) >= LE_COUNT(CRLF)) - && (LE_COUNT(LF) >= LE_COUNT(CR)))) - { - // LF line ends - cpd.newline = "\n"; - LOG_FMT(LLINEENDS, "Using LF line endings\n"); - } - else if ( options::newlines() == LE_CRLF - || ( options::newlines() == LE_AUTO - && (LE_COUNT(CRLF) >= LE_COUNT(LF)) - && (LE_COUNT(CRLF) >= LE_COUNT(CR)))) - { - // CRLF line ends - cpd.newline = "\r\n"; - LOG_FMT(LLINEENDS, "Using CRLF line endings\r\n"); - } - else - { - // CR line ends - cpd.newline = "\r"; - LOG_FMT(LLINEENDS, "Using CR line endings\n"); - } -} // tokenize |