summaryrefslogtreecommitdiffstats
path: root/debian/uncrustify-trinity/uncrustify-trinity-0.72.0/src/unicode.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.72.0/src/unicode.cpp')
-rw-r--r--debian/uncrustify-trinity/uncrustify-trinity-0.72.0/src/unicode.cpp579
1 files changed, 579 insertions, 0 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.72.0/src/unicode.cpp b/debian/uncrustify-trinity/uncrustify-trinity-0.72.0/src/unicode.cpp
new file mode 100644
index 00000000..117d7bcc
--- /dev/null
+++ b/debian/uncrustify-trinity/uncrustify-trinity-0.72.0/src/unicode.cpp
@@ -0,0 +1,579 @@
+/**
+ * @file unicode.cpp
+ * Detects, read and writes characters in the proper format.
+ *
+ * @author Ben Gardner
+ * @license GPL v2+
+ */
+
+#include "unicode.h"
+
+#include "prototypes.h"
+#include "unc_ctype.h"
+#include "uncrustify.h"
+#include "uncrustify_types.h"
+
+#include <cstdlib>
+#include <cstring>
+
+
+using namespace std;
+
+
+//! See if all characters are ASCII (0-127)
+static bool is_ascii(const vector<UINT8> &data, size_t &non_ascii_cnt, size_t &zero_cnt);
+
+
+//! Convert the array of bytes into an array of ints
+static bool decode_bytes(const vector<UINT8> &in_data, deque<int> &out_data);
+
+
+/**
+ * Decode UTF-8 sequences from in_data and put the chars in out_data.
+ * If there are any decoding errors, then return false.
+ */
+static bool decode_utf8(const vector<UINT8> &in_data, deque<int> &out_data);
+
+
+/**
+ * Extract 2 bytes from the stream and increment idx by 2
+ *
+ * @param in byte vector with input data
+ * @param idx index points to working position in vector
+ */
+static int get_word(const vector<UINT8> &in_data, size_t &idx, bool be);
+
+
+/**
+ * Decode a UTF-16 sequence.
+ * Sets enc based on the BOM.
+ * Must have the BOM as the first two bytes.
+ */
+static bool decode_utf16(const vector<UINT8> &in_data, deque<int> &out_data, char_encoding_e &enc);
+
+
+/**
+ * Looks for the BOM of UTF-16 BE/LE and UTF-8.
+ * If found, set enc and return true.
+ * Sets enc to char_encoding_e::e_ASCII and returns false if not found.
+ */
+static bool decode_bom(const vector<UINT8> &in_data, char_encoding_e &enc);
+
+
+//! Write for ASCII and BYTE encoding
+static void write_byte(int ch);
+
+
+//! Writes a single character to a file using UTF-8 encoding
+static void write_utf8(int ch);
+
+
+static void write_utf16(int ch, bool be);
+
+
+static bool is_ascii(const vector<UINT8> &data, size_t &non_ascii_cnt, size_t &zero_cnt)
+{
+ non_ascii_cnt = 0;
+ zero_cnt = 0;
+
+ for (unsigned char value : data)
+ {
+ if (value & 0x80)
+ {
+ non_ascii_cnt++;
+ }
+
+ if (!value)
+ {
+ zero_cnt++;
+ }
+ }
+
+ return((non_ascii_cnt + zero_cnt) == 0);
+}
+
+
+static bool decode_bytes(const vector<UINT8> &in_data, deque<int> &out_data)
+{
+ out_data.resize(in_data.size());
+
+ for (size_t idx = 0; idx < in_data.size(); idx++)
+ {
+ out_data[idx] = in_data[idx];
+ }
+
+ return(true);
+}
+
+
+void encode_utf8(int ch, vector<UINT8> &res)
+{
+ if (ch < 0)
+ {
+ // illegal code - do not store
+ }
+ else if (ch < 0x80)
+ {
+ // 0xxxxxxx
+ res.push_back(ch);
+ }
+ else if (ch < 0x0800)
+ {
+ // 110xxxxx 10xxxxxx
+ res.push_back(0xC0 | (ch >> 6));
+ res.push_back(0x80 | (ch & 0x3f));
+ }
+ else if (ch < 0x10000)
+ {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ res.push_back(0xE0 | (ch >> 12));
+ res.push_back(0x80 | ((ch >> 6) & 0x3f));
+ res.push_back(0x80 | (ch & 0x3f));
+ }
+ else if (ch < 0x200000)
+ {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ res.push_back(0xF0 | (ch >> 18));
+ res.push_back(0x80 | ((ch >> 12) & 0x3f));
+ res.push_back(0x80 | ((ch >> 6) & 0x3f));
+ res.push_back(0x80 | (ch & 0x3f));
+ }
+ else if (ch < 0x4000000)
+ {
+ // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ res.push_back(0xF8 | (ch >> 24));
+ res.push_back(0x80 | ((ch >> 18) & 0x3f));
+ res.push_back(0x80 | ((ch >> 12) & 0x3f));
+ res.push_back(0x80 | ((ch >> 6) & 0x3f));
+ res.push_back(0x80 | (ch & 0x3f));
+ }
+ else // (ch <= 0x7fffffff)
+ {
+ // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ res.push_back(0xFC | (ch >> 30));
+ res.push_back(0x80 | ((ch >> 24) & 0x3f));
+ res.push_back(0x80 | ((ch >> 18) & 0x3f));
+ res.push_back(0x80 | ((ch >> 12) & 0x3f));
+ res.push_back(0x80 | ((ch >> 6) & 0x3f));
+ res.push_back(0x80 | (ch & 0x3f));
+ }
+} // encode_utf8
+
+
+static bool decode_utf8(const vector<UINT8> &in_data, deque<int> &out_data)
+{
+ size_t idx = 0;
+ int cnt;
+
+ out_data.clear();
+
+ // check for UTF-8 BOM silliness and skip
+ if (in_data.size() >= 3)
+ {
+ if ( (in_data[0] == 0xef)
+ && (in_data[1] == 0xbb)
+ && (in_data[2] == 0xbf))
+ {
+ idx = 3; // skip it
+ }
+ }
+
+ while (idx < in_data.size())
+ {
+ int ch = in_data[idx++];
+
+ if (ch < 0x80) // 1-byte sequence
+ {
+ out_data.push_back(ch);
+ continue;
+ }
+ else if ((ch & 0xE0) == 0xC0) // 2-byte sequence
+ {
+ ch &= 0x1F;
+ cnt = 1;
+ }
+ else if ((ch & 0xF0) == 0xE0) // 3-byte sequence
+ {
+ ch &= 0x0F;
+ cnt = 2;
+ }
+ else if ((ch & 0xF8) == 0xF0) // 4-byte sequence
+ {
+ ch &= 0x07;
+ cnt = 3;
+ }
+ else if ((ch & 0xFC) == 0xF8) // 5-byte sequence
+ {
+ ch &= 0x03;
+ cnt = 4;
+ }
+ else if ((ch & 0xFE) == 0xFC) // 6-byte sequence
+ {
+ ch &= 0x01;
+ cnt = 5;
+ }
+ else
+ {
+ // invalid UTF-8 sequence
+ return(false);
+ }
+
+ while (cnt-- > 0 && idx < in_data.size())
+ {
+ int tmp = in_data[idx++];
+
+ if ((tmp & 0xC0) != 0x80)
+ {
+ // invalid UTF-8 sequence
+ return(false);
+ }
+ ch = (ch << 6) | (tmp & 0x3f);
+ }
+
+ if (cnt >= 0)
+ {
+ // short UTF-8 sequence
+ return(false);
+ }
+ out_data.push_back(ch);
+ }
+ return(true);
+} // decode_utf8
+
+
+static int get_word(const vector<UINT8> &in_data, size_t &idx, bool be)
+{
+ int ch;
+
+ if ((idx + 2) > in_data.size())
+ {
+ ch = -1;
+ }
+ else if (be)
+ {
+ ch = (in_data[idx] << 8) | in_data[idx + 1];
+ }
+ else
+ {
+ ch = in_data[idx] | (in_data[idx + 1] << 8);
+ }
+ idx += 2;
+ return(ch);
+}
+
+
+static bool decode_utf16(const vector<UINT8> &in_data, deque<int> &out_data, char_encoding_e &enc)
+{
+ out_data.clear();
+
+ if (in_data.size() & 1)
+ {
+ // can't have and odd length
+ return(false);
+ }
+
+ if (in_data.size() < 2)
+ {
+ // we require the BOM or at least 1 char
+ return(false);
+ }
+ size_t idx = 2;
+
+ if ((in_data[0] == 0xfe) && (in_data[1] == 0xff))
+ {
+ enc = char_encoding_e::e_UTF16_BE;
+ }
+ else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe))
+ {
+ enc = char_encoding_e::e_UTF16_LE;
+ }
+ else
+ {
+ /*
+ * If we have a few words, we can take a guess, assuming the first few
+ * chars are ASCII
+ */
+ enc = char_encoding_e::e_ASCII;
+ idx = 0;
+
+ if (in_data.size() >= 6)
+ {
+ if ( (in_data[0] == 0)
+ && (in_data[2] == 0)
+ && (in_data[4] == 0))
+ {
+ enc = char_encoding_e::e_UTF16_BE;
+ }
+ else if ( (in_data[1] == 0)
+ && (in_data[3] == 0)
+ && (in_data[5] == 0))
+ {
+ enc = char_encoding_e::e_UTF16_LE;
+ }
+ }
+
+ if (enc == char_encoding_e::e_ASCII)
+ {
+ return(false);
+ }
+ }
+ bool be = (enc == char_encoding_e::e_UTF16_BE);
+
+ while (idx < in_data.size())
+ {
+ int ch = get_word(in_data, idx, be);
+
+ if ((ch & 0xfc00) == 0xd800)
+ {
+ ch &= 0x3ff;
+ ch <<= 10;
+ int tmp = get_word(in_data, idx, be);
+
+ if ((tmp & 0xfc00) != 0xdc00)
+ {
+ return(false);
+ }
+ ch |= (tmp & 0x3ff);
+ ch += 0x10000;
+ out_data.push_back(ch);
+ }
+ else if ( (ch >= 0 && ch < 0xD800)
+ || ch >= 0xE000)
+ {
+ out_data.push_back(ch);
+ }
+ else
+ {
+ // invalid character
+ return(false);
+ }
+ }
+ return(true);
+} // decode_utf16
+
+
+static bool decode_bom(const vector<UINT8> &in_data, char_encoding_e &enc)
+{
+ enc = char_encoding_e::e_ASCII;
+
+ if (in_data.size() >= 2)
+ {
+ if ((in_data[0] == 0xfe) && (in_data[1] == 0xff))
+ {
+ enc = char_encoding_e::e_UTF16_BE;
+ return(true);
+ }
+
+ if ((in_data[0] == 0xff) && (in_data[1] == 0xfe))
+ {
+ enc = char_encoding_e::e_UTF16_LE;
+ return(true);
+ }
+
+ if ( (in_data.size() >= 3)
+ && (in_data[0] == 0xef)
+ && (in_data[1] == 0xbb)
+ && (in_data[2] == 0xbf))
+ {
+ enc = char_encoding_e::e_UTF8;
+ return(true);
+ }
+ }
+ return(false);
+}
+
+
+bool decode_unicode(const vector<UINT8> &in_data, deque<int> &out_data, char_encoding_e &enc, bool &has_bom)
+{
+ // check for a BOM
+ if (decode_bom(in_data, enc))
+ {
+ has_bom = true;
+
+ if (enc == char_encoding_e::e_UTF8)
+ {
+ return(decode_utf8(in_data, out_data));
+ }
+ return(decode_utf16(in_data, out_data, enc));
+ }
+ has_bom = false;
+
+ // Check for simple ASCII
+ size_t non_ascii_cnt;
+ size_t zero_cnt;
+
+ if (is_ascii(in_data, non_ascii_cnt, zero_cnt))
+ {
+ enc = char_encoding_e::e_ASCII;
+ return(decode_bytes(in_data, out_data));
+ }
+
+ // There are a lot of 0's in UTF-16 (~50%)
+ if ( (zero_cnt > (in_data.size() / 4))
+ && (zero_cnt <= (in_data.size() / 2)))
+ {
+ // likely is UTF-16
+ if (decode_utf16(in_data, out_data, enc))
+ {
+ return(true);
+ }
+ }
+
+ if (decode_utf8(in_data, out_data))
+ {
+ enc = char_encoding_e::e_UTF8;
+ return(true);
+ }
+ // it is an unrecognized byte sequence
+ enc = char_encoding_e::e_BYTE;
+ return(decode_bytes(in_data, out_data));
+} // decode_unicode
+
+
+static void write_byte(int ch)
+{
+ if ((ch & 0xff) == ch)
+ {
+ if (cpd.fout)
+ {
+ fputc(ch, cpd.fout);
+ }
+
+ if (cpd.bout)
+ {
+ cpd.bout->push_back(static_cast<UINT8>(ch));
+ }
+ }
+ else
+ {
+ // illegal code - do not store
+ }
+}
+
+
+static void write_utf8(int ch)
+{
+ vector<UINT8> vv;
+
+ vv.reserve(6);
+
+ encode_utf8(ch, vv);
+
+ for (unsigned char char_val : vv)
+ {
+ write_byte(char_val);
+ }
+}
+
+
+static void write_utf16(int ch, bool be)
+{
+ // U+0000 to U+D7FF and U+E000 to U+FFFF
+ if ( (ch >= 0 && ch < 0xD800)
+ || (ch >= 0xE000 && ch < 0x10000))
+ {
+ if (be)
+ {
+ write_byte(ch >> 8);
+ write_byte(ch & 0xff);
+ }
+ else
+ {
+ write_byte(ch & 0xff);
+ write_byte(ch >> 8);
+ }
+ }
+ else if (ch >= 0x10000 && ch < 0x110000)
+ {
+ int v1 = ch - 0x10000;
+ int w1 = 0xD800 + (v1 >> 10);
+ int w2 = 0xDC00 + (v1 & 0x3ff);
+
+ if (be)
+ {
+ write_byte(w1 >> 8);
+ write_byte(w1 & 0xff);
+ write_byte(w2 >> 8);
+ write_byte(w2 & 0xff);
+ }
+ else
+ {
+ write_byte(w1 & 0xff);
+ write_byte(w1 >> 8);
+ write_byte(w2 & 0xff);
+ write_byte(w2 >> 8);
+ }
+ }
+ else
+ {
+ // illegal code - do not store
+ }
+} // write_utf16
+
+
+void write_bom(void)
+{
+ switch (cpd.enc)
+ {
+ case char_encoding_e::e_UTF8:
+ write_byte(0xef);
+ write_byte(0xbb);
+ write_byte(0xbf);
+ break;
+
+ case char_encoding_e::e_UTF16_LE:
+ write_utf16(0xfeff, false);
+ break;
+
+ case char_encoding_e::e_UTF16_BE:
+ write_utf16(0xfeff, true);
+ break;
+
+ default:
+ // char_encoding_e::e_ASCII
+ // char_encoding_e::e_BYTE
+ // do nothing
+ // Coveralls will complain
+ break;
+ }
+}
+
+
+void write_char(int ch)
+{
+ if (ch >= 0)
+ {
+ switch (cpd.enc)
+ {
+ case char_encoding_e::e_BYTE:
+ write_byte(ch & 0xff);
+ break;
+
+ case char_encoding_e::e_ASCII:
+ default:
+ write_byte(ch);
+ break;
+
+ case char_encoding_e::e_UTF8:
+ write_utf8(ch);
+ break;
+
+ case char_encoding_e::e_UTF16_LE:
+ write_utf16(ch, false);
+ break;
+
+ case char_encoding_e::e_UTF16_BE:
+ write_utf16(ch, true);
+ break;
+ }
+ }
+}
+
+
+void write_string(const unc_text &text)
+{
+ for (size_t idx = 0; idx < text.size(); idx++)
+ {
+ write_char(text[idx]);
+ }
+}