diff options
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.75.0/src/unicode.cpp')
-rw-r--r-- | debian/uncrustify-trinity/uncrustify-trinity-0.75.0/src/unicode.cpp | 580 |
1 files changed, 580 insertions, 0 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/src/unicode.cpp b/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/src/unicode.cpp new file mode 100644 index 00000000..172cda28 --- /dev/null +++ b/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/src/unicode.cpp @@ -0,0 +1,580 @@ +/** + * @file unicode.cpp + * Detects, read and writes characters in the proper format. + * + * @author Ben Gardner + * @license GPL v2+ + */ + +#include "unicode.h" + + +using namespace std; + + +//! See if all characters are ASCII (0-127) +static bool is_ascii(const vector<UINT8> &data, size_t &non_ascii_cnt, size_t &zero_cnt); + + +//! Convert the array of bytes into an array of ints +static bool decode_bytes(const vector<UINT8> &in_data, deque<int> &out_data); + + +/** + * Decode UTF-8 sequences from in_data and put the chars in out_data. + * If there are any decoding errors, then return false. + */ +static bool decode_utf8(const vector<UINT8> &in_data, deque<int> &out_data); + + +/** + * Extract 2 bytes from the stream and increment idx by 2 + * + * @param in byte vector with input data + * @param idx index points to working position in vector + */ +static int get_word(const vector<UINT8> &in_data, size_t &idx, bool be); + + +/** + * Decode a UTF-16 sequence. + * Sets enc based on the BOM. + * Must have the BOM as the first two bytes. + */ +static bool decode_utf16(const vector<UINT8> &in_data, deque<int> &out_data, char_encoding_e &enc); + + +/** + * Looks for the BOM of UTF-16 BE/LE and UTF-8. + * If found, set enc and return true. + * Sets enc to char_encoding_e::e_ASCII and returns false if not found. + */ +static bool decode_bom(const vector<UINT8> &in_data, char_encoding_e &enc); + + +//! Write for ASCII and BYTE encoding +static void write_byte(int ch); + + +//! Writes a single character to a file using UTF-8 encoding +static void write_utf8(int ch); + + +static void write_utf16(int ch, bool be); + + +static bool is_ascii(const vector<UINT8> &data, size_t &non_ascii_cnt, size_t &zero_cnt) +{ + non_ascii_cnt = 0; + zero_cnt = 0; + + for (unsigned char value : data) + { + if (value & 0x80) + { + non_ascii_cnt++; + } + + if (!value) + { + zero_cnt++; + } + } + + return((non_ascii_cnt + zero_cnt) == 0); +} + + +static bool decode_bytes(const vector<UINT8> &in_data, deque<int> &out_data) +{ + out_data.resize(in_data.size()); + + for (size_t idx = 0; idx < in_data.size(); idx++) + { + out_data[idx] = in_data[idx]; + } + + return(true); +} + + +void encode_utf8(int ch, vector<UINT8> &res) +{ + if (ch < 0) + { + // illegal code - do not store + } + else if (ch < 0x80) + { + // 0xxxxxxx + res.push_back(ch); + } + else if (ch < 0x0800) + { + // 110xxxxx 10xxxxxx + res.push_back(0xC0 | (ch >> 6)); + res.push_back(0x80 | (ch & 0x3f)); + } + else if (ch < 0x10000) + { + // 1110xxxx 10xxxxxx 10xxxxxx + res.push_back(0xE0 | (ch >> 12)); + res.push_back(0x80 | ((ch >> 6) & 0x3f)); + res.push_back(0x80 | (ch & 0x3f)); + } + else if (ch < 0x200000) + { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + res.push_back(0xF0 | (ch >> 18)); + res.push_back(0x80 | ((ch >> 12) & 0x3f)); + res.push_back(0x80 | ((ch >> 6) & 0x3f)); + res.push_back(0x80 | (ch & 0x3f)); + } + else if (ch < 0x4000000) + { + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + res.push_back(0xF8 | (ch >> 24)); + res.push_back(0x80 | ((ch >> 18) & 0x3f)); + res.push_back(0x80 | ((ch >> 12) & 0x3f)); + res.push_back(0x80 | ((ch >> 6) & 0x3f)); + res.push_back(0x80 | (ch & 0x3f)); + } + else // (ch <= 0x7fffffff) + { + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + res.push_back(0xFC | (ch >> 30)); + res.push_back(0x80 | ((ch >> 24) & 0x3f)); + res.push_back(0x80 | ((ch >> 18) & 0x3f)); + res.push_back(0x80 | ((ch >> 12) & 0x3f)); + res.push_back(0x80 | ((ch >> 6) & 0x3f)); + res.push_back(0x80 | (ch & 0x3f)); + } +} // encode_utf8 + + +static bool decode_utf8(const vector<UINT8> &in_data, deque<int> &out_data) +{ + size_t idx = 0; + int cnt; + + out_data.clear(); + + // check for UTF-8 BOM silliness and skip + if (in_data.size() >= 3) + { + if ( (in_data[0] == 0xef) + && (in_data[1] == 0xbb) + && (in_data[2] == 0xbf)) + { + idx = 3; // skip it + } + } + + while (idx < in_data.size()) + { + int ch = in_data[idx++]; + + if (ch < 0x80) // 1-byte sequence + { + out_data.push_back(ch); + continue; + } + else if ((ch & 0xE0) == 0xC0) // 2-byte sequence + { + ch &= 0x1F; + cnt = 1; + } + else if ((ch & 0xF0) == 0xE0) // 3-byte sequence + { + ch &= 0x0F; + cnt = 2; + } + else if ((ch & 0xF8) == 0xF0) // 4-byte sequence + { + ch &= 0x07; + cnt = 3; + } + else if ((ch & 0xFC) == 0xF8) // 5-byte sequence + { + ch &= 0x03; + cnt = 4; + } + else if ((ch & 0xFE) == 0xFC) // 6-byte sequence + { + ch &= 0x01; + cnt = 5; + } + else + { + // invalid UTF-8 sequence + return(false); + } + + while ( cnt-- > 0 + && idx < in_data.size()) + { + int tmp = in_data[idx++]; + + if ((tmp & 0xC0) != 0x80) + { + // invalid UTF-8 sequence + return(false); + } + ch = (ch << 6) | (tmp & 0x3f); + } + + if (cnt >= 0) + { + // short UTF-8 sequence + return(false); + } + out_data.push_back(ch); + } + return(true); +} // decode_utf8 + + +static int get_word(const vector<UINT8> &in_data, size_t &idx, bool be) +{ + int ch; + + if ((idx + 2) > in_data.size()) + { + ch = -1; + } + else if (be) + { + ch = (in_data[idx] << 8) | in_data[idx + 1]; + } + else + { + ch = in_data[idx] | (in_data[idx + 1] << 8); + } + idx += 2; + return(ch); +} + + +static bool decode_utf16(const vector<UINT8> &in_data, deque<int> &out_data, char_encoding_e &enc) +{ + out_data.clear(); + + if (in_data.size() & 1) + { + // can't have and odd length + return(false); + } + + if (in_data.size() < 2) + { + // we require the BOM or at least 1 char + return(false); + } + size_t idx = 2; + + if ( (in_data[0] == 0xfe) + && (in_data[1] == 0xff)) + { + enc = char_encoding_e::e_UTF16_BE; + } + else if ( (in_data[0] == 0xff) + && (in_data[1] == 0xfe)) + { + enc = char_encoding_e::e_UTF16_LE; + } + else + { + /* + * If we have a few words, we can take a guess, assuming the first few + * chars are ASCII + */ + enc = char_encoding_e::e_ASCII; + idx = 0; + + if (in_data.size() >= 6) + { + if ( (in_data[0] == 0) + && (in_data[2] == 0) + && (in_data[4] == 0)) + { + enc = char_encoding_e::e_UTF16_BE; + } + else if ( (in_data[1] == 0) + && (in_data[3] == 0) + && (in_data[5] == 0)) + { + enc = char_encoding_e::e_UTF16_LE; + } + } + + if (enc == char_encoding_e::e_ASCII) + { + return(false); + } + } + bool be = (enc == char_encoding_e::e_UTF16_BE); + + while (idx < in_data.size()) + { + int ch = get_word(in_data, idx, be); + + if ((ch & 0xfc00) == 0xd800) + { + ch &= 0x3ff; + ch <<= 10; + int tmp = get_word(in_data, idx, be); + + if ((tmp & 0xfc00) != 0xdc00) + { + return(false); + } + ch |= (tmp & 0x3ff); + ch += 0x10000; + out_data.push_back(ch); + } + else if ( ( ch >= 0 + && ch < 0xD800) + || ch >= 0xE000) + { + out_data.push_back(ch); + } + else + { + // invalid character + return(false); + } + } + return(true); +} // decode_utf16 + + +static bool decode_bom(const vector<UINT8> &in_data, char_encoding_e &enc) +{ + enc = char_encoding_e::e_ASCII; + + if (in_data.size() >= 2) + { + if ( (in_data[0] == 0xfe) + && (in_data[1] == 0xff)) + { + enc = char_encoding_e::e_UTF16_BE; + return(true); + } + + if ( (in_data[0] == 0xff) + && (in_data[1] == 0xfe)) + { + enc = char_encoding_e::e_UTF16_LE; + return(true); + } + + if ( (in_data.size() >= 3) + && (in_data[0] == 0xef) + && (in_data[1] == 0xbb) + && (in_data[2] == 0xbf)) + { + enc = char_encoding_e::e_UTF8; + return(true); + } + } + return(false); +} + + +bool decode_unicode(const vector<UINT8> &in_data, deque<int> &out_data, char_encoding_e &enc, bool &has_bom) +{ + // check for a BOM + if (decode_bom(in_data, enc)) + { + has_bom = true; + + if (enc == char_encoding_e::e_UTF8) + { + return(decode_utf8(in_data, out_data)); + } + return(decode_utf16(in_data, out_data, enc)); + } + has_bom = false; + + // Check for simple ASCII + size_t non_ascii_cnt; + size_t zero_cnt; + + if (is_ascii(in_data, non_ascii_cnt, zero_cnt)) + { + enc = char_encoding_e::e_ASCII; + return(decode_bytes(in_data, out_data)); + } + + // There are a lot of 0's in UTF-16 (~50%) + if ( (zero_cnt > (in_data.size() / 4)) + && (zero_cnt <= (in_data.size() / 2))) + { + // likely is UTF-16 + if (decode_utf16(in_data, out_data, enc)) + { + return(true); + } + } + + if (decode_utf8(in_data, out_data)) + { + enc = char_encoding_e::e_UTF8; + return(true); + } + // it is an unrecognized byte sequence + enc = char_encoding_e::e_BYTE; + return(decode_bytes(in_data, out_data)); +} // decode_unicode + + +static void write_byte(int ch) +{ + if ((ch & 0xff) == ch) + { + if (cpd.fout) + { + fputc(ch, cpd.fout); + } + + if (cpd.bout) + { + cpd.bout->push_back(static_cast<UINT8>(ch)); + } + } + else + { + // illegal code - do not store + } +} + + +static void write_utf8(int ch) +{ + vector<UINT8> vv; + + vv.reserve(6); + + encode_utf8(ch, vv); + + for (unsigned char char_val : vv) + { + write_byte(char_val); + } +} + + +static void write_utf16(int ch, bool be) +{ + // U+0000 to U+D7FF and U+E000 to U+FFFF + if ( ( ch >= 0 + && ch < 0xD800) + || ( ch >= 0xE000 + && ch < 0x10000)) + { + if (be) + { + write_byte(ch >> 8); + write_byte(ch & 0xff); + } + else + { + write_byte(ch & 0xff); + write_byte(ch >> 8); + } + } + else if ( ch >= 0x10000 + && ch < 0x110000) + { + int v1 = ch - 0x10000; + int w1 = 0xD800 + (v1 >> 10); + int w2 = 0xDC00 + (v1 & 0x3ff); + + if (be) + { + write_byte(w1 >> 8); + write_byte(w1 & 0xff); + write_byte(w2 >> 8); + write_byte(w2 & 0xff); + } + else + { + write_byte(w1 & 0xff); + write_byte(w1 >> 8); + write_byte(w2 & 0xff); + write_byte(w2 >> 8); + } + } + else + { + // illegal code - do not store + } +} // write_utf16 + + +void write_bom(void) +{ + switch (cpd.enc) + { + case char_encoding_e::e_UTF8: + write_byte(0xef); + write_byte(0xbb); + write_byte(0xbf); + break; + + case char_encoding_e::e_UTF16_LE: + write_utf16(0xfeff, false); + break; + + case char_encoding_e::e_UTF16_BE: + write_utf16(0xfeff, true); + break; + + default: + // char_encoding_e::e_ASCII + // char_encoding_e::e_BYTE + // do nothing + // Coveralls will complain + break; + } +} + + +void write_char(int ch) +{ + if (ch >= 0) + { + switch (cpd.enc) + { + case char_encoding_e::e_BYTE: + write_byte(ch & 0xff); + break; + + case char_encoding_e::e_ASCII: + default: + write_byte(ch); + break; + + case char_encoding_e::e_UTF8: + write_utf8(ch); + break; + + case char_encoding_e::e_UTF16_LE: + write_utf16(ch, false); + break; + + case char_encoding_e::e_UTF16_BE: + write_utf16(ch, true); + break; + } + } +} + + +void write_string(const unc_text &text) +{ + for (size_t idx = 0; idx < text.size(); idx++) + { + write_char(text[idx]); + } +} |