summaryrefslogtreecommitdiffstats
path: root/tdecore/tequivchars.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tdecore/tequivchars.cpp')
-rwxr-xr-xtdecore/tequivchars.cpp241
1 files changed, 241 insertions, 0 deletions
diff --git a/tdecore/tequivchars.cpp b/tdecore/tequivchars.cpp
new file mode 100755
index 000000000..d259946b2
--- /dev/null
+++ b/tdecore/tequivchars.cpp
@@ -0,0 +1,241 @@
+#undef REGEX_IS_PCRE2
+#define OPTIMIZE_ASCII_LOOKUP
+
+#ifdef REGEXP_IS_PCRE2
+#pragma message "############ Assuming regular expressions are PCRE2 ############"
+#endif
+
+#ifdef OPTIMIZE_ASCII_LOOKUP
+#pragma message "############ ASCII characters will be processed separately ############"
+#endif
+
+#include "tequivchars.h"
+
+//typedef wchar_t CHAR16;
+//typedef unsigned short CHAR16;
+typedef TQChar CHAR16;
+
+class TEquivChars_Private
+{
+public:
+
+ struct defaultCollation {
+ CHAR16 character;
+ CHAR16 collatesTo;
+ };
+
+ const defaultCollation EquivalentsTable // terminating ';' is provided in include file
+ #include "tequivchars-mapping.h"
+ uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);
+};
+
+TEquivChars::TEquivChars()
+{
+ p = new TEquivChars_Private;
+}
+
+TEquivChars::~TEquivChars()
+{
+ delete p;
+}
+
+TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
+{
+ int inStrLen = inputString.length();
+ TQString outString = TQString::fromLatin1( "" );
+ outString.reserve( inStrLen );
+ const TQChar *char16 = inputString.unicode();
+
+ bool backSlashed = false; // \_
+ bool startedCharClass = false; // Previous character was starting '[' of character class
+ bool inCharacterClass = false; // [___]
+ bool inPosixBracketExpr = false; // [:___:]
+#ifdef REGEXP_IS_PCRE2
+ bool quoteLiteral = false; // \Q___\E
+ bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
+ bool inDirective = false; // (*___)
+ bool inGroupName = false; // (?<___>
+#endif // REGEXP_IS_PCRE2
+ CHAR16 currChar = 0;
+ CHAR16 prevChar = 0;
+ CHAR16 nextChar = 0;
+
+ for ( int i = 0 ; i < inStrLen ; outString[i] = CHAR16(currChar), i++ ) {
+
+ prevChar = currChar;
+ currChar = char16[i].unicode();
+
+ if ( isRegex ) {
+
+ /*
+ Look for regex characters and character sequences
+ that should never be converted to an equivalent.
+ */
+
+ if ( i < ( inStrLen - 1 ) )
+ nextChar = char16[i+1].unicode();
+ else
+ nextChar = 0;
+
+ if ( currChar == '\\' ) {
+ backSlashed = true;
+ continue;
+ }
+
+ // Don't convert backSlashed characters
+ if ( backSlashed ) {
+#ifdef REGEXP_IS_PCRE2
+ switch (currChar) {
+ case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
+ case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
+ case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
+ case 'P' : // Entering (negated) Unicode property specification \p{} ?
+ case 'p' : // Entering Unicode property specification \p{} ?
+ case 'g' : // Entering a named backreference \g{___} ?
+ if ( nextChar == '{' ) inBraceExpr = true;
+ break;
+ }
+#endif // REGEXP_IS_PCRE2
+ backSlashed = false;
+ continue;
+ }
+
+#ifdef REGEXP_IS_PCRE2
+ if ( quoteLiteral )
+ continue;
+
+ if ( inBraceExpr ) {
+ // Is it time to leave brace expression {___} ?
+ if ( nextChar == '}' ) inBraceExpr = true;
+ continue;
+ }
+#endif // REGEXP_IS_PCRE2
+
+ if ( startedCharClass ) {
+ switch (currChar) {
+ case '^' : // Negated character class, proceed to next character
+ continue; // Bypass converting this special character
+ case ']' : // Treat as part of character class, not as a closure
+ case ':' : // Treat as part of character class, not as start of bracket expression
+ startedCharClass = false;
+ continue; // Bypass converting these special characters
+ }
+ startedCharClass = false;
+ } // startedCharClass
+
+ if ( inCharacterClass ) {
+
+ if ( inPosixBracketExpr ) {
+ // Is it time to leave POSIX bracket expression [:___:] ?
+ if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
+ continue;
+ } // inPosixBracketExpr
+
+ else { // ! inPosixBracketExpr
+
+ if ( prevChar == '[' && currChar == ':' ) {
+ // Enter POSIX bracket expression [:___:]
+ inPosixBracketExpr = true;
+ continue;
+ }
+
+ if ( currChar == ']' ) {
+ // Leaving character class [___]
+ inCharacterClass = false;
+ continue;
+ }
+
+ } // ! inPosixBracketExpr
+
+ } // inCharacterClass
+
+ else { // ! inCharacterClass
+
+ switch (currChar) {
+
+ case '[' :
+ // Entering a character class [___]
+ startedCharClass = true;
+ inCharacterClass = true;
+ continue;
+ break;
+#ifdef REGEXP_IS_PCRE2
+ case '*' :
+ if ( prevChar != '(' ) continue;
+ // Entering a PCRE2 directive (*___)
+ inDirective = true;
+ continue;
+ break;
+
+ case '?' :
+ if ( prevChar != '(' ) continue;
+ if ( nextChar != '<' ) continue;
+ // Entering PCRE2 group name (?<___>)
+ inGroupName = true;
+ continue;
+ break;
+#endif // REGEXP_IS_PCRE2
+ }
+#ifdef REGEXP_IS_PCRE2
+ if ( inDirective ) {
+ // Is it time to leave PCRE2 directive (*___) ?
+ if (currChar == ')' ) inDirective = false;
+ continue;
+ }
+
+ if ( inGroupName ) {
+ // Is it time to leave PCRE2 group name (?<___>) ?
+ if (currChar == '>' ) inGroupName = false;
+ continue;
+ }
+#endif // REGEXP_IS_PCRE2
+ } // ! inCharacterClass
+
+ /*
+ If we have reached here, this regex character is a
+ candidate for potential conversion to an equivalent.
+ */
+
+ } // isRegex
+
+ //-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";
+
+#ifdef OPTIMIZE_ASCII_LOOKUP
+ // We can process ASCII quickly without using lookup table
+ unsigned short codepoint = currChar.unicode();
+ if ( codepoint < 128 ) {
+ if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
+ currChar = TQChar(codepoint + 32 ); // to corresponding lower case
+ // All other ASCII characters are equivalent to themselves
+ //-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
+ continue;
+ }
+#endif
+
+ // Use a simple binary search to look up an equivalent character
+ int low = 0;
+ int high = p->EquivTableROWS - 1;
+ while (low <= high) {
+ int mid = low + (high - low) / 2;
+ if ( currChar == p->EquivalentsTable[mid].character ) {
+ // Found equivalent character, use it instead
+ currChar = p->EquivalentsTable[mid].collatesTo;
+ break;
+ }
+ if ( p->EquivalentsTable[mid].character < currChar )
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+ //-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;
+
+ /* FIXME: Possible ideas for optimizing table lookup speed
+ (1) Detect & handle ASCII (<128) characters separately. *DONE*
+ (2) Split table into multiple lookup tables and search each
+ in order of descending likelihood of character match.
+ */
+
+ }
+
+ return outString;
+}