From 7740e825a683a9cc84f8422c94109c5fcc4beb8e Mon Sep 17 00:00:00 2001 From: Michele Calgaro Date: Sat, 17 Aug 2024 22:26:29 +0900 Subject: kjs: use libpcre2 instead of libpcre Signed-off-by: Michele Calgaro --- kjs/CMakeLists.txt | 6 +- kjs/Makefile.am | 6 +- kjs/configure.in.in | 51 ++++++------ kjs/regexp.cpp | 229 ++++++++++++++++++++++++---------------------------- kjs/regexp.h | 21 ++--- 5 files changed, 149 insertions(+), 164 deletions(-) (limited to 'kjs') diff --git a/kjs/CMakeLists.txt b/kjs/CMakeLists.txt index 8e9b16849..c74bf1d5b 100644 --- a/kjs/CMakeLists.txt +++ b/kjs/CMakeLists.txt @@ -15,12 +15,12 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/tdecore - ${LIBPCRE_INCLUDEDIR} + ${LIBPCRE2_INCLUDEDIR} ) link_directories( ${TDECORE_LIBRARY_DIRS} - ${LIBPCRE_LIBDIR} + ${LIBPCRE2_LIBDIR} ) @@ -62,6 +62,6 @@ tde_add_library( ${target} SHARED SOURCES ${${target}_SRCS} VERSION 1.2.0 LINK tdecore-shared - LINK_PRIVATE ${LIBPCRE_LIBRARIES} + LINK_PRIVATE ${LIBPCRE2_LIBRARIES} DESTINATION ${LIB_INSTALL_DIR} ) diff --git a/kjs/Makefile.am b/kjs/Makefile.am index 89f937906..8dc05f656 100644 --- a/kjs/Makefile.am +++ b/kjs/Makefile.am @@ -17,7 +17,7 @@ # Boston, MA 02110-1301, USA. YACC = bison -INCLUDES = $(PCRECFLAGS) $(all_includes) +INCLUDES = $(PCRE2CFLAGS) $(all_includes) lib_LTLIBRARIES = libkjs.la @@ -50,7 +50,7 @@ endif libkjs_la_LDFLAGS = -version-info 3:0:2 -no-undefined $(VSCRIPT) \ $(USER_LDFLAGS) $(all_libraries) -libkjs_la_LIBADD = -lm $(LIBPCRE) +libkjs_la_LIBADD = -lm $(LIBPCRE2) EXTRA_DIST = grammar.y @@ -93,7 +93,7 @@ CLEANFILES = $(LUT_FILES) ## test program (in one program for easier profiling/memory debugging) EXTRA_PROGRAMS = testkjs_static testkjs_static_SOURCES = testkjs.cpp -testkjs_static_LDADD = $(LIBPCRE) libkjs.la +testkjs_static_LDADD = $(LIBPCRE2) libkjs.la testkjs_static_LDFLAGS = -static ## test program (linked to libkjs) diff --git a/kjs/configure.in.in b/kjs/configure.in.in index 1c4d3ac52..4e6bd9742 100644 --- a/kjs/configure.in.in +++ b/kjs/configure.in.in @@ -2,52 +2,55 @@ dnl KDE JavaScript specific configure tests AC_CHECK_HEADERS(ieeefp.h float.h) -AC_DEFUN([AC_CHECK_PCREPOSIX], +AC_DEFUN(AC_CHECK_PCRE2POSIX], [ - dnl define the configure option that disables pcre - AC_ARG_ENABLE(pcre,AC_HELP_STRING([--disable-pcre],[don't require libpcre (poor RegExp support in Javascript)]), - with_pcre=$enableval, with_pcre=yes) + dnl define the configure option that disables pcre2 + AC_ARG_ENABLE(pcre2,AC_HELP_STRING([--disable-pcre],[don't require libpcre (poor RegExp support in Javascript)]), + with_pcre2=$enableval, with_pcre2=yes) - if test "$with_pcre" = "yes"; then + if test "$with_pcre2" = "yes"; then - KDE_FIND_PATH(pcre-config, PCRE_CONFIG, [${exec_prefix}/bin ${prefix}/bin], [PCRE_CONFIG="" ]) - if test -n "$PCRE_CONFIG" && $PCRE_CONFIG --libs >/dev/null 2>&1; then - LIBPCRE=`$PCRE_CONFIG --libs-posix | sed -e "s,-L/usr/lib ,," -e "s,[\b-].\+pcreposix[^[:space:]]*\b,,"` - PCRECFLAGS=`$PCRE_CONFIG --cflags` + KDE_FIND_PATH(pcre2-config, PCRE2_CONFIG, [${exec_prefix}/bin ${prefix}/bin], [PCRE2_CONFIG="" ]) + if test -n "$PCRE2_CONFIG" && $PCRE2_CONFIG --libs8 >/dev/null 2>&1; then + LIBPCRE2=`$PCRE2_CONFIG --libs-posix | sed -e "s,-L/usr/lib ,," -e "s,[\b-].\+pcreposix[^[:space:]]*\b,,"` + PCRE2CFLAGS=`$PCRE2_CONFIG --cflags` else - LIBPCRE="-lpcre" - PCRECFLAGS= + LIBPCRE2="-lpcre2-8" + PCRE2CFLAGS= fi - AC_CACHE_VAL(ac_cv_have_pcreposix, [ + AC_CACHE_VAL(ac_cv_have_pcre2posix, [ ac_save_libs="$LIBS" - LIBS="$LIBPCRE" + LIBS="$LIBPCRE2" ac_CPPFLAGS_save="$CPPFLAGS" - CPPFLAGS="$CPPFLAGS $PCRECFLAGS $all_includes" + CPPFLAGS="$CPPFLAGS $PCRE2CFLAGS $all_includes" ac_LDFLAGS_save="$LDFLAGS" LDFLAGS="$LDFLAGS $all_libraries" AC_TRY_LINK( - [#include ], - [regfree(0);], - [ac_cv_have_pcreposix="yes"], - [ac_cv_have_pcreposix="no"] + [ + #define PCRE2_CODE_UNIT_WIDTH 8 + #include + ], + [pcre2_regfree(0);], + [ac_cv_have_pcre2posix="yes"], + [ac_cv_have_pcre2posix="no"] ) LIBS="$ac_save_libs" LDFLAGS="$ac_LDFLAGS_save" CPPFLAGS="$ac_CPPFLAGS_save" ]) - if test "$ac_cv_have_pcreposix" = "yes"; then - AC_DEFINE(HAVE_PCREPOSIX, 1, [Define if you have pcreposix libraries and header files.]) + if test "$ac_cv_have_pcre2posix" = "yes"; then + AC_DEFINE(HAVE_PCRE2POSIX, 1, [Define if you have pcre2posix libraries and header files.]) else AC_MSG_ERROR([You're missing libpcre. -Download libpcre from http://www.pcre.org or find a binary package for your platform. +Download libpcre2 from http://www.pcre.org or find a binary package for your platform. Alternatively, you can specify --disable-pcre, but some web pages - using regular expressions in Javascript code - will not work correctly, the regexp support being quite limited if libpcre isn't present.]) fi fi ]) -AC_CHECK_PCREPOSIX -AC_SUBST(LIBPCRE) -AC_SUBST(PCRECFLAGS) +AC_CHECK_PCRE2POSIX +AC_SUBST(LIBPCRE2) +AC_SUBST(PCRE2CFLAGS) AM_CONFIG_HEADER([kjs/global.h]) diff --git a/kjs/regexp.cpp b/kjs/regexp.cpp index 0c2675588..a693fdc1a 100644 --- a/kjs/regexp.cpp +++ b/kjs/regexp.cpp @@ -30,21 +30,17 @@ using namespace KJS; -#ifdef PCRE_CONFIG_UTF8 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown; -#endif RegExp::RegExp(const UString &p, int f) : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0) { // Determine whether libpcre has unicode support if need be.. -#ifdef PCRE_CONFIG_UTF8 if (utf8Support == Unknown) { - int supported; - pcre_config(PCRE_CONFIG_UTF8, (void*)&supported); - utf8Support = supported ? Supported : Unsupported; + uint32_t supported; + pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported); + utf8Support = (supported & 0x0001) ? Supported : Unsupported; } -#endif nrSubPatterns = 0; // determined in match() with POSIX regex. @@ -63,33 +59,33 @@ RegExp::RegExp(const UString &p, int f) escape = false; // we only care about \u if (c == 'u') { - // standard unicode escape sequence looks like \uxxxx but - // other browsers also accept less then 4 hex digits - unsigned short u = 0; - int j = 0; - for (j = 0; j < 4; ++j) { - if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { - u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); - ++i; - } else { - // sequence incomplete. restore index. - // TODO: cleaner way to propagate warning - fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); - i -= j; - break; - } - } - if (j < 4) { - // sequence was incomplete. treat \u as u which IE always - // and FF sometimes does. - intern.append(UString('u')); - } else { + // standard unicode escape sequence looks like \uxxxx but + // other browsers also accept less then 4 hex digits + unsigned short u = 0; + int j = 0; + for (j = 0; j < 4; ++j) { + if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) { + u = (u << 4) + Lexer::convertHex(p[i + 1].unicode()); + ++i; + } else { + // sequence incomplete. restore index. + // TODO: cleaner way to propagate warning + fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j); + i -= j; + break; + } + } + if (j < 4) { + // sequence was incomplete. treat \u as u which IE always + // and FF sometimes does. + intern.append(UString('u')); + } else { c = UChar(u); switch (u) { case 0: - // Make sure to encode 0, to avoid terminating the string - intern += UString(nil); - break; + // Make sure to encode 0, to avoid terminating the string + intern += UString(nil); + break; case '^': case '$': case '\\': @@ -101,13 +97,13 @@ RegExp::RegExp(const UString &p, int f) case '{': case '}': case '[': case ']': case '|': - // escape pattern characters have to remain escaped - intern.append(UString('\\')); - // intentional fallthrough + // escape pattern characters have to remain escaped + intern.append(UString('\\')); + // intentional fallthrough default: - intern += UString(&c, 1); - break; - } + intern += UString(&c, 1); + break; + } } continue; } @@ -126,45 +122,46 @@ RegExp::RegExp(const UString &p, int f) intern = p; } -#ifdef HAVE_PCREPOSIX - int pcreflags = 0; - const char *perrormsg; - int errorOffset; +#ifdef HAVE_PCRE2POSIX + uint32_t pcre2flags = 0; + int errorCode; + PCRE2_SIZE errorOffset; if (flgs & IgnoreCase) - pcreflags |= PCRE_CASELESS; + pcre2flags |= PCRE2_CASELESS; if (flgs & Multiline) - pcreflags |= PCRE_MULTILINE; + pcre2flags |= PCRE2_MULTILINE; -#ifdef PCRE_CONFIG_UTF8 if (utf8Support == Supported) - pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK); -#endif + pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK); // Fill our buffer with an encoded version, whether utf-8, or, // if PCRE is incapable, truncated. prepareMatch(intern); - pcregex = pcre_compile(buffer, pcreflags, - &perrormsg, &errorOffset, NULL); + pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags, + &errorCode, &errorOffset, NULL); doneMatch(); // Cleanup buffers if (!pcregex) { #ifndef NDEBUG - fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg); + PCRE2_UCHAR errorMsg[256]; + pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg)); + fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg); #endif valid = false; return; } -#ifdef PCRE_INFO_CAPTURECOUNT // Get number of subpatterns that will be returned - int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns); + int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns); if (rc != 0) -#endif + { nrSubPatterns = 0; // fallback. We always need the first pair of offsets. + } -#else /* HAVE_PCREPOSIX */ + match_data = pcre2_match_data_create_from_pattern(pcregex, NULL); +#else int regflags = 0; #ifdef REG_EXTENDED @@ -195,9 +192,15 @@ RegExp::RegExp(const UString &p, int f) RegExp::~RegExp() { doneMatch(); // Be 100% sure buffers are freed -#ifdef HAVE_PCREPOSIX +#ifdef HAVE_PCRE2POSIX + if (match_data) + { + pcre2_match_data_free(match_data); + } if (pcregex) - pcre_free(pcregex); + { + pcre2_code_free(pcregex); + } #else /* TODO: is this really okay after an error ? */ regfree(&preg); @@ -208,7 +211,7 @@ void RegExp::prepareUtf8(const UString& s) { // Allocate a buffer big enough to hold all the characters plus \0 const int length = s.size(); - buffer = new char[length * 3 + 1]; + buffer = new buftype_t[length * 3 + 1]; // Also create buffer for positions. We need one extra character in there, // even past the \0 since the non-empty handling may jump one past the end @@ -217,7 +220,7 @@ void RegExp::prepareUtf8(const UString& s) // Convert to runs of 8-bit characters, and generate indeces // Note that we do NOT combine surrogate pairs here, as // regexps operate on them as separate characters - char *p = buffer; + buftype_t *p = buffer; int *posOut = originalPos; const UChar *d = s.data(); for (int i = 0; i != length; ++i) { @@ -225,16 +228,16 @@ void RegExp::prepareUtf8(const UString& s) int sequenceLen; if (c < 0x80) { - *p++ = (char)c; + *p++ = (buftype_t)c; sequenceLen = 1; } else if (c < 0x800) { - *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 - *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set + *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8 + *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set sequenceLen = 2; } else { - *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 - *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set - *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set + *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8 + *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set + *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set sequenceLen = 3; } @@ -262,7 +265,7 @@ void RegExp::prepareASCII (const UString& s) // when we don't have utf 8 available -- use // truncated version, and pray for the best CString truncated = s.cstring(); - buffer = new char[truncated.size() + 1]; + buffer = new buftype_t[truncated.size() + 1]; memcpy(buffer, truncated.c_str(), truncated.size()); buffer[truncated.size()] = '\0'; // For _compile use bufferSize = truncated.size(); @@ -272,11 +275,9 @@ void RegExp::prepareMatch(const UString &s) { delete[] originalPos; // Just to be sure.. delete[] buffer; -#ifdef PCRE_CONFIG_UTF8 if (utf8Support == Supported) prepareUtf8(s); else -#endif prepareASCII(s); #ifndef NDEBUG @@ -308,17 +309,16 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector) if (i > s.size() || s.isNull()) return UString::null; -#ifdef HAVE_PCREPOSIX - int ovecsize = (nrSubPatterns+1)*3; // see pcre docu - if (ovector) *ovector = new int[ovecsize]; - if (!pcregex) +#ifdef HAVE_PCRE2POSIX + if (!pcregex || !match_data) + return UString::null; + if (!ovector) return UString::null; int startPos; int nextPos; - -#ifdef PCRE_CONFIG_UTF8 - if (utf8Support == Supported) { + if (utf8Support == Supported) + { startPos = i; while (originalPos[startPos] < i) ++startPos; @@ -328,53 +328,59 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector) while (originalPos[nextPos] < (i + 1)) ++nextPos; } - } else -#endif + } + else { startPos = i; nextPos = i + (i < s.size() ? 1 : 0); } - int baseFlags = -#ifdef PCRE_CONFIG_UTF8 - utf8Support == Supported ? PCRE_NO_UTF8_CHECK : -#endif - 0; - int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos, - m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest - ovector ? *ovector : 0L, ovecsize); - if (numMatches < 0) + uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0); + if (m_notEmpty) + { + baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED; + } + int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL); + if (numMatches <= 0) { // Failed to match. - if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos) + if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos) { // We set m_notEmpty ourselves, to look for a non-empty match - // (see man pcretest or pcretest.c for details). // So we don't stop here, we want to try again at i+1. #ifdef KJS_VERBOSE fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n"); #endif m_notEmpty = 0; - numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags, - ovector ? *ovector : 0L, ovecsize); - if (numMatches < 0) + baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0); + numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL); + if (numMatches <= 0) return UString::null; } - else // done + else return UString::null; } - // Got a match, proceed with it. - // But fix up the ovector if need be.. - if (ovector && originalPos) { - for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) { - if ((*ovector)[c] != -1) - (*ovector)[c] = originalPos[(*ovector)[c]]; + PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data); + if (!pcre2_ovector) + return UString::null; + + uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data); + *ovector = new int[pcre2_ovecCount * 2]; + if (originalPos) + { + for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c) + { + (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1; + } + } + else + { + for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c) + { + (*ovector)[c] = pcre2_ovector[c]; } } - - if (!ovector) - return UString::null; // don't rely on the return value if you pass ovector==0 #else const uint maxMatch = 10; regmatch_t rmatch[maxMatch]; @@ -419,28 +425,3 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector) } return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]); } - -#if 0 // unused -bool RegExp::test(const UString &s, int) -{ -#ifdef HAVE_PCREPOSIX - int ovector[300]; - CString buffer(s.cstring()); - - if (s.isNull() || - pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0, - 0, ovector, 300) == PCRE_ERROR_NOMATCH) - return false; - else - return true; - -#else - - char *str = strdup(s.ascii()); - int r = regexec(&preg, str, 0, 0, 0); - free(str); - - return r == 0; -#endif -} -#endif diff --git a/kjs/regexp.h b/kjs/regexp.h index 88851260e..e731eb714 100644 --- a/kjs/regexp.h +++ b/kjs/regexp.h @@ -25,13 +25,16 @@ #include "config.h" -#ifdef HAVE_PCREPOSIX -#include +#ifdef HAVE_PCRE2POSIX +#define PCRE2_CODE_UNIT_WIDTH 8 +#include +typedef PCRE2_UCHAR8 buftype_t; #else // POSIX regex - not so good... extern "C" { // bug with some libc5 distributions #include +typedef char buftype_t; } -#endif //HAVE_PCREPOSIX +#endif #include "ustring.h" @@ -61,7 +64,7 @@ namespace KJS { bool valid; // Cached encoding info... - char* buffer; + buftype_t *buffer; int* originalPos; int bufferSize; @@ -71,22 +74,20 @@ namespace KJS { UString originalS; // the original string, used for sanity-checking #endif -#ifndef HAVE_PCREPOSIX +#ifndef HAVE_PCRE2POSIX regex_t preg; #else - pcre *pcregex; + pcre2_code *pcregex; + pcre2_match_data *match_data; enum UTF8SupportState { Unknown, Supported, Unsupported }; - -#ifdef PCRE_CONFIG_UTF8 static UTF8SupportState utf8Support; #endif -#endif - unsigned int nrSubPatterns; + uint32_t nrSubPatterns; RegExp(); }; -- cgit v1.2.1