summaryrefslogtreecommitdiffstats
path: root/indexlib/tests/tokenizer-test.cpp
diff options
context:
space:
mode:
authortoma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>2009-11-25 17:56:58 +0000
committertoma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>2009-11-25 17:56:58 +0000
commit460c52653ab0dcca6f19a4f492ed2c5e4e963ab0 (patch)
tree67208f7c145782a7e90b123b982ca78d88cc2c87 /indexlib/tests/tokenizer-test.cpp
downloadtdepim-460c52653ab0dcca6f19a4f492ed2c5e4e963ab0.tar.gz
tdepim-460c52653ab0dcca6f19a4f492ed2c5e4e963ab0.zip
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features.
BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdepim@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
Diffstat (limited to 'indexlib/tests/tokenizer-test.cpp')
-rw-r--r--indexlib/tests/tokenizer-test.cpp69
1 files changed, 69 insertions, 0 deletions
diff --git a/indexlib/tests/tokenizer-test.cpp b/indexlib/tests/tokenizer-test.cpp
new file mode 100644
index 000000000..372859d90
--- /dev/null
+++ b/indexlib/tests/tokenizer-test.cpp
@@ -0,0 +1,69 @@
+#include <boost/test/unit_test.hpp>
+#include "tokenizer.h"
+#include <cassert>
+
+using namespace ::boost::unit_test;
+namespace indexlib { namespace tests { namespace tokenizer_test {
+
+using indexlib::detail::tokenizer;
+using indexlib::detail::get_tokenizer;
+
+void simple() {
+ std::auto_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
+ assert(tokenizer.get());
+ std::vector<std::string> tokens = tokenizer->string_to_words( "one ,as, ''#`:ThReE, בבאחי" );
+ std::vector<std::string> expected;
+ expected.push_back( "ONE" );
+ expected.push_back( "AS" );
+ expected.push_back( "THREE" );
+ expected.push_back( "AAACE" );
+ std::sort( tokens.begin(), tokens.end() );
+ std::sort( expected.begin(), expected.end() );
+ BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
+ for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
+ BOOST_CHECK_EQUAL( expected[ i ], tokens[ i ] );
+ }
+}
+
+void with_newlines() {
+ std::auto_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
+ assert(tokenizer.get());
+ std::vector<std::string> tokens = tokenizer->string_to_words( "one\ntwo\nthree" );
+ std::vector<std::string> expected;
+ expected.push_back( "ONE" );
+ expected.push_back( "TWO" );
+ expected.push_back( "THREE" );
+ std::sort( tokens.begin(), tokens.end() );
+ std::sort( expected.begin(), expected.end() );
+ BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
+ for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
+ BOOST_CHECK_EQUAL( expected.at( i ), tokens.at( i ) );
+ }
+}
+
+void with_numbers() {
+ std::auto_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
+ assert(tokenizer.get());
+ std::vector<std::string> tokens = tokenizer->string_to_words( "one 012 123 four" );
+ std::vector<std::string> expected;
+ expected.push_back( "ONE" );
+ expected.push_back( "012" );
+ expected.push_back( "123" );
+ expected.push_back( "FOUR" );
+ std::sort( tokens.begin(), tokens.end() );
+ std::sort( expected.begin(), expected.end() );
+ BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
+ for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
+ BOOST_CHECK_EQUAL( expected.at( i ), tokens.at( i ) );
+ }
+}
+
+test_suite* get_suite() {
+ test_suite* test = BOOST_TEST_SUITE( "Tokenizer tests" );
+ test->add( BOOST_TEST_CASE( &simple ) );
+ test->add( BOOST_TEST_CASE( &with_newlines ) );
+ test->add( BOOST_TEST_CASE( &with_numbers ) );
+ return test;
+}
+
+}}} //namespaces