Index: inc/breakiterator_cjk.hxx =================================================================== RCS file: /cvs/l10n/i18npool/inc/breakiterator_cjk.hxx,v --- inc/breakiterator_cjk.hxx 7 Sep 2005 16:46:55 -0000 1.5 +++ inc/breakiterator_cjk.hxx 2 Apr 2007 10:11:34 -0000 @@ -38,6 +38,10 @@ #include #include +#ifdef ENABLE_MECAB +#include +#endif + namespace com { namespace sun { namespace star { namespace i18n { // ---------------------------------------------------- // class BreakIterator_CJK @@ -74,9 +78,29 @@ #ifdef BREAKITERATOR_ALL BREAKITERATOR_CJK( zh ) -BREAKITERATOR_CJK( ja ) BREAKITERATOR_CJK( ko ) -#endif + +#ifndef ENABLE_MECAB +BREAKITERATOR_CJK( ja ) +#else // #ifndef ENABLE_MECAB +class BreakIterator_ja : public BreakIterator_CJK { + MeCab::Tagger *tagger; +public: + BreakIterator_ja (); + ~BreakIterator_ja (); + Boundary SAL_CALL nextWord( const rtl::OUString& Text, sal_Int32 nStartPos, + const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType) + throw(com::sun::star::uno::RuntimeException); + Boundary SAL_CALL previousWord( const rtl::OUString& Text, sal_Int32 nStartPos, + const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType) + throw(com::sun::star::uno::RuntimeException); + Boundary SAL_CALL getWordBoundary( const rtl::OUString& Text, sal_Int32 nPos, + const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType, sal_Bool bDirection ) + throw(com::sun::star::uno::RuntimeException); +}; +#endif // #ifndef ENABLE_MECAB +#endif // #ifdef BREAKITERATOR_ALL + #undef BREAKITERATOR__CJK } } } } Index: source/breakiterator/breakiterator_cjk.cxx =================================================================== RCS file: /cvs/l10n/i18npool/source/breakiterator/breakiterator_cjk.cxx,v --- source/breakiterator/breakiterator_cjk.cxx 24 Oct 2006 13:53:13 -0000 1.14 +++ source/breakiterator/breakiterator_cjk.cxx 2 Apr 2007 10:11:35 -0000 @@ -40,6 +40,15 @@ #include #include +#include +#include + +#ifdef ENABLE_MECAB +#include +#include +#include +#endif + using namespace ::com::sun::star::uno; using namespace ::com::sun::star::lang; using namespace ::rtl; @@ -140,30 +149,205 @@ } // ---------------------------------------------------- +// class BreakIterator_ko +// ----------------------------------------------------; +BreakIterator_ko::BreakIterator_ko() +{ + cBreakIterator = "com.sun.star.i18n.BreakIterator_ko"; +} + +BreakIterator_ko::~BreakIterator_ko() +{ +} + +// ---------------------------------------------------- // class BreakIterator_ja // ----------------------------------------------------; +#ifndef ENABLE_MECAB + BreakIterator_ja::BreakIterator_ja() { - dict = new xdictionary("ja"); - dict->setJapaneseWordBreak(); + dict = new xdictionary("ja"); + dict->setJapaneseWordBreak(); cBreakIterator = "com.sun.star.i18n.BreakIterator_ja"; } BreakIterator_ja::~BreakIterator_ja() { - delete dict; } -// ---------------------------------------------------- -// class BreakIterator_ko -// ----------------------------------------------------; -BreakIterator_ko::BreakIterator_ko() +#else // #ifndef ENABLE_MECAB + +BreakIterator_ja::BreakIterator_ja() { - cBreakIterator = "com.sun.star.i18n.BreakIterator_ko"; + OUString aBaseInstallURL; + utl::Bootstrap::PathStatus aBaseLocateResult = + utl::Bootstrap::locateBaseInstallation(aBaseInstallURL); + OUString aBaseInstallPath; + osl::FileBase::getSystemPathFromFileURL(aBaseInstallURL, aBaseInstallPath); + + OUString aMecabOpt = + OUString::createFromAscii("-r ") + + aBaseInstallPath + + OUString::createFromAscii("/program/mecabrc"); + + const char *mecab_dic_dir = getenv( "OOO_MECAB_DIC_DIR" ); + + if (mecab_dic_dir) + { + aMecabOpt = aMecabOpt + + OUString::createFromAscii(" -d ") + + OUString::createFromAscii(mecab_dic_dir); + } + + tagger = MeCab::createTagger(OUStringToOString( aMecabOpt, RTL_TEXTENCODING_UTF8).getStr ()); + if (!tagger) { + const char *e = MeCab::getTaggerError (); + printf("Exception: %s\n", e); + } + + // fallback + if( !tagger ) + { + printf("Fallback!?\n"); + dict = new xdictionary("ja"); + dict->setJapaneseWordBreak(); + } + + cBreakIterator = "com.sun.star.i18n.BreakIterator_ja"; } -BreakIterator_ko::~BreakIterator_ko() +BreakIterator_ja::~BreakIterator_ja() +{ + if( tagger ) + delete tagger; + else + delete dict; +} + +Boundary SAL_CALL +BreakIterator_ja::previousWord(const OUString& text, sal_Int32 anyPos, + const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException) +{ + if( tagger ) { + return getWordBoundary(text, anyPos, nLocale, wordType, sal_False); + } else if (dict) { + result = dict->previousWord(text.getStr(), anyPos, text.getLength(), wordType); + // #109813# for non-CJK, single character word, fallback to ICU breakiterator. + if (result.endPos - result.startPos != 1 || + getScriptType(text, result.startPos) == ScriptType::ASIAN) + return result; + else + return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true); + } + return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType); + +} + +Boundary SAL_CALL +BreakIterator_ja::nextWord(const OUString& text, sal_Int32 anyPos, + const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException) { + if( tagger ) { + if( anyPos >= text.getLength() ) { + result.startPos = result.endPos = text.getLength(); + return result; + } + + OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8); + aStr.append( static_cast< char >(0) ); + const MeCab::Node *node = tagger->parseToNode(aStr.getStr()); + + for( int cnt = 0; node; node = node->next ) { + OUString chunk = OUString(node->surface, node->length, + RTL_TEXTENCODING_UTF8, + OSTRING_TO_OUSTRING_CVTFLAGS); + cnt += chunk.getLength() + (node->rlength - node->length); + if( cnt > anyPos ) { + + chunk = OUString(node->next->surface, node->next->length, + RTL_TEXTENCODING_UTF8, + OSTRING_TO_OUSTRING_CVTFLAGS); + + result.startPos = cnt + (node->next->rlength - node->next->length); + if( result.startPos >= text.getLength() ) + result.endPos = result.startPos; + else + result.endPos = cnt + chunk.getLength(); + + return result; + } + } + + result.startPos = result.endPos = text.getLength(); + return result; + } else if (dict) { + result = dict->nextWord(text.getStr(), anyPos, text.getLength(), wordType); + // #109813# for non-CJK, single character word, fallback to ICU breakiterator. + if (result.endPos - result.startPos != 1 || + getScriptType(text, result.startPos) == ScriptType::ASIAN) + return result; + else + return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true); + } + return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType); } +Boundary SAL_CALL +BreakIterator_ja::getWordBoundary( const OUString& text, sal_Int32 anyPos, + const lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection ) + throw(RuntimeException) +{ + if( tagger ) { + sal_Int32 len = text.getLength(); + + if( anyPos < 0 || + (anyPos == 0 && !bDirection) ) + result.startPos = result.endPos = 0; + else if( anyPos > len || + (anyPos == len && bDirection) ) + result.startPos = result.endPos = len; + else { + OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8); + aStr.append( static_cast< char >(0) ); + const MeCab::Node *node = tagger->parseToNode( aStr.getStr() ); + + for( int cnt = 0; node; node = node->next ) { + OUString chunk = OUString(node->surface, node->length, + RTL_TEXTENCODING_UTF8, + OSTRING_TO_OUSTRING_CVTFLAGS); + cnt += chunk.getLength() + (node->rlength - node->length); + if( cnt == anyPos && bDirection ) { + chunk = OUString(node->next->surface, node->next->length, + RTL_TEXTENCODING_UTF8, + OSTRING_TO_OUSTRING_CVTFLAGS); + + result.startPos = cnt + (node->next->rlength - node->next->length); + if( result.startPos >= len ) + result.endPos = result.startPos; + else + result.endPos = cnt + chunk.getLength(); + break; + } else if( cnt >= anyPos ) { + result.startPos = cnt - chunk.getLength(); + result.endPos = cnt; + break; + } + } + } + + return result; + } else if (dict) { + result = dict->getWordBoundary(text.getStr(), anyPos, text.getLength(), wordType, bDirection); + // #109813# for non-CJK, single character word, fallback to ICU breakiterator. + if (result.endPos - result.startPos != 1 || + getScriptType(text, result.startPos) == ScriptType::ASIAN) + return result; + } + return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection); + +} + +#endif // #ifndef ENABLE_MECAB + } } } } Index: source/breakiterator/makefile.mk =================================================================== RCS file: /cvs/l10n/i18npool/source/breakiterator/makefile.mk,v --- source/breakiterator/makefile.mk 25 Jan 2007 15:25:25 -0000 1.14 +++ source/breakiterator/makefile.mk 2 Apr 2007 10:11:35 -0000 @@ -72,6 +72,10 @@ APP1STDLIBS = $(SALLIB) +.IF "$(ENABLE_MECAB)" != "" +ENVCFLAGS= -DENABLE_MECAB +.ENDIF # "$(ENABLE_MECAB)" != "" + # --- Targets ------------------------------------------------------ .INCLUDE : target.mk Index: source/localedata/LocaleNode.cxx =================================================================== RCS file: /cvs/l10n/i18npool/source/localedata/LocaleNode.cxx,v --- source/localedata/LocaleNode.cxx 25 Jan 2007 09:35:57 -0000 1.26 +++ source/localedata/LocaleNode.cxx 2 Apr 2007 10:11:36 -0000 @@ -38,7 +38,8 @@ #include #include #include -#include +// ad-hoc comment out against compilation error +//#include #include #include Index: util/makefile.mk =================================================================== RCS file: /cvs/l10n/i18npool/util/makefile.mk,v --- util/makefile.mk 19 Dec 2006 18:05:56 -0000 1.21 +++ util/makefile.mk 2 Apr 2007 10:11:36 -0000 @@ -77,7 +77,12 @@ $(CPPULIB) \ $(SALLIB) \ $(ICUINLIB) \ - $(ICUUCLIB) + $(ICUUCLIB) \ + $(UNOTOOLSLIB) + +.IF "$(ENABLE_MECAB)" != "" +ENVLINKFLAGS= -lmecab +.ENDIF # "$(ENABLE_MECAB)" != "" # --- Targets ------------------------------------------------------------