View | Details | Raw Unified | Return to issue 74034
Collapse All | Expand All

(-)inc/breakiterator_cjk.hxx (-2 / +26 lines)
Lines 38-43 Link Here
38
#include <breakiterator_unicode.hxx>
38
#include <breakiterator_unicode.hxx>
39
#include <xdictionary.hxx>
39
#include <xdictionary.hxx>
40
40
41
#ifdef ENABLE_MECAB
42
#include <mecab/mecab.h>
43
#endif
44
41
namespace com { namespace sun { namespace star { namespace i18n {
45
namespace com { namespace sun { namespace star { namespace i18n {
42
//	----------------------------------------------------
46
//	----------------------------------------------------
43
//	class BreakIterator_CJK
47
//	class BreakIterator_CJK
Lines 74-82 Link Here
74
78
75
#ifdef BREAKITERATOR_ALL
79
#ifdef BREAKITERATOR_ALL
76
BREAKITERATOR_CJK( zh )
80
BREAKITERATOR_CJK( zh )
77
BREAKITERATOR_CJK( ja )
78
BREAKITERATOR_CJK( ko )
81
BREAKITERATOR_CJK( ko )
79
#endif
82
83
#ifndef ENABLE_MECAB
84
BREAKITERATOR_CJK( ja )
85
#else // #ifndef ENABLE_MECAB
86
class BreakIterator_ja : public BreakIterator_CJK {
87
    MeCab::Tagger *tagger;
88
public:
89
    BreakIterator_ja ();
90
    ~BreakIterator_ja ();
91
	Boundary SAL_CALL nextWord( const rtl::OUString& Text, sal_Int32 nStartPos,
92
		    const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType)
93
		    throw(com::sun::star::uno::RuntimeException);
94
	Boundary SAL_CALL previousWord( const rtl::OUString& Text, sal_Int32 nStartPos,
95
		    const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType)
96
		    throw(com::sun::star::uno::RuntimeException);
97
	Boundary SAL_CALL getWordBoundary( const rtl::OUString& Text, sal_Int32 nPos,
98
		    const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType, sal_Bool bDirection )
99
		    throw(com::sun::star::uno::RuntimeException);
100
};
101
#endif // #ifndef ENABLE_MECAB
102
#endif // #ifdef BREAKITERATOR_ALL
103
80
#undef BREAKITERATOR__CJK
104
#undef BREAKITERATOR__CJK
81
105
82
} } } }
106
} } } }
(-)source/breakiterator/breakiterator_cjk.cxx (-9 / +193 lines)
Lines 40-45 Link Here
40
#include <breakiterator_cjk.hxx>
40
#include <breakiterator_cjk.hxx>
41
#include <i18nutil/unicode.hxx>
41
#include <i18nutil/unicode.hxx>
42
42
43
#include <rtl/string.hxx>
44
#include <rtl/strbuf.hxx>
45
46
#ifdef ENABLE_MECAB
47
#include <unotools/bootstrap.hxx>
48
#include <osl/file.hxx>
49
#include <mecab/mecab.h>
50
#endif
51
43
using namespace ::com::sun::star::uno;
52
using namespace ::com::sun::star::uno;
44
using namespace ::com::sun::star::lang;
53
using namespace ::com::sun::star::lang;
45
using namespace ::rtl;
54
using namespace ::rtl;
Lines 140-169 Link Here
140
}
149
}
141
150
142
//      ----------------------------------------------------
151
//      ----------------------------------------------------
152
//      class BreakIterator_ko
153
//      ----------------------------------------------------;
154
BreakIterator_ko::BreakIterator_ko()
155
{
156
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ko";
157
}
158
159
BreakIterator_ko::~BreakIterator_ko()
160
{
161
}
162
163
//      ----------------------------------------------------
143
//      class BreakIterator_ja
164
//      class BreakIterator_ja
144
//      ----------------------------------------------------;
165
//      ----------------------------------------------------;
166
#ifndef ENABLE_MECAB
167
145
BreakIterator_ja::BreakIterator_ja()
168
BreakIterator_ja::BreakIterator_ja()
146
{
169
{
147
        dict = new xdictionary("ja");
170
	dict = new xdictionary("ja");
148
        dict->setJapaneseWordBreak();
171
	dict->setJapaneseWordBreak();
149
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
172
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
150
}
173
}
151
174
152
BreakIterator_ja::~BreakIterator_ja()
175
BreakIterator_ja::~BreakIterator_ja()
153
{
176
{
154
        delete dict;
155
}
177
}
156
178
157
//      ----------------------------------------------------
179
#else  // #ifndef ENABLE_MECAB
158
//      class BreakIterator_ko
180
159
//      ----------------------------------------------------;
181
BreakIterator_ja::BreakIterator_ja()
160
BreakIterator_ko::BreakIterator_ko()
161
{
182
{
162
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ko";
183
    OUString aBaseInstallURL;
184
    utl::Bootstrap::PathStatus aBaseLocateResult = 
185
        utl::Bootstrap::locateBaseInstallation(aBaseInstallURL);
186
    OUString aBaseInstallPath;
187
    osl::FileBase::getSystemPathFromFileURL(aBaseInstallURL, aBaseInstallPath);
188
189
    OUString aMecabOpt =
190
	OUString::createFromAscii("-r ") +
191
	aBaseInstallPath +
192
	OUString::createFromAscii("/program/mecabrc");
193
194
    const char *mecab_dic_dir = getenv( "OOO_MECAB_DIC_DIR" );
195
196
    if (mecab_dic_dir)
197
    {
198
	aMecabOpt = aMecabOpt +
199
	    OUString::createFromAscii(" -d ") +
200
	    OUString::createFromAscii(mecab_dic_dir);
201
    }
202
203
    tagger = MeCab::createTagger(OUStringToOString( aMecabOpt, RTL_TEXTENCODING_UTF8).getStr ());
204
    if (!tagger) {
205
	const char *e = MeCab::getTaggerError ();
206
	printf("Exception: %s\n", e);
207
    }
208
209
    // fallback
210
    if( !tagger )
211
    {
212
	printf("Fallback!?\n");
213
	dict = new xdictionary("ja");
214
	dict->setJapaneseWordBreak();
215
    }
216
   
217
    cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
163
}
218
}
164
219
165
BreakIterator_ko::~BreakIterator_ko()
220
BreakIterator_ja::~BreakIterator_ja()
221
{
222
    if( tagger )
223
	delete tagger;
224
    else
225
	delete dict;	
226
}
227
228
Boundary SAL_CALL 
229
BreakIterator_ja::previousWord(const OUString& text, sal_Int32 anyPos, 
230
        const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException)
231
{
232
    if( tagger ) {
233
	return getWordBoundary(text, anyPos, nLocale, wordType, sal_False);
234
    } else if (dict) {
235
	result = dict->previousWord(text.getStr(), anyPos, text.getLength(), wordType);
236
	// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
237
	if (result.endPos - result.startPos != 1 || 
238
	    getScriptType(text, result.startPos) == ScriptType::ASIAN)
239
	    return result;
240
	else
241
	    return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
242
    }
243
    return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType);
244
245
}
246
247
Boundary SAL_CALL 
248
BreakIterator_ja::nextWord(const OUString& text, sal_Int32 anyPos, 
249
        const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException)
166
{
250
{
251
    if( tagger ) {
252
	if( anyPos >= text.getLength() ) {
253
	    result.startPos = result.endPos = text.getLength();
254
	    return result;
255
	}
256
257
	OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8);
258
	aStr.append( static_cast< char >(0) );
259
	const MeCab::Node *node = tagger->parseToNode(aStr.getStr());
260
261
	for( int cnt = 0; node; node = node->next ) {
262
	    OUString chunk = OUString(node->surface, node->length,
263
				      RTL_TEXTENCODING_UTF8,
264
				      OSTRING_TO_OUSTRING_CVTFLAGS);
265
	    cnt += chunk.getLength() + (node->rlength - node->length);
266
	    if( cnt > anyPos ) {
267
268
		chunk = OUString(node->next->surface, node->next->length,
269
				 RTL_TEXTENCODING_UTF8,
270
				 OSTRING_TO_OUSTRING_CVTFLAGS);
271
272
		result.startPos = cnt + (node->next->rlength - node->next->length);
273
		if( result.startPos >= text.getLength() )
274
		    result.endPos = result.startPos;
275
		else
276
		    result.endPos = cnt + chunk.getLength();
277
278
		return result;
279
	    }
280
	}
281
282
	result.startPos = result.endPos = text.getLength();
283
	return result;
284
    } else if (dict) {
285
	result = dict->nextWord(text.getStr(), anyPos, text.getLength(), wordType);
286
	// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
287
	if (result.endPos - result.startPos != 1 || 
288
	    getScriptType(text, result.startPos) == ScriptType::ASIAN)
289
	    return result;
290
	else
291
	    return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
292
    }
293
    return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType);
167
}
294
}
168
295
296
Boundary SAL_CALL 
297
BreakIterator_ja::getWordBoundary( const OUString& text, sal_Int32 anyPos, 
298
        const lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection ) 
299
        throw(RuntimeException)
300
{
301
    if( tagger ) {
302
	sal_Int32 len = text.getLength();
303
304
	if( anyPos < 0 ||
305
	    (anyPos == 0 && !bDirection) )
306
	    result.startPos = result.endPos = 0;
307
	else if( anyPos > len ||
308
		 (anyPos == len && bDirection) )
309
	    result.startPos = result.endPos = len;
310
	else {
311
	    OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8);
312
	    aStr.append( static_cast< char >(0) );
313
	    const MeCab::Node *node = tagger->parseToNode( aStr.getStr() );
314
	
315
	    for( int cnt = 0; node; node = node->next ) {
316
		OUString chunk = OUString(node->surface, node->length,
317
					  RTL_TEXTENCODING_UTF8,
318
					  OSTRING_TO_OUSTRING_CVTFLAGS);
319
		cnt += chunk.getLength() + (node->rlength - node->length);
320
		if( cnt == anyPos && bDirection ) {
321
		    chunk = OUString(node->next->surface, node->next->length,
322
				     RTL_TEXTENCODING_UTF8,
323
				     OSTRING_TO_OUSTRING_CVTFLAGS);
324
325
		    result.startPos = cnt + (node->next->rlength - node->next->length);
326
		    if( result.startPos >= len )
327
			result.endPos = result.startPos;
328
		    else
329
			result.endPos = cnt + chunk.getLength();
330
		    break;
331
		} else if( cnt >= anyPos ) {
332
		    result.startPos = cnt - chunk.getLength();
333
		    result.endPos = cnt;
334
		    break;
335
		}
336
	    }
337
	}
338
339
	return result;
340
    } else if (dict) {
341
	result = dict->getWordBoundary(text.getStr(), anyPos, text.getLength(), wordType, bDirection);
342
	// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
343
	if (result.endPos - result.startPos != 1 || 
344
	    getScriptType(text, result.startPos) == ScriptType::ASIAN)
345
	    return result;
346
    }
347
    return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection);
348
349
}
350
351
#endif // #ifndef ENABLE_MECAB
352
169
} } } }
353
} } } }
(-)source/breakiterator/makefile.mk (+4 lines)
Lines 72-77 Link Here
72
72
73
APP1STDLIBS = $(SALLIB)
73
APP1STDLIBS = $(SALLIB)
74
74
75
.IF "$(ENABLE_MECAB)" != ""
76
ENVCFLAGS= -DENABLE_MECAB
77
.ENDIF # "$(ENABLE_MECAB)" != ""
78
75
# --- Targets ------------------------------------------------------
79
# --- Targets ------------------------------------------------------
76
80
77
.INCLUDE :	target.mk
81
.INCLUDE :	target.mk
(-)source/localedata/LocaleNode.cxx (-1 / +2 lines)
Lines 38-44 Link Here
38
#include <stdio.h>
38
#include <stdio.h>
39
#include <stdlib.h>
39
#include <stdlib.h>
40
#include <string.h>
40
#include <string.h>
41
#include <iostream>
41
// ad-hoc comment out against compilation error
42
//#include <iostream>
42
#include <set>
43
#include <set>
43
44
44
#include <rtl/ustrbuf.hxx>
45
#include <rtl/ustrbuf.hxx>
(-)util/makefile.mk (-1 / +6 lines)
Lines 77-83 Link Here
77
		$(CPPULIB) \
77
		$(CPPULIB) \
78
		$(SALLIB) \
78
		$(SALLIB) \
79
		$(ICUINLIB) \
79
		$(ICUINLIB) \
80
		$(ICUUCLIB)
80
		$(ICUUCLIB) \
81
		$(UNOTOOLSLIB)
82
83
.IF "$(ENABLE_MECAB)" != ""
84
ENVLINKFLAGS= -lmecab
85
.ENDIF # "$(ENABLE_MECAB)" != ""
81
86
82
# --- Targets ------------------------------------------------------------
87
# --- Targets ------------------------------------------------------------
83
88

Return to issue 74034