View | Details | Raw Unified | Return to issue 74034
Collapse All | Expand All

(-)inc/breakiterator_cjk.hxx (-2 / +26 lines)
Lines 38-43 Link Here
38
#include <breakiterator_unicode.hxx>
38
#include <breakiterator_unicode.hxx>
39
#include <xdictionary.hxx>
39
#include <xdictionary.hxx>
40
40
41
#ifdef ENABLE_MECAB
42
#include <mecab.h>
43
#endif
44
41
namespace com { namespace sun { namespace star { namespace i18n {
45
namespace com { namespace sun { namespace star { namespace i18n {
42
//	----------------------------------------------------
46
//	----------------------------------------------------
43
//	class BreakIterator_CJK
47
//	class BreakIterator_CJK
Lines 74-82 Link Here
74
78
75
#ifdef BREAKITERATOR_ALL
79
#ifdef BREAKITERATOR_ALL
76
BREAKITERATOR_CJK( zh )
80
BREAKITERATOR_CJK( zh )
77
BREAKITERATOR_CJK( ja )
78
BREAKITERATOR_CJK( ko )
81
BREAKITERATOR_CJK( ko )
79
#endif
82
83
#ifndef ENABLE_MECAB
84
BREAKITERATOR_CJK( ja )
85
#else // #ifndef ENABLE_MECAB
86
class BreakIterator_ja : public BreakIterator_CJK {
87
    MeCab::Tagger *tagger;
88
public:
89
    BreakIterator_ja ();
90
    ~BreakIterator_ja ();
91
	Boundary SAL_CALL nextWord( const rtl::OUString& Text, sal_Int32 nStartPos,
92
		    const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType)
93
		    throw(com::sun::star::uno::RuntimeException);
94
	Boundary SAL_CALL previousWord( const rtl::OUString& Text, sal_Int32 nStartPos,
95
		    const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType)
96
		    throw(com::sun::star::uno::RuntimeException);
97
	Boundary SAL_CALL getWordBoundary( const rtl::OUString& Text, sal_Int32 nPos,
98
		    const com::sun::star::lang::Locale& nLocale, sal_Int16 WordType, sal_Bool bDirection )
99
		    throw(com::sun::star::uno::RuntimeException);
100
};
101
#endif // #ifndef ENABLE_MECAB
102
#endif // #ifdef BREAKITERATOR_ALL
103
80
#undef BREAKITERATOR__CJK
104
#undef BREAKITERATOR__CJK
81
105
82
} } } }
106
} } } }
(-)source/breakiterator/breakiterator_cjk.cxx (-9 / +161 lines)
Lines 40-45 Link Here
40
#include <breakiterator_cjk.hxx>
40
#include <breakiterator_cjk.hxx>
41
#include <i18nutil/unicode.hxx>
41
#include <i18nutil/unicode.hxx>
42
42
43
#include <rtl/string.hxx>
44
#include <rtl/strbuf.hxx>
45
#include <mecab.h>
46
43
using namespace ::com::sun::star::uno;
47
using namespace ::com::sun::star::uno;
44
using namespace ::com::sun::star::lang;
48
using namespace ::com::sun::star::lang;
45
using namespace ::rtl;
49
using namespace ::rtl;
Lines 140-169 Link Here
140
}
144
}
141
145
142
//      ----------------------------------------------------
146
//      ----------------------------------------------------
147
//      class BreakIterator_ko
148
//      ----------------------------------------------------;
149
BreakIterator_ko::BreakIterator_ko()
150
{
151
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ko";
152
}
153
154
BreakIterator_ko::~BreakIterator_ko()
155
{
156
}
157
158
//      ----------------------------------------------------
143
//      class BreakIterator_ja
159
//      class BreakIterator_ja
144
//      ----------------------------------------------------;
160
//      ----------------------------------------------------;
161
#ifndef ENABLE_MECAB
162
145
BreakIterator_ja::BreakIterator_ja()
163
BreakIterator_ja::BreakIterator_ja()
146
{
164
{
147
        dict = new xdictionary("ja");
165
	dict = new xdictionary("ja");
148
        dict->setJapaneseWordBreak();
166
	dict->setJapaneseWordBreak();
149
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
167
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
150
}
168
}
151
169
152
BreakIterator_ja::~BreakIterator_ja()
170
BreakIterator_ja::~BreakIterator_ja()
153
{
171
{
154
        delete dict;
155
}
172
}
156
173
157
//      ----------------------------------------------------
174
#else  // #ifndef ENABLE_MECAB
158
//      class BreakIterator_ko
175
159
//      ----------------------------------------------------;
176
BreakIterator_ja::BreakIterator_ja()
160
BreakIterator_ko::BreakIterator_ko()
161
{
177
{
162
        cBreakIterator = "com.sun.star.i18n.BreakIterator_ko";
178
    tagger = MeCab::createTagger("");
179
    if( !tagger )
180
    {
181
	dict = new xdictionary("ja");
182
	dict->setJapaneseWordBreak();
183
    }
184
    
185
    cBreakIterator = "com.sun.star.i18n.BreakIterator_ja";
163
}
186
}
164
187
165
BreakIterator_ko::~BreakIterator_ko()
188
BreakIterator_ja::~BreakIterator_ja()
189
{
190
    if( tagger )
191
	delete tagger;
192
    else
193
	delete dict;	
194
}
195
196
Boundary SAL_CALL 
197
BreakIterator_ja::previousWord(const OUString& text, sal_Int32 anyPos, 
198
        const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException)
199
{
200
    if( tagger ) {
201
	return getWordBoundary(text, anyPos, nLocale, wordType, sal_False);
202
    } else if (dict) {
203
	result = dict->previousWord(text.getStr(), anyPos, text.getLength(), wordType);
204
	// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
205
	if (result.endPos - result.startPos != 1 || 
206
	    getScriptType(text, result.startPos) == ScriptType::ASIAN)
207
	    return result;
208
	else
209
	    return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
210
    }
211
    return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType);
212
213
}
214
215
Boundary SAL_CALL 
216
BreakIterator_ja::nextWord(const OUString& text, sal_Int32 anyPos, 
217
        const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException)
166
{
218
{
219
    if( tagger ) {
220
	if( anyPos >= text.getLength() ) {
221
	    result.startPos = result.endPos = text.getLength();
222
	    return result;
223
	}
224
225
	OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8);
226
	aStr.append( static_cast< char >(0) );
227
	MeCab::Node *node = tagger->parseToNode(aStr.getStr());
228
229
	for( int cnt = 0; node; node = node->next ) {
230
	    OUString chunk = OUString(node->surface, node->length,
231
				      RTL_TEXTENCODING_UTF8,
232
				      OSTRING_TO_OUSTRING_CVTFLAGS);
233
	    cnt += chunk.getLength() + (node->rlength - node->length);
234
	    if( cnt > anyPos ) {
235
236
		chunk = OUString(node->next->surface, node->next->length,
237
				 RTL_TEXTENCODING_UTF8,
238
				 OSTRING_TO_OUSTRING_CVTFLAGS);
239
240
		result.startPos = cnt + (node->next->rlength - node->next->length);
241
		if( result.startPos >= text.getLength() )
242
		    result.endPos = result.startPos;
243
		else
244
		    result.endPos = cnt + chunk.getLength();
245
246
		return result;
247
	    }
248
	}
249
250
	result.startPos = result.endPos = text.getLength();
251
	return result;
252
    } else if (dict) {
253
	result = dict->nextWord(text.getStr(), anyPos, text.getLength(), wordType);
254
	// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
255
	if (result.endPos - result.startPos != 1 || 
256
	    getScriptType(text, result.startPos) == ScriptType::ASIAN)
257
	    return result;
258
	else
259
	    return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
260
    }
261
    return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType);
167
}
262
}
168
263
264
Boundary SAL_CALL 
265
BreakIterator_ja::getWordBoundary( const OUString& text, sal_Int32 anyPos, 
266
        const lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection ) 
267
        throw(RuntimeException)
268
{
269
    if( tagger ) {
270
	sal_Int32 len = text.getLength();
271
272
	if( anyPos < 0 ||
273
	    (anyPos == 0 && !bDirection) )
274
	    result.startPos = result.endPos = 0;
275
	else if( anyPos > len ||
276
		 (anyPos == len && bDirection) )
277
	    result.startPos = result.endPos = len;
278
	else {
279
	    OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8);
280
	    aStr.append( static_cast< char >(0) );
281
	    MeCab::Node *node = tagger->parseToNode( aStr.getStr() );
282
	
283
	    for( int cnt = 0; node; node = node->next ) {
284
		OUString chunk = OUString(node->surface, node->length,
285
					  RTL_TEXTENCODING_UTF8,
286
					  OSTRING_TO_OUSTRING_CVTFLAGS);
287
		cnt += chunk.getLength() + (node->rlength - node->length);
288
		if( cnt == anyPos && bDirection ) {
289
		    chunk = OUString(node->next->surface, node->next->length,
290
				     RTL_TEXTENCODING_UTF8,
291
				     OSTRING_TO_OUSTRING_CVTFLAGS);
292
293
		    result.startPos = cnt + (node->next->rlength - node->next->length);
294
		    if( result.startPos >= len )
295
			result.endPos = result.startPos;
296
		    else
297
			result.endPos = cnt + chunk.getLength();
298
		    break;
299
		} else if( cnt >= anyPos ) {
300
		    result.startPos = cnt - chunk.getLength();
301
		    result.endPos = cnt;
302
		    break;
303
		}
304
	    }
305
	}
306
307
	return result;
308
    } else if (dict) {
309
	result = dict->getWordBoundary(text.getStr(), anyPos, text.getLength(), wordType, bDirection);
310
	// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
311
	if (result.endPos - result.startPos != 1 || 
312
	    getScriptType(text, result.startPos) == ScriptType::ASIAN)
313
	    return result;
314
    }
315
    return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection);
316
317
}
318
319
#endif // #ifndef ENABLE_MECAB
320
169
} } } }
321
} } } }
(-)source/breakiterator/makefile.mk (+4 lines)
Lines 72-77 Link Here
72
72
73
APP1STDLIBS = $(SALLIB)
73
APP1STDLIBS = $(SALLIB)
74
74
75
.IF "$(ENABLE_MECAB)" != ""
76
ENVCFLAGS= -DENABLE_MECAB
77
.ENDIF # "$(ENABLE_MECAB)" != ""
78
75
# --- Targets ------------------------------------------------------
79
# --- Targets ------------------------------------------------------
76
80
77
.INCLUDE :	target.mk
81
.INCLUDE :	target.mk
(-)util/makefile.mk (+4 lines)
Lines 79-84 Link Here
79
		$(ICUINLIB) \
79
		$(ICUINLIB) \
80
		$(ICUUCLIB)
80
		$(ICUUCLIB)
81
81
82
.IF "$(ENABLE_MECAB)" != ""
83
ENVLINKFLAGS= -lmecab
84
.ENDIF # "$(ENABLE_MECAB)" != ""
85
82
# --- Targets ------------------------------------------------------------
86
# --- Targets ------------------------------------------------------------
83
87
84
.INCLUDE :	target.mk
88
.INCLUDE :	target.mk

Return to issue 74034