Lines 40-45
Link Here
|
40 |
#include <breakiterator_cjk.hxx> |
40 |
#include <breakiterator_cjk.hxx> |
41 |
#include <i18nutil/unicode.hxx> |
41 |
#include <i18nutil/unicode.hxx> |
42 |
|
42 |
|
|
|
43 |
#include <rtl/string.hxx> |
44 |
#include <rtl/strbuf.hxx> |
45 |
#include <mecab.h> |
46 |
|
43 |
using namespace ::com::sun::star::uno; |
47 |
using namespace ::com::sun::star::uno; |
44 |
using namespace ::com::sun::star::lang; |
48 |
using namespace ::com::sun::star::lang; |
45 |
using namespace ::rtl; |
49 |
using namespace ::rtl; |
Lines 140-169
Link Here
|
140 |
} |
144 |
} |
141 |
|
145 |
|
142 |
// ---------------------------------------------------- |
146 |
// ---------------------------------------------------- |
|
|
147 |
// class BreakIterator_ko |
148 |
// ----------------------------------------------------; |
149 |
BreakIterator_ko::BreakIterator_ko() |
150 |
{ |
151 |
cBreakIterator = "com.sun.star.i18n.BreakIterator_ko"; |
152 |
} |
153 |
|
154 |
BreakIterator_ko::~BreakIterator_ko() |
155 |
{ |
156 |
} |
157 |
|
158 |
// ---------------------------------------------------- |
143 |
// class BreakIterator_ja |
159 |
// class BreakIterator_ja |
144 |
// ----------------------------------------------------; |
160 |
// ----------------------------------------------------; |
|
|
161 |
#ifndef ENABLE_MECAB |
162 |
|
145 |
BreakIterator_ja::BreakIterator_ja() |
163 |
BreakIterator_ja::BreakIterator_ja() |
146 |
{ |
164 |
{ |
147 |
dict = new xdictionary("ja"); |
165 |
dict = new xdictionary("ja"); |
148 |
dict->setJapaneseWordBreak(); |
166 |
dict->setJapaneseWordBreak(); |
149 |
cBreakIterator = "com.sun.star.i18n.BreakIterator_ja"; |
167 |
cBreakIterator = "com.sun.star.i18n.BreakIterator_ja"; |
150 |
} |
168 |
} |
151 |
|
169 |
|
152 |
BreakIterator_ja::~BreakIterator_ja() |
170 |
BreakIterator_ja::~BreakIterator_ja() |
153 |
{ |
171 |
{ |
154 |
delete dict; |
|
|
155 |
} |
172 |
} |
156 |
|
173 |
|
157 |
// ---------------------------------------------------- |
174 |
#else // #ifndef ENABLE_MECAB |
158 |
// class BreakIterator_ko |
175 |
|
159 |
// ----------------------------------------------------; |
176 |
BreakIterator_ja::BreakIterator_ja() |
160 |
BreakIterator_ko::BreakIterator_ko() |
|
|
161 |
{ |
177 |
{ |
162 |
cBreakIterator = "com.sun.star.i18n.BreakIterator_ko"; |
178 |
tagger = MeCab::createTagger(""); |
|
|
179 |
if( !tagger ) |
180 |
{ |
181 |
dict = new xdictionary("ja"); |
182 |
dict->setJapaneseWordBreak(); |
183 |
} |
184 |
|
185 |
cBreakIterator = "com.sun.star.i18n.BreakIterator_ja"; |
163 |
} |
186 |
} |
164 |
|
187 |
|
165 |
BreakIterator_ko::~BreakIterator_ko() |
188 |
BreakIterator_ja::~BreakIterator_ja() |
|
|
189 |
{ |
190 |
if( tagger ) |
191 |
delete tagger; |
192 |
else |
193 |
delete dict; |
194 |
} |
195 |
|
196 |
Boundary SAL_CALL |
197 |
BreakIterator_ja::previousWord(const OUString& text, sal_Int32 anyPos, |
198 |
const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException) |
199 |
{ |
200 |
if( tagger ) { |
201 |
return getWordBoundary(text, anyPos, nLocale, wordType, sal_False); |
202 |
} else if (dict) { |
203 |
result = dict->previousWord(text.getStr(), anyPos, text.getLength(), wordType); |
204 |
// #109813# for non-CJK, single character word, fallback to ICU breakiterator. |
205 |
if (result.endPos - result.startPos != 1 || |
206 |
getScriptType(text, result.startPos) == ScriptType::ASIAN) |
207 |
return result; |
208 |
else |
209 |
return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true); |
210 |
} |
211 |
return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType); |
212 |
|
213 |
} |
214 |
|
215 |
Boundary SAL_CALL |
216 |
BreakIterator_ja::nextWord(const OUString& text, sal_Int32 anyPos, |
217 |
const lang::Locale& nLocale, sal_Int16 wordType) throw(RuntimeException) |
166 |
{ |
218 |
{ |
|
|
219 |
if( tagger ) { |
220 |
if( anyPos >= text.getLength() ) { |
221 |
result.startPos = result.endPos = text.getLength(); |
222 |
return result; |
223 |
} |
224 |
|
225 |
OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8); |
226 |
aStr.append( static_cast< char >(0) ); |
227 |
MeCab::Node *node = tagger->parseToNode(aStr.getStr()); |
228 |
|
229 |
for( int cnt = 0; node; node = node->next ) { |
230 |
OUString chunk = OUString(node->surface, node->length, |
231 |
RTL_TEXTENCODING_UTF8, |
232 |
OSTRING_TO_OUSTRING_CVTFLAGS); |
233 |
cnt += chunk.getLength() + (node->rlength - node->length); |
234 |
if( cnt > anyPos ) { |
235 |
|
236 |
chunk = OUString(node->next->surface, node->next->length, |
237 |
RTL_TEXTENCODING_UTF8, |
238 |
OSTRING_TO_OUSTRING_CVTFLAGS); |
239 |
|
240 |
result.startPos = cnt + (node->next->rlength - node->next->length); |
241 |
if( result.startPos >= text.getLength() ) |
242 |
result.endPos = result.startPos; |
243 |
else |
244 |
result.endPos = cnt + chunk.getLength(); |
245 |
|
246 |
return result; |
247 |
} |
248 |
} |
249 |
|
250 |
result.startPos = result.endPos = text.getLength(); |
251 |
return result; |
252 |
} else if (dict) { |
253 |
result = dict->nextWord(text.getStr(), anyPos, text.getLength(), wordType); |
254 |
// #109813# for non-CJK, single character word, fallback to ICU breakiterator. |
255 |
if (result.endPos - result.startPos != 1 || |
256 |
getScriptType(text, result.startPos) == ScriptType::ASIAN) |
257 |
return result; |
258 |
else |
259 |
return BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true); |
260 |
} |
261 |
return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType); |
167 |
} |
262 |
} |
168 |
|
263 |
|
|
|
264 |
Boundary SAL_CALL |
265 |
BreakIterator_ja::getWordBoundary( const OUString& text, sal_Int32 anyPos, |
266 |
const lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection ) |
267 |
throw(RuntimeException) |
268 |
{ |
269 |
if( tagger ) { |
270 |
sal_Int32 len = text.getLength(); |
271 |
|
272 |
if( anyPos < 0 || |
273 |
(anyPos == 0 && !bDirection) ) |
274 |
result.startPos = result.endPos = 0; |
275 |
else if( anyPos > len || |
276 |
(anyPos == len && bDirection) ) |
277 |
result.startPos = result.endPos = len; |
278 |
else { |
279 |
OStringBuffer aStr = OUStringToOString(text, RTL_TEXTENCODING_UTF8); |
280 |
aStr.append( static_cast< char >(0) ); |
281 |
MeCab::Node *node = tagger->parseToNode( aStr.getStr() ); |
282 |
|
283 |
for( int cnt = 0; node; node = node->next ) { |
284 |
OUString chunk = OUString(node->surface, node->length, |
285 |
RTL_TEXTENCODING_UTF8, |
286 |
OSTRING_TO_OUSTRING_CVTFLAGS); |
287 |
cnt += chunk.getLength() + (node->rlength - node->length); |
288 |
if( cnt == anyPos && bDirection ) { |
289 |
chunk = OUString(node->next->surface, node->next->length, |
290 |
RTL_TEXTENCODING_UTF8, |
291 |
OSTRING_TO_OUSTRING_CVTFLAGS); |
292 |
|
293 |
result.startPos = cnt + (node->next->rlength - node->next->length); |
294 |
if( result.startPos >= len ) |
295 |
result.endPos = result.startPos; |
296 |
else |
297 |
result.endPos = cnt + chunk.getLength(); |
298 |
break; |
299 |
} else if( cnt >= anyPos ) { |
300 |
result.startPos = cnt - chunk.getLength(); |
301 |
result.endPos = cnt; |
302 |
break; |
303 |
} |
304 |
} |
305 |
} |
306 |
|
307 |
return result; |
308 |
} else if (dict) { |
309 |
result = dict->getWordBoundary(text.getStr(), anyPos, text.getLength(), wordType, bDirection); |
310 |
// #109813# for non-CJK, single character word, fallback to ICU breakiterator. |
311 |
if (result.endPos - result.startPos != 1 || |
312 |
getScriptType(text, result.startPos) == ScriptType::ASIAN) |
313 |
return result; |
314 |
} |
315 |
return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection); |
316 |
|
317 |
} |
318 |
|
319 |
#endif // #ifndef ENABLE_MECAB |
320 |
|
169 |
} } } } |
321 |
} } } } |