00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00034 #include <stdlib.h>
00035 #include <stdio.h>
00036 #include "utrac.h"
00037
00038
00039
00040 #include "debug.h"
00041
00043 static inline bool is_ext (char c) {
00044 return (u_char) c >= 0x80;
00045 }
00046
00047
00082 int ut_size_unicode (ulong unicode) {
00083 if ( !(unicode & ~0x7F)) {
00084
00085 return 1;
00086 } else if ( !(unicode & ~0x7FF)) {
00087
00088 return 2;
00089 } else if ( !(unicode & ~0xFFFF)) {
00090
00091 return 3;
00092 } else if (unicode <= 0x10FFFF ) {
00093
00094 return 4;
00095 } else {
00096 return 0;
00097 }
00098 }
00099
00114 ulong ut_utf8c_to_unicode (char ** src_p) {
00115
00116 ulong unicode;
00117 int size;
00118
00119 if (! (**src_p&0x80)) {
00120 unicode = **src_p; (*src_p)++;
00121 return unicode;
00122 } else if (! (**src_p&0x40)) {
00123 (*src_p)++;
00124 return UT_UNICODE_NONCHAR;
00125 } else if (! (**src_p&0x20)) {
00126 size = 1;
00127 unicode = **src_p & 0x1F;
00128 } else if (! (**src_p&0x10)) {
00129 size = 2;
00130 unicode = **src_p & 0x0F;
00131 } else if (! (**src_p&0x08)) {
00132 size = 3;
00133 unicode = **src_p & 0x07;
00134 } else {
00135 #if UT_DEBUG > 1
00136 printf("<%X:%x:%x:", **src_p & 0xFF, (**src_p|0x20), (**src_p|0x10));
00137 ut_print_binary (**src_p & 0xFF);
00138 putchar('>');
00139 #endif
00140 (*src_p)++;
00141 return UT_UNICODE_NONCHAR;
00142 }
00143 (*src_p)++;
00144
00145 while (size--) {
00146 if ((**src_p&0xC0) != 0x80) return UT_UNICODE_NONCHAR;
00147 unicode<<=6;
00148 unicode |= **src_p & 0x3F;
00149 (*src_p)++;
00150 }
00151 return unicode;
00152 }
00153
00170 void ut_unicode_to_utf8c (ulong unicode, char ** dst_p) {
00171
00172 if ( !(unicode & ~0x7F)) {
00173
00174 *(*dst_p)++ = (char) unicode;
00175 } else if ( !(unicode & ~0x7FF)) {
00176
00177 *(*dst_p)++ = ((char) (unicode>>6) & 0x1F) | 0xC0;
00178 *(*dst_p)++ = ((char) unicode & 0x3F) | 0x80;
00179 } else if ( !(unicode & ~0xFFFF)) {
00180
00181 *(*dst_p)++ = ((char) (unicode>>12) & 0x0F) | 0xE0;
00182 *(*dst_p)++ = ((char) (unicode>>6) & 0x3F) | 0x80;
00183 *(*dst_p)++ = ((char) unicode & 0x3F) | 0x80;
00184 } else {
00185 ERROR ("*** UTF8 CHAR ON 4 BYTES!!!***");
00186 }
00187 }
00188
00203 int ut_size_char (char **src_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) {
00204
00205 ASSERT (*src_p)
00206 ASSERT (src_charset != UT_UNSET)
00207 if (dst_charset == UT_UNSET) dst_charset = ut_session->charset_default;
00208
00209 ulong unicode;
00210 UtCharset * src_cs = &(ut_session->charset [src_charset]);
00211 UtCharset * dst_cs = &(ut_session->charset [dst_charset]);
00212
00213 if (src_cs->type == UT_CST_ASCII || dst_cs->type == UT_CST_ASCII) {
00214 if (src_cs->type == UT_CST_UTF_8) ut_utf8c_to_unicode(src_p);
00215 else (*src_p)++;
00216 if (ut_session->nomapping_char<0x80) return 1;
00217 else return 0;
00218 } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
00219 unicode = src_cs->unicode [(u_char) **src_p];
00220 (*src_p)++;
00221 } else if (src_cs->type == UT_CST_UTF_8) {
00222 unicode = ut_utf8c_to_unicode (src_p);
00223 } else {
00224 ERROR ("charset type not managed : %d", src_cs->type)
00225 }
00226
00227 if (unicode==UT_UNICODE_NONCHAR) unicode = ut_session->nomapping_char;
00228
00229 if (dst_cs->type == UT_CST_UTF_8) {
00230 return ut_size_unicode (unicode);
00231 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
00232 return 1;
00233
00234
00235
00236
00237
00238
00239
00240 } else { ERROR ("charset type not managed : %d", src_cs->type) }
00241
00242 }
00243
00259 void ut_conv_char (char ** src_p, char ** dst_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) {
00260 ASSERT (*src_p)
00261 ASSERT (*dst_p)
00262 ASSERT (src_charset != UT_UNSET)
00263 if (dst_charset == UT_UNSET) dst_charset = ut_session->charset_default;
00264
00265 ulong unicode;
00266 UtCharset * src_cs = &(ut_session->charset [src_charset]);
00267 UtCharset * dst_cs = &(ut_session->charset [dst_charset]);
00268
00269 if (src_cs->type == UT_CST_ASCII || dst_cs->type == UT_CST_ASCII) {
00270 if (src_cs->type == UT_CST_UTF_8) ut_utf8c_to_unicode(src_p);
00271 else (*src_p)++;
00272 if (ut_session->nomapping_char<0x80) *(*dst_p)++ = (char) ut_session->nomapping_char;
00273 return;
00274 } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
00275 unicode = src_cs->unicode [(u_char) **src_p];
00276 (*src_p)++;
00277 } else if (src_cs->type == UT_CST_UTF_8) {
00278 unicode = ut_utf8c_to_unicode (src_p);
00279 } else {ERROR ("charset type not managed : %d", src_cs->type) }
00280
00281 if (unicode!=UT_UNICODE_NONCHAR) {
00282 if (dst_cs->type == UT_CST_UTF_8) {
00283 if (unicode==UT_UNICODE_NONCHAR) unicode = ut_session->nomapping_char;
00284 ut_unicode_to_utf8c (unicode, dst_p);
00285 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
00286 if (unicode<0x80) {
00287 *(*dst_p)++ = (char) unicode;
00288 } else {
00289 int i; for (i=0x80; i<0x100; i++) if (unicode==dst_cs->unicode[i]) break;
00290 if(i<0x100) {
00291 *(*dst_p)++ = (char) i;
00292 } else {
00293 if (ut_session->nomapping_char < 0x100) *(*dst_p)++ = (char) ut_session->nomapping_char;
00294 }
00295 }
00296 } else {
00297 ERROR ("charset type not managed : %d", src_cs->type)
00298 }
00299 } else {
00300 if (ut_session->nomapping_char < 0x80) *(*dst_p)++ = (char) ut_session->nomapping_char;
00301 else ERROR ("nomapping char must be < 0x80") ;
00302 }
00303
00304 }
00305
00306
00307 void ut_insert_eol (char ** dst_p, UtEolType dst_eol) {
00308
00309 switch (dst_eol) {
00310 case UT_EOL_CRLF:
00311 DBG3_S ("+CR");
00312 *(*dst_p)++ = 0xD;
00313 case UT_EOL_LF:
00314 DBG3_S ("+LF");
00315 *(*dst_p)++ = 0xA; break;
00316 case UT_EOL_LFCR:
00317 DBG3_S ("+LF");
00318 *(*dst_p)++ = 0xA;
00319 case UT_EOL_CR:
00320 DBG3_S ("+CR");
00321 *(*dst_p)++ = 0xD; break;
00322 case UT_EOL_BSN:
00323 DBG3_S ("+BSN");
00324 *(*dst_p)++ = '\\'; *(*dst_p)++ = 'n'; break;
00325 case UT_EOL_NUL:
00326 DBG3_S ("+NUL");
00327 *(*dst_p)++ = 0; break;
00328 default:
00329 ERROR ("EOL not accepted for conversion : %d", dst_eol)
00330 }
00331 }
00332
00337 uint ut_count_ext_char (UtText * text) {
00338 uint count = 0, i;
00339 for (i=0x80; i<0x100; i++)
00340 count += text->distribution[i];
00341 return count;
00342 }
00343
00344
00345
00363 int ut_size_difference (UtText * src_text, UtText * dst_text) {
00364
00365 ASSERT (src_text->charset != UT_UNSET)
00366 ASSERT (dst_text->charset != UT_UNSET)
00367 ASSERT (src_text->eol != UT_EOL_UNSET)
00368 ASSERT (dst_text->eol != UT_EOL_UNSET)
00369 ASSERT (src_text->eol_alt != UT_EOL_UNSET)
00370 ASSERT (dst_text->eol_alt != UT_EOL_UNSET)
00371
00372 long size;
00373
00374 DBG3("*********** size diff********")
00375
00376 UtCharset * src_cs = &(ut_session->charset [src_text->charset]);
00377 UtCharset * dst_cs = &(ut_session->charset [dst_text->charset]);
00378
00379 if (src_cs->type == UT_CST_ASCII ) {
00380 if (dst_cs->type == UT_CST_ASCII) {
00381 if (ut_session->nomapping_char && ut_session->nomapping_char <0x80) size = 0;
00382 else size = - ut_count_ext_char (src_text);
00383 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
00384 if (ut_session->nomapping_char <0x100) size = 0;
00385 else size = - ut_count_ext_char (src_text);
00386 } else if (dst_cs->type == UT_CST_UTF_8) {
00387 if (ut_session->nomapping_char != UT_UNICODE_NONCHAR)
00388 size = (ut_size_unicode (ut_session->nomapping_char)-1) * ut_count_ext_char (src_text);
00389 else size = - ut_count_ext_char (src_text);
00390 } else {
00391 ERROR ("charset type not managed : %d", dst_cs->type)
00392 }
00393
00394 } else if (src_cs->type == UT_CST_ASCII_EXTENSION) {
00395 if (dst_cs->type == UT_CST_ASCII) {
00396 if (ut_session->nomapping_char <0x80) size = 0;
00397 else size = - ut_count_ext_char (src_text);
00398
00399 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
00400 int count = 0;
00401 if (ut_session->nomapping_char>=0x100) {
00402 int i; for (i=0x80; i<0x100; i++) {
00403 if (src_text->distribution[i]) {
00404 ulong unicode = src_cs->unicode[i];
00405 int j; for (j=0x80; j<0x100; j++) if (unicode==dst_cs->unicode[j]) break;
00406 if (i==0x100) count -= src_text->distribution[i];
00407 }
00408 }
00409 }
00410 size = count;
00411
00412 } else if (dst_cs->type == UT_CST_UTF_8) {
00413 int count = 0;
00414 int i; for (i=0x80; i<0x100; i++) {
00415 if (src_text->distribution[i]) {
00416 ulong unicode = src_cs->unicode[i];
00417 if (unicode != UT_UNICODE_NONCHAR)
00418 count += (ut_size_unicode (unicode) - 1)*src_text->distribution[i];
00419 else if (ut_session->nomapping_char!=UT_UNICODE_NONCHAR)
00420 count += (ut_size_unicode (ut_session->nomapping_char) - 1)*src_text->distribution[i];
00421 else count -= src_text->distribution[i];
00422 }
00423 }
00424 size = count;
00425 } else {
00426 ERROR ("charset type not managed : %d", dst_cs->type)
00427 }
00428 } else if (src_cs->type == UT_CST_UTF_8 ) {
00429 if (dst_cs->type == UT_CST_ASCII) {
00430 if (ut_session->nomapping_char <0x80) size = 0;
00431 else size = - ut_count_ext_char (src_text);
00432
00433 } else if (dst_cs->type == UT_CST_ASCII_EXTENSION) {
00434 size = 0;
00435
00436 } else if (dst_cs->type == UT_CST_UTF_8) {
00437 if (ut_session->nomapping_char == UT_UNICODE_NONCHAR) size = 0;
00438 else size = - (ut_size_unicode (ut_session->nomapping_char) - 1) * ut_count_ext_char (src_text);
00439
00440 } else {
00441 ERROR ("charset type not managed : %d", dst_cs->type)
00442 }
00443 } else {
00444 ERROR ("charset type not managed : %d", dst_cs->type)
00445 }
00446
00447 DBG3( "** size diff chars : % ld", size);
00448
00449 switch (src_text->eol) {
00450 case UT_EOL_NONE:
00451 break;
00452 case UT_EOL_CRLF:
00453 case UT_EOL_LFCR:
00454 switch (dst_text->eol) {
00455 case UT_EOL_CRLF:
00456 case UT_EOL_LFCR:
00457 case UT_EOL_BSN:
00458
00459 break;
00460 case UT_EOL_CR:
00461 case UT_EOL_LF:
00462 size -= src_text->nb_lines; break;
00463 case UT_EOL_NONE:
00464 size -= 2*src_text->nb_lines; break;
00465 default:
00466 ERROR ("dst EOL type unsupported")
00467 } break;
00468
00469 case UT_EOL_NUL:
00470 case UT_EOL_CR:
00471 case UT_EOL_LF:
00472 case UT_EOL_MIX:
00473 switch (dst_text->eol) {
00474 case UT_EOL_CR:
00475 case UT_EOL_LF:
00476 case UT_EOL_NUL:
00477
00478 break;
00479 case UT_EOL_CRLF:
00480 case UT_EOL_LFCR:
00481 case UT_EOL_BSN:
00482 size += src_text->nb_lines; break;
00483 case UT_EOL_NONE:
00484 size -= src_text->nb_lines; break;
00485 default:
00486 ERROR ("dst EOL type unsupported")
00487 } break;
00488 default:
00489 ERROR ("src EOL type unsupported")
00490 }
00491
00492 DBG3( "** size diff chars+eol : % ld", size);
00493
00494 switch (src_text->eol_alt) {
00495 case UT_EOL_NONE:
00496 break;
00497 case UT_EOL_NUL:
00498 case UT_EOL_CR:
00499 case UT_EOL_LF:
00500 case UT_EOL_MIX:
00501 switch (dst_text->eol_alt) {
00502 case UT_EOL_CR:
00503 case UT_EOL_LF:
00504 case UT_EOL_NUL:
00505
00506 break;
00507 case UT_EOL_CRLF:
00508 case UT_EOL_LFCR:
00509 case UT_EOL_BSN:
00510 size += src_text->nb_lines_alt; break;
00511 case UT_EOL_NONE:
00512 size -= src_text->nb_lines_alt; break;
00513 default:
00514 ERROR ("dst EOL type unsupported")
00515 } break;
00516 default:
00517 ERROR ("src EOL type unsupported")
00518 }
00519
00520 DBG3( "** size diff chars+eol+alt : % ld", size);
00521
00522 return size;
00523 }
00524
00525
00539 UtCode ut_conversion_pass (UtText * src_text, UtText * dst_text) {
00540
00541 ASSERT (src_text)
00542 ASSERT (dst_text)
00543
00544 ASSERT (dst_text->data == NULL)
00545
00546
00547 if (dst_text->eol==UT_EOL_UNSET) dst_text->eol = src_text->eol;
00548 if (dst_text->eol_alt==UT_EOL_UNSET) dst_text->eol_alt = src_text->eol_alt;
00549 free (dst_text->data);
00550 dst_text->data = NULL;
00551
00552 long newsize = ut_size_difference (src_text, dst_text);
00553
00554 DBG3 ("size diff : %ld ext char : %d", newsize, ut_count_ext_char (src_text) )
00555 newsize += src_text->size;
00556 DBG3 ("old size: %lu new size: %lu", src_text->size, newsize)
00557
00558
00559 char *dst_beg = (char*) malloc (newsize+1);
00560 if (!dst_beg) return UT_MALLOC_ERROR;
00561
00562 char *src = src_text->data;
00563 char *src_end = src_text->data + src_text->size;
00564 char *dst = dst_beg;
00565 int cumul=1;
00566
00567 for (;;) {
00568 DBG3_S ("<%d>", *src);
00569 if (!is_ext (*src)) {
00570 if (*src) {
00571 if (*src==src_text->skip_char) {
00572 src++;
00573 } else if (*src==UT_EOL_ALT_CHAR) {
00574 ut_insert_eol (&dst, dst_text->eol_alt);
00575 src++;
00576 } else {
00577 *dst++ = *src++;
00578 }
00579 } else {
00580 if (src - src_text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00581 if (!ut_update_progress (src_text, src - src_text->data, false)) break;
00582 cumul++;
00583 }
00584 if (src >= src_end) {
00585 ASSERT (src==src_end)
00586 *dst = 0;
00587 break;
00588 }
00589 ut_insert_eol (&dst, dst_text->eol);
00590 src++;
00591 DBG3_S ("!")
00592 }
00593 } else {
00594 ut_conv_char (&src, &dst, src_text->charset, dst_text->charset);
00595 }
00596 }
00597
00598 if (src < src_end) {
00599
00600 DBG3 ( "interrupted! : src:%d srcend: %d dst:%d", src - src_text->data, src_end - src_text->data, dst - dst_beg)
00601 free (dst_beg);
00602 return UT_INTERRUPTED_BY_USER;
00603 }
00604
00605
00606 ASSERT ( dst - dst_beg <= newsize )
00607 DBG3 ( "precalculated size: %ld actual size: %d", newsize, dst - dst_beg)
00608
00609
00610 dst_text->data = dst_beg;
00611 dst_text->size = dst - dst_beg;
00612
00613 DBG2 ("Conversion done!")
00614 return UT_OK;
00615 }