00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00032 #include <stdlib.h>
00033 #include <stdio.h>
00034 #include <string.h>
00035 #include "utrac.h"
00036
00037 #undef UT_DEBUG
00038 #define UT_DEBUG 1
00039 #include "debug.h"
00040
00041
00045 bool inline ut_unicode_invalid (ulong unicode) {
00046 return (( 0x0000FDD0 <= unicode && unicode <= 0x0000FDEF )
00047 || ( 0x0010FFFE <= unicode )
00048 || ( ( 0xFFF0FFFE & unicode ) == 0x0000FFFE ));
00049 }
00050
00051
00062 UtCode ut_distrib_utf_pass (UtText * text) {
00063
00064 char * scan = text->data;
00065 char * scan_end;
00066
00067 ASSERT(text);
00068 ASSERT(text->data);
00069
00070
00071 if (text->size) scan_end = scan + text->size;
00072 else scan_end = NULL;
00073
00074 ulong unicode = 0;
00075 ushort multibyte = 0;
00076 ulong error_utf8 = 0;
00077 int cumul = 1;
00078 if (!text->distribution) text->distribution = (ulong*) malloc (sizeof(ulong)*256);
00079 int i; for (i=0; i<0x100; i++) text->distribution[i] = 0;
00080
00081 scan--;
00082 for (;;) {
00083 scan++;
00084
00085 switch (*scan) {
00086 case 0:
00087 if (scan>=scan_end) {
00088 ASSERT (!scan_end || scan==scan_end)
00089 goto out_for;
00090 } else if (!scan_end) goto out_for;
00091 case 0xA:
00092 case 0xD:
00093 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00094 if (!ut_update_progress (text, scan - text->data, false)) goto out_for;
00095 cumul++;
00096 }
00097 }
00098
00099 text->distribution [(u_char) *scan]++;
00100 if (multibyte) {
00101 if ((*scan & 0xC0) == 0x80) {
00102 unicode <<= 6;
00103 unicode |= *scan & 0x3F;
00104 if(!--multibyte) {
00105 if (ut_unicode_invalid (unicode)) error_utf8++;
00106 }
00107 } else {
00108 multibyte = 0;
00109 error_utf8++;
00110 }
00111 } else if (*scan & 0x80) {
00112 if ((*scan & 0xE0) == 0xC0) {
00113 multibyte = 1;
00114 unicode = *scan & 0x1F;
00115 } else if ((*scan & 0xF0) == 0xE0) {
00116 multibyte = 2;
00117 unicode = *scan & 0x0F;
00118 } else if ((*scan & 0xF8) == 0xF0) {
00119 multibyte = 3;
00120 unicode = *scan & 0x07;
00121 } else {
00122 error_utf8++;
00123 }
00124 }
00125 }
00126 out_for:
00127
00128
00129 if (scan<scan_end) {
00130 return UT_INTERRUPTED_BY_USER;
00131 }
00132
00133 if (multibyte) error_utf8++;
00134
00135 DBG2 ("Distribution and UTF-8 pass done! (%lu B)", text->size)
00136
00137 if (!text->size) text->size = scan - text->data;
00138 if (!text->size) return UT_EMPTY_DATA_ERROR;
00139
00140 ulong nb_ctrl_chars = 0;
00141
00142 for (i=0; i<0x20; i++) {
00143 if (i==0x9 || i==0xA || i==0xD) continue;
00144 nb_ctrl_chars += text->distribution[i];
00145 }
00146 nb_ctrl_chars += text->distribution[0x7F];
00147
00148
00149 if (text->size * UT_THRESHOLD_CONTROL_CHAR < nb_ctrl_chars) {
00150
00151 DBG3 ("Binary file detected! (%lu cc)", nb_ctrl_chars)
00152 return UT_BINARY_DATA_ERROR;
00153 }
00154
00155
00156 ulong nb_ext_chars = 0;
00157 for (i=0x80; i<0x100; i++) {
00158 nb_ext_chars += text->distribution[i];
00159 }
00160 DBG3 ("UTF-8 error : %lu, ext char number : %lu", error_utf8, nb_ext_chars)
00161
00162 if (text->flags & UT_F_IDENTIFY_CHARSET) {
00163 if (!nb_ext_chars) {
00164
00165 for (i=0; i<ut_session->nb_charsets; i++)
00166 if (ut_session->charset[i].type == UT_CST_ASCII) break;
00167 ASSERT_MSG (i!=ut_session->nb_charsets, "ASCII not defined")
00168 text->charset = i;
00169 DBG3 ("ASCII Encoding detected!")
00170 } else if (nb_ext_chars * UT_THRESHOLD_UTF8 > error_utf8) {
00171
00172
00173 for (i=0; i<ut_session->nb_charsets; i++)
00174 if (ut_session->charset[i].type == UT_CST_UTF_8) break;
00175 ASSERT_MSG (i!=ut_session->nb_charsets, "UTF-8 not defined")
00176 text->charset = i;
00177 DBG3 ("UTF-8 Encoding detected!")
00178 } else {
00179 text->charset = UT_UNSET;
00180 }
00181 }
00182
00183 return UT_OK;
00184 }
00185
00186
00187
00194 void ut_change_EOL1toEOL2 (char * beg, char * end) {
00195 ASSERT (beg<end)
00196 ASSERT (*end==UT_EOL_CHAR)
00197 char * scan = beg;
00198 for(;;) {
00199 if (*scan==UT_EOL_CHAR) {
00200 if (scan==end) return;
00201 *scan=UT_EOL_ALT_CHAR;
00202 }
00203 scan++;
00204 }
00205 }
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00277 UtCode ut_eol_pass (UtText * text) {
00278
00279 char * scan = text->data;
00280 char * scan_end = text->data+text->size;
00281 ASSERT ( *scan_end == 0 )
00282
00283 text->nb_lines = 0;
00284 text->nb_lines_alt = 0;
00285 ulong cumul=1;
00286
00287
00288
00289 UtEolType eol1 = UT_EOL_NONE;
00290 UtEolType eol2 = UT_EOL_NONE;
00291
00292
00293 for (;;) {
00294 DBG3_S ("<%d>", *scan);
00295
00296 if ((u_char)*scan<0x20) {
00297 if (!*scan) {
00298 if (scan>=scan_end) {
00299 ASSERT (scan==scan_end)
00300 break;
00301 } else if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00302 if (!ut_update_progress (text, scan - text->data, false)) break;
00303 cumul++;
00304 }
00305 }
00306 if (*scan == 0xA) {
00307 DBG3_S ("*");
00308 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00309 ut_update_progress (text, scan - text->data, false);
00310 cumul++;
00311 }
00312
00313 if (*(scan+1) == 0xD) {
00314 switch (eol1) {
00315 case UT_EOL_LFCR:
00316 case UT_EOL_MIX:
00317 if (*(scan+2) == 0xA) goto LF_only;
00318 break;
00319 case UT_EOL_CRLF:
00320 if (*(scan+2) == 0xA) goto LF_only;
00321 eol1 = UT_EOL_MIX;
00322 if (eol2 != UT_EOL_NONE) {
00323 ERROR ("EOL2 todo...")
00324 }
00325 break;
00326 case UT_EOL_CR:
00327 case UT_EOL_LF:
00328 if (*(scan+2) == 0xA) goto LF_only;
00329 ASSERT (eol2 == UT_EOL_NONE)
00330 eol2 = eol1;
00331 text->nb_lines_alt = text->nb_lines;
00332 text->nb_lines = 0;
00333 *scan = UT_EOL_CHAR;
00334 ut_change_EOL1toEOL2 (text->data, scan);
00335 case UT_EOL_NONE:
00336 eol1 = UT_EOL_LFCR;
00337 break;
00338 default:
00339 ERROR ("Forbiden case!?!")
00340 }
00341 *scan++ = UT_EOL_CHAR;
00342 *scan++ = text->skip_char;
00343 text->nb_lines++;
00344 } else {
00345 LF_only:
00346 switch (eol1) {
00347 case UT_EOL_NONE:
00348 eol1 = UT_EOL_LF;
00349 case UT_EOL_LF:
00350 case UT_EOL_MIX:
00351 *scan++ = UT_EOL_CHAR;
00352 text->nb_lines++;
00353 break;
00354 case UT_EOL_CR:
00355 eol1 = UT_EOL_MIX;
00356 *scan++ = UT_EOL_CHAR;
00357 text->nb_lines++;
00358 break;
00359 case UT_EOL_CRLF:
00360 case UT_EOL_LFCR:
00361 switch (eol2) {
00362 case UT_EOL_NONE:
00363 eol2 = UT_EOL_LF;
00364 break;
00365 case UT_EOL_CR:
00366 eol2 = UT_EOL_MIX;
00367 case UT_EOL_LF:
00368 case UT_EOL_MIX:
00369 break;
00370 default:
00371 ERROR ("Forbiden case!?!")
00372 }
00373 *scan++ = UT_EOL_ALT_CHAR;
00374 text->nb_lines_alt++;
00375 break;
00376 default:
00377 ERROR ("Forbiden case!?!")
00378 }
00379 }
00380 } else if (*scan == 0xD) {
00381 DBG3_S ("*");
00382 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00383 ut_update_progress (text, scan - text->data, false);
00384 cumul++;
00385 }
00386
00387 if (*(scan+1) == 0xA) {
00388 switch (eol1) {
00389 case UT_EOL_CRLF:
00390 case UT_EOL_MIX:
00391 break;
00392 case UT_EOL_LFCR:
00393 eol1 = UT_EOL_MIX;
00394 if (eol2 != UT_EOL_NONE) {
00395 ERROR ("EOL2 todo...")
00396 }
00397 break;
00398 case UT_EOL_CR:
00399 case UT_EOL_LF:
00400 ASSERT (eol2 == UT_EOL_NONE)
00401 eol2 = eol1;
00402 text->nb_lines_alt = text->nb_lines;
00403 text->nb_lines = 0;
00404 *scan = UT_EOL_CHAR;
00405 ut_change_EOL1toEOL2 (text->data, scan);
00406 case UT_EOL_NONE:
00407 eol1 = UT_EOL_CRLF;
00408 break;
00409 default:
00410 ERROR ("Forbiden case!?!")
00411 }
00412 *scan++ = UT_EOL_CHAR;
00413 *scan++ = text->skip_char;
00414 text->nb_lines++;
00415 } else {
00416 switch (eol1) {
00417 case UT_EOL_NONE:
00418 eol1 = UT_EOL_CR;
00419 case UT_EOL_CR:
00420 case UT_EOL_MIX:
00421 *scan++ = UT_EOL_CHAR;
00422 text->nb_lines++;
00423 break;
00424 case UT_EOL_LF:
00425 eol1 = UT_EOL_MIX;
00426 *scan++ = UT_EOL_CHAR;
00427 text->nb_lines++;
00428 break;
00429 case UT_EOL_CRLF:
00430 case UT_EOL_LFCR:
00431 switch (eol2) {
00432 case UT_EOL_CR:
00433 case UT_EOL_MIX:
00434 break;
00435 case UT_EOL_NONE:
00436 eol2 = UT_EOL_CR;
00437 break;
00438 case UT_EOL_LF:
00439 eol2 = UT_EOL_MIX;
00440 break;
00441 default:
00442 ERROR ("Forbiden case!?!")
00443 }
00444 *scan++ = UT_EOL_ALT_CHAR;
00445 text->nb_lines_alt++;
00446 break;
00447 default:
00448 ERROR ("Forbiden case!?!")
00449 }
00450 }
00451 } else if (*scan == 0x9 ) {
00452 scan++;
00453 } else if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) {
00454 *scan++ = text->skip_char;
00455 }
00456
00457 } else {
00458 if (*scan == 0x7F && (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) ) {
00459 *scan++ = text->skip_char;
00460 } else {
00461 scan++;
00462 }
00463 }
00464 }
00465
00466
00467 if (scan<scan_end) {
00468 return UT_INTERRUPTED_BY_USER;
00469 }
00470
00471 if (text->flags & UT_F_ADD_FINAL_EOL) {
00472
00473 if ( (*(scan-2) != UT_EOL_CHAR || *(scan-1) != text->skip_char)
00474 && *(scan-1) != UT_EOL_CHAR ) {
00475 if (text->flags & UT_F_TRANSFORM_EOL) {
00476 *scan = UT_EOL_CHAR;
00477 text->size++;
00478 }
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493 text->nb_lines++;
00494 }
00495 }
00496
00497 if (text->eol == UT_EOL_UNSET) {
00498 text->eol = eol1;
00499 text->eol_alt = eol2;
00500 } else {
00501 text->nb_lines = UT_UNSET;
00502 text->nb_lines_alt = UT_UNSET;
00503 }
00504
00505
00506 ASSERT (*scan == UT_EOF_CHAR)
00507
00508 DBG2 ("End Of Line pass done! (%lu B)", text->size)
00509
00510 return UT_OK;
00511 }
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552