00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00031 #define _UTRAC_C_
00032
00033 #include <stdlib.h>
00034 #include <stdio.h>
00035 #include "utrac.h"
00036
00037 #undef UT_DEBUG
00038 #define UT_DEBUG 3
00039 #include "debug.h"
00040
00041
00042
00056 UtCode ut_init () {
00057
00058 if (ut_session) return UT_ALREADY_INITIALISED_ERROR;
00059
00060 ut_session = (UtSession*) malloc (sizeof(UtSession));
00061 if (!ut_session) return UT_MALLOC_ERROR;
00062
00063 return ut_init_noalloc();
00064 }
00065
00069 UtCode ut_init_noalloc () {
00070
00071 ut_session->charset = NULL;
00072 ut_session->nb_charsets = 0;
00073 ut_session->language.name = NULL;
00074 ut_session->language.code = NULL;
00075 ut_session->language.n = 0;
00076 ut_session->language.n_max = 0;
00077 ut_session->system.name = NULL;
00078 ut_session->system.code = NULL;
00079 ut_session->system.n = 0;
00080 ut_session->system.n_max = 0;
00081
00082 ut_session->eol_default = UT_EOL_UNSET;
00083 ut_session->eol_alt_default = UT_EOL_UNSET;
00084
00085 ut_session->nomapping_char = '_';
00086 ut_session->progress_function = NULL;
00087 ut_session->error_string = NULL;
00088
00089 UT_TRY (ut_load_charsets ())
00090
00091
00092 #ifdef linux
00093
00094 int i;
00095 ut_session->language_default = 0;
00096 ut_session->system_default = 3;
00097 ut_session->eol_default = UT_EOL_LF;
00098 ut_session->eol_alt_default = UT_EOL_LF;
00099 ut_session->charset_default = ut_find_charset("ISO-8859-1");
00100
00101 char * def_enc = getenv ("LC_CTYPE");
00102 if (!def_enc) def_enc = getenv ("LC_ALL");
00103 if (!def_enc) def_enc = getenv ("LANG");
00104 if (def_enc) {
00105 if (def_enc[2]=='_' || def_enc[2]=='.' || def_enc[2]==0) {
00106 for (i=0; i<ut_session->language.n; i++) {
00107 if (def_enc[0]-'a'+'A'== ut_session->language.code[i*2+0]
00108 && def_enc[1]-'a'+'A'== ut_session->language.code[i*2+1] ) {
00109 ut_session->language_default = i;
00110 break;
00111 }
00112 }
00113 }
00114 if (def_enc[2]=='.') def_enc +=3;
00115 if (def_enc[2]=='_' && def_enc[5]=='.') def_enc +=6;
00116 for (i=0; i<ut_session->nb_charsets; i++)
00117 if (ut_str_fuzzy_cmp (def_enc, ut_session->charset[i].name,'@')) break;
00118 if (i!=ut_session->nb_charsets) ut_session->charset_default = i;
00119 }
00120
00121 if (ut_session->charset_default == UT_UNSET) {
00122 for (i=0; i<ut_session->nb_charsets; i++)
00123 if (ut_str_fuzzy_cmp (UT_DEFAULT_ENCODING_UNIX, ut_session->charset[i].name,0)) break;
00124 if (i==ut_session->nb_charsets) {
00125 DBG1 ("*** No default charset ***")
00126 }
00127 else ut_session->charset_default = i;
00128 }
00129 #else
00130 ERROR ("pas unix!")
00131 #endif
00132
00133 #if UT_DEBUG == 2
00134 if (ut_session->language_default != UT_UNSET)
00135 DBG2 ("lang: %s" , ut_session->language.name[ut_session->language_default])
00136 if (ut_session->charset_default != UT_UNSET)
00137 DBG2 ("charset: %s", ut_session->charset[ut_session->charset_default].name)
00138 if (ut_session->eol_default != UT_EOL_UNSET)
00139 DBG2 ("eol: %s", UT_EOL_NAME [ut_session->eol_default])
00140 #endif
00141
00142 return UT_OK;
00143 }
00144
00154 void ut_finish () {
00155
00156 ut_finish_nofree ();
00157 free(ut_session);
00158 ut_session = NULL;
00159
00160 return;
00161 }
00162
00166 void ut_finish_nofree () {
00167
00168 if (!ut_session) return;
00169
00170 int i; for(i=0; i<ut_session->nb_charsets; i++) {
00171 free(ut_session->charset[i].name);
00172 free(ut_session->charset[i].alias);
00173 free(ut_session->charset[i].common_name);
00174 free(ut_session->charset[i].comment);
00175 free(ut_session->charset[i].unicode);
00176 free(ut_session->charset[i].char_type);
00177 free(ut_session->charset[i].language);
00178 free(ut_session->charset[i].system);
00179 }
00180 free (ut_session->charset);
00181
00182 for (i=0; i<ut_session->language.n; i++)
00183 free (ut_session->language.name[i]);
00184 free (ut_session->language.name);
00185 free (ut_session->language.code);
00186
00187 for (i=0; i<ut_session->system.n; i++)
00188 free (ut_session->system.name[i]);
00189 free (ut_session->system.name);
00190 free (ut_session->system.code);
00191
00192 free (ut_session->error_string);
00193 return;
00194 };
00195
00196
00197
00198
00199
00206 UtText * ut_init_text_heap () {
00207 ASSERT (ut_session)
00208 UtText* new_text = (UtText*) malloc (sizeof(UtText));
00209 if (!new_text) return NULL;
00210
00211 ut_init_text (new_text);
00212
00213 return new_text;
00214 }
00215
00221 void ut_init_text (UtText * new_text) {
00222
00223 new_text->data = NULL;
00224 new_text->size = 0;
00225
00226 new_text->eol = UT_EOL_UNSET;
00227 new_text->eol_alt = UT_EOL_UNSET;
00228 new_text->charset = UT_UNSET;
00229
00230 new_text->nb_lines = UT_UNSET;
00231 new_text->nb_lines_alt = UT_UNSET;
00232 new_text->distribution = NULL;
00233
00234 new_text->ext_char = NULL;
00235 new_text->evaluation = NULL;
00236
00237 new_text->flags = UT_F_DEFAULT;
00238 new_text->pass_flags = UT_PF_UNSET;
00239 new_text->skip_char = UT_SKIP_CHAR;
00240
00241 new_text->progress_done = 0.0;
00242 new_text->progress_todo = 0;
00243 new_text->current_pass = UT_PF_UNSET;
00244
00245 new_text->user = NULL;
00246 }
00247
00253 void ut_free_text_heap (UtText *text) {
00254
00255 ut_free_text (text);
00256 free(text);
00257
00258 }
00259
00265 void ut_free_text (UtText *text) {
00266
00267
00268 free(text->data); text->data = NULL;
00269 free(text->distribution); text->distribution = NULL;
00270 while (text->ext_char) {
00271 UtExtCharLine * tmp = text->ext_char;
00272 text->ext_char = text->ext_char->next;
00273 free (tmp);
00274 } text->ext_char = NULL;
00275
00276 free(text->evaluation); text->evaluation = NULL;
00277
00278 }
00279
00280
00289 UtCode ut_init_progress (UtText *text) {
00290
00291 ASSERT (text);
00292
00293 text->progress_done = 0.0;
00294 text->progress_todo = 0;
00295 if (text->pass_flags == UT_PF_UNSET) text->pass_flags = UT_PF_RECOGNIZE;
00296
00297 if (text->pass_flags & UT_PF_LOAD ) text->progress_todo++;
00298
00299 if (text->pass_flags & UT_PF_RECOGNIZE ) {
00300 if ((text->flags & UT_F_IDENTIFY_CHARSET) || (text->pass_flags & UT_PF_CONVERT ) )
00301 text->pass_flags |= UT_PF_DISTRIB_PASS;
00302 else text->pass_flags &= ~UT_PF_DISTRIB_PASS;
00303 if (text->flags & (UT_F_TRANSFORM_EOL | UT_F_REMOVE_ILLEGAL_CHAR | UT_F_ADD_FINAL_EOL | UT_F_IDENTIFY_EOL ) )
00304 text->pass_flags |= UT_PF_EOL_PASS;
00305 else text->pass_flags &= ~UT_PF_EOL_PASS;
00306
00307 if (text->flags & (UT_F_IDENTIFY_CHARSET | UT_F_REFERENCE_EXT_CHAR ) )
00308 text->pass_flags |= UT_PF_XASCII_PASS;
00309 else text->pass_flags &= ~UT_PF_XASCII_PASS;
00310
00311 if (text->pass_flags & UT_PF_DISTRIB_PASS) text->progress_todo++;
00312 if (text->pass_flags & UT_PF_EOL_PASS) text->progress_todo++;
00313 if (text->pass_flags & UT_PF_XASCII_PASS) text->progress_todo++;
00314 } else {
00315 text->pass_flags &= ~(UT_PF_DISTRIB_PASS | UT_PF_EOL_PASS | UT_PF_XASCII_PASS);
00316 }
00317
00318 if (text->pass_flags & UT_PF_CONVERT ) text->progress_todo++;
00319
00320 return UT_OK;
00321 }
00322
00330 UtCode ut_load (UtText *text, const char * filename) {
00331
00332 ASSERT (text);
00333
00334 if (text->pass_flags==UT_PF_UNSET) {
00335 text->pass_flags |= UT_PF_LOAD | UT_PF_RECOGNIZE;
00336 ut_init_progress(text);
00337 }
00338
00339 if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
00340
00341 text->current_pass = UT_PF_LOAD;
00342
00343 if (filename) {
00344 UT_TRY ( ut_load_file_pass (text, filename) )
00345 } else {
00346 UT_TRY ( ut_load_stdin_pass (text) )
00347 }
00348
00349 text->current_pass = UT_PF_NONE;
00350
00351 if (ut_session->progress_function) {
00352 text->progress_done+= (1-text->progress_done)/text->progress_todo;
00353 text->progress_todo--;
00354 }
00355
00356
00357 if (ut_session->progress_function && text->progress_todo == 0) ut_update_progress (text, 0, true);
00358
00359 return UT_OK;
00360 }
00361
00362
00387 UtCode ut_recognize (UtText *text) {
00388
00389 if (!text || !text->data) return UT_BAD_PARAMETER_ERROR;
00390
00391 if (text->pass_flags==UT_PF_UNSET) ut_init_progress(text);
00392
00393 if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
00394
00395
00396 if (text->pass_flags & UT_PF_DISTRIB_PASS) {
00397 text->current_pass = UT_PF_DISTRIB_PASS | UT_PF_RECOGNIZE;
00398 int rcode = ut_distrib_utf_pass (text);
00399 text->current_pass = UT_PF_NONE;
00400
00401 if (rcode == UT_BINARY_DATA_ERROR) {
00402 if ( !(text->flags & UT_F_FORCE_BINARY)) return rcode;
00403 } else if ( rcode != UT_OK) return rcode;
00404
00405 if (text->charset != UT_UNSET && text->pass_flags & UT_PF_XASCII_PASS) {
00406 text->pass_flags &= ~UT_PF_XASCII_PASS | UT_PF_RECOGNIZE;
00407 text->progress_todo--;
00408 }
00409
00410 if (ut_session->progress_function) {
00411 text->progress_done+= (1-text->progress_done)/text->progress_todo;
00412 text->progress_todo--;
00413 }
00414 }
00415
00416
00417 if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR ) {
00418 text->skip_char = UT_SKIP_CHAR;
00419 } else {
00420
00421 int i; for (i=1; i<0x20; i++) {
00422 if (i==UT_EOL_ALT_CHAR || i== 0x9|| i==0xA || i==0xD) continue;
00423 if (!text->distribution[i]) break;
00424 }
00425 if (i!=0x20) text->skip_char = i;
00426 else text->skip_char = UT_SKIP_CHAR;
00427 }
00428
00429
00430
00431
00432 if (text->pass_flags & UT_PF_EOL_PASS) {
00433 text->current_pass = UT_PF_EOL_PASS | UT_PF_RECOGNIZE;
00434 UT_TRY ( ut_eol_pass (text) )
00435 text->current_pass = UT_PF_NONE;
00436 if (ut_session->progress_function) {
00437 text->progress_done+= (1-text->progress_done)/text->progress_todo;
00438 text->progress_todo--;
00439 }
00440 }
00441
00442
00443 if ( text->pass_flags & UT_PF_XASCII_PASS ) {
00444 text->current_pass = UT_PF_XASCII_PASS | UT_PF_RECOGNIZE;
00445 UT_TRY ( ut_xascii_pass (text) )
00446 text->current_pass = UT_PF_NONE;
00447 if (ut_session->progress_function) {
00448 text->progress_done+= (1-text->progress_done)/text->progress_todo;
00449 text->progress_todo--;
00450 }
00451 }
00452
00453 if (ut_session->progress_function && text->progress_todo == 0) ut_update_progress (text, 0, true);
00454
00455 return UT_OK;
00456 }
00457
00458
00470 UtCode ut_convert (UtText *src_text, UtText *dst_text) {
00471
00472 if (!src_text || !src_text->data) return UT_BAD_PARAMETER_ERROR;
00473
00474 ASSERT (src_text->eol != UT_EOL_UNSET)
00475 ASSERT (src_text->charset != UT_UNSET)
00476 ASSERT (src_text->distribution)
00477
00478 bool same_text = false;
00479 if (!dst_text) {
00480 same_text = true;
00481 dst_text = ut_init_text_heap ();
00482 if (!dst_text) return UT_MALLOC_ERROR;
00483 }
00484
00485 ASSERT (dst_text)
00486
00487 if (src_text->pass_flags==UT_PF_UNSET) {
00488 src_text->pass_flags |= UT_PF_CONVERT;
00489 ut_init_progress(src_text);
00490 }
00491
00492
00493 if (ut_session->progress_function && src_text->progress_done == 0.0) ut_update_progress (src_text, 0, true);
00494
00495 if (dst_text->eol == UT_EOL_UNSET) dst_text->eol = ut_session->eol_default;
00496 if (dst_text->eol_alt == UT_EOL_UNSET) dst_text->eol_alt = ut_session->eol_alt_default;
00497 if (dst_text->charset == UT_UNSET) dst_text->charset = ut_session->charset_default;
00498
00499 src_text->current_pass = UT_PF_CONVERT;
00500 UT_TRY ( ut_conversion_pass (src_text, dst_text) )
00501 src_text->current_pass = UT_PF_NONE;
00502
00503 if (ut_session->progress_function) {
00504 src_text->progress_done+= (1-src_text->progress_done)/src_text->progress_todo;
00505 src_text->progress_todo--;
00506 }
00507
00508 if (ut_session->progress_function && src_text->progress_todo == 0) ut_update_progress (src_text, 0, true);
00509
00510 if (same_text) {
00511 free (src_text->data);
00512 src_text->data = dst_text->data;
00513 dst_text->data = NULL;
00514 src_text->size = dst_text->size ;
00515 src_text->eol = dst_text->eol ;
00516 src_text->eol_alt = dst_text->eol_alt ;
00517 src_text->charset = dst_text->charset ;
00518 free (src_text->distribution);
00519 src_text->distribution = NULL;
00520 while (src_text->ext_char) {
00521 UtExtCharLine * tmp = src_text->ext_char;
00522 src_text->ext_char = src_text->ext_char->next;
00523 free (tmp);
00524 } src_text->ext_char = NULL;
00525 free(src_text->evaluation);
00526 src_text->evaluation = NULL;
00527 ut_free_text_heap (dst_text);
00528 }
00529
00530 return UT_OK;
00531 }
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576
00577
00578