00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00032 #include <stdlib.h>
00033 #include <stdio.h>
00034 #include <string.h>
00035 #include <limits.h>
00036 #include <endian.h>
00037 #include <math.h>
00038
00039 #include "utrac.h"
00040
00041 #undef UT_DEBUG
00042 #define UT_DEBUG 3
00043 #include "debug.h"
00044
00045 #define ESC_RED "\e[31m"
00046 #define ESC_NONE "\e[0m"
00047 const char * NONE_STR = "none";
00048
00049
00050
00051 int custom_pb_fct (float rate) {
00052
00053 if (!rate) printf("Rate = 0\n");
00054
00055 const int barsize = 100;
00056
00057 putchar(0xD);
00058 float i; for (i=0; i < barsize*rate; i++) {
00059 putchar('*');
00060
00061 }
00062 for (; i<barsize; i++) putchar ('-');
00063 putchar(0xD);
00064
00065
00066 if (rate==1.0) printf ("\n fin!\n");
00067
00068 return true;
00069 }
00070
00071
00072
00073
00074 UtCode parse_charset_eol (char * charset_eol, int *charset_ip, UtEolType * eol_type) {
00075
00076 char * separator = charset_eol;
00077
00078 for (;;) {
00079 if (!*separator) {
00080 separator = NULL;
00081 break;
00082 } else if (*separator != '/') {
00083 separator++;
00084 } else {
00085 *separator = 0;
00086 break;
00087 }
00088 }
00089
00090 *charset_ip = ut_find_charset (charset_eol);
00091
00092 if (*charset_ip != UT_UNSET && !separator) {
00093 *eol_type = UT_EOL_UNSET;
00094 return UT_OK;
00095 }
00096
00097
00098 if (separator) {
00099 *separator = '/';
00100 *eol_type = ut_find_eol (separator+1);
00101 } else {
00102 *eol_type = ut_find_eol (charset_eol);
00103 }
00104
00105 if (*eol_type == UT_EOL_UNSET || (separator && *charset_ip == UT_UNSET))
00106 return UT_NOT_FOUND_ERROR;
00107
00108 return UT_OK;
00109 }
00110
00111
00112 void test_rcode (UtCode rcode, char * argv[]) {
00113 if (rcode!=UT_OK) {
00114 fprintf (stderr, "%s: %s (error %d)\n", argv[0], ut_error_message(rcode), rcode);
00115 if (ut_session->error_string) fprintf (stderr, "%s: %s\n", argv[0], ut_session->error_string);
00116 exit(rcode);
00117 }
00118 }
00119
00120
00121 void print_all_ratings (UtText *text) {
00122 ASSERT (text->evaluation)
00123
00124 ulong prev_checksum = 0;
00125 long prev_rating = LONG_MIN;
00126 long rating;
00127
00128 bool *done = malloc (sizeof(bool)*ut_session->nb_charsets);
00129 int i; for (i=0; i<ut_session->nb_charsets; i++) done[i] = false;
00130 printf ("With locale Brut Checksum Name(s)");
00131 for (;;) {
00132 long rating_max = LONG_MIN;
00133 short index = -1;
00134
00135 for (i=0; i<ut_session->nb_charsets; i++) {
00136 if (done[i] || ut_session->charset[i].type != UT_CST_ASCII_EXTENSION) continue;
00137
00138 rating = text->evaluation[i].rating * ut_get_charset_coef (i);
00139
00140
00141
00142
00143 if (rating == prev_rating ) {
00144 rating_max = rating;
00145 index = i;
00146
00147 if (text->evaluation[i].checksum == prev_checksum) break;
00148 } else if (rating > rating_max) {
00149 rating_max = rating;
00150 index = i;
00151
00152 }
00153 }
00154 if (index == -1) break;
00155 if ( i == ut_session->nb_charsets) {
00156 printf ("\n%8ld %8ld (%8lx) %s",
00157 (long) (text->evaluation [index].rating * ut_get_charset_coef (index)),
00158 text->evaluation [index].rating, text->evaluation [index].checksum,
00159 ut_session->charset [index].name );
00160 } else {
00161 printf (", %s", ut_session->charset [index].name);
00162 }
00163 prev_checksum = text->evaluation [index].checksum;
00164 prev_rating = rating_max;
00165 done[index] = true;
00166
00167 }
00168 free(done);
00169 putchar('\n');
00170 }
00171
00172
00173
00174
00175 void print_parameters () {
00176 printf ("Language: ");
00177 if ( ut_session->language_default == UT_UNSET) printf ("None\n");
00178 else printf ("%s\n", ut_session->language.name[ut_session->language_default]);
00179 printf ("System: ");
00180 if (ut_session->system_default == UT_UNSET) printf ("None\n");
00181 else printf ("%s\n", ut_session->system.name[ut_session->system_default]);
00182 printf ("Output charset: %s\n", ut_session->charset[ut_session->charset_default].name);
00183 printf ("Output EOL: %s\n", UT_EOL_NAME[ut_session->eol_default]);
00184 printf ("Error character: ");
00185 if (ut_session->nomapping_char < 0x80) {
00186 printf("'%c'\n", (char) ut_session->nomapping_char);
00187 } else {
00188 printf ("(not yet coded, sorry)\n");
00189
00190
00191 }
00192 }
00193
00194
00195
00196
00197 void print_list () {
00198 int j;
00199 printf("Charsets:\n");
00200 for (j=0; j<ut_session->nb_charsets; j++) {
00201 printf (" %s", ut_session->charset[j].name);
00202 if (ut_session->charset[j].alias)
00203 printf (", %s", ut_session->charset[j].alias);
00204 putchar('\n');
00205 }
00206 printf ("EOL: ");
00207 for (j=0; j<= UT_EOL_NONE; j++) {
00208 printf ("%s", UT_EOL_NAME[j]);
00209 if (j<UT_EOL_NONE) printf(", ");
00210 else putchar('\n');
00211 }
00212 printf ("Languages:\n");
00213 for (j=0; j < ut_session->language.n; j++) {
00214 printf (" %c%c", ut_session->language.code[j*2+0], ut_session->language.code[j*2+1] );
00215 printf (" (%s)\n", ut_session->language.name[j]);
00216
00217
00218 }
00219 printf ("Systems:\n");
00220 for (j=0; j < ut_session->system.n; j++) {
00221 printf (" %c%c", ut_session->system.code[j*2+0], ut_session->system.code[j*2+1] );
00222 printf (" (%s)\n", ut_session->system.name[j]);
00223
00224
00225 }
00226
00227
00228
00229 }
00230
00231
00232 void print_distribution (UtText *text) {
00233
00234 uint i, j, max=0;
00235 char scan;
00236 const int COLS_N = 8;
00237 UtCharset * cs = &(ut_session->charset[ut_session->charset_default]);
00238
00239 for (i=0; i<256; i++)
00240 if (text->distribution[i] > max) max = text->distribution[i];
00241 max = ((uint) floor (log10 ((double) max)) )+1;
00242
00243 int nb_lines = 256/COLS_N;
00244 if (256%COLS_N) nb_lines++;
00245 for (i=0; i<nb_lines; i++) {
00246 for (j=i; j<256; j+=nb_lines) {
00247
00248 if ( j < ' ' || j==0x7F
00249 || ( 0x80 <= j && (cs->type==UT_CST_ASCII || cs->type==UT_CST_UTF_8) )
00250 || ( cs->char_type && cs->char_type[j].categorie == UT_CTG_CONTROL) )
00251 scan = '.';
00252 else
00253 scan = j;
00254
00255 if (text->distribution[j]) printf("<%2x+%c> %*lu ", j, scan, max,text->distribution[j]);
00256 else printf("<%2x-%c> %.*s ", j, scan, max, " ");
00257 }
00258 printf("\n");
00259 }
00260
00261 }
00262
00263
00264
00265 void print_ext_chars (UtText *text, bool use_color_b) {
00266 UtExtCharLine *scan_exl = text->ext_char;
00267 char * scan_char;
00268 while (scan_exl) {
00269 printf ("%3lu [%lu]: ", scan_exl->line_i, scan_exl->nb_ext_chars);
00270 scan_char = scan_exl->line_p;
00271 while (*scan_char) {
00272 if ((u_char)*scan_char<0x80) putchar(*scan_char++);
00273 else {
00274 char dst_char[5];
00275 char *dst_char_p = dst_char;
00276 ut_conv_char (&scan_char, &dst_char_p, text->charset, ut_session->charset_default);
00277 *dst_char_p = 0;
00278 if (use_color_b) printf (ESC_RED);
00279 printf ("%s", dst_char);
00280 if (use_color_b) printf (ESC_NONE);
00281
00282
00283
00284 }
00285
00286 }
00287 putchar ('\n');
00288 scan_exl = scan_exl->next;
00289 }
00290 }
00291
00292
00293 void all_ext_chars (UtText *text, bool use_color_b) {
00294
00295 int i, j;
00296
00297 printf(" |");
00298 for (j=0x80; j<0x100; j++) {
00299 if (!text->distribution[j]) continue;
00300 printf (" %2X |", j);
00301 }
00302
00303 char in[1], out[7], *in_p, *out_p;
00304
00305 for (i=0; i<ut_session->nb_charsets; i++) {
00306 if (ut_session->charset[i].type != UT_CST_ASCII_EXTENSION) continue;
00307 printf("\n%13.13s |", ut_session->charset[i].name);
00308
00309 for (j=0x80; j<0x100; j++) {
00310 if (!text->distribution[j]) continue;
00311 in[0] = j; in_p = in;
00312 out_p = out;
00313 ut_conv_char (&in_p, &out_p, i, ut_session->charset_default);
00314 *out_p = 0;
00315 if (use_color_b) printf (ESC_RED);
00316 printf (" %s ", out);
00317 if (use_color_b) printf (ESC_NONE);
00318 putchar ('|');
00319 }
00320 }
00321
00322 putchar('\n');
00323
00324 }
00325
00326
00327 int callback (UtText * text, float progress) {
00328
00329 ut_print_binary (text->current_pass);
00330 printf (" : progress : %f\n", progress);
00331
00332 return true;
00333 }
00334
00335
00336 int main (int argc, char * argv[]) {
00337
00338 int i;
00339
00340 for (i=1; i<argc; i++) {
00341
00342
00343 if (!strcmp (argv[i], "-h") || !strcmp (argv[i], "--help")) {
00344 fprintf (stderr,
00345 "usage: %s [OPTION] [FILE]\n"
00346 "With no FILE, read standard input; with no OPTION, recognize and write converted text to standard output.\n"
00347 "LC_ALL, LC_TYPE, LANG are read to determine prefered language and output charset.\n"
00348 "\n"
00349 " -i --file-info Print file information\n"
00350 " -p --print-charset Print recognized charset\n"
00351 " -P --print-all-charset Print ranked list of charsets\n"
00352 "\n"
00353 " -f --from Force input charset (disable recognition)\n"
00354 " -t --to Select output charset\n"
00355 " -L --language Select language\n"
00356 " -S --system Select system\n"
00357 "\n"
00358 " -x --ext-chars Print lines with extended characters\n"
00359 " -c --colors (with -x) Use colors\n"
00360 " -z --distribution Print distribution\n"
00361 " -a --all-ext-chars Print all ext chars in each charset\n"
00362 "\n"
00363 " -b --bar Display a progress bar\n"
00364 " -l --list List charsets/eol/lanuages/systems\n"
00365 " -d --default-info Print default/chosen parameters\n"
00366 " -v --version Print version\n"
00367 " -h --help Print this help\n"
00368 "\n"
00369 "For more information, try: man utrac\n",
00370 argv[0]
00371 );
00372 exit(0);
00373 } else if (!strcmp (argv[i], "-v") || !strcmp (argv[i], "--version")) {
00374 fprintf (stderr, "Utrac Universal Text Recognizer And Converter (version " UT_VERSION ")\n"
00375 "Written by Antoine Calando - Alliance MCA (antoine@alliancemca.net)\n");
00376 exit(0);
00377 }
00378 }
00379
00380 UtCode rcode = ut_init ();
00381 test_rcode (rcode,argv);
00382
00383 const char * filename = NULL;
00384
00385 int src_charset = UT_UNSET, dst_charset = UT_UNSET;
00386 UtEolType src_eol = UT_EOL_UNSET, dst_eol = UT_EOL_UNSET;
00387 bool print_ext_char_b = false;
00388 bool use_color_b = false;
00389 bool print_charset_name_b = false;
00390 bool print_all_ratings_b = false;
00391 bool print_parameters_b = false;
00392 bool print_list_b = false;
00393 bool print_file_info_b = false;
00394 bool print_distribution_b = false;
00395 bool progress_bar_b = false;
00396 bool convert_b = false;
00397 bool all_ext_chars_b = false;
00398
00399 for (i=1; i<argc; i++) {
00400 if (!strcmp (argv[i], "-f") || !strcmp (argv[i], "--from")) {
00401
00402 if (++i==argc || parse_charset_eol (argv[i], &src_charset, &src_eol)!=UT_OK) {
00403 fprintf (stderr, "%s : error invalid charset or EOL %s\nTry `%s --help' for more information\n", argv[0], argv[i], argv[0]);
00404 ut_finish (); exit(-1);
00405 }
00406 if (src_eol!=UT_EOL_UNSET) {
00407 fprintf (stderr, "%s : warning input EOL type is ignored\n", argv[0]);
00408 }
00409 } else if (!strcmp (argv[i], "-t") || !strcmp (argv[i], "--to")) {
00410 if (++i==argc || parse_charset_eol (argv[i], &dst_charset, &dst_eol)!=UT_OK) {
00411 fprintf (stderr, "%s : error invalid charset %s\nTry `%s --help' for more information\n", argv[0], argv[i], argv[0]);
00412 ut_finish (); exit(-1);
00413 }
00414 } else if (!strcmp (argv[i], "-x") || !strcmp (argv[i], "--ext-chars")) {
00415 print_ext_char_b = true;
00416 } else if (!strcmp (argv[i], "-c") || !strcmp (argv[i], "--colors")) {
00417 use_color_b = true;
00418 } else if (!strcmp (argv[i], "-i") || !strcmp (argv[i], "--file-info")) {
00419 print_file_info_b = true;
00420 } else if (!strcmp (argv[i], "-p") || !strcmp (argv[i], "--print-charset")) {
00421 print_charset_name_b = true;
00422 } else if (!strcmp (argv[i], "-P") || !strcmp (argv[i], "--print-all-charsets")) {
00423 print_all_ratings_b = true;
00424 } else if (!strcmp (argv[i], "-L") || !strcmp (argv[i], "--language")) {
00425 int language_id;
00426 if (++i==argc ||
00427 ( ( language_id = ut_find_lang_sys (argv[i], &ut_session->language)) == UT_UNSET
00428 && !ut_str_fuzzy_cmp (argv[i], NONE_STR, 0) ) ) {
00429 fprintf (stderr, "%s : error invalid language %s\nTry `%s --help' for more information\n", argv[0], argv[i], argv[0]);
00430 ut_finish (); exit(-1);
00431 }
00432 ut_session->language_default = language_id;
00433 } else if (!strcmp (argv[i], "-S") || !strcmp (argv[i], "--system")) {
00434 int system_id = UT_UNSET;
00435 if (++i==argc ||
00436 ( ( system_id = ut_find_lang_sys (argv[i], &ut_session->system)) == UT_UNSET
00437 && !ut_str_fuzzy_cmp (argv[i], NONE_STR, 0) ) ) {
00438 fprintf (stderr, "%s: error invalid system %s\nTry `%s --help' for more information\n", argv[0], argv[i], argv[0]);
00439 ut_finish (); exit(-1);
00440 }
00441 ut_session->system_default = system_id;
00442 } else if (!strcmp (argv[i], "-d") || !strcmp (argv[i], "--default-info")) {
00443 print_parameters_b = true;
00444 } else if (!strcmp (argv[i], "-l") || !strcmp (argv[i], "--list")) {
00445 print_list_b = true;
00446 } else if (!strcmp (argv[i], "-z") || !strcmp (argv[i], "--distribution")) {
00447 print_distribution_b = true;
00448 } else if (!strcmp (argv[i], "-a") || !strcmp (argv[i], "--all-ext-chars")) {
00449 all_ext_chars_b = true;
00450 } else if (!strcmp (argv[i], "-b") || !strcmp (argv[i], "--bar")) {
00451 progress_bar_b = true;
00452 } else if (!filename) {
00453 filename = argv[i];
00454 } else {
00455 fprintf (stderr, "%s : error invalid option '%s'\nTry `%s --help' for more information\n", argv[0], argv[i], argv[0]);
00456 ut_finish (); exit(-1);
00457 }
00458 }
00459
00460 if (print_parameters_b) {
00461 print_parameters ();
00462 }
00463 if (print_list_b) {
00464 print_list ();
00465 }
00466
00467 if (print_parameters_b || print_list_b) {
00468 ut_finish ();
00469 exit (0);
00470 }
00471
00472 UtText src_text, dst_text;
00473 ut_init_text (&src_text);
00474 ut_init_text (&dst_text);
00475
00476 src_text.flags |= UT_F_IDENTIFY_EOL | UT_F_TRANSFORM_EOL;
00477
00478 ASSERT (src_text.flags == (UT_F_IDENTIFY_EOL | UT_F_TRANSFORM_EOL
00479 | UT_F_REMOVE_ILLEGAL_CHAR | UT_F_IDENTIFY_CHARSET) );
00480
00481 if (dst_charset!=UT_UNSET) ut_session->charset_default = dst_charset;
00482 if (dst_eol!=UT_EOL_UNSET) ut_session->eol_default = dst_eol;
00483
00484 if (src_charset!=UT_UNSET) {
00485 src_text.charset = src_charset;
00486 src_text.flags &= ~UT_F_IDENTIFY_CHARSET;
00487 }
00488
00489 if (src_eol!=UT_EOL_UNSET) {
00490 src_text.eol = src_eol;
00491 src_text.flags &= ~UT_F_IDENTIFY_EOL;
00492 }
00493
00494
00495 if (print_ext_char_b) src_text.flags |= UT_F_REFERENCE_EXT_CHAR;
00496
00497 if (progress_bar_b) ut_session->progress_function = &callback;
00498
00499 src_text.pass_flags = UT_PF_LOAD | UT_PF_RECOGNIZE;
00500 if (!print_distribution_b && !print_ext_char_b && !print_all_ratings_b && !print_charset_name_b && !all_ext_chars_b
00501 && !print_file_info_b) {
00502 convert_b = true;
00503 src_text.pass_flags |= UT_PF_CONVERT;
00504 }
00505
00506 ut_init_progress (&src_text);
00507
00508 rcode = ut_load (&src_text, filename);
00509 test_rcode (rcode,argv);
00510
00511 rcode = ut_recognize (&src_text);
00512 test_rcode (rcode,argv);
00513
00514 if (print_distribution_b) {
00515 print_distribution (&src_text);
00516 }
00517
00518 if (print_ext_char_b) {
00519 print_ext_chars (&src_text, use_color_b);
00520 }
00521
00522 if (print_charset_name_b) {
00523 printf ("%s\n", ut_session->charset [src_text.charset].name);
00524 }
00525
00526 if (print_file_info_b) {
00527 printf ("Filename: %s\n", filename?filename:"<stdin>");
00528
00529 printf ("Charset (%s): %s\n", src_text.evaluation?"unsure":"sure", ut_session->charset [src_text.charset].name);
00530 printf ("EOL: %s (%lu lines)\n", UT_EOL_NAME [src_text.eol], src_text.nb_lines);
00531 if (src_text.eol_alt != UT_EOL_NONE)
00532 printf ("EOL alt: %s (%lu alt lines)\n", UT_EOL_NAME [src_text.eol_alt], src_text.nb_lines_alt);
00533 printf ("Size: %lu\n", src_text.size);
00534 }
00535
00536 if (print_all_ratings_b) {
00537 if (src_text.evaluation) {
00538 print_all_ratings (&src_text);
00539 } else {
00540 printf ("%s\n", ut_session->charset [src_text.charset].name);
00541 }
00542 }
00543
00544 if (all_ext_chars_b) {
00545 all_ext_chars (&src_text, use_color_b);
00546 }
00547
00548 if (convert_b) {
00549 rcode = ut_convert (&src_text, &dst_text);
00550 test_rcode (rcode,argv);
00551
00552 printf ("%s", dst_text.data);
00553 }
00554
00555 ut_free_text (&src_text);
00556 ut_free_text (&dst_text);
00557
00558 ut_finish ();
00559 exit (0);
00560
00561 }
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616
00617