Main Page | Class List | File List | Class Members | File Members | Related Pages

utrac.c

Go to the documentation of this file.
00001 /***************************************************************************
00002  *            utrac.c
00003  *
00004  *  Tue Oct  5 11:29:59 2004
00005  *  Copyright  2004  Alliance MCA
00006  *  Written by : Antoine Calando (antoine@alliancemca.net)
00007  ****************************************************************************/
00008 
00009 /*
00010  *  This program is free software; you can redistribute it and/or modify
00011  *  it under the terms of the GNU General Public License as published by
00012  *  the Free Software Foundation; either version 2 of the License, or
00013  *  (at your option) any later version.
00014  *
00015  *  This program is distributed in the hope that it will be useful,
00016  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  *  GNU Library General Public License for more details.
00019  *
00020  *  You should have received a copy of the GNU General Public License
00021  *  along with this program; if not, write to the Free Software
00022  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00023  */
00024  
00031 #define _UTRAC_C_
00032 
00033 #include <stdlib.h>
00034 #include <stdio.h>
00035 #include "utrac.h"
00036 
00037 #undef UT_DEBUG
00038 #define UT_DEBUG 3
00039 #include "debug.h"
00040 
00041 
00042 /***************************************************************************/
00056 UtCode ut_init () {
00057         
00058         if (ut_session) return UT_ALREADY_INITIALISED_ERROR;
00059                 
00060         ut_session = (UtSession*) malloc (sizeof(UtSession));
00061         if (!ut_session) return UT_MALLOC_ERROR;
00062                 
00063         return ut_init_noalloc();
00064 }
00065 
00069 UtCode ut_init_noalloc () {
00070         //ut_session->flags = UT_F_UNSET; //flags_in
00071         ut_session->charset = NULL;
00072         ut_session->nb_charsets = 0;
00073         ut_session->language.name = NULL;
00074         ut_session->language.code = NULL;
00075         ut_session->language.n = 0;
00076         ut_session->language.n_max = 0;
00077         ut_session->system.name = NULL;
00078         ut_session->system.code = NULL;
00079         ut_session->system.n = 0;
00080         ut_session->system.n_max = 0;
00081         //ut_session->charset_default = UT_UNSET;
00082         ut_session->eol_default = UT_EOL_UNSET;
00083         ut_session->eol_alt_default = UT_EOL_UNSET;
00084         
00085         ut_session->nomapping_char = '_';
00086         ut_session->progress_function = NULL;
00087         ut_session->error_string = NULL;
00088         //load charsets data
00089         UT_TRY (ut_load_charsets ())
00090 
00091         //find default language, charset, eol type on the system
00092         #ifdef linux
00093         //should we use nl_langinfo()? (discovered later...) ->yes!
00094         int i;
00095         ut_session->language_default = 0; //language_default_in
00096         ut_session->system_default = 3;                 //3 (to check in file charsets.dat)
00097         ut_session->eol_default = UT_EOL_LF;
00098         ut_session->eol_alt_default = UT_EOL_LF;
00099         ut_session->charset_default = ut_find_charset("ISO-8859-1");
00100         
00101         char * def_enc = getenv ("LC_CTYPE");
00102         if (!def_enc) def_enc = getenv ("LC_ALL");
00103         if (!def_enc) def_enc = getenv ("LANG");
00104         if (def_enc) {
00105                 if (def_enc[2]=='_' || def_enc[2]=='.' || def_enc[2]==0) {
00106                         for (i=0; i<ut_session->language.n; i++) {
00107                                 if (def_enc[0]-'a'+'A'== ut_session->language.code[i*2+0]
00108                                         && def_enc[1]-'a'+'A'== ut_session->language.code[i*2+1] ) {
00109                                         ut_session->language_default = i;
00110                                         break;
00111                                 }
00112                         } //for
00113                 }
00114                 if (def_enc[2]=='.') def_enc +=3;
00115                 if (def_enc[2]=='_' && def_enc[5]=='.') def_enc +=6;
00116                 for (i=0; i<ut_session->nb_charsets; i++) 
00117                         if (ut_str_fuzzy_cmp (def_enc, ut_session->charset[i].name,'@')) break;
00118                 if (i!=ut_session->nb_charsets) ut_session->charset_default = i;
00119         }
00120         
00121         if (ut_session->charset_default == UT_UNSET) {
00122                 for (i=0; i<ut_session->nb_charsets; i++) 
00123                         if (ut_str_fuzzy_cmp (UT_DEFAULT_ENCODING_UNIX, ut_session->charset[i].name,0)) break;
00124                 if (i==ut_session->nb_charsets) {
00125                         DBG1 ("*** No default charset ***")
00126                 }
00127                 else ut_session->charset_default = i;
00128         }
00129         #else
00130         ERROR ("pas unix!")
00131         #endif
00132 
00133         #if UT_DEBUG == 2
00134         if (ut_session->language_default != UT_UNSET)
00135                 DBG2 ("lang: %s" , ut_session->language.name[ut_session->language_default])
00136         if (ut_session->charset_default != UT_UNSET)
00137                 DBG2 ("charset: %s", ut_session->charset[ut_session->charset_default].name)
00138         if (ut_session->eol_default != UT_EOL_UNSET)
00139                 DBG2 ("eol: %s", UT_EOL_NAME [ut_session->eol_default])
00140         #endif
00141         
00142         return UT_OK;
00143 }
00144 
00154 void ut_finish () {
00155         
00156         ut_finish_nofree ();
00157         free(ut_session);
00158         ut_session = NULL;
00159         
00160         return;
00161 }
00162 
00166 void ut_finish_nofree () {
00167         
00168         if (!ut_session) return;
00169         
00170         int i; for(i=0; i<ut_session->nb_charsets; i++) {
00171                 free(ut_session->charset[i].name);
00172                 free(ut_session->charset[i].alias);
00173                 free(ut_session->charset[i].common_name);
00174                 free(ut_session->charset[i].comment);
00175                 free(ut_session->charset[i].unicode);
00176                 free(ut_session->charset[i].char_type);
00177                 free(ut_session->charset[i].language);
00178                 free(ut_session->charset[i].system);
00179         }
00180         free (ut_session->charset);
00181         
00182         for (i=0; i<ut_session->language.n; i++) 
00183                 free (ut_session->language.name[i]);
00184         free (ut_session->language.name);
00185         free (ut_session->language.code);
00186 
00187         for (i=0; i<ut_session->system.n; i++) 
00188                 free (ut_session->system.name[i]);
00189         free (ut_session->system.name);
00190         free (ut_session->system.code);
00191         
00192         free (ut_session->error_string);
00193         return;
00194 };
00195 
00196 
00197 
00198 
00199 /***************************************************************************/
00206 UtText * ut_init_text_heap () {
00207         ASSERT (ut_session)
00208         UtText* new_text = (UtText*) malloc (sizeof(UtText));
00209         if (!new_text) return NULL;
00210         
00211         ut_init_text (new_text);
00212         
00213         return new_text;
00214 }
00215 
00221 void ut_init_text (UtText * new_text) {
00222                 
00223         new_text->data = NULL;
00224         new_text->size = 0;
00225 
00226         new_text->eol = UT_EOL_UNSET;
00227         new_text->eol_alt = UT_EOL_UNSET;
00228         new_text->charset = UT_UNSET;
00229 
00230         new_text->nb_lines = UT_UNSET;
00231         new_text->nb_lines_alt = UT_UNSET;
00232         new_text->distribution = NULL;
00233         //int i; for (i=0; i<0x100; i++) new_text->distribution [i] = 0;
00234         new_text->ext_char = NULL;
00235         new_text->evaluation = NULL;
00236 
00237         new_text->flags = UT_F_DEFAULT;
00238         new_text->pass_flags = UT_PF_UNSET;
00239         new_text->skip_char = UT_SKIP_CHAR;
00240         
00241         new_text->progress_done = 0.0;
00242         new_text->progress_todo = 0;
00243         new_text->current_pass = UT_PF_UNSET;
00244 
00245         new_text->user = NULL;
00246 }
00247 
00253 void ut_free_text_heap (UtText *text) {
00254         
00255         ut_free_text (text);
00256         free(text);     
00257         
00258 }
00259 
00265 void ut_free_text (UtText *text) {
00266         //free(text->filename);
00267         //filename is not freed because it is set by user. 
00268         free(text->data); text->data = NULL;
00269         free(text->distribution); text->distribution = NULL;
00270         while (text->ext_char) {
00271                 UtExtCharLine * tmp = text->ext_char;
00272                 text->ext_char = text->ext_char->next;
00273                 free (tmp);
00274         } text->ext_char = NULL;
00275 
00276         free(text->evaluation); text->evaluation = NULL;
00277         //text->user should be free by the user.
00278 }
00279 
00280 
00289 UtCode ut_init_progress (UtText *text) {
00290         
00291         ASSERT (text);
00292         
00293         text->progress_done = 0.0;
00294         text->progress_todo = 0;
00295         if (text->pass_flags == UT_PF_UNSET) text->pass_flags = UT_PF_RECOGNIZE;
00296 
00297         if (text->pass_flags & UT_PF_LOAD ) text->progress_todo++;
00298                 
00299         if (text->pass_flags & UT_PF_RECOGNIZE ) {
00300                 if ((text->flags & UT_F_IDENTIFY_CHARSET) || (text->pass_flags & UT_PF_CONVERT ) )
00301                         text->pass_flags |= UT_PF_DISTRIB_PASS;
00302                 else text->pass_flags &= ~UT_PF_DISTRIB_PASS;
00303                 if (text->flags & (UT_F_TRANSFORM_EOL | UT_F_REMOVE_ILLEGAL_CHAR | UT_F_ADD_FINAL_EOL | UT_F_IDENTIFY_EOL ) )
00304                         text->pass_flags |= UT_PF_EOL_PASS;
00305                 else text->pass_flags &= ~UT_PF_EOL_PASS;
00306 
00307                 if (text->flags & (UT_F_IDENTIFY_CHARSET | UT_F_REFERENCE_EXT_CHAR ) )
00308                         text->pass_flags |= UT_PF_XASCII_PASS;
00309                 else text->pass_flags &= ~UT_PF_XASCII_PASS;
00310                 
00311                 if (text->pass_flags & UT_PF_DISTRIB_PASS) text->progress_todo++;
00312                 if (text->pass_flags & UT_PF_EOL_PASS) text->progress_todo++;
00313                 if (text->pass_flags & UT_PF_XASCII_PASS) text->progress_todo++;
00314         } else {
00315                 text->pass_flags &= ~(UT_PF_DISTRIB_PASS | UT_PF_EOL_PASS | UT_PF_XASCII_PASS);
00316         }
00317                 
00318         if (text->pass_flags & UT_PF_CONVERT ) text->progress_todo++;
00319         
00320         return UT_OK;
00321 }
00322 
00330 UtCode ut_load (UtText *text, const char * filename) {
00331 
00332         ASSERT (text);
00333 
00334         if (text->pass_flags==UT_PF_UNSET) {
00335                 text->pass_flags |= UT_PF_LOAD | UT_PF_RECOGNIZE;
00336                 ut_init_progress(text);
00337         }
00338         
00339         if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
00340 
00341         text->current_pass = UT_PF_LOAD;
00342 
00343         if (filename) {
00344                 UT_TRY ( ut_load_file_pass (text, filename) )
00345         } else {
00346                 UT_TRY ( ut_load_stdin_pass (text) )
00347         }
00348 
00349         text->current_pass = UT_PF_NONE;
00350         
00351         if (ut_session->progress_function) {
00352                 text->progress_done+= (1-text->progress_done)/text->progress_todo;
00353                 text->progress_todo--;
00354         }
00355         
00356         //if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
00357         if (ut_session->progress_function && text->progress_todo == 0) ut_update_progress (text, 0, true);
00358                 
00359         return UT_OK;   
00360 }
00361 
00362 
00387 UtCode ut_recognize (UtText *text) {
00388         
00389         if (!text || !text->data) return UT_BAD_PARAMETER_ERROR;
00390 
00391         if (text->pass_flags==UT_PF_UNSET) ut_init_progress(text);
00392 
00393         if (ut_session->progress_function && text->progress_done == 0.0) ut_update_progress (text, 0, true);
00394         
00395         //FIRST PASS
00396         if (text->pass_flags & UT_PF_DISTRIB_PASS) {
00397                 text->current_pass = UT_PF_DISTRIB_PASS | UT_PF_RECOGNIZE;
00398                 int rcode = ut_distrib_utf_pass (text);
00399                 text->current_pass = UT_PF_NONE;
00400 
00401                 if (rcode == UT_BINARY_DATA_ERROR) {
00402                         if ( !(text->flags & UT_F_FORCE_BINARY)) return rcode;
00403                 } else if ( rcode != UT_OK) return rcode;
00404                 
00405                 if (text->charset != UT_UNSET && text->pass_flags & UT_PF_XASCII_PASS) {
00406                         text->pass_flags &= ~UT_PF_XASCII_PASS | UT_PF_RECOGNIZE;
00407                         text->progress_todo--;                  
00408                 }
00409         
00410                 if (ut_session->progress_function) {
00411                         text->progress_done+= (1-text->progress_done)/text->progress_todo;
00412                         text->progress_todo--;
00413                 }
00414         }
00415 
00416         // set text->skip_char
00417         if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR ) {
00418                 text->skip_char = UT_SKIP_CHAR;
00419         } else {
00420                 //if control code accepted in file, try to find one not used
00421                 int i; for (i=1; i<0x20; i++) {
00422                         if (i==UT_EOL_ALT_CHAR || i== 0x9|| i==0xA || i==0xD) continue; //UT_EOL_CHAR and UT_EOF_CHAR = 0
00423                         if (!text->distribution[i]) break;
00424                 }
00425                 if (i!=0x20) text->skip_char = i; 
00426                 else text->skip_char = UT_SKIP_CHAR; //all control code used, nevermind, we use UT_SKIP_CHAR
00427         }
00428 
00429         //ASSERT (text->flags & UT_F_TRANSFORM_EOL)
00430         
00431         //SECOND PASS
00432         if (text->pass_flags & UT_PF_EOL_PASS) {
00433                 text->current_pass = UT_PF_EOL_PASS | UT_PF_RECOGNIZE;
00434                 UT_TRY ( ut_eol_pass (text) )
00435                 text->current_pass = UT_PF_NONE;
00436                 if (ut_session->progress_function) {
00437                         text->progress_done+= (1-text->progress_done)/text->progress_todo;
00438                         text->progress_todo--;
00439                 }
00440         }
00441 
00442         //THIRD PASS
00443         if ( text->pass_flags & UT_PF_XASCII_PASS ) {
00444                 text->current_pass = UT_PF_XASCII_PASS | UT_PF_RECOGNIZE;
00445                 UT_TRY ( ut_xascii_pass (text) )
00446                 text->current_pass = UT_PF_NONE;
00447                 if (ut_session->progress_function) {
00448                         text->progress_done+= (1-text->progress_done)/text->progress_todo;
00449                         text->progress_todo--;
00450                 }
00451         }
00452 
00453         if (ut_session->progress_function && text->progress_todo == 0) ut_update_progress (text, 0, true);
00454         
00455         return UT_OK;   
00456 }
00457 
00458 
00470 UtCode ut_convert (UtText *src_text, UtText *dst_text) {
00471         
00472         if (!src_text || !src_text->data) return UT_BAD_PARAMETER_ERROR;
00473 
00474         ASSERT (src_text->eol != UT_EOL_UNSET)
00475         ASSERT (src_text->charset != UT_UNSET)
00476         ASSERT (src_text->distribution)
00477 
00478         bool same_text = false;
00479         if (!dst_text) {
00480                 same_text = true;
00481                 dst_text = ut_init_text_heap ();
00482                 if (!dst_text) return UT_MALLOC_ERROR;
00483         }
00484         
00485         ASSERT (dst_text)
00486         
00487         if (src_text->pass_flags==UT_PF_UNSET) {
00488                 src_text->pass_flags |= UT_PF_CONVERT;
00489                 ut_init_progress(src_text);
00490         }
00491 
00492         
00493         if (ut_session->progress_function && src_text->progress_done == 0.0) ut_update_progress (src_text, 0, true);
00494         
00495         if (dst_text->eol         == UT_EOL_UNSET)  dst_text->eol               = ut_session->eol_default;
00496         if (dst_text->eol_alt == UT_EOL_UNSET)  dst_text->eol_alt       = ut_session->eol_alt_default;
00497         if (dst_text->charset == UT_UNSET)              dst_text->charset       = ut_session->charset_default;
00498 
00499         src_text->current_pass = UT_PF_CONVERT;
00500         UT_TRY  ( ut_conversion_pass (src_text, dst_text) )
00501         src_text->current_pass = UT_PF_NONE;
00502 
00503         if (ut_session->progress_function) {
00504                 src_text->progress_done+= (1-src_text->progress_done)/src_text->progress_todo;
00505                 src_text->progress_todo--;
00506         }
00507         
00508         if (ut_session->progress_function && src_text->progress_todo == 0) ut_update_progress (src_text, 0, true);
00509                 
00510         if (same_text) {
00511                 free (src_text->data);
00512                 src_text->data = dst_text->data;
00513                 dst_text->data = NULL;
00514                 src_text->size = dst_text->size ;
00515                 src_text->eol = dst_text->eol ;
00516                 src_text->eol_alt = dst_text->eol_alt ;
00517                 src_text->charset = dst_text->charset ;
00518                 free (src_text->distribution);
00519                 src_text->distribution = NULL;
00520                 while (src_text->ext_char) {
00521                         UtExtCharLine * tmp = src_text->ext_char;
00522                         src_text->ext_char = src_text->ext_char->next;
00523                         free (tmp);
00524                 } src_text->ext_char = NULL;
00525                 free(src_text->evaluation); 
00526                 src_text->evaluation = NULL;
00527                 ut_free_text_heap (dst_text);
00528         }
00529 
00530         return UT_OK;   
00531 }
00532 
00533 
00534 
00535 
00536 /***************************************************************************/
00537 /* OLD DOC!!!!
00538  * \brief Recognize charset and EOL type of a text, and eventually convert it.
00539  * 
00540  * This function take an UtText structure as a parameter and do severeal tasks :
00541  * -# it loads the file (or read the standard input),
00542  * -# it calculate the frequency distribution of each byte in the file
00543  *    (UtText::distribution), checks if the file is binary data or text,
00544  *    checks if it is ASCII or UTF-8,
00545  * -# it recognize the EOL type, and replace each EOL by null character to make
00546  *    further processing of the file easier (this feature can be disbled).
00547  * -# if the charset has not been determined earlier as ASCII or UTF-8, it tries
00548  *    to detect which known charset fit the best to the text.
00549  * -# it eventually convert the text, replacing EOL and extended character by
00550  *    those corresponding to the selection of the user and/or the result of the recogntion.
00551  *
00552  * \param text Text to recognize and eventually convert. Some members must be set
00553  *        before calling this function, but some other are optionnal. Members that
00554  *        select the input text are:
00555  *            - UtText::data: Pointer to the text to process (which must be null terminated).
00556  *          If NULL, UtText::filename is used.
00557  *        - UtText::filename: Path to the file containing the text to process, which will
00558  *          be loaded if . If NULL, standard input is read.
00559  *            - UtText::size: If UtText::data is set, this member can also be set to indicate
00560  *          the size of the text, if null, the first null character will determine the
00561  *          end of the text.
00562  * 
00563  * Members that modifies the recognition or the conversion are:
00564  *      - UtText::flags: Flags to customize the processing and the modification of the text.
00565  *    Set intially to UT_F_DEFAULT.
00566  *      - UtText::src_eol and UtText::src_charset: EOL type and charset of the text used as
00567  *    source for the conversion. If unset, the values taken are those recognized automatically.
00568  *      - UtText::dst_eol and UtText::dst_charset: EOL type and charset of the text resulting of
00569  *    the conversion. If unset, the values taken are those by default found by ut_init().
00570  *      - UtText::nomapping_char: Character inserted during the conversion each time an error occurs.
00571  * 
00572  * Misc member:
00573  *      - UtText::progress_function: Custom function provided by the user to refresh a progress bar.
00574  *
00575  * \param convert If true, conversion is effectued after recognition.
00576  *      
00577  * \return UT_OK on success, error code on failure (see UtCode).
00578  */

Generated on Fri Feb 25 18:30:15 2005 for Utrac by  doxygen 1.3.9