Utrac: ut_recognition1.c Source File

00001 /***************************************************************************
00002  *            ut_recognition1.c
00003  *
00004  *  Tue Oct  5 11:29:40 2004
00005  *  Copyright  2004  Alliance MCA
00006  *  Written by : Antoine Calando (antoine@alliancemca.net)
00007  ****************************************************************************/
00008 
00009 /*
00010  *  This program is free software; you can redistribute it and/or modify
00011  *  it under the terms of the GNU General Public License as published by
00012  *  the Free Software Foundation; either version 2 of the License, or
00013  *  (at your option) any later version.
00014  *
00015  *  This program is distributed in the hope that it will be useful,
00016  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  *  GNU Library General Public License for more details.
00019  *
00020  *  You should have received a copy of the GNU General Public License
00021  *  along with this program; if not, write to the Free Software
00022  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00023  */
00024  
00025 
00032 #include <stdlib.h>
00033 #include <stdio.h>
00034 #include <string.h>
00035 #include "utrac.h"
00036 
00037 #undef UT_DEBUG
00038 #define UT_DEBUG 1
00039 #include "debug.h"
00040 
00041 /***************************************************************************/
00045 bool inline ut_unicode_invalid (ulong unicode) {
00046         return ((   0x0000FDD0 <= unicode && unicode <= 0x0000FDEF  )
00047                  || (   0x0010FFFE <= unicode                                   )
00048                  || ( ( 0xFFF0FFFE  & unicode ) == 0x0000FFFE                   ));
00049 }
00050 
00051 /***************************************************************************/
00062 UtCode ut_distrib_utf_pass (UtText * text) {
00063         
00064         char * scan = text->data;
00065         char * scan_end;
00066         
00067         ASSERT(text);
00068         ASSERT(text->data);
00069         
00070         //bug! (see assert l85)
00071         if (text->size) scan_end = scan + text->size;
00072         else scan_end = NULL;
00073         
00074         ulong unicode = 0;
00075         ushort multibyte = 0;
00076         ulong error_utf8 = 0;
00077         int cumul = 1;
00078         if (!text->distribution) text->distribution = (ulong*) malloc (sizeof(ulong)*256);
00079         int i; for (i=0; i<0x100; i++) text->distribution[i] = 0;
00080 
00081         scan--; //incrementation at the beginning of the loop is faster
00082         for (;;) {
00083                 scan++;
00084                 //EC: double test de !*scan !! AC ok
00085                 switch (*scan) {
00086                   case 0:
00087                         if (scan>=scan_end) {
00088                                 ASSERT (!scan_end || scan==scan_end)
00089                                 goto out_for;
00090                         } else if (!scan_end) goto out_for;
00091                   case 0xA:
00092                   case 0xD:
00093                         if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00094                                 if (!ut_update_progress (text, scan - text->data, false)) goto out_for;
00095                                 cumul++;
00096                         }
00097                 }
00098                 
00099                 text->distribution [(u_char) *scan]++;
00100                 if (multibyte) {
00101                         if ((*scan & 0xC0) == 0x80) {   //==10xx xxxx
00102                                 unicode <<= 6;
00103                                 unicode |= *scan & 0x3F;
00104                                 if(!--multibyte) { //last multybyte byte? then test if noncharacter (66 cases)
00105                                         if (ut_unicode_invalid (unicode)) error_utf8++;
00106                                 }
00107                         } else {
00108                                 multibyte = 0;
00109                                 error_utf8++;
00110                         }
00111                 } else if (*scan & 0x80) { //1xxx xxx
00112                         if ((*scan & 0xE0) == 0xC0) { //110x xxxx
00113                                 multibyte = 1;
00114                                 unicode = *scan & 0x1F;
00115                         } else if ((*scan & 0xF0) == 0xE0) { //1110 xxxx
00116                                 multibyte = 2;
00117                                 unicode = *scan & 0x0F;
00118                         } else if ((*scan & 0xF8) == 0xF0) { //1111 0xxx
00119                                 multibyte = 3;
00120                                 unicode = *scan & 0x07;
00121                         } else { //error
00122                                 error_utf8++;
00123                         }
00124                 }
00125         } //for (;;)
00126         out_for:
00127         //interrupted?
00128         //EC: ou il y a déja un 0 dans le texte ! AC test déjà fait
00129         if (scan<scan_end) {
00130                 return UT_INTERRUPTED_BY_USER;
00131         }
00132         
00133         if (multibyte) error_utf8++;
00134         
00135         DBG2 ("Distribution and UTF-8 pass done! (%lu B)", text->size)
00136         
00137         if (!text->size) text->size = scan - text->data; //terminating 0 not counted
00138         if (!text->size) return UT_EMPTY_DATA_ERROR;
00139         
00140         ulong nb_ctrl_chars = 0;
00141         // count the number of control chars
00142         for (i=0; i<0x20; i++) {
00143                 if (i==0x9 || i==0xA || i==0xD) continue;
00144                 nb_ctrl_chars += text->distribution[i];
00145         }
00146         nb_ctrl_chars += text->distribution[0x7F];
00147         
00148         //test if text is actually binary data
00149         if (text->size * UT_THRESHOLD_CONTROL_CHAR < nb_ctrl_chars) {
00150                 //to do: detect if UTF16!?!?
00151                 DBG3 ("Binary file detected! (%lu cc)", nb_ctrl_chars)
00152                 return UT_BINARY_DATA_ERROR;
00153         }
00154         
00155         //count the number of extended char
00156         ulong nb_ext_chars = 0;
00157         for (i=0x80; i<0x100; i++) {
00158                 nb_ext_chars += text->distribution[i];
00159         }
00160         DBG3 ("UTF-8 error : %lu, ext char number : %lu", error_utf8, nb_ext_chars)
00161 
00162         if (text->flags & UT_F_IDENTIFY_CHARSET) {
00163                 if (!nb_ext_chars) {
00164                         //text is ASCII
00165                         for (i=0; i<ut_session->nb_charsets; i++) 
00166                                 if (ut_session->charset[i].type == UT_CST_ASCII) break;
00167                         ASSERT_MSG (i!=ut_session->nb_charsets, "ASCII not defined")
00168                         text->charset = i;
00169                         DBG3 ("ASCII Encoding detected!")
00170                 } else if (nb_ext_chars * UT_THRESHOLD_UTF8 > error_utf8) {
00171                         //text is UTF-8
00172                 
00173                         for (i=0; i<ut_session->nb_charsets; i++) 
00174                                 if (ut_session->charset[i].type == UT_CST_UTF_8) break;
00175                         ASSERT_MSG (i!=ut_session->nb_charsets, "UTF-8 not defined")
00176                         text->charset = i;
00177                         DBG3 ("UTF-8 Encoding detected!")
00178                 } else {
00179                         text->charset = UT_UNSET;
00180                 }
00181         }
00182 
00183         return UT_OK;
00184 }
00185 
00186 
00187 /***************************************************************************/
00194 void ut_change_EOL1toEOL2 (char * beg, char * end) {
00195         ASSERT (beg<end)
00196         ASSERT (*end==UT_EOL_CHAR)
00197         char * scan = beg;
00198         for(;;) {
00199                 if (*scan==UT_EOL_CHAR) {
00200                         if (scan==end) return;
00201                         *scan=UT_EOL_ALT_CHAR;
00202                 }
00203                 scan++;
00204         }               
00205 }
00206 
00207 /***************************************************************************/
00208 /*
00209  * \brief Change all UT_EOL_ALT_CHAR to UT_EOL_CHAR, from beg to end-1.
00210  *
00211  * \note pour faire de vraie optimisation, on utilise strchr() à la place de
00212  *       for(;;) {... scan++ }, strchr() est une macro assembleur.
00213  */
00214 /*
00215 void ut_change_lff2eoe (char * beg, char * end) {
00216         ASSERT (beg<end)
00217         ASSERT (*end==UT_EOL_ALT_CHAR)
00218         char * scan = beg;
00219         for(;;) {
00220                 if (*scan==UT_EOL_ALT_CHAR) {
00221                         if (scan==end) return;
00222                         *scan=UT_EOL_CHAR;
00223                 }
00224                 scan++;
00225         }               
00226 }
00227 */
00228 // \brief exemple de fonction de remplacement pour ut_change_lff_eoe()
00229 /*
00230 void ut_change_lff_eoe_maybe (char * beg, char * end) 
00231         {
00232         char * scan; //les variables locales en début de bloc, sinon c'est du C++
00233 
00234         ASSERT (beg!=NULL) //important à tester en debug
00235         ASSERT (end!=NULL) //important à tester en debug
00236         ASSERT (beg<end)  
00237         ASSERT (*end==UT_EOL_ALT_CHAR) //c'est sur que cela doit être en ASSERT ?
00238 
00239         *end = UT_EOL_CHAR; //c'est bien le 0 final ? non ?
00240 
00241         
00242          donc ici pas d'appel de fonction ! c'est une directive __asm {}
00243 
00244          il vaut mieux cependant utiliser memchr, c'est plus sûr (puisque
00245          l'on spécifie la taille du buffer), et plus rapide car il utilis
00246          REPNE SCASB
00247 
00248    movb AL,octet à rechercher
00249          movl EDX,adresse du buffer
00250          movl ECX,taille du buffer -1
00251          rpne scasb
00252          je ...
00253          EDX contient l'adresse de l'octet trouvé
00254          
00255         for(scan=beg;
00256                         (scan=strchr(scan,UT_EOL_ALT_CHAR));
00257                         *scan=UT_EOL_CHAR)
00258                         ;
00259         
00260         Si il peut y avoir des 0 dans le texte avant l'appel de cette fonction, il faut faire
00261         une double boucle pour avancer d'un octet si scan!=end alors que strchr renvoi NULL
00262         }
00263         */
00264 
00265 
00266 /***************************************************************************/
00277 UtCode ut_eol_pass (UtText * text) {
00278 
00279         char * scan = text->data;
00280         char * scan_end = text->data+text->size;
00281         ASSERT ( *scan_end == 0 )
00282         //ASSERT ( text->flags & UT_F_TRANSFORM_EOL )
00283         text->nb_lines = 0;
00284         text->nb_lines_alt = 0;
00285         ulong cumul=1;
00286         
00287         //while (scan < scan_end) {
00288         
00289         UtEolType eol1 = UT_EOL_NONE;
00290         UtEolType eol2 = UT_EOL_NONE;
00291         
00292 
00293         for (;;) {
00294                 DBG3_S ("<%d>", *scan);
00295                 
00296                 if ((u_char)*scan<0x20) {                       //======== control code =============
00297                         if (!*scan) {                                           //--------null char
00298                                 if (scan>=scan_end) {   
00299                                         ASSERT (scan==scan_end)
00300                                         break;
00301                                 } else if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00302                                         if (!ut_update_progress (text, scan - text->data, false)) break;
00303                                         cumul++;
00304                                 }                               
00305                         }
00306                         if (*scan == 0xA) {                             //-------- LF (+CR?)    -------------
00307                                 DBG3_S ("*");
00308                                 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00309                                         ut_update_progress (text, scan - text->data, false);
00310                                         cumul++;
00311                                 }
00312 
00313                                 if (*(scan+1) == 0xD) { //LFCR
00314                                         switch (eol1) {
00315                                           case UT_EOL_LFCR:
00316                                           case UT_EOL_MIX:
00317                                                 if (*(scan+2) == 0xA) goto LF_only;
00318                                                 break;
00319                                           case UT_EOL_CRLF:
00320                                                 if (*(scan+2) == 0xA) goto LF_only;
00321                                                 eol1 = UT_EOL_MIX;
00322                                                 if (eol2 != UT_EOL_NONE) {
00323                                                         ERROR ("EOL2 todo...")
00324                                                 }
00325                                                 break;
00326                                           case UT_EOL_CR:
00327                                           case UT_EOL_LF:
00328                                                 if (*(scan+2) == 0xA) goto LF_only;
00329                                                 ASSERT (eol2 == UT_EOL_NONE)
00330                                                 eol2 = eol1;
00331                                                 text->nb_lines_alt = text->nb_lines;
00332                                             text->nb_lines = 0;
00333                                                 *scan = UT_EOL_CHAR;
00334                                                 ut_change_EOL1toEOL2 (text->data, scan);
00335                                           case UT_EOL_NONE:
00336                                                 eol1 = UT_EOL_LFCR;
00337                                                 break;
00338                                           default:
00339                                                 ERROR ("Forbiden case!?!")
00340                                         }
00341                                         *scan++ = UT_EOL_CHAR;
00342                                         *scan++ = text->skip_char;
00343                                         text->nb_lines++;
00344                                 } else {     //LF only
00345                                         LF_only:
00346                                         switch (eol1) {
00347                                           case UT_EOL_NONE:
00348                                                 eol1 = UT_EOL_LF;
00349                                           case UT_EOL_LF:
00350                                           case UT_EOL_MIX:
00351                                                 *scan++ = UT_EOL_CHAR;
00352                                                 text->nb_lines++;
00353                                                 break;
00354                                           case UT_EOL_CR:
00355                                                 eol1 = UT_EOL_MIX;
00356                                                 *scan++ = UT_EOL_CHAR;
00357                                                 text->nb_lines++;
00358                                                 break;
00359                                           case UT_EOL_CRLF:
00360                                           case UT_EOL_LFCR:
00361                                                 switch (eol2) {
00362                                                   case UT_EOL_NONE:     
00363                                                         eol2 = UT_EOL_LF;
00364                                                         break;
00365                                                   case UT_EOL_CR:
00366                                                         eol2 = UT_EOL_MIX;
00367                                                   case UT_EOL_LF:
00368                                                   case UT_EOL_MIX:
00369                                                         break;  
00370                                                   default:
00371                                                         ERROR ("Forbiden case!?!")
00372                                                 }
00373                                                 *scan++ = UT_EOL_ALT_CHAR;
00374                                                 text->nb_lines_alt++;
00375                                                 break;
00376                                           default:
00377                                                         ERROR ("Forbiden case!?!")
00378                                         } //switch
00379                                 } // else LF
00380                         } else if (*scan == 0xD) {              //--------- CR (LF?)      ------------
00381                                 DBG3_S ("*");
00382                                 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00383                                         ut_update_progress (text, scan - text->data, false);
00384                                         cumul++;
00385                                 }
00386 
00387                                 if (*(scan+1) == 0xA) { //CRLF
00388                                         switch (eol1) {
00389                                           case UT_EOL_CRLF:
00390                                           case UT_EOL_MIX:
00391                                                 break;
00392                                           case UT_EOL_LFCR:
00393                                                 eol1 = UT_EOL_MIX;
00394                                                 if (eol2 != UT_EOL_NONE) {
00395                                                         ERROR ("EOL2 todo...")
00396                                                 }
00397                                                 break;
00398                                           case UT_EOL_CR:
00399                                           case UT_EOL_LF:
00400                                                 ASSERT (eol2 == UT_EOL_NONE)
00401                                                 eol2 = eol1;
00402                                                 text->nb_lines_alt = text->nb_lines;
00403                                             text->nb_lines = 0;
00404                                                 *scan = UT_EOL_CHAR;
00405                                                 ut_change_EOL1toEOL2 (text->data, scan);
00406                                           case UT_EOL_NONE:
00407                                                 eol1 = UT_EOL_CRLF;
00408                                             break;
00409                                           default:
00410                                                 ERROR ("Forbiden case!?!")
00411                                         }
00412                                         *scan++ = UT_EOL_CHAR;
00413                                         *scan++ = text->skip_char;
00414                                         text->nb_lines++;
00415                                 } else {     //CR only
00416                                         switch (eol1) {
00417                                           case UT_EOL_NONE:
00418                                                 eol1 = UT_EOL_CR;
00419                                           case UT_EOL_CR:
00420                                           case UT_EOL_MIX:
00421                                                 *scan++ = UT_EOL_CHAR;
00422                                                 text->nb_lines++;
00423                                                 break;
00424                                           case UT_EOL_LF:
00425                                                 eol1 = UT_EOL_MIX;
00426                                                 *scan++ = UT_EOL_CHAR;
00427                                                 text->nb_lines++;
00428                                                 break;
00429                                           case UT_EOL_CRLF:
00430                                           case UT_EOL_LFCR:
00431                                                 switch (eol2) {
00432                                                   case UT_EOL_CR:
00433                                                   case UT_EOL_MIX:
00434                                                         break;  
00435                                                   case UT_EOL_NONE:     
00436                                                         eol2 = UT_EOL_CR;
00437                                                         break;
00438                                                   case UT_EOL_LF:
00439                                                         eol2 = UT_EOL_MIX;
00440                                                         break;
00441                                                   default:
00442                                                         ERROR ("Forbiden case!?!")
00443                                                 }
00444                                                 *scan++ = UT_EOL_ALT_CHAR;
00445                                                 text->nb_lines_alt++;
00446                                                 break;
00447                                           default:
00448                                                         ERROR ("Forbiden case!?!")
00449                                         } //switch
00450                                 } // else CR
00451                         } else if (*scan == 0x9 ) {             //------------- tab ----------
00452                                 scan++;
00453                         } else if (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) {
00454                                 *scan++ = text->skip_char;
00455                         } //else
00456                         
00457                 } else {                                        //======== non control code =============
00458                         if (*scan == 0x7F && (text->flags & UT_F_REMOVE_ILLEGAL_CHAR) ) {  //control char del
00459                                 *scan++ = text->skip_char;
00460                         } else {
00461                                 scan++;
00462                         } //else
00463                 } //else
00464         } //while
00465 
00466         //interrupted?
00467         if (scan<scan_end) {
00468                 return UT_INTERRUPTED_BY_USER;
00469         }
00470         
00471         if (text->flags & UT_F_ADD_FINAL_EOL) {
00472                 //add EOE if missinG
00473                 if (   (*(scan-2) != UT_EOL_CHAR || *(scan-1) != text->skip_char)
00474                         &&  *(scan-1) != UT_EOL_CHAR ) {
00475                         if (text->flags & UT_F_TRANSFORM_EOL) {
00476                                 *scan = UT_EOL_CHAR;
00477                                 text->size++;
00478                         } /* text->flags & UT_F_TRANSFORM_EOL should be true
00479                         else { switch (text->eol) {
00480                           case UT_EOL_CR:
00481                                 *scan = 0xD;
00482                                 text->size++;
00483                             break;
00484                           case UT_EOL_LF:
00485                                 *scan = 0xA;
00486                                 text->size++;
00487                           case UT_EOL_LF:
00488                                 *scan++ = 0xD;
00489                                 *scan   = 0xA;
00490                                 text->size+=2;
00491                         } } //else switch
00492                         */
00493                         text->nb_lines++;
00494                 } // if *scan
00495         } //if text->flags
00496         
00497         if (text->eol == UT_EOL_UNSET) {
00498                 text->eol = eol1;
00499                 text->eol_alt = eol2;
00500         } else {
00501                 text->nb_lines = UT_UNSET;
00502                 text->nb_lines_alt = UT_UNSET;
00503         }
00504         
00505         //verify EOF
00506         ASSERT (*scan == UT_EOF_CHAR)
00507 
00508         DBG2 ("End Of Line pass done! (%lu B)", text->size)
00509 
00510         return UT_OK;
00511 }
00512 
00513 // ************* Check for UTF16 - big endian & little endian *********
00514 /*
00515 {
00516         ulong error_utf16 = 0 ; //, error_utf16be = 0, error_utf16le = 0;
00517         ushort * scanw;
00518         ushort * scanw_end;
00519         
00520         if ( ifd->data_size%2) {
00521                 error_utf16 = -1U;
00522         } else {
00523                 scanw = (ushort *) ifd->data;
00524                 scanw_end = scanw+ifd->data_size/2;
00525                 for (;;) {
00526                         if (!*scanw && scanw==scanw_end) break;
00527                         if (0xD800 <=*scanw && *scanw < 0xDC00) { //surrogate?
00528                                 unicode = (*scanw & 0x3FF) + 0x400;
00529                                 scanw++;
00530                                 if (!(0xDC00 <= *scanw && *scanw < 0xE000 )) {
00531                                         error_utf16++;
00532                                         if (scanw==scanw_end) break;
00533                                 }
00534                                 unicode <<= 10;
00535                                 unicode |= *scanw & 0x3FF;
00536                         } else {
00537                                 unicode = *scanw;       
00538                         }
00539                         if (   ( 0xFDD0 <= unicode && unicode <= 0xFDEF )
00540                                 || ( (unicode & 0xFFF0FFFE) == 0x0000FFFE)
00541                                 || ( unicode >= 0x0010FFFE)
00542                                 || ( 0xD800 <=unicode && unicode < 0xE000) ) {
00543                                 error_utf16++;
00544                         }
00545                         scanw++;
00546                 } //for (;;)
00547                 printf ("UTF16 : %lu errors\n", error_utf16);
00548         } //else
00549         
00550 
00551         //ulong error_utf32be = 0, error_utf32le = 0;
00552 } */