Utrac: ut_recognition2.c Source File

00001 /***************************************************************************
00002  *            ut_recognition2.c
00003  *
00004  *  Tue Oct  5 11:29:47 2004
00005  *  Copyright  2004  Alliance MCA
00006  *  Written by : Antoine Calando (antoine@alliancemca.net)
00007  ****************************************************************************/
00008 
00009 /*
00010  *  This program is free software; you can redistribute it and/or modify
00011  *  it under the terms of the GNU General Public License as published by
00012  *  the Free Software Foundation; either version 2 of the License, or
00013  *  (at your option) any later version.
00014  *
00015  *  This program is distributed in the hope that it will be useful,
00016  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  *  GNU Library General Public License for more details.
00019  *
00020  *  You should have received a copy of the GNU General Public License
00021  *  along with this program; if not, write to the Free Software
00022  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00023  */
00024  
00031 #include <stdlib.h>
00032 #include <stdio.h>
00033 #include "ut_text.h"
00034 #include "ut_charset.h"
00035 #include "utrac.h"
00036 
00037 //#undef UT_DEBUG
00038 //#define UT_DEBUG 3
00039 #include "debug.h"
00040 
00041 /***************************************************************************/
00043 char inline ut_get_pre_char (char **scan_pre, UtText * text) {
00044         do { 
00045                 if (*scan_pre == text->data) return 0;
00046                 --(*scan_pre);  
00047         } while (**scan_pre == text->skip_char);
00048         return **scan_pre;
00049 }
00050 
00051 /***************************************************************************/
00053 char inline ut_get_post_char (char **scan_post, UtText * text, char *scan_end) {
00054         do { 
00055                 if (*scan_post == scan_end) return 0; 
00056                 ++(*scan_post);
00057         } while (**scan_post == text->skip_char);
00058         return **scan_post;
00059 }
00060 
00061 
00062 /***************************************************************************/
00082 UtCode ut_xascii_pass (UtText * text) {
00083         
00084 
00085         int i,j;
00086         char * scan = text->data;
00087         char * scan_end = text->data + text->size;
00088         
00089         char * line_beg = scan;
00090         ulong line_i = 0;
00091         ulong nb_ext_chars = 0; //number of ext char in current line
00092         bool ext_char[0x80]; for (i=0x0; i<0x80; i++) ext_char[i] = false;      //bit for each of the 128 ext char in current line
00093         bool ext_char_diff = false;             //ext char not previously found in current line?
00094         
00095         UtExtCharLine * scan_exl, * pre_exl, * new_exl;
00096         ulong  ponct_init[UT_CTG_PONCT_IF_N]; for (i=0; i<UT_CTG_PONCT_IF_N; i++) ponct_init[i] = 0;
00097         
00098         
00099         if (text->charset == UT_UNSET) {
00100                 if (!text->evaluation) 
00101                         text->evaluation = (UtCharsetEval*) malloc ( sizeof (UtCharsetEval) * ut_session->nb_charsets);
00102                 
00103                 for (i=0; i<ut_session->nb_charsets; i++) {
00104                         text->evaluation [i].rating = 0;
00105                         text->evaluation [i].checksum = 0;
00106                 }
00107         }
00108 
00109         int cumul = 1;
00110         scan--;
00111         for (;;) {
00112                 scan++;
00113                 if (!*scan) { //eol!!!
00114                         if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00115                                 if (!ut_update_progress (text, scan - text->data, false)) break;
00116                                 cumul++;
00117                         }
00118                         if (scan >= scan_end) {
00119                                 ASSERT (scan==scan_end)
00120                                 break; //last line?
00121                         }
00122                         if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
00123                                 if (ext_char_diff) { //extended char in this line?
00124                                         //create new struct
00125                                         new_exl = (UtExtCharLine*) malloc (sizeof(UtExtCharLine));
00126                                         new_exl->line_p = line_beg;
00127                                         new_exl->line_i = line_i;
00128                                         new_exl->nb_ext_chars = nb_ext_chars;
00129                                         
00130                                         //the link is inserted in the list which is sorted by
00131                                         //line with biggest number of extended char first
00132                                         if (!text->ext_char                     //insert struct at first pos?
00133                                                 || text->ext_char->nb_ext_chars <= nb_ext_chars ) {
00134                                                 new_exl->next = text->ext_char;
00135                                                 text->ext_char = new_exl;
00136                                         } else {
00137                                                 pre_exl = scan_exl = text->ext_char;
00138                                                 while (scan_exl && scan_exl->nb_ext_chars > nb_ext_chars) {
00139                                                         pre_exl = scan_exl;
00140                                                         scan_exl = scan_exl->next;
00141                                                 }
00142                                                 pre_exl->next = new_exl;
00143                                                 new_exl->next = scan_exl;
00144                                         }
00145                                         ext_char_diff = false;
00146                                 } //if
00147                                 nb_ext_chars = 0;
00148                                 line_beg = scan+1;
00149                                 line_i++;
00150                         }
00151                         
00152                 } else if ((u_char)*scan>0x7F) { //char extended found
00153                         if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
00154                                 nb_ext_chars++;
00155                                 if (!ext_char[(u_char)*scan-0x80]) { //already found?
00156                                         ext_char[(u_char)*scan-0x80] = true;
00157                                         ext_char_diff = true;
00158                                 }
00159                         }
00160 
00161                         if (text->charset == UT_UNSET) {        
00162                         
00163                                 UtCharsetEval * cs_eval = &(text->evaluation[0]);
00164                 
00165                                 //rate each charset for this extended char
00166                                 for (i=0; i<ut_session->nb_charsets; i++, cs_eval++) {
00167                                         UtCharset * cs = &(ut_session->charset[i]);
00168                                         if (cs->type != UT_CST_ASCII_EXTENSION) continue;
00169         
00170                                         char tmp;
00171                                         UtCateg pre1_ctg, pre2_ctg, scan_ctg, post1_ctg, post2_ctg, post3_ctg;
00172                                         UtScript pre1_scr, scan_scr, post1_scr;
00173                                         char * scan_pre = scan, * scan_post = scan;
00174                                         
00175                                         //get category and alphabet type of chars at pos scan-1, scan and scan+1
00176                                         scan_ctg  = (cs->char_type[(u_char) *scan].categorie);
00177                                         scan_scr  = (cs->char_type[(u_char) *scan].script);
00178                                         tmp = ut_get_pre_char  (&scan_pre, text);
00179                                         pre1_ctg  = (cs->char_type[(u_char) tmp].categorie);
00180                                         pre1_scr  = (cs->char_type[(u_char) tmp].script);
00181                                         tmp = ut_get_post_char (&scan_post, text, scan_end);
00182                                         post1_ctg = (cs->char_type[(u_char) tmp].categorie);
00183                                         post1_scr  = (cs->char_type[(u_char) tmp].script);
00184                                         
00185                                         //compare to previous and following char(s)
00186                                         switch (scan_ctg) {
00187                                           case UT_CTG_UPPERCASE:
00188                                                 if     ( pre1_ctg==UT_CTG_DELIMITER && 
00189                                                                 (post1_ctg==UT_CTG_LOWERCASE || post1_ctg==UT_CTG_UPPERCASE))           cs_eval->rating++;
00190                                                 else
00191                                                         if ( pre1_ctg==UT_CTG_UPPERCASE)                                                                cs_eval->rating++;
00192                                                 else {
00193                                                         post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
00194                                                         if (post1_ctg==UT_CTG_UPPERCASE && post2_ctg!=UT_CTG_LOWERCASE)                 cs_eval->rating++;
00195                                                         else {
00196                                                                 pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre, text)].categorie);
00197                                                                 if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
00198                                                                   (( pre2_ctg==UT_CTG_UPPERCASE && post2_ctg==UT_CTG_UPPERCASE) ||
00199                                                                         (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER)))                         cs_eval->rating++;
00200                                                         } 
00201                                                 } break;
00202                                                 
00203                                           case UT_CTG_LOWERCASE:
00204                                                 if     ( pre1_ctg==UT_CTG_LOWERCASE)                                                            cs_eval->rating++;
00205                                                 else 
00206                                                         if (post1_ctg==UT_CTG_LOWERCASE)                                                                cs_eval->rating++;
00207                                                 else
00208                                                         if ( pre1_ctg==UT_CTG_UPPERCASE && post1_ctg!=UT_CTG_UPPERCASE)                 cs_eval->rating++;
00209                                                 else {
00210                                                         pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre , text)].categorie);
00211                                                         post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
00212                                                         post3_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
00213                                                         if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
00214                                                           (( pre2_ctg==UT_CTG_LOWERCASE && (post2_ctg==UT_CTG_LOWERCASE || (post2_ctg==UT_CTG_UPPERCASE && post3_ctg==UT_CTG_LOWERCASE)) 
00215                                                           ) || (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER)))          cs_eval->rating++;
00216                                                 } break;
00217                                           case UT_CTG_OTHER_LETTER:
00218                                                         if (pre1_ctg==UT_CTG_OTHER_LETTER)                                                      cs_eval->rating++;
00219                                                         if (post1_ctg==UT_CTG_OTHER_LETTER)                                                     cs_eval->rating++;
00220                                                 break;
00221         
00222                                           case UT_CTG_MARK:
00223                                                         if (pre1_ctg>=UT_CTG_UPPERCASE && pre1_ctg<=UT_CTG_OTHER_LETTER)                cs_eval->rating++;
00224                                                         if (post1_ctg>=UT_CTG_UPPERCASE && post1_ctg<=UT_CTG_OTHER_LETTER)      cs_eval->rating++;
00225                                                 break;
00226         
00227                                           case UT_CTG_CONTROL:
00228                                           case UT_CTG_UNSET:
00229                                                 cs_eval->rating-=2;
00230                                                 break;
00231         
00232                                           case UT_CTG_CURRENCY:
00233                                                         if (pre1_ctg==UT_CTG_NUMBER || post1_ctg==UT_CTG_NUMBER) cs_eval->rating++;
00234                                                         else if (pre1_ctg==UT_CTG_DELIMITER) {
00235                                                                 pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre , text)].categorie);
00236                                                                 if (pre2_ctg==UT_CTG_NUMBER ) cs_eval->rating++;
00237                                                         }
00238                                                 break;
00239         
00240                                           case UT_CTG_SYMBOL:
00241                                                 switch (cs->unicode[(u_char)*scan]) {
00242                                                   case 0x00B0: /* ° */
00243                                                         pre2_ctg  = (cs->char_type [(u_char) ut_get_pre_char  (&scan_pre, text)].categorie);  
00244                                                         if (pre2_ctg>UT_CTG_OTHER_LETTER && (*(scan-1)=='N' || *(scan-1)=='n') 
00245                                                                 && post1_ctg>UT_CTG_OTHER_LETTER) cs_eval->rating+=3;
00246                                                 } break;
00247                                           case UT_CTG_DELIMITER:
00248                                                 if (pre1_ctg==post1_ctg || *scan==*(scan-1) || *scan==*(scan+1)) cs_eval->rating++;
00249                                                 break;
00250                                           case UT_CTG_NUMBER:
00251                                           case UT_CTG_PONCTUATION:
00252                                           case UT_CTG_OTHER:  break;
00253                                           default: 
00254                                                 for (j=0; j<UT_CTG_PONCT_IF_N; j++) {
00255                                                         if (scan_ctg==UT_CTG_PONCT_INIT_0+j) ponct_init[j]++;
00256                                                         else if (scan_ctg==UT_CTG_PONCT_FINAL_0+j && ponct_init[j]) {
00257                                                                 ponct_init[j]--;
00258                                                                 cs_eval->rating+=2;
00259                                                         }
00260                                                 } //for
00261                                         } //switch
00262                                         
00263                                         //rate according to the script
00264                                         if (scan_scr==1) {
00265                                                 if (scan_scr== pre1_scr)
00266                                                         cs_eval->rating++;
00267                                                 if (scan_scr == post1_scr)
00268                                                         cs_eval->rating++;
00269                                         } else if (scan_scr>1) {
00270                                                 if (scan_scr== pre1_scr)
00271                                                         cs_eval->rating+=2;
00272                                                 if (scan_scr == post1_scr)
00273                                                         cs_eval->rating+=2;
00274                                         }
00275 
00276                                 } //for nb_charsets
00277 
00278                         } //if (text->charset == UT_UNSET)
00279                 
00280                 } //if (*scan>0x7F)
00281 
00282         } //for (;;)
00283         
00284         
00285         //interrupted?
00286         if (scan<scan_end) {
00287                 return UT_INTERRUPTED_BY_USER;
00288         }
00289         
00290         if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
00291                 //filter the extended line linked list
00292                 for (i=0x0; i<0x80; i++) ext_char[i] = false;
00293                 pre_exl = scan_exl = text->ext_char;
00294                 
00295                 while (scan_exl) {  //scan each struct
00296                         ext_char_diff = false;
00297                         scan = scan_exl->line_p;
00298                         while (*scan) { //scan each char
00299                                 if ((u_char)*scan>0x7F) { //char extended found
00300                                         if (!ext_char[(u_char)*scan-0x80]) { //already found?
00301                                                 ext_char[(u_char)*scan-0x80] = true;
00302                                                 ext_char_diff = true;
00303                                         }
00304                                 }
00305                                 scan++;
00306                         }//while
00307                         
00308                         if (!ext_char_diff) { //remove the struct ext_char_line?
00309                                 pre_exl->next = scan_exl->next; //(first struct is never removed, so this code is ok)
00310                                 free (scan_exl);
00311                                 scan_exl = pre_exl->next;
00312                         } else {
00313                                 pre_exl = scan_exl;
00314                                 scan_exl = scan_exl->next;
00315                         }
00316                 } //while
00317 
00318                 //sort the extended line linked list with an insertion sort
00319                 UtExtCharLine * src_exl, *pre_src_exl;
00320                 UtExtCharLine * dst_exl, *pre_dst_exl;
00321                 
00322                 src_exl = pre_src_exl = text->ext_char;
00323                 while (src_exl) {
00324                         
00325                         pre_dst_exl = dst_exl = text->ext_char;
00326                         new_exl = src_exl->next;
00327                         
00328                         while (src_exl!=dst_exl) {
00329                                 if (src_exl->line_i < dst_exl->line_i) {
00330                                         //insert src before dst postion
00331                                         pre_src_exl->next = src_exl->next;
00332                                         src_exl->next = dst_exl;
00333         
00334                                         if (dst_exl == text->ext_char)  text->ext_char = src_exl; //fisrt pos?
00335                                         else pre_dst_exl->next = src_exl;       //second pos or after
00336                                         src_exl = pre_src_exl;
00337                                         break;
00338                                 } //if
00339                                 pre_dst_exl = dst_exl;
00340                                 dst_exl = dst_exl->next;
00341                         } //while
00342                         pre_src_exl = src_exl;
00343                         src_exl = new_exl;
00344                 } //while
00345         }
00346         
00347         if (text->charset == UT_UNSET) {        
00348                 //calculate checksum for each charset
00349                 for (i=0; i<ut_session->nb_charsets; i++) {
00350                         if (ut_session->charset[i].type != UT_CST_ASCII_EXTENSION) continue;
00351                         for (j=0x80; j<0x100; j++) {
00352                                 if ( text->distribution[j]) text->evaluation[i].checksum 
00353                                                 = ut_crc32 (ut_session->charset[i].unicode[(u_char)j], text->evaluation[i].checksum);
00354                         }
00355                 }
00356 
00357                 //choose the best charmap depending on the results of the estimation
00358                 //and on the selected language
00359                 double max_value = -1; //long could also be used
00360                 short max_index = -1;
00361                 double tmp;
00362                 
00363                 for (i=0; i<ut_session->nb_charsets; i++) {
00364                         tmp = text->evaluation[i].rating;
00365                         tmp *= ut_get_charset_coef (i);
00366 
00367                         if (tmp > max_value) {
00368                                 max_value = tmp;
00369                                 max_index = i;
00370                         }
00371                 }
00372                 text->charset = max_index;
00373         
00374                 if (max_index<0) {
00375                         DBG1 ("*** NO CHARSET SELECTED !!! ***")
00376                         //return UT_CHARSET_NOT_RECOGNIZED_ERROR;
00377                 } else {
00378                         DBG2 ("%s selected", ut_session->charset[max_index].name)
00379                 }
00380         }
00381         DBG2 ("Extended Ascii charset pass done! (%lu B)", text->size)
00382                 
00383         return UT_OK;
00384 }