00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00031 #include <stdlib.h>
00032 #include <stdio.h>
00033 #include "ut_text.h"
00034 #include "ut_charset.h"
00035 #include "utrac.h"
00036
00037
00038
00039 #include "debug.h"
00040
00041
00043 char inline ut_get_pre_char (char **scan_pre, UtText * text) {
00044 do {
00045 if (*scan_pre == text->data) return 0;
00046 --(*scan_pre);
00047 } while (**scan_pre == text->skip_char);
00048 return **scan_pre;
00049 }
00050
00051
00053 char inline ut_get_post_char (char **scan_post, UtText * text, char *scan_end) {
00054 do {
00055 if (*scan_post == scan_end) return 0;
00056 ++(*scan_post);
00057 } while (**scan_post == text->skip_char);
00058 return **scan_post;
00059 }
00060
00061
00062
00082 UtCode ut_xascii_pass (UtText * text) {
00083
00084
00085 int i,j;
00086 char * scan = text->data;
00087 char * scan_end = text->data + text->size;
00088
00089 char * line_beg = scan;
00090 ulong line_i = 0;
00091 ulong nb_ext_chars = 0;
00092 bool ext_char[0x80]; for (i=0x0; i<0x80; i++) ext_char[i] = false;
00093 bool ext_char_diff = false;
00094
00095 UtExtCharLine * scan_exl, * pre_exl, * new_exl;
00096 ulong ponct_init[UT_CTG_PONCT_IF_N]; for (i=0; i<UT_CTG_PONCT_IF_N; i++) ponct_init[i] = 0;
00097
00098
00099 if (text->charset == UT_UNSET) {
00100 if (!text->evaluation)
00101 text->evaluation = (UtCharsetEval*) malloc ( sizeof (UtCharsetEval) * ut_session->nb_charsets);
00102
00103 for (i=0; i<ut_session->nb_charsets; i++) {
00104 text->evaluation [i].rating = 0;
00105 text->evaluation [i].checksum = 0;
00106 }
00107 }
00108
00109 int cumul = 1;
00110 scan--;
00111 for (;;) {
00112 scan++;
00113 if (!*scan) {
00114 if (scan - text->data >= UT_PROCESS_STEP*cumul && ut_session->progress_function) {
00115 if (!ut_update_progress (text, scan - text->data, false)) break;
00116 cumul++;
00117 }
00118 if (scan >= scan_end) {
00119 ASSERT (scan==scan_end)
00120 break;
00121 }
00122 if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
00123 if (ext_char_diff) {
00124
00125 new_exl = (UtExtCharLine*) malloc (sizeof(UtExtCharLine));
00126 new_exl->line_p = line_beg;
00127 new_exl->line_i = line_i;
00128 new_exl->nb_ext_chars = nb_ext_chars;
00129
00130
00131
00132 if (!text->ext_char
00133 || text->ext_char->nb_ext_chars <= nb_ext_chars ) {
00134 new_exl->next = text->ext_char;
00135 text->ext_char = new_exl;
00136 } else {
00137 pre_exl = scan_exl = text->ext_char;
00138 while (scan_exl && scan_exl->nb_ext_chars > nb_ext_chars) {
00139 pre_exl = scan_exl;
00140 scan_exl = scan_exl->next;
00141 }
00142 pre_exl->next = new_exl;
00143 new_exl->next = scan_exl;
00144 }
00145 ext_char_diff = false;
00146 }
00147 nb_ext_chars = 0;
00148 line_beg = scan+1;
00149 line_i++;
00150 }
00151
00152 } else if ((u_char)*scan>0x7F) {
00153 if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
00154 nb_ext_chars++;
00155 if (!ext_char[(u_char)*scan-0x80]) {
00156 ext_char[(u_char)*scan-0x80] = true;
00157 ext_char_diff = true;
00158 }
00159 }
00160
00161 if (text->charset == UT_UNSET) {
00162
00163 UtCharsetEval * cs_eval = &(text->evaluation[0]);
00164
00165
00166 for (i=0; i<ut_session->nb_charsets; i++, cs_eval++) {
00167 UtCharset * cs = &(ut_session->charset[i]);
00168 if (cs->type != UT_CST_ASCII_EXTENSION) continue;
00169
00170 char tmp;
00171 UtCateg pre1_ctg, pre2_ctg, scan_ctg, post1_ctg, post2_ctg, post3_ctg;
00172 UtScript pre1_scr, scan_scr, post1_scr;
00173 char * scan_pre = scan, * scan_post = scan;
00174
00175
00176 scan_ctg = (cs->char_type[(u_char) *scan].categorie);
00177 scan_scr = (cs->char_type[(u_char) *scan].script);
00178 tmp = ut_get_pre_char (&scan_pre, text);
00179 pre1_ctg = (cs->char_type[(u_char) tmp].categorie);
00180 pre1_scr = (cs->char_type[(u_char) tmp].script);
00181 tmp = ut_get_post_char (&scan_post, text, scan_end);
00182 post1_ctg = (cs->char_type[(u_char) tmp].categorie);
00183 post1_scr = (cs->char_type[(u_char) tmp].script);
00184
00185
00186 switch (scan_ctg) {
00187 case UT_CTG_UPPERCASE:
00188 if ( pre1_ctg==UT_CTG_DELIMITER &&
00189 (post1_ctg==UT_CTG_LOWERCASE || post1_ctg==UT_CTG_UPPERCASE)) cs_eval->rating++;
00190 else
00191 if ( pre1_ctg==UT_CTG_UPPERCASE) cs_eval->rating++;
00192 else {
00193 post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
00194 if (post1_ctg==UT_CTG_UPPERCASE && post2_ctg!=UT_CTG_LOWERCASE) cs_eval->rating++;
00195 else {
00196 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre, text)].categorie);
00197 if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
00198 (( pre2_ctg==UT_CTG_UPPERCASE && post2_ctg==UT_CTG_UPPERCASE) ||
00199 (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER))) cs_eval->rating++;
00200 }
00201 } break;
00202
00203 case UT_CTG_LOWERCASE:
00204 if ( pre1_ctg==UT_CTG_LOWERCASE) cs_eval->rating++;
00205 else
00206 if (post1_ctg==UT_CTG_LOWERCASE) cs_eval->rating++;
00207 else
00208 if ( pre1_ctg==UT_CTG_UPPERCASE && post1_ctg!=UT_CTG_UPPERCASE) cs_eval->rating++;
00209 else {
00210 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre , text)].categorie);
00211 post2_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
00212 post3_ctg = (cs->char_type [(u_char) ut_get_post_char (&scan_post, text, scan_end)].categorie);
00213 if ( pre1_ctg==UT_CTG_DELIMITER && post1_ctg==UT_CTG_DELIMITER &&
00214 (( pre2_ctg==UT_CTG_LOWERCASE && (post2_ctg==UT_CTG_LOWERCASE || (post2_ctg==UT_CTG_UPPERCASE && post3_ctg==UT_CTG_LOWERCASE))
00215 ) || (pre2_ctg==UT_CTG_NUMBER && post2_ctg==UT_CTG_NUMBER))) cs_eval->rating++;
00216 } break;
00217 case UT_CTG_OTHER_LETTER:
00218 if (pre1_ctg==UT_CTG_OTHER_LETTER) cs_eval->rating++;
00219 if (post1_ctg==UT_CTG_OTHER_LETTER) cs_eval->rating++;
00220 break;
00221
00222 case UT_CTG_MARK:
00223 if (pre1_ctg>=UT_CTG_UPPERCASE && pre1_ctg<=UT_CTG_OTHER_LETTER) cs_eval->rating++;
00224 if (post1_ctg>=UT_CTG_UPPERCASE && post1_ctg<=UT_CTG_OTHER_LETTER) cs_eval->rating++;
00225 break;
00226
00227 case UT_CTG_CONTROL:
00228 case UT_CTG_UNSET:
00229 cs_eval->rating-=2;
00230 break;
00231
00232 case UT_CTG_CURRENCY:
00233 if (pre1_ctg==UT_CTG_NUMBER || post1_ctg==UT_CTG_NUMBER) cs_eval->rating++;
00234 else if (pre1_ctg==UT_CTG_DELIMITER) {
00235 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre , text)].categorie);
00236 if (pre2_ctg==UT_CTG_NUMBER ) cs_eval->rating++;
00237 }
00238 break;
00239
00240 case UT_CTG_SYMBOL:
00241 switch (cs->unicode[(u_char)*scan]) {
00242 case 0x00B0:
00243 pre2_ctg = (cs->char_type [(u_char) ut_get_pre_char (&scan_pre, text)].categorie);
00244 if (pre2_ctg>UT_CTG_OTHER_LETTER && (*(scan-1)=='N' || *(scan-1)=='n')
00245 && post1_ctg>UT_CTG_OTHER_LETTER) cs_eval->rating+=3;
00246 } break;
00247 case UT_CTG_DELIMITER:
00248 if (pre1_ctg==post1_ctg || *scan==*(scan-1) || *scan==*(scan+1)) cs_eval->rating++;
00249 break;
00250 case UT_CTG_NUMBER:
00251 case UT_CTG_PONCTUATION:
00252 case UT_CTG_OTHER: break;
00253 default:
00254 for (j=0; j<UT_CTG_PONCT_IF_N; j++) {
00255 if (scan_ctg==UT_CTG_PONCT_INIT_0+j) ponct_init[j]++;
00256 else if (scan_ctg==UT_CTG_PONCT_FINAL_0+j && ponct_init[j]) {
00257 ponct_init[j]--;
00258 cs_eval->rating+=2;
00259 }
00260 }
00261 }
00262
00263
00264 if (scan_scr==1) {
00265 if (scan_scr== pre1_scr)
00266 cs_eval->rating++;
00267 if (scan_scr == post1_scr)
00268 cs_eval->rating++;
00269 } else if (scan_scr>1) {
00270 if (scan_scr== pre1_scr)
00271 cs_eval->rating+=2;
00272 if (scan_scr == post1_scr)
00273 cs_eval->rating+=2;
00274 }
00275
00276 }
00277
00278 }
00279
00280 }
00281
00282 }
00283
00284
00285
00286 if (scan<scan_end) {
00287 return UT_INTERRUPTED_BY_USER;
00288 }
00289
00290 if (text->flags & UT_F_REFERENCE_EXT_CHAR ) {
00291
00292 for (i=0x0; i<0x80; i++) ext_char[i] = false;
00293 pre_exl = scan_exl = text->ext_char;
00294
00295 while (scan_exl) {
00296 ext_char_diff = false;
00297 scan = scan_exl->line_p;
00298 while (*scan) {
00299 if ((u_char)*scan>0x7F) {
00300 if (!ext_char[(u_char)*scan-0x80]) {
00301 ext_char[(u_char)*scan-0x80] = true;
00302 ext_char_diff = true;
00303 }
00304 }
00305 scan++;
00306 }
00307
00308 if (!ext_char_diff) {
00309 pre_exl->next = scan_exl->next;
00310 free (scan_exl);
00311 scan_exl = pre_exl->next;
00312 } else {
00313 pre_exl = scan_exl;
00314 scan_exl = scan_exl->next;
00315 }
00316 }
00317
00318
00319 UtExtCharLine * src_exl, *pre_src_exl;
00320 UtExtCharLine * dst_exl, *pre_dst_exl;
00321
00322 src_exl = pre_src_exl = text->ext_char;
00323 while (src_exl) {
00324
00325 pre_dst_exl = dst_exl = text->ext_char;
00326 new_exl = src_exl->next;
00327
00328 while (src_exl!=dst_exl) {
00329 if (src_exl->line_i < dst_exl->line_i) {
00330
00331 pre_src_exl->next = src_exl->next;
00332 src_exl->next = dst_exl;
00333
00334 if (dst_exl == text->ext_char) text->ext_char = src_exl;
00335 else pre_dst_exl->next = src_exl;
00336 src_exl = pre_src_exl;
00337 break;
00338 }
00339 pre_dst_exl = dst_exl;
00340 dst_exl = dst_exl->next;
00341 }
00342 pre_src_exl = src_exl;
00343 src_exl = new_exl;
00344 }
00345 }
00346
00347 if (text->charset == UT_UNSET) {
00348
00349 for (i=0; i<ut_session->nb_charsets; i++) {
00350 if (ut_session->charset[i].type != UT_CST_ASCII_EXTENSION) continue;
00351 for (j=0x80; j<0x100; j++) {
00352 if ( text->distribution[j]) text->evaluation[i].checksum
00353 = ut_crc32 (ut_session->charset[i].unicode[(u_char)j], text->evaluation[i].checksum);
00354 }
00355 }
00356
00357
00358
00359 double max_value = -1;
00360 short max_index = -1;
00361 double tmp;
00362
00363 for (i=0; i<ut_session->nb_charsets; i++) {
00364 tmp = text->evaluation[i].rating;
00365 tmp *= ut_get_charset_coef (i);
00366
00367 if (tmp > max_value) {
00368 max_value = tmp;
00369 max_index = i;
00370 }
00371 }
00372 text->charset = max_index;
00373
00374 if (max_index<0) {
00375 DBG1 ("*** NO CHARSET SELECTED !!! ***")
00376
00377 } else {
00378 DBG2 ("%s selected", ut_session->charset[max_index].name)
00379 }
00380 }
00381 DBG2 ("Extended Ascii charset pass done! (%lu B)", text->size)
00382
00383 return UT_OK;
00384 }