00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00031 #include <sys/stat.h>
00032 #include <unistd.h>
00033 #include <float.h>
00034 #include <fcntl.h>
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <limits.h>
00038
00039 #include <endian.h>
00040 #include <byteswap.h>
00041
00042 #include "utrac.h"
00043 #include "ut_text.h"
00044 #include "ut_charset.h"
00045
00046
00047
00048 #include "debug.h"
00049
00050
00064 UtCode ut_load_charset_file (const char * filename, char ** buffer) {
00065
00066 DBG3 ("Loading file %s...", filename)
00067
00068 int fd = open (filename, O_RDONLY);
00069 if (fd==-1) return UT_OPEN_FILE_ERROR;
00070
00071 struct stat f_stat;
00072 if (fstat (fd, &f_stat)) return UT_FSTAT_FILE_ERROR;
00073
00074
00075 *buffer = (char*) malloc (f_stat.st_size + 2);
00076 if (!*buffer) return UT_MALLOC_ERROR;
00077
00078 int code=read (fd, *buffer, f_stat.st_size);
00079 if (code<=0) return UT_READ_FILE_ERROR;
00080 if (code!=f_stat.st_size) return UT_READ_FILE_ERROR2;
00081
00082 DBG3 ("File %s (%lu b) loaded!", filename, f_stat.st_size)
00083
00084 *(*buffer+f_stat.st_size) = '\n';
00085 *(*buffer+f_stat.st_size+1) = UT_EOF_CHAR;
00086
00087 if (close(fd)) return UT_CLOSE_FILE_ERROR;
00088
00089 return UT_OK;
00090 }
00091
00092
00093
00094
00102 void ut_print_binary (ulong src) {
00103
00104 int i; for (i=0; i<16; i++) {
00105 if (src&1<<15) putchar('x');
00106 else putchar ('-');
00107 src<<=1;
00108 if (!((i+1)%4)) putchar(' ');
00109 }
00110
00111 }
00112
00113
00117 UtCode ut_debug_text (UtText * text) {
00118
00119 ASSERT (text);
00120
00121 printf ("=====> Structure UtText :\n");
00122
00123 printf ("- size : %lu - %luk - %lum\n", text->size, text->size/1024, text->size/1024/1024);
00124 printf ("- lines1 : %lu - %luk\n", text->nb_lines, text->nb_lines/1024);
00125 printf ("- lines2 : %lu - %luk\n", text->nb_lines_alt, text->nb_lines_alt/1024);
00126 printf ("- skip char : <%c>\n", text->skip_char);
00127 printf ("- flags : "); ut_print_binary (text->flags); putchar('\n');
00128
00129
00130
00131 printf ("- eol1 : <%d>\n", text->eol);
00132 printf ("- eol2 : <%d>\n", text->eol_alt);
00133 printf ("- charset : <%hu>", text->charset);
00134 if (text->charset != UT_UNSET) printf (" (%s)",
00135 ut_session->charset[text->charset].name);
00136 putchar('\n');
00137
00138
00139 return UT_OK;
00140 }
00141
00142
00146 UtCode ut_debug_text_rating (UtText * text) {
00147
00148 ASSERT (text);
00149 if (!text->evaluation) return UT_OK;
00150
00151 int i; for (i=0; i<ut_session->nb_charsets; i++) {
00152 printf ("=> %2i: chk:%11lx rtg:%6ld %s\n", i, text->evaluation[i].checksum,
00153 text->evaluation[i].rating, ut_session->charset[i].name);
00154 }
00155
00156 return UT_OK;
00157 }
00158
00160
00162 static inline bool is_maj (char c) { return ('A'<=c && c<='Z'); }
00163 static inline bool is_min (char c) { return ('a'<=c && c<='z'); }
00164 static inline bool is_letter (char c) { c &= ~0x20; return is_maj(c); }
00165 static inline bool is_num (char c) { return ('0'<=c && c<='9'); }
00166
00167
00168
00169
00174 UtCharsetIndex ut_find_charset (char * charset_name) {
00175
00176 ASSERT (charset_name)
00177
00178 UtCharsetIndex i;
00179 for (i=0; i<ut_session->nb_charsets; i++) {
00180 if ( ut_session->charset[i].name &&
00181 ut_str_fuzzy_cmp (charset_name, ut_session->charset[i].name, 0)) break;
00182 if ( ut_session->charset[i].alias &&
00183 ut_str_fuzzy_cmp (charset_name, ut_session->charset[i].alias, 0)) break;
00184 }
00185
00186 if (i==ut_session->nb_charsets) return UT_UNSET;
00187 else return i;
00188 }
00189
00190 UtEolType ut_find_eol (char * eol_name) {
00191
00192 ASSERT (eol_name)
00193
00194 UtEolType j;
00195 for (j= UT_EOL_CR; j<UT_EOL_NONE; j++)
00196 if ( UT_EOL_NAME[j] && ut_str_fuzzy_cmp (eol_name, UT_EOL_NAME[j], 0) ) break;
00197
00198 if (j==UT_EOL_NONE) return UT_EOL_UNSET;
00199 else return j;
00200 }
00201
00202 int ut_find_lang_sys (char * language_name, UtLangSys * lang_sys) {
00203
00204 int language_id;
00205 char ln[2];
00206
00207 ln[0] = language_name[0];
00208 ln[1] = language_name[1];
00209 if ('a'<= ln[0] && ln[0] <= 'z' ) ln[0] += 'A'-'a';
00210 if ('a'<= ln[1] && ln[1] <= 'z' ) ln[1] += 'A'-'a';
00211
00212 for (language_id=0; language_id < lang_sys->n; language_id++) {
00213 if ( ln[0] == lang_sys->code[language_id*2+0] &&
00214 ln[1] == lang_sys->code[language_id*2+1]) break;
00215 }
00216
00217 if (language_id == lang_sys->n) return UT_UNSET;
00218
00219 return language_id;
00220 }
00221
00222
00223
00224
00232 bool ut_str_fuzzy_cmp (const char *str1, const char *str2, char stop_char) {
00233
00234 ASSERT(str1)
00235 ASSERT(str2)
00236
00237
00238 const char SEP = '*';
00239 const char END = 0;
00240 char prec1, c1=0;
00241 char prec2, c2=0;
00242
00243 for (;;) {
00244 prec1 = c1;
00245 if (is_letter(*str1)) {
00246 if (is_maj(prec1) || prec1==SEP) c1 = *str1++ & ~0x20;
00247 else c1 = SEP;
00248 } else if (is_num (*str1)) {
00249 if (is_num (prec1) || prec1==SEP) c1 = *str1++;
00250 else c1 = SEP;
00251 } else if (!*str1 || *str1==stop_char) {
00252 if (prec1==SEP) c1 = END;
00253 else c1=SEP;
00254 } else {
00255 c1 = SEP;
00256 while (!is_letter(*str1) && !is_num(*str1) && *str1 && *str1!=stop_char) str1++;
00257 }
00258 prec2 = c2;
00259 if (is_letter(*str2)) {
00260 if (is_maj(prec2) || prec2==SEP) c2 = *str2++ & ~0x20;
00261 else c2 = SEP;
00262 } else if (is_num (*str2)) {
00263 if (is_num (prec2) || prec2==SEP) c2 = *str2++;
00264 else c2 = SEP;
00265 } else if (!*str2 || *str2==stop_char) {
00266 if (prec2==SEP) c2 = END;
00267 else c2=SEP;
00268 } else {
00269 c2 = SEP;
00270 while (!is_letter(*str2) && !is_num(*str2) && *str2 && *str2!=stop_char) str2++;
00271 }
00272 if (c1!=c2) {
00273
00274 return false; }
00275 if (c1==END) {
00276
00277 return true;
00278 }
00279 }
00280 }
00281
00282
00283
00284
00285 double ut_get_charset_coef (UtCharsetIndex i) {
00286
00287 float coef;
00288
00289 if (ut_session->language_default>=0)
00290 coef = UT_LANG_SYS_COEF [ut_session->charset[i].language[ut_session->language_default]];
00291 else
00292 coef = 1.0;
00293
00294 if (ut_session->system_default>=0)
00295 coef *= UT_LANG_SYS_COEF [ut_session->charset[i].system[ut_session->system_default]];
00296
00297 return coef;
00298 }
00299
00300
00301
00302
00303
00304
00316 bool ut_update_progress (UtText * text, ulong processed, bool start_stop) {
00317
00318 ASSERT (ut_session->progress_function)
00319
00320 float rate;
00321
00322 if (start_stop) {
00323 if (!text->progress_done) rate = 0;
00324 else if (!text->progress_todo) rate = 1.0;
00325 else {
00326 rate = 0;
00327 DBG1 ("ut_update_progress: done!=0 && todo!=0 !?!?")
00328 }
00329 } else {
00330 rate = text->progress_done + (1-text->progress_done)*( (float) processed/text->size)/text->progress_todo;
00331 if (rate==0.0) rate = FLT_MIN;
00332 else if (rate==1.0) rate = 1.0 - FLT_MIN;
00333 if (rate>1.0) {
00334 DBG1 ("ut_update_progress: rate = %f !!", rate)
00335 }
00336 }
00337
00338 return (*(ut_session->progress_function)) (text, rate);
00339 }
00340
00341
00343 ulong ut_crc32_table[256];
00345 const ulong UT_CRC32_POLY=0x04c11db7;
00346
00347
00366 ulong ut_crc32(ushort data, ulong crc_in) {
00367 ulong crc;
00368
00369 if (!ut_crc32_table[1]) {
00370 int i, j; ulong c;
00371 for (i = 0; i < 256; ++i) {
00372 for (c = i << 24, j = 8; j > 0; --j) c = c & 0x80000000 ? (c << 1) ^ UT_CRC32_POLY : (c << 1);
00373 ut_crc32_table[i] = c;
00374 }
00375 }
00376 crc_in = ~crc_in;
00377 crc = (crc_in << 8) ^ ut_crc32_table[((crc_in >> 16) ^ data )>>8];
00378 crc = (crc << 8) ^ ut_crc32_table[(crc >> 24) ^ (data&0xFF)];
00379 return ~crc;
00380 }