Utrac: ut_utils.c Source File

00001 /***************************************************************************
00002  *            ut_utils.c
00003  *
00004  *  Tue Oct  5 11:29:53 2004
00005  *  Copyright  2004  Alliance MCA
00006  *  Written by : Antoine Calando (antoine@alliancemca.net)
00007  ****************************************************************************/
00008 
00009 /*
00010  *  This program is free software; you can redistribute it and/or modify
00011  *  it under the terms of the GNU General Public License as published by
00012  *  the Free Software Foundation; either version 2 of the License, or
00013  *  (at your option) any later version.
00014  *
00015  *  This program is distributed in the hope that it will be useful,
00016  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  *  GNU Library General Public License for more details.
00019  *
00020  *  You should have received a copy of the GNU General Public License
00021  *  along with this program; if not, write to the Free Software
00022  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00023  */
00024  
00031 #include <sys/stat.h>
00032 #include <unistd.h>
00033 #include <float.h>
00034 #include <fcntl.h>
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <limits.h> //for SSIZE_MAX
00038 
00039 #include <endian.h>
00040 #include <byteswap.h>
00041 
00042 #include "utrac.h"
00043 #include "ut_text.h"
00044 #include "ut_charset.h"
00045 
00046 //#undef UT_DEBUG
00047 //#define UT_DEBUG 3
00048 #include "debug.h"
00049 
00050 /***************************************************************************/
00064 UtCode ut_load_charset_file (const char * filename, char ** buffer) {
00065         
00066         DBG3 ("Loading file %s...", filename)
00067         
00068         int fd = open (filename, O_RDONLY);
00069         if (fd==-1) return UT_OPEN_FILE_ERROR;
00070 
00071         struct stat f_stat;
00072         if (fstat (fd, &f_stat)) return UT_FSTAT_FILE_ERROR;
00073         
00074         //some space is needed to add an EOL and an EOF
00075         *buffer = (char*) malloc (f_stat.st_size + 2);
00076         if (!*buffer) return UT_MALLOC_ERROR;
00077         
00078         int code=read (fd, *buffer, f_stat.st_size);
00079         if (code<=0) return UT_READ_FILE_ERROR;
00080         if (code!=f_stat.st_size) return UT_READ_FILE_ERROR2;
00081 
00082         DBG3 ("File %s (%lu b) loaded!", filename, f_stat.st_size)
00083         
00084         *(*buffer+f_stat.st_size) = '\n';
00085         *(*buffer+f_stat.st_size+1) = UT_EOF_CHAR;
00086 
00087         if (close(fd)) return UT_CLOSE_FILE_ERROR;
00088 
00089         return UT_OK;
00090 }
00091 
00092 
00093 
00094 /***************************************************************************/
00102 void ut_print_binary (ulong src) {
00103 
00104         int i; for (i=0; i<16; i++) {
00105                 if (src&1<<15) putchar('x');
00106                 else putchar ('-');
00107                 src<<=1;
00108                 if (!((i+1)%4)) putchar(' ');
00109         }
00110         
00111 }
00112 
00113 /***************************************************************************/
00117 UtCode ut_debug_text (UtText * text) {
00118         
00119         ASSERT (text);
00120         
00121         printf ("=====> Structure UtText :\n");
00122         //data  
00123         printf ("- size : %lu - %luk - %lum\n", text->size, text->size/1024, text->size/1024/1024);
00124         printf ("- lines1 : %lu - %luk\n", text->nb_lines, text->nb_lines/1024);
00125         printf ("- lines2 : %lu - %luk\n", text->nb_lines_alt, text->nb_lines_alt/1024);
00126         printf ("- skip char : <%c>\n", text->skip_char);
00127         printf ("- flags : "); ut_print_binary (text->flags); putchar('\n');            
00128         //distrib
00129         //ext_char
00130         //charmap
00131         printf ("- eol1 : <%d>\n", text->eol);
00132         printf ("- eol2 : <%d>\n", text->eol_alt);
00133         printf ("- charset : <%hu>", text->charset);
00134         if (text->charset != UT_UNSET) printf (" (%s)", 
00135                 ut_session->charset[text->charset].name);
00136         putchar('\n');
00137         //convert eol
00138         //convert charset
00139         return UT_OK;
00140 }
00141 
00142 /***************************************************************************/
00146 UtCode ut_debug_text_rating (UtText * text) {
00147         
00148         ASSERT (text);
00149         if (!text->evaluation) return UT_OK;
00150         
00151         int i; for (i=0; i<ut_session->nb_charsets; i++) {
00152                 printf ("=> %2i:   chk:%11lx   rtg:%6ld     %s\n", i, text->evaluation[i].checksum, 
00153                         text->evaluation[i].rating, ut_session->charset[i].name);       
00154         }
00155                 
00156         return UT_OK;   
00157 }
00158 
00160 
00162 static inline bool is_maj (char c) { return ('A'<=c && c<='Z'); }
00163 static inline bool is_min (char c) { return ('a'<=c && c<='z'); }
00164 static inline bool is_letter (char c) { c &= ~0x20;     return is_maj(c); }
00165 static inline bool is_num (char c) { return ('0'<=c && c<='9'); }
00166 // @}
00167 
00168 
00169 
00174 UtCharsetIndex ut_find_charset (char * charset_name) {
00175         
00176         ASSERT (charset_name)
00177         
00178         UtCharsetIndex i;
00179         for (i=0; i<ut_session->nb_charsets; i++) {
00180                 if ( ut_session->charset[i].name &&
00181                         ut_str_fuzzy_cmp (charset_name, ut_session->charset[i].name, 0)) break;
00182                 if ( ut_session->charset[i].alias &&
00183                         ut_str_fuzzy_cmp (charset_name, ut_session->charset[i].alias, 0)) break;        
00184         }
00185 
00186         if (i==ut_session->nb_charsets) return UT_UNSET;
00187         else return i;
00188 }
00189 
00190 UtEolType ut_find_eol (char * eol_name) {
00191         
00192         ASSERT (eol_name)
00193         
00194         UtEolType j;
00195         for (j= UT_EOL_CR; j<UT_EOL_NONE; j++) 
00196                 if ( UT_EOL_NAME[j] && ut_str_fuzzy_cmp (eol_name, UT_EOL_NAME[j], 0) ) break;
00197 
00198         if (j==UT_EOL_NONE) return UT_EOL_UNSET;
00199         else return j;
00200 }
00201 
00202 int ut_find_lang_sys (char * language_name, UtLangSys * lang_sys) {
00203         
00204         int language_id;
00205         char ln[2];
00206         
00207         ln[0] = language_name[0];
00208         ln[1] = language_name[1];
00209         if ('a'<= ln[0] && ln[0] <= 'z' ) ln[0] += 'A'-'a';
00210         if ('a'<= ln[1] && ln[1] <= 'z' ) ln[1] += 'A'-'a';
00211         
00212         for (language_id=0; language_id < lang_sys->n; language_id++) {
00213                 if ( ln[0] == lang_sys->code[language_id*2+0] &&
00214                          ln[1] == lang_sys->code[language_id*2+1]) break;
00215         }
00216         
00217         if (language_id == lang_sys->n) return UT_UNSET;
00218         
00219         return language_id;
00220 }
00221 
00222 
00223 
00224 /***************************************************************************/
00232 bool ut_str_fuzzy_cmp (const char *str1, const char *str2, char stop_char) {
00233         
00234         ASSERT(str1)
00235         ASSERT(str2)
00236         //DBG3 (" <%s> =? <%s> ", str1, str2);
00237 
00238         const char SEP = '*';
00239         const char END = 0;
00240         char prec1, c1=0;
00241         char prec2, c2=0;
00242         
00243         for (;;) {
00244                 prec1 = c1;
00245                 if (is_letter(*str1)) {
00246                         if (is_maj(prec1) || prec1==SEP) c1 = *str1++ & ~0x20;
00247                         else c1 = SEP;
00248                 } else if (is_num (*str1)) {
00249                         if (is_num (prec1) || prec1==SEP) c1 = *str1++;
00250                         else c1 = SEP;
00251                 } else if (!*str1 || *str1==stop_char) { 
00252                         if (prec1==SEP) c1 = END;
00253                         else c1=SEP;
00254                 } else {
00255                         c1 = SEP;
00256                         while (!is_letter(*str1) && !is_num(*str1) && *str1 && *str1!=stop_char) str1++;
00257                 }
00258                 prec2 = c2;
00259                 if (is_letter(*str2)) {
00260                         if (is_maj(prec2) || prec2==SEP) c2 = *str2++ & ~0x20;
00261                         else c2 = SEP;
00262                 } else if (is_num (*str2)) {
00263                         if (is_num (prec2) || prec2==SEP) c2 = *str2++;
00264                         else c2 = SEP;
00265                 } else if (!*str2 || *str2==stop_char) { 
00266                         if (prec2==SEP) c2 = END;
00267                         else c2=SEP;
00268                 } else {
00269                         c2 = SEP;
00270                         while (!is_letter(*str2) && !is_num(*str2) && *str2 && *str2!=stop_char) str2++;
00271                 }
00272                 if (c1!=c2) {
00273                         //DBG3 ("false");
00274                         return false; }
00275                 if (c1==END) {
00276                         //DBG3 ("true");
00277                         return true;
00278                 }
00279         }
00280 }
00281 
00282 
00283 
00284 
00285 double ut_get_charset_coef (UtCharsetIndex i) {
00286         
00287         float coef;
00288         
00289         if (ut_session->language_default>=0)
00290                 coef = UT_LANG_SYS_COEF [ut_session->charset[i].language[ut_session->language_default]];
00291         else
00292                 coef = 1.0;
00293 
00294         if (ut_session->system_default>=0)
00295                  coef *= UT_LANG_SYS_COEF [ut_session->charset[i].system[ut_session->system_default]];
00296         
00297         return coef;
00298 }
00299 
00300 
00301 
00302 
00303 
00304 /***************************************************************************/
00316 bool ut_update_progress (UtText * text, ulong processed, bool start_stop) {
00317         
00318         ASSERT (ut_session->progress_function)
00319         
00320         float rate;
00321         
00322         if (start_stop) {
00323                 if (!text->progress_done) rate = 0;
00324                 else if (!text->progress_todo) rate = 1.0;
00325                 else {
00326                         rate = 0;
00327                         DBG1 ("ut_update_progress: done!=0 && todo!=0 !?!?")
00328                 }
00329         } else {
00330                 rate = text->progress_done + (1-text->progress_done)*( (float) processed/text->size)/text->progress_todo;
00331                 if (rate==0.0) rate = FLT_MIN;
00332                 else if (rate==1.0) rate = 1.0 - FLT_MIN;
00333                 if (rate>1.0) {
00334                         DBG1 ("ut_update_progress: rate = %f !!", rate)
00335                 }
00336         }
00337         
00338         return (*(ut_session->progress_function)) (text, rate); 
00339 }
00340 
00341 /***************************************************************************/
00343 ulong ut_crc32_table[256];
00345 const ulong UT_CRC32_POLY=0x04c11db7;
00346 
00347 /***************************************************************************/
00366 ulong ut_crc32(ushort data, ulong crc_in) {
00367         ulong  crc;
00368 
00369         if (!ut_crc32_table[1]) {
00370                 int i, j; ulong c;
00371                 for (i = 0; i < 256; ++i) {
00372                         for (c = i << 24, j = 8; j > 0; --j) c = c & 0x80000000 ? (c << 1) ^ UT_CRC32_POLY : (c << 1);
00373                         ut_crc32_table[i] = c;
00374                 }
00375         }
00376         crc_in = ~crc_in;
00377         crc = (crc_in << 8) ^ ut_crc32_table[((crc_in >> 16) ^ data )>>8];  //crc for 8 MSB of data
00378         crc = (crc << 8) ^ ut_crc32_table[(crc >> 24) ^ (data&0xFF)];       //crc for 8 LSB of data
00379         return ~crc;
00380 }