Utrac: ut_charset.c Source File

00001 /***************************************************************************
00002  *            ut_charset.c
00003  *
00004  *  Fri Apr 23 15:24:30 2004
00005  *  Copyright  2004  Alliance MCA
00006  *  Written by : Antoine Calando (antoine@alliancemca.net)
00007  ****************************************************************************/
00008 /*
00009  *  This program is free software; you can redistribute it and/or modify
00010  *  it under the terms of the GNU General Public License as published by
00011  *  the Free Software Foundation; either version 2 of the License, or
00012  *  (at your option) any later version.
00013  *
00014  *  This program is distributed in the hope that it will be useful,
00015  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *  GNU Library General Public License for more details.
00018  *
00019  *  You should have received a copy of the GNU General Public License
00020  *  along with this program; if not, write to the Free Software
00021  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00022  */
00023  
00024  
00035 #define _UT_CHARSET_C_
00036 
00037 #include <stdlib.h>
00038 #include <stdio.h>
00039 #include <endian.h>
00040 #include <byteswap.h>
00041 #define __USE_GNU       //for strndup
00042 #include <string.h>
00043 
00044 #include "utrac.h"
00045 #include "ut_charset.h"
00046 
00047 //#undef UT_DEBUG
00048 //#define UT_DEBUG 3
00049 #include "debug.h"
00050 
00051 // ***************************************************************************************
00052 // const char * charmaps_filename = "/home/antoine/dev/libimport/charmaps_categ.txt";
00053 
00055 const static char * charmap_keyword = "Charmap:";
00056 const static char * alias_keyword = "Alias:";
00057 const static char * common_name_keyword = "CommonName:";
00058 const static char * comment_keyword = "Comment:";
00059 const static char * language_keyword = "Language:";
00060 const static char * system_keyword = "System:";
00061 const static char * language_def_keyword = "DefineLanguage:";
00062 const static char * system_def_keyword = "DefineSystem:";
00063 
00065 const static char * SCRIPT_NAME[] = { "LATIN", "CYRILLIC", "ARABIC", "GREEK", "HEBREW", "THAI", NULL};
00066 
00067 // ***************************************************************************************
00073 static inline bool is_blank (char c) { return (c==' ' || c=='\t'); }
00074 
00089 static inline bool is_eol (char c)   { return (c=='\n' || c=='\r' /*|| c=='\0'*/); }
00090 
00096 static inline bool is_eol_c (char c) { return (c=='#' || c=='\n' || c=='\r' /*|| c=='\0'*/); }
00097 
00103 typedef struct UtCharmapLink {
00104         UtCharset * charset;
00105         struct UtCharmapLink * next;
00106 } UtCharmapLink;
00107 
00108 /**************************************************************************/
00116 static UtCode expend_lang_sys (UtLangSys *lang_sys) {
00117 
00118         lang_sys->n_max += UT_LANG_SYS_ALLOC_STEP;      
00119         lang_sys->name = (char**)  realloc (lang_sys->name, lang_sys->n_max*sizeof(char*));
00120         //lang_sys->code = (ushort*) realloc (lang_sys->code, lang_sys->n_max*sizeof(ushort));
00121         lang_sys->code = (char*) realloc (lang_sys->code, lang_sys->n_max*2);
00122         //lang_sys->code[0] = 0; lang_sys->code[1] = 0;
00123         
00124         if (!lang_sys->name     || !lang_sys->code) return UT_MALLOC_ERROR;
00125         //if (!lang_sys->name   ) return UT_MALLOC_ERROR;
00126         else return UT_OK;
00127                 
00128         DBG3 ("Lang/sys dynamic array (at %p) expended to %d elements", lang_sys, lang_sys->n_max)
00129 }
00130 
00131 /**************************************************************************/
00145 static UtCode parse_string_line (char** scan_in, char ** dst) {
00146 
00147         char *scan = *scan_in;
00148 
00149         while (is_blank(*scan)) scan++; //trim space before language name
00150 
00151         char * name_beg = scan;
00152         while (!is_eol_c(*scan)) scan++; //find eol or comment
00153         do scan--; while (is_blank(*scan));     //go back until first nonblank char
00154         
00155         if (scan-name_beg<0) return UT_STRING_MISSING_ERROR;
00156         
00157         *dst = strndup (name_beg, scan-name_beg+1);
00158 
00159         *scan_in = scan;
00160         return UT_OK;
00161 }
00162 
00163 /**************************************************************************/
00167 static UtCode parse_lang_sys_def_line (char** scan_in, UtLangSys * lang_sys) {
00168 
00169         char *scan = *scan_in;
00170         
00171         if (ut_session->nb_charsets) return UT_LANG_SYS_DEF_AFTER_CHARSET_ERROR;
00172         if (lang_sys->n == lang_sys->n_max) {
00173                 UT_TRY( expend_lang_sys (lang_sys) )
00174         }
00175 
00176         //printf (scan);
00177         while (is_blank(*scan)) 
00178                 scan++; //trim space before language id
00179         
00180         if (is_eol_c(*scan)) return     UT_LANG_SYS_CODE_MISSING_ERROR;
00181                 
00182         if (is_blank(*(scan+1)) || is_eol_c(*(scan+1))) return UT_PARTIAL_LANG_SYS_CODE_ERROR;
00183         
00184         //lang_sys->code [lang_sys->n] = *(((ushort*)(scan)))++;
00185         //#if BYTE_ORDER == LITTLE_ENDIAN
00186         //bswap_16(lang_sys->code [lang_sys->n]);
00187         //#endif
00188         lang_sys->code [lang_sys->n*2+0] = *scan++;
00189         lang_sys->code [lang_sys->n*2+1] = *scan++;
00190         
00191         //check if language exists
00192         int i; for (i=0; i<lang_sys->n; i++)
00193                 if (lang_sys->code [i*2+0] == lang_sys->code [lang_sys->n*2+0] && 
00194                         lang_sys->code [i*2+1] == lang_sys->code [lang_sys->n*2+1]) 
00195                         return UT_LANG_SYS_ALREADY_DEFINED_ERROR;
00196 
00197         UtCode rcode = parse_string_line (&scan, &lang_sys->name[lang_sys->n]);
00198         if (rcode!=UT_OK) return rcode;
00199         
00200         lang_sys->n++;
00201         
00202         DBG("Lang/sys (%p) added : %s (%c%c) at pos %d", 
00203                         lang_sys, lang_sys->name [lang_sys->n],
00204                         lang_sys->code [lang_sys->n*2+0],
00205                         lang_sys->code [lang_sys->n*2+1], lang_sys->n-1)
00206         
00207         *scan_in = scan;
00208         return UT_OK;
00209 }
00210 
00211 
00212 /**************************************************************************/
00216 static UtCode parse_charmap_line (char** scan_in, UtCharmapLink ** current_link) {
00217         
00218         char* scan = *scan_in;
00219         UtCharmapLink * old_link = *current_link;
00220         
00221         UtCharset * new_charset = (UtCharset*) malloc (sizeof(UtCharset));
00222         if (!new_charset) return UT_MALLOC_ERROR;
00223         new_charset->name = NULL;
00224         new_charset->alias = NULL;
00225         new_charset->common_name = NULL;
00226         new_charset->comment = NULL;
00227         new_charset->type = UT_CST_UNSET;
00228         new_charset->language = (u_char*) malloc (ut_session->language.n*(sizeof(u_char)));
00229         new_charset->system = (u_char*) malloc (ut_session->system.n*(sizeof(u_char)));
00230         new_charset->unicode = NULL;
00231         new_charset->char_type = NULL;
00232         
00233         int i; 
00234         for (i=0; i<ut_session->language.n; i++) new_charset->language[i] = 0;
00235         for (i=0; i<ut_session->system.n; i++) new_charset->system[i] = 0;
00236 
00237         UtCode rcode = parse_string_line (&scan, &new_charset->name);
00238         if (rcode!=UT_OK) return rcode;
00239 
00240         i = 0; while (UT_CHARSET_NAME[i]) {
00241                 if (strcmp (UT_CHARSET_NAME[i], new_charset->name)==0) break;
00242                 i++;
00243         }
00244         new_charset->type = (UtCharsetType) i;
00245         
00246         UtCharmapLink * new_link;
00247         if (old_link->charset ) {
00248                 new_link = (UtCharmapLink*) calloc (1, sizeof(UtCharmapLink));
00249                 old_link->next = new_link; 
00250         } else {
00251                 new_link = old_link;
00252         }
00253         new_link->charset = new_charset;
00254         new_link->next = NULL;
00255         ut_session->nb_charsets++;
00256 
00257         DBG3 (" - Charset %s added! - ", new_charset->name)
00258         *current_link = new_link;
00259         *scan_in = scan;
00260         return UT_OK;
00261 }
00262 
00263 /**************************************************************************/
00267 static UtCode parse_lang_sys_line (char** scan_in, UtLangSys * lang_sys, char * lang_sys_coef) {
00268         char *scan = *scan_in;
00269         
00270         u_char language_id, coef_id;
00271 
00272         for(;;) {
00273                 while (is_blank(*scan)) scan++;
00274                 if (is_eol_c(*scan)) break;
00275                 //ushort lang_sys_code = *(ushort*)scan;
00276                 #if BYTE_ORDER == LITTLE_ENDIAN
00277                 bswap_16 (*(ushort*)scan);
00278                 #endif
00279                 
00280                 for (language_id=0; language_id<lang_sys->n; language_id++) {
00281                         //if ( *(ushort*)scan == lang_sys->code[language_id]) break;
00282                         if (    *scan == lang_sys->code[language_id*2+0] &&
00283                                 *(scan+1) == lang_sys->code[language_id*2+1]) break;
00284                 }
00285                 
00286                 if (language_id==lang_sys->n) return UT_LANG_SYS_UNDEFINED_ERROR;
00287                         
00288                 scan+=2;
00289                 if (*scan==':') {
00290                         char * beg = ++scan;
00291                         coef_id = strtoul (beg, &scan, 0);
00292                         if (beg==scan) return UT_LANG_SYS_COEF_MISSING_ERROR;
00293                         if (!is_blank(*scan) && !is_eol_c(*scan)) return UT_LANG_SYS_INCORRECT_COEF_ERROR;
00294                         if (coef_id>UT_COEF_MAX) return UT_LANG_SYS_COEF_TOO_BIG_ERROR;
00295                 } else coef_id = 1;
00296 
00297                 lang_sys_coef[language_id] = coef_id;
00298         } // for(;;)
00299 
00300         *scan_in = scan-1;
00301         return UT_OK;
00302 }
00303 
00304 /**************************************************************************/
00308 static UtCode parse_charmap_entry (char** scan_in, UtCharset * charset) {
00309         
00310         if (charset->type!=UT_CST_ASCII && charset->type!=UT_CST_ASCII_EXTENSION)
00311                 return UT_CHARMAP_ENTRY_ILLEGAL_ERROR;
00312         
00313         char* scan = *scan_in;
00314         char * hex_beg = scan;
00315         
00316         ulong character = strtoul (hex_beg, &scan, 16);
00317         if (hex_beg==scan) return UT_INCORRECT_CHARMAP_ENTRY_ERROR;    //useless?
00318 
00319         if (character >= 0x80 && charset->type!=UT_CST_ASCII_EXTENSION)
00320                 return UT_CHARMAP_ENTRY_ILLEGAL_ERROR;
00321 
00322         hex_beg = scan;
00323         ulong unicode = strtoul (hex_beg, &scan, 16);
00324         if (hex_beg==scan) unicode = UT_UNICODE_NONCHAR;   //some unicode entries are empty!
00325         if (character>0xFF)             return UT_CHAR_TOO_BIG_ERROR;
00326         if (unicode > 0xFFFF) return UT_UNICODE_CHAR_TOO_BIG_ERROR;
00327                 
00328         if (!charset->unicode && !charset->char_type) {
00329                 charset->unicode = (ushort*) malloc (sizeof( ushort[0x100]));
00330                 charset->char_type = (UtCharType*) malloc (sizeof( UtCharType[0x100]));
00331                 if (!charset->unicode || !charset->char_type) 
00332                         return UT_MALLOC_ERROR;
00333                 int i; for (i=0; i<0x100; i++) {
00334                         charset->unicode[i] = UT_UNICODE_NONCHAR;
00335                         charset->char_type[i].categorie = UT_CTG_UNSET;
00336                         charset->char_type[i].script = 0;
00337                 }
00338         }
00339         
00340         charset->unicode[(u_char)character] = (ushort) unicode;
00341         
00342         while (is_blank(*scan)) scan++;
00343         
00344         if ('A'<=*scan && *scan <= 'Z') {
00345                 
00346                 if (character==0||character==0x9||character==0xA||character==0xD||character==0x20)
00347                         charset->char_type[(u_char) character].categorie = UT_CTG_DELIMITER;
00348                 else 
00349                   #if BYTE_ORDER==BIG_ENDIAN
00350                   switch (* (ushort*) scan ) {
00351                   #else
00352                   switch (bswap_16(* (ushort*) scan )) { //}
00353                   #endif
00354                         case 'Lu': charset->char_type[(u_char) character].categorie = UT_CTG_UPPERCASE; break;
00355                         case 'Ll': charset->char_type[(u_char) character].categorie = UT_CTG_LOWERCASE; break;
00356                         case 'Lt': 
00357                         case 'Lm': 
00358                         case 'Lo': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER_LETTER; break;
00359 
00360                         case 'Mn': charset->char_type[(u_char) character].categorie = UT_CTG_MARK; break;
00361                         case 'Mc': 
00362                         case 'Me': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER; break;
00363 
00364                         case 'Nd': 
00365                         case 'Nl': 
00366                         case 'No': charset->char_type[(u_char) character].categorie = UT_CTG_NUMBER; break;
00367 
00368                         case 'Pc': 
00369                         case 'Pd': 
00370                         case 'Po': charset->char_type[(u_char) character].categorie = UT_CTG_PONCTUATION; break;
00371                         case 'Ps': charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_OTHER ; break;
00372                         case 'Pe': charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_OTHER ; break;
00373                         case 'Pi':
00374                                 switch (unicode) {
00375                                         case 0x00AB: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_0 ; break;
00376                                         case 0x2018: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_1 ; break;
00377                                         case 0x201C: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_2 ; break;
00378                                         case 0x2039: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_3 ; break;
00379                                         default: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_OTHER ; break;
00380                                 } break;
00381                 
00382                         case 'Pf': 
00383                                 switch (unicode) {
00384                                         case 0x00BB: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_0 ; break;
00385                                         case 0x2019: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_1 ; break;
00386                                         case 0x201D: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_2 ; break;
00387                                         case 0x203A: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_3 ; break;
00388                                         default: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_OTHER ; break;
00389                                 } break;
00390 
00391                         case 'Sc': charset->char_type[(u_char) character].categorie = UT_CTG_CURRENCY; break;
00392                         case 'Sm': 
00393                         case 'Sk': 
00394                         case 'So': charset->char_type[(u_char) character].categorie = UT_CTG_SYMBOL; break;
00395 
00396                         case 'Zs': charset->char_type[(u_char) character].categorie = UT_CTG_DELIMITER; break;
00397                         case 'Zl': 
00398                         case 'Zp': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER; break;
00399 
00400                         case 'Cc': charset->char_type[(u_char) character].categorie = UT_CTG_CONTROL; break;
00401                         case 'Cf': 
00402                         case 'Cs': 
00403                         case 'Co': 
00404                         case 'Cn': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER; break;
00405                         default: return UT_UNDEFINED_CATEGORY_ERROR;
00406                 }
00407                 scan +=2;
00408                 while (is_blank(*scan)) scan++;
00409         }
00410         
00411         //look for an script type in the comment (latin, arabic, hebrew...)
00412         if (*scan == '#') {                                                     //is there a comment?
00413                 const char ** script = SCRIPT_NAME;
00414                 int index_script = 0;
00415                 char * first_eol, *first_script;
00416                 first_eol = strchr (scan, '\n');                //find the eol and replace it by \0
00417                 if (first_eol) *first_eol=0;                    //in order to use strstr
00418                 while (*script) {
00419                         index_script++;
00420                         first_script = strstr (scan, *script);          //locate substring
00421                         if (first_script && first_script < first_eol) {
00422                                 charset->char_type [(u_char) character].script = (char) index_script;//found
00423                                 if (first_eol) scan = first_eol; //speed up the parsing
00424                                 break;
00425                         } 
00426                         script++;
00427                 }
00428                 if (first_eol) *first_eol='\n';                 //replace the 0 by the initial eol
00429         }
00430         //while (*scan!='\n') scan++;
00431         *scan_in = scan-1;
00432         return UT_OK;
00433 }
00434 
00436 static bool streq (const char * src, char **cmp) {
00437         char *cmp_scan = *cmp;
00438         while (*src) {
00439                 if (*src!=*cmp_scan || !*cmp_scan) return false;
00440                 src++; cmp_scan++;
00441         }
00442         *cmp = cmp_scan;
00443         return true;
00444 }
00445 
00446 
00447 UtCode ut_print_charsets () {
00448         
00449         int i; for (i=0; i < ut_session->nb_charsets; i++) {
00450                 printf ("%2d: %20s %2d [", i, ut_session->charset[i].name, ut_session->charset[i].type);
00451                 int j; for (j=0; j<ut_session->language.n_max;j++) printf("%d ",(int)ut_session->charset[i].language[j]);
00452                 printf("] [");  
00453                 for (j=0; j<ut_session->system.n_max;j++) printf("%d ",(int)ut_session->charset[i].system[j]);
00454                 printf("]\n");
00455         }
00456         
00457         
00458 }
00459 
00460 
00461 /*****************************************************************************/
00472 UtCode ut_load_charsets () {
00473         
00474         DBG3 ("Loading charsets...")
00475 
00476         int i;
00477         char * file_buffer;
00478         int rcode;
00479         const char * filename;
00480         {
00481                 #ifdef UT_CHARMAPS_FILENAME
00482                 filename = UT_CHARMAPS_FILENAME;
00483                 rcode = ut_load_charset_file (filename, &file_buffer);
00484         }
00485         if (rcode!=UT_OK) {
00486                 #endif
00487                 filename = UT_CHARMAPS_FILENAME2;
00488                 rcode = ut_load_charset_file (filename, &file_buffer);
00489         }
00490 
00491         if (rcode!=UT_OK) return rcode;
00492         
00493         char * scan = file_buffer;
00494         int line = 1;
00495         
00496         //each new charmap is added to a linked list
00497         UtCharmapLink * charmap_list = (UtCharmapLink*) calloc (1, sizeof(UtCharmapLink));
00498         UtCharmapLink * current_link = charmap_list;
00499         
00500         //parse file 
00501         while (*scan) {
00502                 if (*scan=='\r') {
00503                         if (*(scan+1)=='\n') scan++;
00504                         line++;
00505                 } else if (*scan=='\n') {
00506                         line++;
00507                 } else if (!is_blank(*scan)) {
00508                         if (*scan=='#') {
00509                                 while (!is_eol(*++scan));
00510                                 scan--;
00511                         } else if (*scan=='0' && *(scan+1)=='x') {
00512                                 rcode = parse_charmap_entry(&scan, current_link->charset);
00513 
00514                         } else if ( streq (charmap_keyword, &scan) ) {
00515                                 rcode = parse_charmap_line(&scan, &current_link);
00516                         } else if ( streq (alias_keyword, &scan) ) {
00517                                 rcode = parse_string_line(&scan, &current_link->charset->alias);
00518                         } else if ( streq (common_name_keyword, &scan) ) {
00519                                 rcode = parse_string_line(&scan, &current_link->charset->common_name);
00520                         } else if ( streq (comment_keyword, &scan) ) {
00521                                 rcode = parse_string_line(&scan, &current_link->charset->comment);
00522 
00523                         } else if ( streq (language_keyword, &scan) ) {
00524                                 rcode = parse_lang_sys_line(&scan, &ut_session->language, current_link->charset->language);
00525                         } else if ( streq (system_keyword, &scan) ) {
00526                                 rcode = parse_lang_sys_line(&scan, &ut_session->system, current_link->charset->system);
00527 
00528                         } else if ( streq (language_def_keyword, &scan) ) {
00529                                 rcode = parse_lang_sys_def_line(&scan, &ut_session->language);
00530                         } else if ( streq (system_def_keyword, &scan) ) {
00531                                 rcode = parse_lang_sys_def_line(&scan, &ut_session->system);
00532                         } else {
00533                                 //error
00534                                 //rcode = utSYNTAX_ERROR;
00535                                 if (!ut_session->error_string) ut_session->error_string = (char*) malloc (UT_ERROR_STRING_SIZE);
00536                                 snprintf (ut_session->error_string, UT_ERROR_STRING_SIZE,
00537                                                 "syntax error in %s at line %d:\n%s", filename, line, scan);
00538                                 return UT_SYNTAX_ERROR;
00539                         }
00540                         if (rcode!=UT_OK) {
00541                                 if (!ut_session->error_string) ut_session->error_string = (char*) malloc (UT_ERROR_STRING_SIZE);
00542                                 snprintf (ut_session->error_string, UT_ERROR_STRING_SIZE,
00543                                                 "error %d in %s at line %d", rcode, filename, line);
00544                                 //malloc'ed blocs (file_buffer & links) not free'ed
00545                                 return UT_CHARSET_FILE_ERROR;
00546                         }
00547                 } //else
00548                 scan++;
00549         } //while
00550         
00551         //put pointers from charmap linked list in an array
00552         ut_session->charset = (UtCharset*) calloc (ut_session->nb_charsets, sizeof (UtCharset));
00553         i=0;
00554         current_link = charmap_list;
00555         while (current_link) {
00556                 ut_session->charset[i].name                     = current_link->charset->name;
00557                 ut_session->charset[i].alias            = current_link->charset->alias;
00558                 ut_session->charset[i].common_name      = current_link->charset->common_name;
00559                 ut_session->charset[i].comment          = current_link->charset->comment;
00560                 ut_session->charset[i].type             = current_link->charset->type;
00561                 ut_session->charset[i].unicode          = current_link->charset->unicode;
00562                 ut_session->charset[i].char_type        = current_link->charset->char_type;
00563                 ut_session->charset[i].language         = current_link->charset->language;
00564                 ut_session->charset[i].system           = current_link->charset->system;
00565                 charmap_list = current_link->next;
00566                 free(current_link->charset);
00567                 free(current_link);
00568                 current_link = charmap_list;
00569                 i++;
00570         }
00571         free (file_buffer);
00572         
00573         DBG2 ("Charset file %s processed!", filename)
00574         //ut_print_charsets () ;
00575         return UT_OK;
00576         
00577 }