00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00035 #define _UT_CHARSET_C_
00036
00037 #include <stdlib.h>
00038 #include <stdio.h>
00039 #include <endian.h>
00040 #include <byteswap.h>
00041 #define __USE_GNU //for strndup
00042 #include <string.h>
00043
00044 #include "utrac.h"
00045 #include "ut_charset.h"
00046
00047
00048
00049 #include "debug.h"
00050
00051
00052
00053
00055 const static char * charmap_keyword = "Charmap:";
00056 const static char * alias_keyword = "Alias:";
00057 const static char * common_name_keyword = "CommonName:";
00058 const static char * comment_keyword = "Comment:";
00059 const static char * language_keyword = "Language:";
00060 const static char * system_keyword = "System:";
00061 const static char * language_def_keyword = "DefineLanguage:";
00062 const static char * system_def_keyword = "DefineSystem:";
00063
00065 const static char * SCRIPT_NAME[] = { "LATIN", "CYRILLIC", "ARABIC", "GREEK", "HEBREW", "THAI", NULL};
00066
00067
00073 static inline bool is_blank (char c) { return (c==' ' || c=='\t'); }
00074
00089 static inline bool is_eol (char c) { return (c=='\n' || c=='\r' ); }
00090
00096 static inline bool is_eol_c (char c) { return (c=='#' || c=='\n' || c=='\r' ); }
00097
00103 typedef struct UtCharmapLink {
00104 UtCharset * charset;
00105 struct UtCharmapLink * next;
00106 } UtCharmapLink;
00107
00108
00116 static UtCode expend_lang_sys (UtLangSys *lang_sys) {
00117
00118 lang_sys->n_max += UT_LANG_SYS_ALLOC_STEP;
00119 lang_sys->name = (char**) realloc (lang_sys->name, lang_sys->n_max*sizeof(char*));
00120
00121 lang_sys->code = (char*) realloc (lang_sys->code, lang_sys->n_max*2);
00122
00123
00124 if (!lang_sys->name || !lang_sys->code) return UT_MALLOC_ERROR;
00125
00126 else return UT_OK;
00127
00128 DBG3 ("Lang/sys dynamic array (at %p) expended to %d elements", lang_sys, lang_sys->n_max)
00129 }
00130
00131
00145 static UtCode parse_string_line (char** scan_in, char ** dst) {
00146
00147 char *scan = *scan_in;
00148
00149 while (is_blank(*scan)) scan++;
00150
00151 char * name_beg = scan;
00152 while (!is_eol_c(*scan)) scan++;
00153 do scan--; while (is_blank(*scan));
00154
00155 if (scan-name_beg<0) return UT_STRING_MISSING_ERROR;
00156
00157 *dst = strndup (name_beg, scan-name_beg+1);
00158
00159 *scan_in = scan;
00160 return UT_OK;
00161 }
00162
00163
00167 static UtCode parse_lang_sys_def_line (char** scan_in, UtLangSys * lang_sys) {
00168
00169 char *scan = *scan_in;
00170
00171 if (ut_session->nb_charsets) return UT_LANG_SYS_DEF_AFTER_CHARSET_ERROR;
00172 if (lang_sys->n == lang_sys->n_max) {
00173 UT_TRY( expend_lang_sys (lang_sys) )
00174 }
00175
00176
00177 while (is_blank(*scan))
00178 scan++;
00179
00180 if (is_eol_c(*scan)) return UT_LANG_SYS_CODE_MISSING_ERROR;
00181
00182 if (is_blank(*(scan+1)) || is_eol_c(*(scan+1))) return UT_PARTIAL_LANG_SYS_CODE_ERROR;
00183
00184
00185
00186
00187
00188 lang_sys->code [lang_sys->n*2+0] = *scan++;
00189 lang_sys->code [lang_sys->n*2+1] = *scan++;
00190
00191
00192 int i; for (i=0; i<lang_sys->n; i++)
00193 if (lang_sys->code [i*2+0] == lang_sys->code [lang_sys->n*2+0] &&
00194 lang_sys->code [i*2+1] == lang_sys->code [lang_sys->n*2+1])
00195 return UT_LANG_SYS_ALREADY_DEFINED_ERROR;
00196
00197 UtCode rcode = parse_string_line (&scan, &lang_sys->name[lang_sys->n]);
00198 if (rcode!=UT_OK) return rcode;
00199
00200 lang_sys->n++;
00201
00202 DBG("Lang/sys (%p) added : %s (%c%c) at pos %d",
00203 lang_sys, lang_sys->name [lang_sys->n],
00204 lang_sys->code [lang_sys->n*2+0],
00205 lang_sys->code [lang_sys->n*2+1], lang_sys->n-1)
00206
00207 *scan_in = scan;
00208 return UT_OK;
00209 }
00210
00211
00212
00216 static UtCode parse_charmap_line (char** scan_in, UtCharmapLink ** current_link) {
00217
00218 char* scan = *scan_in;
00219 UtCharmapLink * old_link = *current_link;
00220
00221 UtCharset * new_charset = (UtCharset*) malloc (sizeof(UtCharset));
00222 if (!new_charset) return UT_MALLOC_ERROR;
00223 new_charset->name = NULL;
00224 new_charset->alias = NULL;
00225 new_charset->common_name = NULL;
00226 new_charset->comment = NULL;
00227 new_charset->type = UT_CST_UNSET;
00228 new_charset->language = (u_char*) malloc (ut_session->language.n*(sizeof(u_char)));
00229 new_charset->system = (u_char*) malloc (ut_session->system.n*(sizeof(u_char)));
00230 new_charset->unicode = NULL;
00231 new_charset->char_type = NULL;
00232
00233 int i;
00234 for (i=0; i<ut_session->language.n; i++) new_charset->language[i] = 0;
00235 for (i=0; i<ut_session->system.n; i++) new_charset->system[i] = 0;
00236
00237 UtCode rcode = parse_string_line (&scan, &new_charset->name);
00238 if (rcode!=UT_OK) return rcode;
00239
00240 i = 0; while (UT_CHARSET_NAME[i]) {
00241 if (strcmp (UT_CHARSET_NAME[i], new_charset->name)==0) break;
00242 i++;
00243 }
00244 new_charset->type = (UtCharsetType) i;
00245
00246 UtCharmapLink * new_link;
00247 if (old_link->charset ) {
00248 new_link = (UtCharmapLink*) calloc (1, sizeof(UtCharmapLink));
00249 old_link->next = new_link;
00250 } else {
00251 new_link = old_link;
00252 }
00253 new_link->charset = new_charset;
00254 new_link->next = NULL;
00255 ut_session->nb_charsets++;
00256
00257 DBG3 (" - Charset %s added! - ", new_charset->name)
00258 *current_link = new_link;
00259 *scan_in = scan;
00260 return UT_OK;
00261 }
00262
00263
00267 static UtCode parse_lang_sys_line (char** scan_in, UtLangSys * lang_sys, char * lang_sys_coef) {
00268 char *scan = *scan_in;
00269
00270 u_char language_id, coef_id;
00271
00272 for(;;) {
00273 while (is_blank(*scan)) scan++;
00274 if (is_eol_c(*scan)) break;
00275
00276 #if BYTE_ORDER == LITTLE_ENDIAN
00277 bswap_16 (*(ushort*)scan);
00278 #endif
00279
00280 for (language_id=0; language_id<lang_sys->n; language_id++) {
00281
00282 if ( *scan == lang_sys->code[language_id*2+0] &&
00283 *(scan+1) == lang_sys->code[language_id*2+1]) break;
00284 }
00285
00286 if (language_id==lang_sys->n) return UT_LANG_SYS_UNDEFINED_ERROR;
00287
00288 scan+=2;
00289 if (*scan==':') {
00290 char * beg = ++scan;
00291 coef_id = strtoul (beg, &scan, 0);
00292 if (beg==scan) return UT_LANG_SYS_COEF_MISSING_ERROR;
00293 if (!is_blank(*scan) && !is_eol_c(*scan)) return UT_LANG_SYS_INCORRECT_COEF_ERROR;
00294 if (coef_id>UT_COEF_MAX) return UT_LANG_SYS_COEF_TOO_BIG_ERROR;
00295 } else coef_id = 1;
00296
00297 lang_sys_coef[language_id] = coef_id;
00298 }
00299
00300 *scan_in = scan-1;
00301 return UT_OK;
00302 }
00303
00304
00308 static UtCode parse_charmap_entry (char** scan_in, UtCharset * charset) {
00309
00310 if (charset->type!=UT_CST_ASCII && charset->type!=UT_CST_ASCII_EXTENSION)
00311 return UT_CHARMAP_ENTRY_ILLEGAL_ERROR;
00312
00313 char* scan = *scan_in;
00314 char * hex_beg = scan;
00315
00316 ulong character = strtoul (hex_beg, &scan, 16);
00317 if (hex_beg==scan) return UT_INCORRECT_CHARMAP_ENTRY_ERROR;
00318
00319 if (character >= 0x80 && charset->type!=UT_CST_ASCII_EXTENSION)
00320 return UT_CHARMAP_ENTRY_ILLEGAL_ERROR;
00321
00322 hex_beg = scan;
00323 ulong unicode = strtoul (hex_beg, &scan, 16);
00324 if (hex_beg==scan) unicode = UT_UNICODE_NONCHAR;
00325 if (character>0xFF) return UT_CHAR_TOO_BIG_ERROR;
00326 if (unicode > 0xFFFF) return UT_UNICODE_CHAR_TOO_BIG_ERROR;
00327
00328 if (!charset->unicode && !charset->char_type) {
00329 charset->unicode = (ushort*) malloc (sizeof( ushort[0x100]));
00330 charset->char_type = (UtCharType*) malloc (sizeof( UtCharType[0x100]));
00331 if (!charset->unicode || !charset->char_type)
00332 return UT_MALLOC_ERROR;
00333 int i; for (i=0; i<0x100; i++) {
00334 charset->unicode[i] = UT_UNICODE_NONCHAR;
00335 charset->char_type[i].categorie = UT_CTG_UNSET;
00336 charset->char_type[i].script = 0;
00337 }
00338 }
00339
00340 charset->unicode[(u_char)character] = (ushort) unicode;
00341
00342 while (is_blank(*scan)) scan++;
00343
00344 if ('A'<=*scan && *scan <= 'Z') {
00345
00346 if (character==0||character==0x9||character==0xA||character==0xD||character==0x20)
00347 charset->char_type[(u_char) character].categorie = UT_CTG_DELIMITER;
00348 else
00349 #if BYTE_ORDER==BIG_ENDIAN
00350 switch (* (ushort*) scan ) {
00351 #else
00352 switch (bswap_16(* (ushort*) scan )) {
00353 #endif
00354 case 'Lu': charset->char_type[(u_char) character].categorie = UT_CTG_UPPERCASE; break;
00355 case 'Ll': charset->char_type[(u_char) character].categorie = UT_CTG_LOWERCASE; break;
00356 case 'Lt':
00357 case 'Lm':
00358 case 'Lo': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER_LETTER; break;
00359
00360 case 'Mn': charset->char_type[(u_char) character].categorie = UT_CTG_MARK; break;
00361 case 'Mc':
00362 case 'Me': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER; break;
00363
00364 case 'Nd':
00365 case 'Nl':
00366 case 'No': charset->char_type[(u_char) character].categorie = UT_CTG_NUMBER; break;
00367
00368 case 'Pc':
00369 case 'Pd':
00370 case 'Po': charset->char_type[(u_char) character].categorie = UT_CTG_PONCTUATION; break;
00371 case 'Ps': charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_OTHER ; break;
00372 case 'Pe': charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_OTHER ; break;
00373 case 'Pi':
00374 switch (unicode) {
00375 case 0x00AB: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_0 ; break;
00376 case 0x2018: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_1 ; break;
00377 case 0x201C: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_2 ; break;
00378 case 0x2039: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_3 ; break;
00379 default: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_INIT_OTHER ; break;
00380 } break;
00381
00382 case 'Pf':
00383 switch (unicode) {
00384 case 0x00BB: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_0 ; break;
00385 case 0x2019: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_1 ; break;
00386 case 0x201D: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_2 ; break;
00387 case 0x203A: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_3 ; break;
00388 default: charset->char_type[(u_char) character].categorie = UT_CTG_PONCT_FINAL_OTHER ; break;
00389 } break;
00390
00391 case 'Sc': charset->char_type[(u_char) character].categorie = UT_CTG_CURRENCY; break;
00392 case 'Sm':
00393 case 'Sk':
00394 case 'So': charset->char_type[(u_char) character].categorie = UT_CTG_SYMBOL; break;
00395
00396 case 'Zs': charset->char_type[(u_char) character].categorie = UT_CTG_DELIMITER; break;
00397 case 'Zl':
00398 case 'Zp': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER; break;
00399
00400 case 'Cc': charset->char_type[(u_char) character].categorie = UT_CTG_CONTROL; break;
00401 case 'Cf':
00402 case 'Cs':
00403 case 'Co':
00404 case 'Cn': charset->char_type[(u_char) character].categorie = UT_CTG_OTHER; break;
00405 default: return UT_UNDEFINED_CATEGORY_ERROR;
00406 }
00407 scan +=2;
00408 while (is_blank(*scan)) scan++;
00409 }
00410
00411
00412 if (*scan == '#') {
00413 const char ** script = SCRIPT_NAME;
00414 int index_script = 0;
00415 char * first_eol, *first_script;
00416 first_eol = strchr (scan, '\n');
00417 if (first_eol) *first_eol=0;
00418 while (*script) {
00419 index_script++;
00420 first_script = strstr (scan, *script);
00421 if (first_script && first_script < first_eol) {
00422 charset->char_type [(u_char) character].script = (char) index_script;
00423 if (first_eol) scan = first_eol;
00424 break;
00425 }
00426 script++;
00427 }
00428 if (first_eol) *first_eol='\n';
00429 }
00430
00431 *scan_in = scan-1;
00432 return UT_OK;
00433 }
00434
00436 static bool streq (const char * src, char **cmp) {
00437 char *cmp_scan = *cmp;
00438 while (*src) {
00439 if (*src!=*cmp_scan || !*cmp_scan) return false;
00440 src++; cmp_scan++;
00441 }
00442 *cmp = cmp_scan;
00443 return true;
00444 }
00445
00446
00447 UtCode ut_print_charsets () {
00448
00449 int i; for (i=0; i < ut_session->nb_charsets; i++) {
00450 printf ("%2d: %20s %2d [", i, ut_session->charset[i].name, ut_session->charset[i].type);
00451 int j; for (j=0; j<ut_session->language.n_max;j++) printf("%d ",(int)ut_session->charset[i].language[j]);
00452 printf("] [");
00453 for (j=0; j<ut_session->system.n_max;j++) printf("%d ",(int)ut_session->charset[i].system[j]);
00454 printf("]\n");
00455 }
00456
00457
00458 }
00459
00460
00461
00472 UtCode ut_load_charsets () {
00473
00474 DBG3 ("Loading charsets...")
00475
00476 int i;
00477 char * file_buffer;
00478 int rcode;
00479 const char * filename;
00480 {
00481 #ifdef UT_CHARMAPS_FILENAME
00482 filename = UT_CHARMAPS_FILENAME;
00483 rcode = ut_load_charset_file (filename, &file_buffer);
00484 }
00485 if (rcode!=UT_OK) {
00486 #endif
00487 filename = UT_CHARMAPS_FILENAME2;
00488 rcode = ut_load_charset_file (filename, &file_buffer);
00489 }
00490
00491 if (rcode!=UT_OK) return rcode;
00492
00493 char * scan = file_buffer;
00494 int line = 1;
00495
00496
00497 UtCharmapLink * charmap_list = (UtCharmapLink*) calloc (1, sizeof(UtCharmapLink));
00498 UtCharmapLink * current_link = charmap_list;
00499
00500
00501 while (*scan) {
00502 if (*scan=='\r') {
00503 if (*(scan+1)=='\n') scan++;
00504 line++;
00505 } else if (*scan=='\n') {
00506 line++;
00507 } else if (!is_blank(*scan)) {
00508 if (*scan=='#') {
00509 while (!is_eol(*++scan));
00510 scan--;
00511 } else if (*scan=='0' && *(scan+1)=='x') {
00512 rcode = parse_charmap_entry(&scan, current_link->charset);
00513
00514 } else if ( streq (charmap_keyword, &scan) ) {
00515 rcode = parse_charmap_line(&scan, ¤t_link);
00516 } else if ( streq (alias_keyword, &scan) ) {
00517 rcode = parse_string_line(&scan, ¤t_link->charset->alias);
00518 } else if ( streq (common_name_keyword, &scan) ) {
00519 rcode = parse_string_line(&scan, ¤t_link->charset->common_name);
00520 } else if ( streq (comment_keyword, &scan) ) {
00521 rcode = parse_string_line(&scan, ¤t_link->charset->comment);
00522
00523 } else if ( streq (language_keyword, &scan) ) {
00524 rcode = parse_lang_sys_line(&scan, &ut_session->language, current_link->charset->language);
00525 } else if ( streq (system_keyword, &scan) ) {
00526 rcode = parse_lang_sys_line(&scan, &ut_session->system, current_link->charset->system);
00527
00528 } else if ( streq (language_def_keyword, &scan) ) {
00529 rcode = parse_lang_sys_def_line(&scan, &ut_session->language);
00530 } else if ( streq (system_def_keyword, &scan) ) {
00531 rcode = parse_lang_sys_def_line(&scan, &ut_session->system);
00532 } else {
00533
00534
00535 if (!ut_session->error_string) ut_session->error_string = (char*) malloc (UT_ERROR_STRING_SIZE);
00536 snprintf (ut_session->error_string, UT_ERROR_STRING_SIZE,
00537 "syntax error in %s at line %d:\n%s", filename, line, scan);
00538 return UT_SYNTAX_ERROR;
00539 }
00540 if (rcode!=UT_OK) {
00541 if (!ut_session->error_string) ut_session->error_string = (char*) malloc (UT_ERROR_STRING_SIZE);
00542 snprintf (ut_session->error_string, UT_ERROR_STRING_SIZE,
00543 "error %d in %s at line %d", rcode, filename, line);
00544
00545 return UT_CHARSET_FILE_ERROR;
00546 }
00547 }
00548 scan++;
00549 }
00550
00551
00552 ut_session->charset = (UtCharset*) calloc (ut_session->nb_charsets, sizeof (UtCharset));
00553 i=0;
00554 current_link = charmap_list;
00555 while (current_link) {
00556 ut_session->charset[i].name = current_link->charset->name;
00557 ut_session->charset[i].alias = current_link->charset->alias;
00558 ut_session->charset[i].common_name = current_link->charset->common_name;
00559 ut_session->charset[i].comment = current_link->charset->comment;
00560 ut_session->charset[i].type = current_link->charset->type;
00561 ut_session->charset[i].unicode = current_link->charset->unicode;
00562 ut_session->charset[i].char_type = current_link->charset->char_type;
00563 ut_session->charset[i].language = current_link->charset->language;
00564 ut_session->charset[i].system = current_link->charset->system;
00565 charmap_list = current_link->next;
00566 free(current_link->charset);
00567 free(current_link);
00568 current_link = charmap_list;
00569 i++;
00570 }
00571 free (file_buffer);
00572
00573 DBG2 ("Charset file %s processed!", filename)
00574
00575 return UT_OK;
00576
00577 }