#include <sys/types.h>
#include "ut_error.h"
#include "ut_text.h"
#include "ut_charset.h"
Include dependency graph for utrac.h:
This graph shows which files directly or indirectly include this file:
Go to the source code of this file.
Classes | |
struct | UtLangSys |
Structure containing different languages or systems used. More... | |
struct | UtSession |
Structure containing all required information for Utrac. More... | |
Defines | |
#define | true 1 |
#define | false 0 |
#define | UT_VERSION "0.3.0" |
#define | UT_EOL_CHAR 0x0 |
Character code for end of line. | |
#define | UT_EOL_ALT_CHAR 0xD |
Character code for end of line 2 (see UtEolType). | |
#define | UT_SKIP_CHAR 0x1 |
Character code for character to skip during conversion. | |
#define | UT_EOF_CHAR 0x0 |
Character code for end of file. | |
#define | UT_UNICODE_NONCHAR 0xFFFF |
Illegal character, also used to indicate "no character". | |
#define | UT_UNSET -1 |
Unset variable (often used for indexes). | |
#define | UT_THRESHOLD_CONTROL_CHAR 0.05 |
Maximum percentage of illegal control chars accepted in a file. | |
#define | UT_THRESHOLD_UTF8 0.01 |
Maximum percentage of utf-8 errors in an UTF-8 file. | |
#define | UT_LOAD_STEP 1*1024*1024 |
Step in bytes between two calls of the "progress bar" function during loading. | |
#define | UT_PROCESS_STEP 1*1024*1024 |
Step in bytes between two calls of the "progress bar" function during processing. | |
#define | UT_COEF_MAX 5 |
Maximum number of coefficients for languages and systems. | |
#define | UT_LANG_SYS_ALLOC_STEP 8 |
Initial size for dynamic array UtLangSys. | |
#define | UT_ERROR_STRING_SIZE 128 |
Size of UtSession::error_string. | |
#define | UT_STDIN_BUFFER_SIZE 65536 |
Initial size for dynamic buffer in ut_load_text_stdin(). | |
#define | UT_TRY(func) |
Typedefs | |
typedef unsigned short int | bool |
typedef UtLangSys | UtLangSys |
Structure containing different languages or systems used. | |
typedef UtSession | UtSession |
Structure containing all required information for Utrac. | |
Functions | |
UtCode | ut_init () |
Initialize the Utrac library. | |
UtCode | ut_init_noalloc () |
Initialize the Utrac library, without allocating memory for UtSession. Used internally. | |
void | ut_finish () |
Free ressources allocated during initialization of Utrac. | |
void | ut_finish_nofree () |
Free ressources allocated during initialization of Utrac, without freeing UtSession. Used internally. | |
UtText * | ut_init_text_heap () |
Allocates and initalizes an UtText structure. | |
void | ut_init_text (UtText *new_text) |
Initalizes an UtText structure. | |
void | ut_free_text_heap (UtText *text) |
Free an UtText structure. | |
void | ut_free_text (UtText *text) |
Free the contents of an UtText structure, without freeing the structure itself. | |
UtCode | ut_init_progress (UtText *text) |
Initialize an UtText structure before using the 'progress bar' callback feature. | |
UtCode | ut_load (UtText *text, const char *filename) |
Load a file in an UtText structure. | |
UtCode | ut_recognize (UtText *text) |
Recognize charset and EOL of a text. | |
UtCode | ut_convert (UtText *src_text, UtText *dst_text) |
Convert a text. | |
UtCode | ut_load_charsets () |
Loads and parses file charset.dat. | |
UtCode | ut_load_charset_file (const char *filename, char **buffer) |
Load a file in a buffer. | |
UtCharsetIndex | ut_find_charset (char *charset_name) |
get charset index from a string | |
UtEolType | ut_find_eol (char *eol_name) |
int | ut_find_lang_sys (char *language_name, UtLangSys *lang_sys) |
double | ut_get_charset_coef (UtCharsetIndex i) |
bool | ut_str_fuzzy_cmp (const char *str1, const char *str2, char stop_char) |
Approximative comparaison between two strings. | |
bool | ut_update_progress (struct UtText *, ulong, bool) |
ulong | ut_crc32 (ushort, ulong) |
Function which call the user-defined function UtText::progress_function. | |
void | ut_print_binary (ulong src) |
Print a number in binary form on stdout (debug). | |
UtCode | ut_debug_text (struct UtText *) |
UtCode | ut_debug_text_rating (struct UtText *) |
const char * | ut_error_message (UtCode code) |
UtCode | ut_load_file_pass (UtText *text, const char *filename) |
Load a text in a buffer. | |
UtCode | ut_load_stdin_pass (UtText *text) |
Load a text from stdin. | |
UtCode | ut_distrib_utf_pass (struct UtText *) |
UtCode | ut_eol_pass (struct UtText *) |
UtCode | ut_xascii_pass (struct UtText *) |
int | ut_size_char (char **src_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) |
Return size in byte of a character after conversion. | |
void | ut_conv_char (char **src_p, char **dst_p, UtCharsetIndex src_charset, UtCharsetIndex dst_charset) |
Convert a character. | |
void | ut_insert_eol (char **dst_p, UtEolType dst_eol) |
uint | ut_count_ext_char (UtText *text) |
Count the number of extended character in a text. | |
int | ut_size_difference (UtText *src_text, UtText *dst_text) |
Return the difference between the size of a text and its size after conversion. | |
UtCode | ut_conversion_pass (UtText *src_text, UtText *dst_text) |
Convert extended characters and EOL. | |
Variables | |
const float | UT_LANG_SYS_COEF [] |
Language and system coeficients applied to charset rating, depending on language or system selected. | |
const char * | UT_CHARMAPS_FILENAME |
Path 1 to file containing charset informations. | |
const char * | UT_CHARMAPS_FILENAME2 |
Path 2 to file containing charset informations. | |
const char * | UT_DEFAULT_ENCODING_UNIX |
Default encoding on Unix systems. | |
UtSession * | ut_session |
Definition in file utrac.h.
|
Value: {\ UtCode rcode = func;\ if (rcode != UT_OK) return rcode;\ } |
|
Structure containing different languages or systems used. This structure is a dynamic array containing the list of languages or systems defined in the charset data file |
|
Structure containing all required information for Utrac. This structure contains all the required information for an utrac session (charsets data, language, system and charset default...). Its unique instance can be accessed with the ut_session pointer which is defined as a global variable. It is created with ut_init() and destroyed with ut_finish().
|
|
Convert a character.
Definition at line 259 of file ut_conversion.c. References UtSession::charset, UtSession::charset_default, UtSession::nomapping_char, UtCharset::type, UtCharset::unicode, ut_unicode_to_utf8c(), UT_UNSET, and ut_utf8c_to_unicode(). Referenced by ut_conversion_pass(). |
Here is the call graph for this function:
|
Convert extended characters and EOL. The conversion consists to :
Definition at line 539 of file ut_conversion.c. References UtText::charset, UtText::data, UtText::eol, UtText::eol_alt, is_ext(), UtSession::progress_function, UtText::size, ut_conv_char(), ut_count_ext_char(), UT_PROCESS_STEP, ut_size_difference(), and ut_update_progress(). Referenced by ut_convert(). |
Here is the call graph for this function:
|
Convert a text.
Definition at line 470 of file utrac.c. References UtText::charset, UtSession::charset_default, UtText::current_pass, UtText::data, UtText::distribution, UtText::eol, UtText::eol_alt, UtSession::eol_alt_default, UtSession::eol_default, UtText::ext_char, UtExtCharLine::next, UtText::pass_flags, UtText::progress_done, UtSession::progress_function, UtText::progress_todo, UtText::size, ut_conversion_pass(), ut_free_text_heap(), ut_init_progress(), ut_init_text_heap(), UT_UNSET, and ut_update_progress(). |
Here is the call graph for this function:
|
Function which call the user-defined function UtText::progress_function.
Definition at line 366 of file ut_utils.c. References UT_CRC32_POLY, and ut_crc32_table. Referenced by ut_xascii_pass(). |
|
Free ressources allocated during initialization of Utrac. This function frees the structure allocated in ut_session by ut_init(). It must be the last Utrac function called.
Definition at line 154 of file utrac.c. References ut_finish_nofree(). |
Here is the call graph for this function:
|
Free the contents of an UtText structure, without freeing the structure itself.
Definition at line 265 of file utrac.c. References UtText::data, UtText::distribution, UtText::evaluation, UtText::ext_char, and UtExtCharLine::next. Referenced by ut_free_text_heap(). |
|
Free an UtText structure.
Definition at line 253 of file utrac.c. References ut_free_text(). Referenced by ut_convert(). |
Here is the call graph for this function:
|
Initialize the Utrac library. This function must be called before any other Utrac function. It allocates an UtSession struture that is accessible by the ut_session pointer, initalizes it, loads charsets data, and sets default language, charset and end of line type. The memory used is about 630kb for 47 charsets loaded.
Definition at line 56 of file utrac.c. References ut_init_noalloc(). |
Here is the call graph for this function:
|
Initialize an UtText structure before using the 'progress bar' callback feature. Can be used internaly or by the user. The UtText must have member UtText::pass_flag set, or at least UtText::flags (if UtText::pass_flags is unset, it will be set for just a recognition pass and subpasses will be selected upon the value of UtText::flags). Definition at line 289 of file utrac.c. References UtText::flags, UtText::pass_flags, UtText::progress_done, UtText::progress_todo, UT_F_ADD_FINAL_EOL, UT_F_REMOVE_ILLEGAL_CHAR, and UT_F_TRANSFORM_EOL. Referenced by ut_convert(), ut_load(), and ut_recognize(). |
|
Initalizes an UtText structure.
Definition at line 221 of file utrac.c. References UtText::charset, UtText::current_pass, UtText::data, UtText::distribution, UtText::eol, UtText::eol_alt, UtText::evaluation, UtText::ext_char, UtText::flags, UtText::nb_lines, UtText::nb_lines_alt, UtText::pass_flags, UtText::progress_done, UtText::progress_todo, UtText::size, UtText::skip_char, and UtText::user. Referenced by ut_init_text_heap(). |
|
Allocates and initalizes an UtText structure.
Definition at line 206 of file utrac.c. References ut_init_text(). Referenced by ut_convert(). |
Here is the call graph for this function:
|
Load a file in an UtText structure. If filename is null, it will read stdin. text->data and text->size will be set. If ut_session->progress_function is set, it will be called during loading and members of text dealing with this feature will be updated. Definition at line 330 of file utrac.c. References UtText::current_pass, UtText::pass_flags, UtText::progress_done, UtSession::progress_function, UtText::progress_todo, ut_init_progress(), and ut_update_progress(). |
Here is the call graph for this function:
|
Load a file in a buffer.
Definition at line 64 of file ut_utils.c. Referenced by ut_load_charsets(). |
|
Loads and parses file charset.dat. This function loads and parses file charset.dat containing all informations about charset in a UtCharset array in UtSession::charset.
Definition at line 472 of file ut_charset.c. References UtCharset::alias, UtCharset::char_type, charmap_keyword, UtSession::charset, UtCharmapLink::charset, UtCharset::comment, UtCharset::common_name, UtSession::error_string, is_blank(), is_eol(), UtCharset::language, UtSession::language, UtCharset::name, UtSession::nb_charsets, UtCharmapLink::next, parse_charmap_entry(), parse_charmap_line(), parse_lang_sys_def_line(), parse_lang_sys_line(), parse_string_line(), streq(), UtCharset::system, UtSession::system, UtCharset::type, UtCharset::unicode, UT_ERROR_STRING_SIZE, ut_load_charset_file(), and UtCharmapLink. Referenced by ut_init_noalloc(). |
Here is the call graph for this function:
|
Load a text in a buffer.
Definition at line 57 of file ut_loading.c. |
|
Load a text from stdin.
Definition at line 124 of file ut_loading.c. |
|
Print a number in binary form on stdout (debug).
Definition at line 102 of file ut_utils.c. Referenced by ut_debug_text(), and ut_utf8c_to_unicode(). |
|
Recognize charset and EOL of a text. text->data must be set. If text->size is null, recognition will stop at the first null character. text->flags must also be set to select processes to do (see UtTextFlags). If ut_session->progress_function is set, it will be called during loading and members of text dealing with this feature will be updated. If UT_F_FORCE_BINARY is set, texts with caracters between 0 and 0x19 (space is 0x20, and TAB, CR, LF are excluded of this range) won't produce error. If UT_F_IDENTIFY_EOL is set, text->eol, text->eol_alt, text->nb_lines, text->nb_lines_alt will be updated. If convertion of EOL is planned, UT_F_TRANSFORM_EOL must be set. If UT_F_IDENTIFY_CHARSET is set, text->charset will be updated. text->evaluation also if charset is 8bits and ASCII -erivated. text->distribution will always be set, text->ext_char also (but this is a bug!) If ut_session->progress_function is set, it will be called during loading and members of text dealing with this feature will be updated. Definition at line 387 of file utrac.c. References UtText::charset, UtText::current_pass, UtText::data, UtText::distribution, UtText::flags, UtText::pass_flags, UtText::progress_done, UtSession::progress_function, UtText::progress_todo, UtText::skip_char, ut_distrib_utf_pass(), UT_EOL_ALT_CHAR, ut_eol_pass(), ut_init_progress(), UT_UNSET, ut_update_progress(), and ut_xascii_pass(). |
Here is the call graph for this function:
|
Return size in byte of a character after conversion.
Definition at line 203 of file ut_conversion.c. References UtSession::charset, UtSession::charset_default, UtSession::nomapping_char, UtCharset::type, UtCharset::unicode, ut_size_unicode(), UT_UNSET, and ut_utf8c_to_unicode(). |
Here is the call graph for this function:
|
Return the difference between the size of a text and its size after conversion.
Definition at line 363 of file ut_conversion.c. References UtSession::charset, UtText::charset, UtText::distribution, UtText::eol, UtText::eol_alt, UtText::nb_lines, UtText::nb_lines_alt, UtSession::nomapping_char, UtCharset::type, UtCharset::unicode, ut_count_ext_char(), UT_EOL_BSN, UT_EOL_MIX, UT_EOL_NUL, ut_size_unicode(), and UT_UNSET. Referenced by ut_conversion_pass(). |
Here is the call graph for this function:
|
Approximative comparaison between two strings. The comparaison focuses only on substrings composed of number or letter (case is not significant). For instance "iso8859 1"=="ISO-8859-1", but "Mac Roman"!="MacRoman". Definition at line 232 of file ut_utils.c. References is_letter(), is_maj(), and is_num(). Referenced by ut_find_charset(), and ut_init_noalloc(). |
Here is the call graph for this function: