FreeTDS API
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
Macros | Typedefs | Enumerations | Functions | Variables
Charset conversion

Convert between different charsets. More...

Collaboration diagram for Charset conversion:

Macros

#define CHUNK_ALLOC   4
 

Typedefs

typedef TDS_UINT ICONV_CHAR
 
typedef int(* iconv_get_t )(const unsigned char *p, size_t len, ICONV_CHAR *out)
 
typedef int(* iconv_put_t )(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 

Enumerations

enum  ICONV_CD_VALUE { Like_to_Like = 0x100 }
 

Functions

static void _iconv_close (iconv_t *cd)
 
static int collate2charset (int sql_collate, int lcid)
 
static int get_ascii (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_err (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_iso1 (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_ucs4be (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_ucs4le (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_utf16be (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_utf16le (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int get_utf8 (const unsigned char *p, size_t len, ICONV_CHAR *out)
 
static int lookup_canonic (const CHARACTER_SET_ALIAS aliases[], const char *charset_name)
 
static int put_ascii (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_err (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_iso1 (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_ucs4be (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_ucs4le (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_utf16be (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_utf16le (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static int put_utf8 (unsigned char *buf, size_t buf_len, ICONV_CHAR c)
 
static size_t skip_one_input_sequence (iconv_t cd, const TDS_ENCODING *charset, const char **input, size_t *input_size)
 Move the input sequence pointer to the next valid position. More...
 
void tds7_srv_charset_changed (TDSCONNECTION *conn, int sql_collate, int lcid)
 
static int tds_canonical_charset (const char *charset_name)
 Determine canonical iconv character set. More...
 
const char * tds_canonical_charset_name (const char *charset_name)
 Determine canonical iconv character set name. More...
 
size_t tds_iconv (TDSSOCKET *tds, TDSICONV *conv, TDS_ICONV_DIRECTION io, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
 Wrapper around iconv(3). More...
 
void tds_iconv_close (TDSCONNECTION *conn)
 
static void tds_iconv_err (TDSSOCKET *tds, int err)
 
void tds_iconv_free (TDSCONNECTION *conn)
 
TDSICONVtds_iconv_from_collate (TDSCONNECTION *conn, TDS_UCHAR collate[5])
 Get iconv information from a LCID (to support different column encoding under MSSQL2K)
 
TDSICONVtds_iconv_get (TDSCONNECTION *conn, const char *client_charset, const char *server_charset)
 
static TDSICONVtds_iconv_get_info (TDSCONNECTION *conn, int canonic_client, int canonic_server)
 Get a iconv info structure, allocate and initialize if needed.
 
static void tds_iconv_info_close (TDSICONV *char_conv)
 
static int tds_iconv_info_init (TDSICONV *char_conv, int client_canonical, int server_canonical)
 Open iconv descriptors to convert between character sets (both directions). More...
 
TDSRET tds_iconv_open (TDSCONNECTION *conn, const char *charset, int use_utf16)
 
void tds_srv_charset_changed (TDSCONNECTION *conn, const char *charset)
 
static void tds_srv_charset_changed_num (TDSCONNECTION *conn, int canonic_charset_num)
 
size_t tds_sys_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
 
int tds_sys_iconv_close (iconv_t cd)
 
iconv_t tds_sys_iconv_open (const char *tocode, const char *fromcode)
 Inputs are FreeTDS canonical names, no other. More...
 

Variables

static const iconv_get_t iconv_gets [8]
 
static const iconv_put_t iconv_puts [8]
 
static const unsigned char utf8_lengths [256]
 
static const unsigned char utf8_masks [7]
 

Detailed Description

Convert between different charsets.

Set up the initial iconv conversion descriptors.

When the socket is allocated, three TDSICONV structures are attached to iconv. They have fixed meanings:

Other designs that use less data are possible, but these three conversion needs are very often needed. By reserving them, we avoid searching the array for our most common purposes.

To solve different iconv names and portability problems FreeTDS maintains a list of aliases each charset.

First we discover the names of our minimum required charsets (UTF-8, ISO8859-1 and UCS2). Later, as and when it's needed, we try to discover others.

There is one list of canonic names (GNU iconv names) and two sets of aliases (one for other iconv implementations and another for Sybase). For every canonic charset name we cache the iconv name found during discovery.

Function Documentation

static size_t skip_one_input_sequence ( iconv_t  cd,
const TDS_ENCODING charset,
const char **  input,
size_t *  input_size 
)
static

Move the input sequence pointer to the next valid position.

Used when an input character cannot be converted.

Returns
number of bytes to skip.

Here is the call graph for this function:

Here is the caller graph for this function:

static int tds_canonical_charset ( const char *  charset_name)
static

Determine canonical iconv character set.

Returns
canonical position, or -1 if lookup failed.
Remarks
Returned name can be used in bytes_per_char(), above.

Here is the caller graph for this function:

const char* tds_canonical_charset_name ( const char *  charset_name)

Determine canonical iconv character set name.

Returns
canonical name, or NULL if lookup failed.
Remarks
Returned name can be used in bytes_per_char(), above.

Here is the call graph for this function:

Here is the caller graph for this function:

size_t tds_iconv ( TDSSOCKET tds,
TDSICONV conv,
TDS_ICONV_DIRECTION  io,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft 
)

Wrapper around iconv(3).

Same parameters, with slightly different behavior.

Parameters
tdsstate information for the socket and the TDS protocol
ioEnumerated value indicating whether the data are being sent to or received from the server.
convinformation about the encodings involved, including the iconv(3) conversion descriptors.
inbufaddress of pointer to the input buffer of data to be converted.
inbytesleftaddress of count of bytes in inbuf.
outbufaddress of pointer to the output buffer.
outbytesleftaddress of count of bytes in outbuf.
Return values
numberof irreversible conversions performed. -1 on error, see iconv(3) documentation for a description of the possible values of errno.
Remarks
Unlike iconv(3), none of the arguments can be nor point to NULL. Like iconv(3), all pointers will be updated. Success is signified by a nonnegative return code and *inbytesleft == 0. If the conversion descriptor in iconv is -1 or NULL, inbuf is copied to outbuf, and all parameters updated accordingly.

If a character in inbuf cannot be converted because no such cbaracter exists in the outbuf character set, we emit messages similar to the ones Sybase emits when it fails such a conversion. The message varies depending on the direction of the data. On a read error, we emit Msg 2403, Severity 16 (EX_INFO): "WARNING! Some character(s) could not be converted into client's character set. Unconverted bytes were changed to question marks ('?')." On a write error we emit Msg 2402, Severity 16 (EX_USER): "Error converting client characters into server's character set. Some character(s) could not be converted." and return an error code. Client libraries relying on this routine should reflect an error back to the application.

Todo:

Check for variable multibyte non-UTF-8 input character set.

Use more robust error message generation.

For reads, cope with outbuf encodings that don't have the equivalent of an ASCII '?'.

Support alternative to '?' for the replacement character.

Here is the call graph for this function:

Here is the caller graph for this function:

static int tds_iconv_info_init ( TDSICONV char_conv,
int  client_canonical,
int  server_canonical 
)
static

Open iconv descriptors to convert between character sets (both directions).

  1. Look up the canonical names of the character sets.
  2. Look up their widths.
  3. Ask iconv to open a conversion descriptor.
  4. Fail if any of the above offer any resistance.
    Remarks
    The charset names written to iconv will be the canonical names, not necessarily the names passed in.

Here is the call graph for this function:

Here is the caller graph for this function:

iconv_t tds_sys_iconv_open ( const char *  tocode,
const char *  fromcode 
)

Inputs are FreeTDS canonical names, no other.

No alias list is consulted.

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

const iconv_get_t iconv_gets[8]
static
Initial value:
= {
get_iso1, get_ascii, get_utf16le, get_utf16be, get_ucs4le, get_ucs4be, get_utf8, get_err
}
const iconv_put_t iconv_puts[8]
static
Initial value:
= {
put_iso1, put_ascii, put_utf16le, put_utf16be, put_ucs4le, put_ucs4be, put_utf8, put_err
}
const unsigned char utf8_lengths[256]
static
Initial value:
= {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0,
}
const unsigned char utf8_masks[7]
static
Initial value:
= {
0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01
}