diff options
Diffstat (limited to 'common/utf8conv.c')
-rw-r--r-- | common/utf8conv.c | 813 |
1 files changed, 813 insertions, 0 deletions
diff --git a/common/utf8conv.c b/common/utf8conv.c new file mode 100644 index 000000000..6cbe4e92c --- /dev/null +++ b/common/utf8conv.c @@ -0,0 +1,813 @@ +/* utf8conf.c - UTF8 character set conversion + * Copyright (C) 1994, 1998, 1999, 2000, 2001, 2003, 2006, + * 2008, 2010 Free Software Foundation, Inc. + * + * This file is part of JNLIB. + * + * JNLIB is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 3 of + * the License, or (at your option) any later version. + * + * JNLIB is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <ctype.h> +#ifdef HAVE_LANGINFO_CODESET +#include <langinfo.h> +#endif +#include <errno.h> +#ifndef HAVE_W32_SYSTEM +# include <iconv.h> +#endif + +#include "libjnlib-config.h" +#include "stringhelp.h" +#include "dynload.h" +#include "utf8conv.h" + +#ifndef MB_LEN_MAX +#define MB_LEN_MAX 16 +#endif + +static const char *active_charset_name = "iso-8859-1"; +static int no_translation; /* Set to true if we let simply pass through. */ +static int use_iconv; /* iconv comversion fucntions required. */ + + +/* Under W32 we dlopen the iconv dll and don't require any iconv + related headers at all. However we need to define some stuff. */ +#ifdef HAVE_W32_SYSTEM +typedef void *iconv_t; +#ifndef ICONV_CONST +#define ICONV_CONST +#endif +static iconv_t (* __stdcall iconv_open) (const char *tocode, + const char *fromcode); +static size_t (* __stdcall iconv) (iconv_t cd, + char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft); +static int (* __stdcall iconv_close) (iconv_t cd); + +static int +load_libiconv (void) +{ + static int done; + + if (!done) + { + void *handle; + + done = 1; /* Do it right now because we might get called recursivly + through gettext. */ + + handle = dlopen ("iconv.dll", RTLD_LAZY); + if (handle) + { + iconv_open = dlsym (handle, "libiconv_open"); + if (iconv_open) + iconv = dlsym (handle, "libiconv"); + if (iconv) + iconv_close = dlsym (handle, "libiconv_close"); + } + if (!handle || !iconv_close) + { + log_info (_("error loading `%s': %s\n"), + "iconv.dll", dlerror ()); + log_info (_("please see %s for more information\n"), + "http://www.gnupg.org/download/iconv.html"); + iconv_open = NULL; + iconv = NULL; + iconv_close = NULL; + if (handle) + dlclose (handle); + } + } + return iconv_open? 0: -1; +} +#endif /*HAVE_W32_SYSTEM*/ + + +/* Error handler for iconv failures. This is needed to not clutter the + output with repeated diagnostics about a missing conversion. */ +static void +handle_iconv_error (const char *to, const char *from, int use_fallback) +{ + if (errno == EINVAL) + { + static int shown1, shown2; + int x; + + if (to && !strcmp (to, "utf-8")) + { + x = shown1; + shown1 = 1; + } + else + { + x = shown2; + shown2 = 1; + } + + if (!x) + log_info (_("conversion from `%s' to `%s' not available\n"), + from, to); + } + else + { + static int shown; + + if (!shown) + log_info (_("iconv_open failed: %s\n"), strerror (errno)); + shown = 1; + } + + if (use_fallback) + { + /* To avoid further error messages we fallback to Latin-1 for the + native encoding. This is justified as one can expect that on a + utf-8 enabled system nl_langinfo() will work and thus we won't + never get to here. Thus Latin-1 seems to be a reasonable + default. */ + active_charset_name = "iso-8859-1"; + no_translation = 0; + use_iconv = 0; + } +} + + + +int +set_native_charset (const char *newset) +{ + const char *full_newset; + + if (!newset) + { +#ifdef HAVE_W32_SYSTEM + static char codepage[30]; + unsigned int cpno; + const char *aliases; + + /* We are a console program thus we need to use the + GetConsoleOutputCP function and not the the GetACP which + would give the codepage for a GUI program. Note this is not + a bulletproof detection because GetConsoleCP might return a + different one for console input. Not sure how to cope with + that. If the console Code page is not known we fall back to + the system code page. */ +#ifndef HAVE_W32CE_SYSTEM + cpno = GetConsoleOutputCP (); + if (!cpno) +#endif + cpno = GetACP (); + sprintf (codepage, "CP%u", cpno ); + /* Resolve alias. We use a long string string and not the usual + array to optimize if the code is taken to a DSO. Taken from + libiconv 1.9.2. */ + newset = codepage; + for (aliases = ("CP936" "\0" "GBK" "\0" + "CP1361" "\0" "JOHAB" "\0" + "CP20127" "\0" "ASCII" "\0" + "CP20866" "\0" "KOI8-R" "\0" + "CP21866" "\0" "KOI8-RU" "\0" + "CP28591" "\0" "ISO-8859-1" "\0" + "CP28592" "\0" "ISO-8859-2" "\0" + "CP28593" "\0" "ISO-8859-3" "\0" + "CP28594" "\0" "ISO-8859-4" "\0" + "CP28595" "\0" "ISO-8859-5" "\0" + "CP28596" "\0" "ISO-8859-6" "\0" + "CP28597" "\0" "ISO-8859-7" "\0" + "CP28598" "\0" "ISO-8859-8" "\0" + "CP28599" "\0" "ISO-8859-9" "\0" + "CP28605" "\0" "ISO-8859-15" "\0" + "CP65001" "\0" "UTF-8" "\0"); + *aliases; + aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) + { + if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1])) + { + newset = aliases + strlen (aliases) + 1; + break; + } + } + +#else /*!HAVE_W32_SYSTEM*/ + +#ifdef HAVE_LANGINFO_CODESET + newset = nl_langinfo (CODESET); +#else /*!HAVE_LANGINFO_CODESET*/ + /* Try to get the used charset from environment variables. */ + static char codepage[30]; + const char *lc, *dot, *mod; + + strcpy (codepage, "iso-8859-1"); + lc = getenv ("LC_ALL"); + if (!lc || !*lc) + { + lc = getenv ("LC_CTYPE"); + if (!lc || !*lc) + lc = getenv ("LANG"); + } + if (lc && *lc) + { + dot = strchr (lc, '.'); + if (dot) + { + mod = strchr (++dot, '@'); + if (!mod) + mod = dot + strlen (dot); + if (mod - dot < sizeof codepage && dot != mod) + { + memcpy (codepage, dot, mod - dot); + codepage [mod - dot] = 0; + } + } + } + newset = codepage; +#endif /*!HAVE_LANGINFO_CODESET*/ +#endif /*!HAVE_W32_SYSTEM*/ + } + + full_newset = newset; + if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3)) + { + newset += 3; + if (*newset == '-' || *newset == '_') + newset++; + } + + /* Note that we silently assume that plain ASCII is actually meant + as Latin-1. This makes sense because many Unix system don't have + their locale set up properly and thus would get annoying error + messages and we have to handle all the "bug" reports. Latin-1 has + always been the character set used for 8 bit characters on Unix + systems. */ + if ( !*newset + || !ascii_strcasecmp (newset, "8859-1" ) + || !ascii_strcasecmp (newset, "646" ) + || !ascii_strcasecmp (newset, "ASCII" ) + || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" ) + ) + { + active_charset_name = "iso-8859-1"; + no_translation = 0; + use_iconv = 0; + } + else if ( !ascii_strcasecmp (newset, "utf8" ) + || !ascii_strcasecmp(newset, "utf-8") ) + { + active_charset_name = "utf-8"; + no_translation = 1; + use_iconv = 0; + } + else + { + iconv_t cd; + +#ifdef HAVE_W32_SYSTEM + if (load_libiconv ()) + return -1; +#endif /*HAVE_W32_SYSTEM*/ + + cd = iconv_open (full_newset, "utf-8"); + if (cd == (iconv_t)-1) + { + handle_iconv_error (full_newset, "utf-8", 0); + return -1; + } + iconv_close (cd); + cd = iconv_open ("utf-8", full_newset); + if (cd == (iconv_t)-1) + { + handle_iconv_error ("utf-8", full_newset, 0); + return -1; + } + iconv_close (cd); + active_charset_name = full_newset; + no_translation = 0; + use_iconv = 1; + } + return 0; +} + +const char * +get_native_charset () +{ + return active_charset_name; +} + +/* Return true if the native charset is utf-8. */ +int +is_native_utf8 (void) +{ + return no_translation; +} + + +/* Convert string, which is in native encoding to UTF8 and return a + new allocated UTF-8 string. This function terminates the process + on memory shortage. */ +char * +native_to_utf8 (const char *orig_string) +{ + const unsigned char *string = (const unsigned char *)orig_string; + const unsigned char *s; + char *buffer; + unsigned char *p; + size_t length = 0; + + if (no_translation) + { + /* Already utf-8 encoded. */ + buffer = jnlib_xstrdup (orig_string); + } + else if (!use_iconv) + { + /* For Latin-1 we can avoid the iconv overhead. */ + for (s = string; *s; s++) + { + length++; + if (*s & 0x80) + length++; + } + buffer = jnlib_xmalloc (length + 1); + for (p = (unsigned char *)buffer, s = string; *s; s++) + { + if ( (*s & 0x80 )) + { + *p++ = 0xc0 | ((*s >> 6) & 3); + *p++ = 0x80 | (*s & 0x3f); + } + else + *p++ = *s; + } + *p = 0; + } + else + { + /* Need to use iconv. */ + iconv_t cd; + const char *inptr; + char *outptr; + size_t inbytes, outbytes; + + cd = iconv_open ("utf-8", active_charset_name); + if (cd == (iconv_t)-1) + { + handle_iconv_error ("utf-8", active_charset_name, 1); + return native_to_utf8 (string); + } + + for (s=string; *s; s++ ) + { + length++; + if ((*s & 0x80)) + length += 5; /* We may need up to 6 bytes for the utf8 output. */ + } + buffer = jnlib_xmalloc (length + 1); + + inptr = string; + inbytes = strlen (string); + outptr = buffer; + outbytes = length; + if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, + &outptr, &outbytes) == (size_t)-1) + { + static int shown; + + if (!shown) + log_info (_("conversion from `%s' to `%s' failed: %s\n"), + active_charset_name, "utf-8", strerror (errno)); + shown = 1; + /* We don't do any conversion at all but use the strings as is. */ + strcpy (buffer, string); + } + else /* Success. */ + { + *outptr = 0; + /* We could realloc the buffer now but I doubt that it makes + much sense given that it will get freed anyway soon + after. */ + } + iconv_close (cd); + } + return buffer; +} + + + +static char * +do_utf8_to_native (const char *string, size_t length, int delim, + int with_iconv) +{ + int nleft; + int i; + unsigned char encbuf[8]; + int encidx; + const unsigned char *s; + size_t n; + char *buffer = NULL; + char *p = NULL; + unsigned long val = 0; + size_t slen; + int resync = 0; + + /* First pass (p==NULL): count the extended utf-8 characters. */ + /* Second pass (p!=NULL): create string. */ + for (;;) + { + for (slen = length, nleft = encidx = 0, n = 0, + s = (const unsigned char *)string; + slen; + s++, slen--) + { + if (resync) + { + if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd))) + { + /* Still invalid. */ + if (p) + { + sprintf (p, "\\x%02x", *s); + p += 4; + } + n += 4; + continue; + } + resync = 0; + } + if (!nleft) + { + if (!(*s & 0x80)) + { + /* Plain ascii. */ + if ( delim != -1 + && (*s < 0x20 || *s == 0x7f || *s == delim + || (delim && *s == '\\'))) + { + n++; + if (p) + *p++ = '\\'; + switch (*s) + { + case '\n': n++; if ( p ) *p++ = 'n'; break; + case '\r': n++; if ( p ) *p++ = 'r'; break; + case '\f': n++; if ( p ) *p++ = 'f'; break; + case '\v': n++; if ( p ) *p++ = 'v'; break; + case '\b': n++; if ( p ) *p++ = 'b'; break; + case 0: n++; if ( p ) *p++ = '0'; break; + default: + n += 3; + if (p) + { + sprintf (p, "x%02x", *s); + p += 3; + } + break; + } + } + else + { + if (p) + *p++ = *s; + n++; + } + } + else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */ + { + val = *s & 0x1f; + nleft = 1; + encidx = 0; + encbuf[encidx++] = *s; + } + else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */ + { + val = *s & 0x0f; + nleft = 2; + encidx = 0; + encbuf[encidx++] = *s; + } + else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */ + { + val = *s & 0x07; + nleft = 3; + encidx = 0; + encbuf[encidx++] = *s; + } + else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */ + { + val = *s & 0x03; + nleft = 4; + encidx = 0; + encbuf[encidx++] = *s; + } + else if ((*s & 0xfe) == 0xfc) /* 1111 110x */ + { + val = *s & 0x01; + nleft = 5; + encidx = 0; + encbuf[encidx++] = *s; + } + else /* Invalid encoding: print as \xNN. */ + { + if (p) + { + sprintf (p, "\\x%02x", *s); + p += 4; + } + n += 4; + resync = 1; + } + } + else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */ + { + if (p) + { + for (i = 0; i < encidx; i++) + { + sprintf (p, "\\x%02x", encbuf[i]); + p += 4; + } + sprintf (p, "\\x%02x", *s); + p += 4; + } + n += 4 + 4 * encidx; + nleft = 0; + encidx = 0; + resync = 1; + } + else + { + encbuf[encidx++] = *s; + val <<= 6; + val |= *s & 0x3f; + if (!--nleft) /* Ready. */ + { + if (no_translation) + { + if (p) + { + for (i = 0; i < encidx; i++) + *p++ = encbuf[i]; + } + n += encidx; + encidx = 0; + } + else if (with_iconv) + { + /* Our strategy for using iconv is a bit strange + but it better keeps compatibility with + previous versions in regard to how invalid + encodings are displayed. What we do is to + keep the utf-8 as is and have the real + translation step then at the end. Yes, I + know that this is ugly. However we are short + of the 1.4 release and for this branch we + should not mess too much around with iconv + things. One reason for this is that we don't + know enough about non-GNU iconv + implementation and want to minimize the risk + of breaking the code on too many platforms. */ + if ( p ) + { + for (i=0; i < encidx; i++ ) + *p++ = encbuf[i]; + } + n += encidx; + encidx = 0; + } + else /* Latin-1 case. */ + { + if (val >= 0x80 && val < 256) + { + /* We can simply print this character */ + n++; + if (p) + *p++ = val; + } + else + { + /* We do not have a translation: print utf8. */ + if (p) + { + for (i = 0; i < encidx; i++) + { + sprintf (p, "\\x%02x", encbuf[i]); + p += 4; + } + } + n += encidx * 4; + encidx = 0; + } + } + } + + } + } + if (!buffer) + { + /* Allocate the buffer after the first pass. */ + buffer = p = jnlib_xmalloc (n + 1); + } + else if (with_iconv) + { + /* Note: See above for comments. */ + iconv_t cd; + const char *inptr; + char *outbuf, *outptr; + size_t inbytes, outbytes; + + *p = 0; /* Terminate the buffer. */ + + cd = iconv_open (active_charset_name, "utf-8"); + if (cd == (iconv_t)-1) + { + handle_iconv_error (active_charset_name, "utf-8", 1); + jnlib_free (buffer); + return utf8_to_native (string, length, delim); + } + + /* Allocate a new buffer large enough to hold all possible + encodings. */ + n = p - buffer + 1; + inbytes = n - 1;; + inptr = buffer; + outbytes = n * MB_LEN_MAX; + if (outbytes / MB_LEN_MAX != n) + BUG (); /* Actually an overflow. */ + outbuf = outptr = jnlib_xmalloc (outbytes); + if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, + &outptr, &outbytes) == (size_t)-1) + { + static int shown; + + if (!shown) + log_info (_("conversion from `%s' to `%s' failed: %s\n"), + "utf-8", active_charset_name, strerror (errno)); + shown = 1; + /* Didn't worked out. Try again but without iconv. */ + jnlib_free (buffer); + buffer = NULL; + jnlib_free (outbuf); + outbuf = do_utf8_to_native (string, length, delim, 0); + } + else /* Success. */ + { + *outptr = 0; /* Make sure it is a string. */ + /* We could realloc the buffer now but I doubt that it + makes much sense given that it will get freed + anyway soon after. */ + jnlib_free (buffer); + } + iconv_close (cd); + return outbuf; + } + else /* Not using iconv. */ + { + *p = 0; /* Make sure it is a string. */ + return buffer; + } + } +} + +/* Convert string, which is in UTF-8 to native encoding. Replace + illegal encodings by some "\xnn" and quote all control + characters. A character with value DELIM will always be quoted, it + must be a vanilla ASCII character. A DELIM value of -1 is special: + it disables all quoting of control characters. This function + terminates the process on memory shortage. */ +char * +utf8_to_native (const char *string, size_t length, int delim) +{ + return do_utf8_to_native (string, length, delim, use_iconv); +} + + + + +/* Wrapper function for iconv_open, required for W32 as we dlopen that + library on that system. */ +jnlib_iconv_t +jnlib_iconv_open (const char *tocode, const char *fromcode) +{ +#ifdef HAVE_W32_SYSTEM + if (load_libiconv ()) + return (jnlib_iconv_t)(-1); +#endif /*HAVE_W32_SYSTEM*/ + + return (jnlib_iconv_t)iconv_open (tocode, fromcode); +} + + +/* Wrapper function for iconv, required for W32 as we dlopen that + library on that system. */ +size_t +jnlib_iconv (jnlib_iconv_t cd, + const char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ + +#ifdef HAVE_W32_SYSTEM + if (load_libiconv ()) + return 0; +#endif /*HAVE_W32_SYSTEM*/ + + return iconv ((iconv_t)cd, (char**)inbuf, inbytesleft, outbuf, outbytesleft); +} + +/* Wrapper function for iconv_close, required for W32 as we dlopen that + library on that system. */ +int +jnlib_iconv_close (jnlib_iconv_t cd) +{ +#ifdef HAVE_W32_SYSTEM + if (load_libiconv ()) + return 0; +#endif /*HAVE_W32_SYSTEM*/ + + return iconv_close ((iconv_t)cd); +} + + +#ifdef HAVE_W32_SYSTEM +/* Return a malloced string encoded in UTF-8 from the wide char input + string STRING. Caller must free this value. Returns NULL and sets + ERRNO on failure. Calling this function with STRING set to NULL is + not defined. */ +char * +wchar_to_utf8 (const wchar_t *string) +{ + int n; + char *result; + + n = WideCharToMultiByte (CP_UTF8, 0, string, -1, NULL, 0, NULL, NULL); + if (n < 0) + { + jnlib_set_errno (EINVAL); + return NULL; + } + + result = jnlib_malloc (n+1); + if (!result) + return NULL; + + n = WideCharToMultiByte (CP_UTF8, 0, string, -1, result, n, NULL, NULL); + if (n < 0) + { + jnlib_free (result); + jnlib_set_errno (EINVAL); + result = NULL; + } + return result; +} + + +/* Return a malloced wide char string from an UTF-8 encoded input + string STRING. Caller must free this value. Returns NULL and sets + ERRNO on failure. Calling this function with STRING set to NULL is + not defined. */ +wchar_t * +utf8_to_wchar (const char *string) +{ + int n; + size_t nbytes; + wchar_t *result; + + n = MultiByteToWideChar (CP_UTF8, 0, string, -1, NULL, 0); + if (n < 0) + { + jnlib_set_errno (EINVAL); + return NULL; + } + + nbytes = (size_t)(n+1) * sizeof(*result); + if (nbytes / sizeof(*result) != (n+1)) + { + jnlib_set_errno (ENOMEM); + return NULL; + } + result = malloc (nbytes); + if (!result) + return NULL; + + n = MultiByteToWideChar (CP_UTF8, 0, string, -1, result, n); + if (n < 0) + { + free (result); + jnlib_set_errno (EINVAL); + result = NULL; + } + return result; +} +#endif /*HAVE_W32_SYSTEM*/ |