454 lines
14 KiB
Plaintext
454 lines
14 KiB
Plaintext
|
Add code support for ICU.
|
||
|
|
||
|
diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c
|
||
|
index b86a547..0f41df9 100644
|
||
|
--- a/third_party/libxml/encoding.c
|
||
|
+++ b/third_party/libxml/encoding.c
|
||
|
@@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
|
||
|
static int xmlCharEncodingAliasesNb = 0;
|
||
|
static int xmlCharEncodingAliasesMax = 0;
|
||
|
|
||
|
-#ifdef LIBXML_ICONV_ENABLED
|
||
|
+#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
|
||
|
#if 0
|
||
|
#define DEBUG_ENCODING /* Define this to get encoding traces */
|
||
|
#endif
|
||
|
@@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
|
||
|
NULL, 0, val, NULL, NULL, 0, 0, msg, val);
|
||
|
}
|
||
|
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+static uconv_t*
|
||
|
+openIcuConverter(const char* name, int toUnicode)
|
||
|
+{
|
||
|
+ UErrorCode status = U_ZERO_ERROR;
|
||
|
+ uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
|
||
|
+ if (conv == NULL)
|
||
|
+ return NULL;
|
||
|
+
|
||
|
+ conv->uconv = ucnv_open(name, &status);
|
||
|
+ if (U_FAILURE(status))
|
||
|
+ goto error;
|
||
|
+
|
||
|
+ status = U_ZERO_ERROR;
|
||
|
+ if (toUnicode) {
|
||
|
+ ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
|
||
|
+ NULL, NULL, NULL, &status);
|
||
|
+ }
|
||
|
+ else {
|
||
|
+ ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
|
||
|
+ NULL, NULL, NULL, &status);
|
||
|
+ }
|
||
|
+ if (U_FAILURE(status))
|
||
|
+ goto error;
|
||
|
+
|
||
|
+ status = U_ZERO_ERROR;
|
||
|
+ conv->utf8 = ucnv_open("UTF-8", &status);
|
||
|
+ if (U_SUCCESS(status))
|
||
|
+ return conv;
|
||
|
+
|
||
|
+error:
|
||
|
+ if (conv->uconv)
|
||
|
+ ucnv_close(conv->uconv);
|
||
|
+ xmlFree(conv);
|
||
|
+ return NULL;
|
||
|
+}
|
||
|
+
|
||
|
+static void
|
||
|
+closeIcuConverter(uconv_t *conv)
|
||
|
+{
|
||
|
+ if (conv != NULL) {
|
||
|
+ ucnv_close(conv->uconv);
|
||
|
+ ucnv_close(conv->utf8);
|
||
|
+ xmlFree(conv);
|
||
|
+ }
|
||
|
+}
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
+
|
||
|
/************************************************************************
|
||
|
* *
|
||
|
* Conversions To/From UTF8 encoding *
|
||
|
@@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name,
|
||
|
#ifdef LIBXML_ICONV_ENABLED
|
||
|
handler->iconv_in = NULL;
|
||
|
handler->iconv_out = NULL;
|
||
|
-#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#endif
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ handler->uconv_in = NULL;
|
||
|
+ handler->uconv_out = NULL;
|
||
|
+#endif
|
||
|
|
||
|
/*
|
||
|
* registers and returns the handler.
|
||
|
@@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) {
|
||
|
xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL);
|
||
|
xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL);
|
||
|
#endif /* LIBXML_OUTPUT_ENABLED */
|
||
|
-#ifndef LIBXML_ICONV_ENABLED
|
||
|
+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
|
||
|
#ifdef LIBXML_ISO8859X_ENABLED
|
||
|
xmlRegisterCharEncodingHandlersISO8859x ();
|
||
|
#endif
|
||
|
@@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) {
|
||
|
xmlCharEncodingHandlerPtr enc;
|
||
|
iconv_t icv_in, icv_out;
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ xmlCharEncodingHandlerPtr enc;
|
||
|
+ uconv_t *ucv_in, *ucv_out;
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
char upper[100];
|
||
|
int i;
|
||
|
|
||
|
@@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) {
|
||
|
"iconv : problems with filters for '%s'\n", name);
|
||
|
}
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ /* check whether icu can handle this */
|
||
|
+ ucv_in = openIcuConverter(name, 1);
|
||
|
+ ucv_out = openIcuConverter(name, 0);
|
||
|
+ if (ucv_in != NULL && ucv_out != NULL) {
|
||
|
+ enc = (xmlCharEncodingHandlerPtr)
|
||
|
+ xmlMalloc(sizeof(xmlCharEncodingHandler));
|
||
|
+ if (enc == NULL) {
|
||
|
+ closeIcuConverter(ucv_in);
|
||
|
+ closeIcuConverter(ucv_out);
|
||
|
+ return(NULL);
|
||
|
+ }
|
||
|
+ enc->name = xmlMemStrdup(name);
|
||
|
+ enc->input = NULL;
|
||
|
+ enc->output = NULL;
|
||
|
+ enc->uconv_in = ucv_in;
|
||
|
+ enc->uconv_out = ucv_out;
|
||
|
+#ifdef DEBUG_ENCODING
|
||
|
+ xmlGenericError(xmlGenericErrorContext,
|
||
|
+ "Found ICU converter handler for encoding %s\n", name);
|
||
|
+#endif
|
||
|
+ return enc;
|
||
|
+ } else if (ucv_in != NULL || ucv_out != NULL) {
|
||
|
+ closeIcuConverter(ucv_in);
|
||
|
+ closeIcuConverter(ucv_out);
|
||
|
+ xmlEncodingErr(XML_ERR_INTERNAL_ERROR,
|
||
|
+ "ICU converter : problems with filters for '%s'\n", name);
|
||
|
+ }
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
|
||
|
#ifdef DEBUG_ENCODING
|
||
|
xmlGenericError(xmlGenericErrorContext,
|
||
|
@@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
|
||
|
|
||
|
/************************************************************************
|
||
|
* *
|
||
|
+ * ICU based generic conversion functions *
|
||
|
+ * *
|
||
|
+ ************************************************************************/
|
||
|
+
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+/**
|
||
|
+ * xmlUconvWrapper:
|
||
|
+ * @cd: ICU uconverter data structure
|
||
|
+ * @toUnicode : non-zero if toUnicode. 0 otherwise.
|
||
|
+ * @out: a pointer to an array of bytes to store the result
|
||
|
+ * @outlen: the length of @out
|
||
|
+ * @in: a pointer to an array of ISO Latin 1 chars
|
||
|
+ * @inlen: the length of @in
|
||
|
+ *
|
||
|
+ * Returns 0 if success, or
|
||
|
+ * -1 by lack of space, or
|
||
|
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
|
||
|
+ * the result of transformation can't fit into the encoding we want), or
|
||
|
+ * -3 if there the last byte can't form a single output char.
|
||
|
+ *
|
||
|
+ * The value of @inlen after return is the number of octets consumed
|
||
|
+ * as the return value is positive, else unpredictable.
|
||
|
+ * The value of @outlen after return is the number of ocetes consumed.
|
||
|
+ */
|
||
|
+static int
|
||
|
+xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
||
|
+ const unsigned char *in, int *inlen) {
|
||
|
+ const char *ucv_in = (const char *) in;
|
||
|
+ char *ucv_out = (char *) out;
|
||
|
+ UErrorCode err = U_ZERO_ERROR;
|
||
|
+
|
||
|
+ if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
|
||
|
+ if (outlen != NULL) *outlen = 0;
|
||
|
+ return(-1);
|
||
|
+ }
|
||
|
+
|
||
|
+ /*
|
||
|
+ * TODO(jungshik)
|
||
|
+ * 1. is ucnv_convert(To|From)Algorithmic better?
|
||
|
+ * 2. had we better use an explicit pivot buffer?
|
||
|
+ * 3. error returned comes from 'fromUnicode' only even
|
||
|
+ * when toUnicode is true !
|
||
|
+ */
|
||
|
+ if (toUnicode) {
|
||
|
+ /* encoding => UTF-16 => UTF-8 */
|
||
|
+ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
|
||
|
+ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
|
||
|
+ 0, TRUE, &err);
|
||
|
+ } else {
|
||
|
+ /* UTF-8 => UTF-16 => encoding */
|
||
|
+ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
|
||
|
+ &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
|
||
|
+ 0, TRUE, &err);
|
||
|
+ }
|
||
|
+ *inlen = ucv_in - (const char*) in;
|
||
|
+ *outlen = ucv_out - (char *) out;
|
||
|
+ if (U_SUCCESS(err))
|
||
|
+ return 0;
|
||
|
+ if (err == U_BUFFER_OVERFLOW_ERROR)
|
||
|
+ return -1;
|
||
|
+ if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
|
||
|
+ return -2;
|
||
|
+ /* if (err == U_TRUNCATED_CHAR_FOUND) */
|
||
|
+ return -3;
|
||
|
+}
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
+
|
||
|
+/************************************************************************
|
||
|
+ * *
|
||
|
* The real API used by libxml for on-the-fly conversion *
|
||
|
* *
|
||
|
************************************************************************/
|
||
|
@@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||
|
if (ret == -1) ret = -3;
|
||
|
}
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ else if (handler->uconv_in != NULL) {
|
||
|
+ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
|
||
|
+ &written, in->content, &toconv);
|
||
|
+ xmlBufferShrink(in, toconv);
|
||
|
+ out->use += written;
|
||
|
+ out->content[out->use] = 0;
|
||
|
+ if (ret == -1) ret = -3;
|
||
|
+ }
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
#ifdef DEBUG_ENCODING
|
||
|
switch (ret) {
|
||
|
case 0:
|
||
|
@@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
|
||
|
ret = -3;
|
||
|
}
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ else if (handler->uconv_in != NULL) {
|
||
|
+ ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use],
|
||
|
+ &written, in->content, &toconv);
|
||
|
+ xmlBufferShrink(in, toconv);
|
||
|
+ out->use += written;
|
||
|
+ out->content[out->use] = 0;
|
||
|
+ if (ret == -1)
|
||
|
+ ret = -3;
|
||
|
+ }
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
switch (ret) {
|
||
|
case 0:
|
||
|
#ifdef DEBUG_ENCODING
|
||
|
@@ -2015,6 +2190,15 @@ retry:
|
||
|
out->content[out->use] = 0;
|
||
|
}
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ else if (handler->uconv_out != NULL) {
|
||
|
+ ret = xmlUconvWrapper(handler->uconv_out, 0,
|
||
|
+ &out->content[out->use],
|
||
|
+ &written, NULL, &toconv);
|
||
|
+ out->use += written;
|
||
|
+ out->content[out->use] = 0;
|
||
|
+ }
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
#ifdef DEBUG_ENCODING
|
||
|
xmlGenericError(xmlGenericErrorContext,
|
||
|
"initialized encoder\n");
|
||
|
@@ -2061,6 +2245,26 @@ retry:
|
||
|
}
|
||
|
}
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ else if (handler->uconv_out != NULL) {
|
||
|
+ ret = xmlUconvWrapper(handler->uconv_out, 0,
|
||
|
+ &out->content[out->use],
|
||
|
+ &written, in->content, &toconv);
|
||
|
+ xmlBufferShrink(in, toconv);
|
||
|
+ out->use += written;
|
||
|
+ writtentot += written;
|
||
|
+ out->content[out->use] = 0;
|
||
|
+ if (ret == -1) {
|
||
|
+ if (written > 0) {
|
||
|
+ /*
|
||
|
+ * Can be a limitation of iconv
|
||
|
+ */
|
||
|
+ goto retry;
|
||
|
+ }
|
||
|
+ ret = -3;
|
||
|
+ }
|
||
|
+ }
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
else {
|
||
|
xmlEncodingErr(XML_I18N_NO_OUTPUT,
|
||
|
"xmlCharEncOutFunc: no output function !\n", NULL);
|
||
|
@@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
|
||
|
xmlFree(handler);
|
||
|
}
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) {
|
||
|
+ if (handler->name != NULL)
|
||
|
+ xmlFree(handler->name);
|
||
|
+ handler->name = NULL;
|
||
|
+ if (handler->uconv_out != NULL) {
|
||
|
+ closeIcuConverter(handler->uconv_out);
|
||
|
+ handler->uconv_out = NULL;
|
||
|
+ }
|
||
|
+ if (handler->uconv_in != NULL) {
|
||
|
+ closeIcuConverter(handler->uconv_in);
|
||
|
+ handler->uconv_in = NULL;
|
||
|
+ }
|
||
|
+ xmlFree(handler);
|
||
|
+ }
|
||
|
+#endif
|
||
|
#ifdef DEBUG_ENCODING
|
||
|
if (ret)
|
||
|
xmlGenericError(xmlGenericErrorContext,
|
||
|
@@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
|
||
|
cur += toconv;
|
||
|
} while (ret == -2);
|
||
|
#endif
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ } else if (handler->uconv_out != NULL) {
|
||
|
+ do {
|
||
|
+ toconv = in->end - cur;
|
||
|
+ written = 32000;
|
||
|
+ ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0],
|
||
|
+ &written, cur, &toconv);
|
||
|
+ if (ret < 0) {
|
||
|
+ if (written > 0)
|
||
|
+ ret = -2;
|
||
|
+ else
|
||
|
+ return(-1);
|
||
|
+ }
|
||
|
+ unused += written;
|
||
|
+ cur += toconv;
|
||
|
+ } while (ret == -2);
|
||
|
} else {
|
||
|
/* could not find a converter */
|
||
|
return(-1);
|
||
|
@@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) {
|
||
|
}
|
||
|
return(in->consumed + (in->cur - in->base));
|
||
|
}
|
||
|
+#endif
|
||
|
|
||
|
-#ifndef LIBXML_ICONV_ENABLED
|
||
|
+#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED)
|
||
|
#ifdef LIBXML_ISO8859X_ENABLED
|
||
|
|
||
|
/**
|
||
|
diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/include/libxml/encoding.h
|
||
|
index c74b25f..b5f8b48 100644
|
||
|
--- a/third_party/libxml/include/libxml/encoding.h
|
||
|
+++ b/third_party/libxml/include/libxml/encoding.h
|
||
|
@@ -26,6 +26,24 @@
|
||
|
|
||
|
#ifdef LIBXML_ICONV_ENABLED
|
||
|
#include <iconv.h>
|
||
|
+#else
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+#include <unicode/ucnv.h>
|
||
|
+#if 0
|
||
|
+/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
|
||
|
+ * to prevent unwanted ICU symbols being exposed to users of libxml2.
|
||
|
+ * One particular case is Qt4 conflicting on UChar32.
|
||
|
+ */
|
||
|
+#include <stdint.h>
|
||
|
+struct UConverter;
|
||
|
+typedef struct UConverter UConverter;
|
||
|
+#ifdef _MSC_VER
|
||
|
+typedef wchar_t UChar;
|
||
|
+#else
|
||
|
+typedef uint16_t UChar;
|
||
|
+#endif
|
||
|
+#endif
|
||
|
+#endif
|
||
|
#endif
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
@@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
|
||
|
* Block defining the handlers for non UTF-8 encodings.
|
||
|
* If iconv is supported, there are two extra fields.
|
||
|
*/
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+struct _uconv_t {
|
||
|
+ UConverter *uconv; /* for conversion between an encoding and UTF-16 */
|
||
|
+ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
|
||
|
+};
|
||
|
+typedef struct _uconv_t uconv_t;
|
||
|
+#endif
|
||
|
|
||
|
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
|
||
|
typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
|
||
|
@@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler {
|
||
|
iconv_t iconv_in;
|
||
|
iconv_t iconv_out;
|
||
|
#endif /* LIBXML_ICONV_ENABLED */
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ uconv_t *uconv_in;
|
||
|
+ uconv_t *uconv_out;
|
||
|
+#endif /* LIBXML_ICU_ENABLED */
|
||
|
};
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/include/libxml/parser.h
|
||
|
index dd79c42..3580b63 100644
|
||
|
--- a/third_party/libxml/include/libxml/parser.h
|
||
|
+++ b/third_party/libxml/include/libxml/parser.h
|
||
|
@@ -1222,6 +1222,7 @@ typedef enum {
|
||
|
XML_WITH_DEBUG_MEM = 29,
|
||
|
XML_WITH_DEBUG_RUN = 30,
|
||
|
XML_WITH_ZLIB = 31,
|
||
|
+ XML_WITH_ICU = 32,
|
||
|
XML_WITH_NONE = 99999 /* just to be sure of allocation size */
|
||
|
} xmlFeature;
|
||
|
|
||
|
diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/libxml/include/libxml/xmlversion.h.in
|
||
|
index 4739f3a..de310ab 100644
|
||
|
--- a/third_party/libxml/include/libxml/xmlversion.h.in
|
||
|
+++ b/third_party/libxml/include/libxml/xmlversion.h.in
|
||
|
@@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version);
|
||
|
#endif
|
||
|
|
||
|
/**
|
||
|
+ * LIBXML_ICU_ENABLED:
|
||
|
+ *
|
||
|
+ * Whether icu support is available
|
||
|
+ */
|
||
|
+#if @WITH_ICU@
|
||
|
+#define LIBXML_ICU_ENABLED
|
||
|
+#endif
|
||
|
+
|
||
|
+/**
|
||
|
* LIBXML_ISO8859X_ENABLED:
|
||
|
*
|
||
|
* Whether ISO-8859-* support is made available in case iconv is not
|
||
|
diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c
|
||
|
index 85e7599..3ba2a06 100644
|
||
|
--- a/third_party/libxml/parser.c
|
||
|
+++ b/third_party/libxml/parser.c
|
||
|
@@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature)
|
||
|
#else
|
||
|
return(0);
|
||
|
#endif
|
||
|
+ case XML_WITH_ICU:
|
||
|
+#ifdef LIBXML_ICU_ENABLED
|
||
|
+ return(1);
|
||
|
+#else
|
||
|
+ return(0);
|
||
|
+#endif
|
||
|
default:
|
||
|
break;
|
||
|
}
|