Version in base suite: 3.8.1-3+lenny1 Version in overlay suite: (not present) Base version: icu_3.8.1-3+lenny1 Target version: icu_3.8.1-3+lenny2 Base file: /org/ftp.debian.org/ftp/pool/main/i/icu/icu_3.8.1-3+lenny1.dsc Target file: /org/ftp.debian.org/queue/p-u-new/icu_3.8.1-3+lenny2.dsc debian/patches/03-redhat.icu5797.patch | 752 +++++++++ debian/patches/04-redhat.icu6001.patch | 771 ++++++++++ debian/patches/05-redhat.icu6002.patch | 412 +++++ debian/patches/CVE-2009-0153.patch | 590 +++++++ icu-3.8.1/debian/changelog | 12 icu-3.8.1/debian/patches/00-cve-2007-4770-4771.patch | 53 icu-3.8.1/debian/patches/01-kfreebsd.patch | 18 icu-3.8.1/debian/patches/02-icu-3.8.1-sinhala-fix-matra-segmentation.patch | 60 icu-3.8.1/debian/patches/CVE-2008-1036.patch | 230 +- icu-3.8.1/debian/patches/icu-3.6-setBreakType.patch | 6 10 files changed, 2719 insertions(+), 185 deletions(-) diff -u icu-3.8.1/debian/changelog icu-3.8.1/debian/changelog --- icu-3.8.1/debian/changelog +++ icu-3.8.1/debian/changelog @@ -1,3 +1,15 @@ +icu (3.8.1-3+lenny2) stable-security; urgency=high + + * Apply patch CVE-2009-0153.patch to fix problem handling invalid byte + sequences during Unicode conversion. Thanks to Red Hat for + backporting the patch to ICU version 3.8.1. Applying this patch to + the debian package required pulling in three additional Red Hat + patches for tickets 5797, 6001, and 6002 in ICU's issue tracking + system as well as adjusting offsets in CVE-2008-1036.patch. (Closes: + #534590) + + -- Jay Berkenbilt Mon, 07 Sep 2009 20:00:39 -0400 + icu (3.8.1-3+lenny1) stable-security; urgency=high * Non-maintainer upload. diff -u icu-3.8.1/debian/patches/icu-3.6-setBreakType.patch icu-3.8.1/debian/patches/icu-3.6-setBreakType.patch --- icu-3.8.1/debian/patches/icu-3.6-setBreakType.patch +++ icu-3.8.1/debian/patches/icu-3.6-setBreakType.patch @@ -1,5 +1,7 @@ ---- icu/source/common/unicode/rbbi.h-old 2007-10-31 15:52:08.000000000 +0100 -+++ icu/source/common/unicode/rbbi.h 2007-10-31 15:52:47.000000000 +0100 +Index: icu/source/common/unicode/rbbi.h +=================================================================== +--- icu.orig/source/common/unicode/rbbi.h 2009-09-07 19:53:29.017855637 -0400 ++++ icu/source/common/unicode/rbbi.h 2009-09-07 19:53:57.165855852 -0400 @@ -611,12 +611,14 @@ virtual int32_t getBreakType() const; #endif diff -u icu-3.8.1/debian/patches/02-icu-3.8.1-sinhala-fix-matra-segmentation.patch icu-3.8.1/debian/patches/02-icu-3.8.1-sinhala-fix-matra-segmentation.patch --- icu-3.8.1/debian/patches/02-icu-3.8.1-sinhala-fix-matra-segmentation.patch +++ icu-3.8.1/debian/patches/02-icu-3.8.1-sinhala-fix-matra-segmentation.patch @@ -1,8 +1,8 @@ -diff --git icu.old/source/layout/IndicClassTables.cpp icu/source/layout/IndicClassTables.cpp -index 2d3e809..2d1272f 100644 ---- icu.old/source/layout/IndicClassTables.cpp -+++ icu/source/layout/IndicClassTables.cpp -@@ -48,6 +48,7 @@ U_NAMESPACE_BEGIN +Index: icu/source/layout/IndicClassTables.cpp +=================================================================== +--- icu.orig/source/layout/IndicClassTables.cpp 2009-09-07 19:53:29.829856228 -0400 ++++ icu/source/layout/IndicClassTables.cpp 2009-09-07 19:53:54.945856127 -0400 +@@ -48,6 +48,7 @@ #define _m2 (CC_SPLIT_VOWEL_PIECE_2 | CF_LENGTH_MARK) #define _m3 (CC_SPLIT_VOWEL_PIECE_3 | CF_LENGTH_MARK) #define _vr (CC_VIRAMA) @@ -10,7 +10,7 @@ // split matras #define _s1 (_dv | _x1) -@@ -206,7 +207,7 @@ static const IndicClassTable::CharClass sinhCharClasses[] = +@@ -206,7 +207,7 @@ _iv, _iv, _iv, _iv, _iv, _iv, _iv, _xx, _xx, _xx, _ct, _ct, _ct, _ct, _ct, _ct, // 0D90 - 0D9F _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, // 0DA0 - 0DAF _ct, _ct, _xx, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _ct, _xx, _ct, _xx, _xx, // 0DB0 - 0DBF @@ -19,7 +19,7 @@ _dr, _dr, _da, _da, _db, _xx, _db, _xx, _dr, _dl, _s1, _dl, _s2, _s3, _s4, _dr, // 0DD0 - 0DDF _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0DE0 - 0DEF _xx, _xx, _dr, _dr, _xx // 0DF0 - 0DF4 -@@ -248,7 +249,7 @@ static const SplitMatra sinhSplitTable[] = {{0x0DD9, 0x0DCA}, {0x0DD9, 0x0DCF}, +@@ -248,7 +249,7 @@ #define TELU_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) #define KNDA_SCRIPT_FLAGS (SF_MATRAS_AFTER_BASE | SF_FILTER_ZERO_WIDTH | 3) #define MLYM_SCRIPT_FLAGS (SF_MPRE_FIXUP | SF_NO_POST_BASE_LIMIT /*| SF_FILTER_ZERO_WIDTH*/) @@ -28,11 +28,11 @@ // // Indic Class Tables -diff --git icu.old/source/layout/IndicReordering.cpp icu/source/layout/IndicReordering.cpp -index 5ca4d0d..0b35a62 100644 ---- icu.old/source/layout/IndicReordering.cpp -+++ icu/source/layout/IndicReordering.cpp -@@ -73,8 +73,8 @@ private: +Index: icu/source/layout/IndicReordering.cpp +=================================================================== +--- icu.orig/source/layout/IndicReordering.cpp 2009-09-07 19:53:29.841856267 -0400 ++++ icu/source/layout/IndicReordering.cpp 2009-09-07 19:53:54.945856127 -0400 +@@ -73,8 +73,8 @@ LEUnicode fLengthMark; le_int32 fLengthMarkIndex; @@ -43,7 +43,7 @@ FeatureMask fMatraFeatures; -@@ -97,9 +97,9 @@ private: +@@ -97,9 +97,9 @@ if (IndicClassTable::isLengthMark(matraClass)) { fLengthMark = matra; fLengthMarkIndex = matraIndex; @@ -56,7 +56,7 @@ } else { switch (matraClass & CF_POS_MASK) { case CF_POS_BEFORE: -@@ -133,7 +133,7 @@ public: +@@ -133,7 +133,7 @@ IndicReorderingOutput(LEUnicode *outChars, LEGlyphStorage &glyphStorage, MPreFixups *mpreFixups) : fSyllableCount(0), fOutIndex(0), fOutChars(outChars), fGlyphStorage(glyphStorage), fMpre(0), fMpreIndex(0), fMbelow(0), fMbelowIndex(0), fMabove(0), fMaboveIndex(0), @@ -65,7 +65,7 @@ fMatraFeatures(0), fMPreOutIndex(-1), fMPreFixups(mpreFixups), fVMabove(0), fVMpost(0), fVMIndex(0), fVMFeatures(0), fSMabove(0), fSMbelow(0), fSMIndex(0), fSMFeatures(0) -@@ -150,7 +150,7 @@ public: +@@ -150,7 +150,7 @@ { fSyllableCount += 1; @@ -74,7 +74,7 @@ fMPreOutIndex = -1; fVMabove = fVMpost = 0; -@@ -255,11 +255,11 @@ public: +@@ -255,11 +255,11 @@ } } @@ -90,7 +90,7 @@ } } -@@ -370,20 +370,21 @@ static const le_int32 featureCount = LE_ARRAY_SIZE(featureMap); +@@ -370,20 +370,21 @@ static const le_int8 stateTable[][CC_COUNT] = { @@ -126,7 +126,7 @@ }; -@@ -466,6 +467,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le +@@ -466,6 +467,7 @@ break; @@ -134,7 +134,7 @@ case CC_NUKTA: case CC_VIRAMA: output.writeChar(C_DOTTED_CIRCLE, prev, tagArray1); -@@ -500,7 +502,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le +@@ -500,7 +502,7 @@ } output.writeLengthMark(); @@ -143,7 +143,7 @@ if ((classTable->scriptFlags & SF_REPH_AFTER_BELOW) == 0) { output.writeVMabove(); -@@ -632,7 +634,11 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le +@@ -632,7 +634,11 @@ bcSpan += 1; } @@ -156,7 +156,7 @@ bcSpan += 1; if (bcSpan < markStart && chars[bcSpan] == C_SIGN_ZWNJ) { -@@ -708,7 +714,7 @@ le_int32 IndicReordering::reorder(const LEUnicode *chars, le_int32 charCount, le +@@ -708,7 +714,7 @@ } output.writeLengthMark(); @@ -165,11 +165,11 @@ // write reph if ((classTable->scriptFlags & SF_REPH_AFTER_BELOW) == 0) { -diff --git icu.old/source/layout/IndicReordering.h icu/source/layout/IndicReordering.h -index 5a1938e..fc1d429 100644 ---- icu.old/source/layout/IndicReordering.h -+++ icu/source/layout/IndicReordering.h -@@ -37,7 +37,8 @@ U_NAMESPACE_BEGIN +Index: icu/source/layout/IndicReordering.h +=================================================================== +--- icu.orig/source/layout/IndicReordering.h 2009-09-07 19:53:29.853855608 -0400 ++++ icu/source/layout/IndicReordering.h 2009-09-07 19:53:54.949856132 -0400 +@@ -37,7 +37,8 @@ #define CC_SPLIT_VOWEL_PIECE_3 12U #define CC_VIRAMA 13U #define CC_ZERO_WIDTH_MARK 14U @@ -179,7 +179,7 @@ // Character class flags #define CF_CLASS_MASK 0x0000FFFFU -@@ -98,6 +99,7 @@ struct IndicClassTable +@@ -98,6 +99,7 @@ inline le_bool isConsonant(LEUnicode ch) const; inline le_bool isReph(LEUnicode ch) const; inline le_bool isVirama(LEUnicode ch) const; @@ -187,7 +187,7 @@ inline le_bool isNukta(LEUnicode ch) const; inline le_bool isVattu(LEUnicode ch) const; inline le_bool isMatra(LEUnicode ch) const; -@@ -112,6 +114,7 @@ struct IndicClassTable +@@ -112,6 +114,7 @@ inline static le_bool isConsonant(CharClass charClass); inline static le_bool isReph(CharClass charClass); inline static le_bool isVirama(CharClass charClass); @@ -195,7 +195,7 @@ inline static le_bool isNukta(CharClass charClass); inline static le_bool isVattu(CharClass charClass); inline static le_bool isMatra(CharClass charClass); -@@ -193,6 +196,11 @@ inline le_bool IndicClassTable::isVirama(CharClass charClass) +@@ -193,6 +196,11 @@ return (charClass & CF_CLASS_MASK) == CC_VIRAMA; } @@ -207,7 +207,7 @@ inline le_bool IndicClassTable::isVattu(CharClass charClass) { return (charClass & CF_VATTU) != 0; -@@ -255,6 +263,11 @@ inline le_bool IndicClassTable::isVirama(LEUnicode ch) const +@@ -255,6 +263,11 @@ return isVirama(getCharClass(ch)); } diff -u icu-3.8.1/debian/patches/00-cve-2007-4770-4771.patch icu-3.8.1/debian/patches/00-cve-2007-4770-4771.patch --- icu-3.8.1/debian/patches/00-cve-2007-4770-4771.patch +++ icu-3.8.1/debian/patches/00-cve-2007-4770-4771.patch @@ -1,7 +1,7 @@ -Index: source/i18n/regexcmp.cpp +Index: icu/source/i18n/regexcmp.cpp =================================================================== ---- source/i18n/regexcmp.cpp (revision 23291) -+++ source/i18n/regexcmp.cpp (revision 23292) +--- icu.orig/source/i18n/regexcmp.cpp 2009-09-07 19:53:29.889855981 -0400 ++++ icu/source/i18n/regexcmp.cpp 2009-09-07 19:53:54.606356313 -0400 @@ -2,7 +2,7 @@ // // file: regexcmp.cpp @@ -35,10 +35,10 @@ } break; -Index: source/i18n/rematch.cpp +Index: icu/source/i18n/rematch.cpp =================================================================== ---- source/i18n/rematch.cpp (revision 23291) -+++ source/i18n/rematch.cpp (revision 23292) +--- icu.orig/source/i18n/rematch.cpp 2009-09-07 19:53:29.897856217 -0400 ++++ icu/source/i18n/rematch.cpp 2009-09-07 19:53:54.606356313 -0400 @@ -1,6 +1,6 @@ /* ************************************************************************** @@ -126,10 +126,10 @@ break; } } -Index: source/test/intltest/regextst.h +Index: icu/source/test/intltest/regextst.h =================================================================== ---- source/test/intltest/regextst.h (revision 23291) -+++ source/test/intltest/regextst.h (revision 23292) +--- icu.orig/source/test/intltest/regextst.h 2009-09-07 19:53:29.905856118 -0400 ++++ icu/source/test/intltest/regextst.h 2009-09-07 19:53:54.606356313 -0400 @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: @@ -146,10 +146,10 @@ // The following functions are internal to the regexp tests. virtual UBool doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line); -Index: source/test/intltest/regextst.cpp +Index: icu/source/test/intltest/regextst.cpp =================================================================== ---- source/test/intltest/regextst.cpp (revision 23291) -+++ source/test/intltest/regextst.cpp (revision 23292) +--- icu.orig/source/test/intltest/regextst.cpp 2009-09-07 19:53:29.917855927 -0400 ++++ icu/source/test/intltest/regextst.cpp 2009-09-07 19:53:54.614354890 -0400 @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: @@ -209,10 +209,10 @@ #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ -Index: source/common/uvectr32.cpp +Index: icu/source/common/uvectr32.cpp =================================================================== ---- source/common/uvectr32.cpp (revision 23291) -+++ source/common/uvectr32.cpp (revision 23292) +--- icu.orig/source/common/uvectr32.cpp 2009-09-07 19:53:29.925855723 -0400 ++++ icu/source/common/uvectr32.cpp 2009-09-07 19:53:54.622355001 -0400 @@ -1,6 +1,6 @@ /* ****************************************************************************** @@ -247,7 +247,7 @@ elements = (int32_t *)uprv_malloc(sizeof(int32_t)*initialCapacity); if (elements == 0) { status = U_MEMORY_ALLOCATION_ERROR; -@@ -189,24 +194,38 @@ +@@ -189,21 +194,35 @@ UBool UVector32::expandCapacity(int32_t minimumCapacity, UErrorCode &status) { if (capacity >= minimumCapacity) { return TRUE; @@ -266,7 +266,7 @@ - elements = newElems; - capacity = newCap; - return TRUE; - } ++ } + if (maxCapacity>0 && minimumCapacity>maxCapacity) { + status = U_BUFFER_OVERFLOW_ERROR; + return FALSE; @@ -288,23 +288,20 @@ + elements = newElems; + capacity = newCap; + return TRUE; - } - ++} ++ +void UVector32::setMaxCapacity(int32_t limit) { + U_ASSERT(limit >= 0); + maxCapacity = limit; + if (maxCapacity < 0) { + maxCapacity = 0; -+ } -+} -+ - /** - * Change the size of this vector as follows: If newSize is smaller, - * then truncate the array, possibly deleting held elements for i >= -Index: source/common/uvectr32.h + } + } + +Index: icu/source/common/uvectr32.h =================================================================== ---- source/common/uvectr32.h (revision 23291) -+++ source/common/uvectr32.h (revision 23292) +--- icu.orig/source/common/uvectr32.h 2009-09-07 19:53:29.937856159 -0400 ++++ icu/source/common/uvectr32.h 2009-09-07 19:53:54.630354955 -0400 @@ -1,6 +1,6 @@ /* ********************************************************************** diff -u icu-3.8.1/debian/patches/01-kfreebsd.patch icu-3.8.1/debian/patches/01-kfreebsd.patch --- icu-3.8.1/debian/patches/01-kfreebsd.patch +++ icu-3.8.1/debian/patches/01-kfreebsd.patch @@ -1,5 +1,7 @@ ---- icu.orig/source/common/putil.c 2007-12-12 19:57:26 +0100 -+++ icu/source/common/putil.c 2008-01-20 21:08:57 +0100 +Index: icu/source/common/putil.c +=================================================================== +--- icu.orig/source/common/putil.c 2009-09-07 19:53:29.053856156 -0400 ++++ icu/source/common/putil.c 2009-09-07 19:53:56.910355591 -0400 @@ -105,7 +105,7 @@ # define ICU_NO_USER_DATA_OVERRIDE 1 #elif defined(OS390) @@ -31,8 +33,10 @@ if (locale != NULL && uprv_strcmp(name, "euc") == 0) { /* Linux underspecifies the "EUC" name. */ if (uprv_strcmp(locale, "korean") == 0) { ---- icu.orig/source/configure 2007-12-12 19:58:06 +0100 -+++ icu/source/configure 2008-01-20 21:07:42 +0100 +Index: icu/source/configure +=================================================================== +--- icu.orig/source/configure 2009-09-07 19:53:29.061855656 -0400 ++++ icu/source/configure 2009-09-07 19:53:56.914356065 -0400 @@ -8291,6 +8291,7 @@ case "${host}" in *-*-solaris*) platform=U_SOLARIS ;; @@ -41,8 +45,10 @@ *-pc-gnu) platform=U_HURD ;; *-*-*bsd*|*-*-dragonfly*) platform=U_BSD ;; *-*-aix*) platform=U_AIX ;; ---- icu.orig/source/configure.in 2007-12-12 19:58:06 +0100 -+++ icu/source/configure.in 2008-01-20 21:07:42 +0100 +Index: icu/source/configure.in +=================================================================== +--- icu.orig/source/configure.in 2009-09-07 19:53:29.073855791 -0400 ++++ icu/source/configure.in 2009-09-07 19:53:56.922355024 -0400 @@ -996,6 +996,7 @@ case "${host}" in *-*-solaris*) platform=U_SOLARIS ;; diff -u icu-3.8.1/debian/patches/CVE-2008-1036.patch icu-3.8.1/debian/patches/CVE-2008-1036.patch --- icu-3.8.1/debian/patches/CVE-2008-1036.patch +++ icu-3.8.1/debian/patches/CVE-2008-1036.patch @@ -1,6 +1,7 @@ -diff -Nru icu.old/source/common/ucnv2022.c icu/source/common/ucnv2022.c ---- icu.old/source/common/ucnv2022.c 2007-12-12 19:57:26.000000000 +0100 -+++ icu/source/common/ucnv2022.c 2009-03-16 23:26:45.018254230 +0100 +Index: icu/source/common/ucnv2022.c +=================================================================== +--- icu.orig/source/common/ucnv2022.c 2009-09-07 19:53:55.514356046 -0400 ++++ icu/source/common/ucnv2022.c 2009-09-07 19:53:56.270356296 -0400 @@ -201,6 +201,7 @@ #ifdef U_ENABLE_GENERIC_ISO_2022 UBool isFirstBuffer; @@ -9,7 +10,7 @@ char name[30]; char locale[3]; }UConverterDataISO2022; -@@ -610,6 +611,7 @@ +@@ -609,6 +610,7 @@ if(choice<=UCNV_RESET_TO_UNICODE) { uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); myConverterData->key = 0; @@ -17,7 +18,7 @@ } if(choice!=UCNV_RESET_TO_UNICODE) { uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); -@@ -815,6 +817,7 @@ +@@ -814,6 +816,7 @@ if(chosenConverterName == NULL) { /* SS2 or SS3 */ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; @@ -25,7 +26,7 @@ return; } -@@ -936,6 +939,8 @@ +@@ -935,6 +938,8 @@ } if(U_SUCCESS(*err)) { _this->toULength = 0; @@ -34,7 +35,7 @@ } } -@@ -1789,6 +1794,7 @@ +@@ -1986,6 +1991,7 @@ continue; } else { /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ @@ -42,11 +43,11 @@ break; } -@@ -1800,21 +1806,39 @@ +@@ -1997,21 +2003,39 @@ continue; } else { /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ -+ myData->isEmptySegment = FALSE; /* reset this, we have a different error */ ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */ break; } @@ -74,7 +75,7 @@ if(U_FAILURE(*err)){ args->target = myTarget; args->source = mySource; -+ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ return; } + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ @@ -84,7 +85,7 @@ continue; /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ -@@ -1831,6 +1855,7 @@ +@@ -2028,6 +2052,7 @@ /* falls through */ default: /* convert one or two bytes */ @@ -92,12 +93,12 @@ cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && !IS_JP_DBCS(cs) -@@ -2325,15 +2350,27 @@ +@@ -2524,15 +2549,27 @@ if(mySourceChar==UCNV_SI){ myData->toU2022State.g = 0; + if (myData->isEmptySegment) { -+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ + *err = U_ILLEGAL_ESCAPE_SEQUENCE; + args->converter->toUCallbackReason = UCNV_IRREGULAR; + args->converter->toUBytes[0] = mySourceChar; @@ -110,17 +111,17 @@ continue; }else if(mySourceChar==UCNV_SO){ myData->toU2022State.g = 1; -+ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ /*consume the source */ continue; }else if(mySourceChar==ESC_2022){ mySource--; escape: -+ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ ++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ changeState_2022(args->converter,&(mySource), mySourceLimit, ISO_2022_KR, err); if(U_FAILURE(*err)){ -@@ -2344,6 +2381,7 @@ +@@ -2543,6 +2580,7 @@ continue; } @@ -128,12 +129,12 @@ if(myData->toU2022State.g == 1) { if(mySource < mySourceLimit) { char trailByte; -@@ -2876,27 +2914,52 @@ +@@ -3075,27 +3113,52 @@ switch(mySourceChar){ case UCNV_SI: pToU2022State->g=0; + if (myData->isEmptySegment) { -+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ + *err = U_ILLEGAL_ESCAPE_SEQUENCE; + args->converter->toUCallbackReason = UCNV_IRREGULAR; + args->converter->toUBytes[0] = mySourceChar; @@ -147,11 +148,11 @@ case UCNV_SO: if(pToU2022State->cs[1] != 0) { pToU2022State->g=1; -+ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ continue; } else { /* illegal to have SO before a matching designator */ -+ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ ++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ break; } @@ -179,11 +180,11 @@ if(U_FAILURE(*err)){ args->target = myTarget; args->source = mySource; -+ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ return; } continue; -@@ -2910,6 +2973,7 @@ +@@ -3109,6 +3172,7 @@ /* falls through */ default: /* convert one or two bytes */ @@ -191,9 +192,10 @@ if(pToU2022State->g != 0) { if(mySource < mySourceLimit) { UConverterSharedData *cnv; -diff -Nru icu.old/source/common/ucnv_bld.c icu/source/common/ucnv_bld.c ---- icu.old/source/common/ucnv_bld.c 2007-12-12 19:57:26.000000000 +0100 -+++ icu/source/common/ucnv_bld.c 2009-03-16 22:36:28.013608421 +0100 +Index: icu/source/common/ucnv_bld.c +=================================================================== +--- icu.orig/source/common/ucnv_bld.c 2009-09-07 19:53:29.257856247 -0400 ++++ icu/source/common/ucnv_bld.c 2009-09-07 19:53:56.274356179 -0400 @@ -932,6 +932,7 @@ myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen; myUConverter->subChars = (uint8_t *)myUConverter->subUChars; @@ -202,9 +204,18 @@ if(mySharedConverterData->impl->open != NULL) { mySharedConverterData->impl->open(myUConverter, realName, locale, options, err); -diff -Nru icu.old/source/common/ucnv_bld.h icu/source/common/ucnv_bld.h ---- icu.old/source/common/ucnv_bld.h 2007-12-12 19:57:26.000000000 +0100 -+++ icu/source/common/ucnv_bld.h 2009-03-16 22:36:28.017606051 +0100 +Index: icu/source/common/ucnv_bld.h +=================================================================== +--- icu.orig/source/common/ucnv_bld.h 2009-09-07 19:53:29.265855477 -0400 ++++ icu/source/common/ucnv_bld.h 2009-09-07 19:53:56.278355900 -0400 +@@ -1,6 +1,6 @@ + /* + ********************************************************************** +-* Copyright (C) 1999-2006, International Business Machines ++* Copyright (C) 1999-2006,2008 International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * @@ -226,6 +226,9 @@ char preToU[UCNV_EXT_MAX_BYTES]; int8_t preFromULength, preToULength; /* negative: replay */ @@ -215,9 +226,10 @@ }; U_CDECL_END /* end of UConverter */ -diff -Nru icu.old/source/common/ucnv.c icu/source/common/ucnv.c ---- icu.old/source/common/ucnv.c 2007-12-12 19:57:24.000000000 +0100 -+++ icu/source/common/ucnv.c 2009-03-16 22:36:27.989604759 +0100 +Index: icu/source/common/ucnv.c +=================================================================== +--- icu.orig/source/common/ucnv.c 2009-09-07 19:53:29.273855764 -0400 ++++ icu/source/common/ucnv.c 2009-09-07 19:53:56.282355941 -0400 @@ -1528,11 +1528,14 @@ cnv->toULength=0; @@ -235,9 +247,10 @@ /* * loop back to the offset handling -diff -Nru icu.old/source/common/ucnvhz.c icu/source/common/ucnvhz.c ---- icu.old/source/common/ucnvhz.c 2007-12-12 19:57:26.000000000 +0100 -+++ icu/source/common/ucnvhz.c 2009-03-16 22:36:28.013608421 +0100 +Index: icu/source/common/ucnvhz.c +=================================================================== +--- icu.orig/source/common/ucnvhz.c 2009-09-07 19:53:55.949856455 -0400 ++++ icu/source/common/ucnvhz.c 2009-09-07 19:53:56.282355941 -0400 @@ -59,6 +59,7 @@ UBool isEscapeAppended; UBool isStateDBCS; @@ -265,93 +278,64 @@ */ -@@ -163,11 +169,13 @@ - +@@ -168,12 +174,23 @@ + args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); } *(myTarget++)=(UChar)mySourceChar; + myData->isEmptySegment = FALSE; continue; - - case UCNV_TILDE: - if(args->converter->mode ==UCNV_TILDE){ - *(myTarget++)=(UChar)mySourceChar; -+ myData->isEmptySegment = FALSE; - args->converter->mode=0; - continue; - -@@ -183,20 +191,22 @@ - - case UCNV_OPEN_BRACE: -- if(args->converter->mode == UCNV_TILDE){ -- args->converter->mode=0; -- myData->isStateDBCS = TRUE; -- continue; -- } -- else{ -- break; -- } -- -- +- myData->isStateDBCS = TRUE; +- continue; case UCNV_CLOSE_BRACE: - if(args->converter->mode == UCNV_TILDE){ - args->converter->mode=0; -- myData->isStateDBCS = FALSE; -+ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); -+ if (myData->isEmptySegment) { -+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ -+ *err = U_ILLEGAL_ESCAPE_SEQUENCE; -+ args->converter->toUCallbackReason = UCNV_IRREGULAR; -+ args->converter->toUBytes[0] = UCNV_TILDE; -+ args->converter->toUBytes[1] = mySourceChar; -+ args->converter->toULength = 2; -+ args->target = myTarget; -+ args->source = mySource; -+ return; -+ } -+ myData->isEmptySegment = TRUE; - continue; - } - else{ -@@ -210,6 +220,7 @@ - if(args->converter->mode == UCNV_TILDE){ - args->converter->mode=0; - mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); -+ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ - goto SAVE_STATE; - } - -@@ -220,12 +231,14 @@ - if(myData->isStateDBCS){ - if(args->converter->toUnicodeStatus == 0x00){ - args->converter->toUnicodeStatus = (UChar) mySourceChar; -+ myData->isEmptySegment = FALSE; +- myData->isStateDBCS = FALSE; ++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); ++ if (myData->isEmptySegment) { ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE; ++ args->converter->toUCallbackReason = UCNV_IRREGULAR; ++ args->converter->toUBytes[0] = UCNV_TILDE; ++ args->converter->toUBytes[1] = mySourceChar; ++ args->converter->toULength = 2; ++ args->target = myTarget; ++ args->source = mySource; ++ return; ++ } ++ myData->isEmptySegment = TRUE; continue; + default: + /* if the first byte is equal to TILDE and the trail byte +@@ -181,6 +198,7 @@ + */ + mySourceChar = 0x7e00 | mySourceChar; + targetUniChar = 0xffff; ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ + break; } - else{ - tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ; - tempBuf[1] = (char) (mySourceChar+0x80); - mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); -+ myData->isEmptySegment = FALSE; - args->converter->toUnicodeStatus =0x00; - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, - tempBuf, 2, args->converter->useFallback); -@@ -233,10 +246,12 @@ - } - else{ - if(args->converter->fromUnicodeStatus == 0x00){ -+ myData->isEmptySegment = FALSE; - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, - mySource - 1, 1, args->converter->useFallback); + } else if(myData->isStateDBCS) { +@@ -191,6 +209,7 @@ + } else { + /* add another bit to distinguish a 0 byte from not having seen a lead byte */ + args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */ + } + continue; } - else{ -+ myData->isEmptySegment = FALSE; - goto SAVE_STATE; +@@ -218,8 +237,10 @@ + continue; + } else if(mySourceChar <= 0x7f) { + targetUniChar = (UChar)mySourceChar; /* ASCII */ ++ myData->isEmptySegment = FALSE; /* the segment has something valid */ + } else { + targetUniChar = 0xffff; ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */ } - -diff -Nru icu.old/source/test/cintltst/nucnvtst.c icu/source/test/cintltst/nucnvtst.c ---- icu.old/source/test/cintltst/nucnvtst.c 2007-12-12 19:57:08.000000000 +0100 -+++ icu/source/test/cintltst/nucnvtst.c 2009-03-16 22:58:25.345605093 +0100 + } + if(targetUniChar < 0xfffe){ +Index: icu/source/test/cintltst/nucnvtst.c +=================================================================== +--- icu.orig/source/test/cintltst/nucnvtst.c 2009-09-07 19:53:55.213856047 -0400 ++++ icu/source/test/cintltst/nucnvtst.c 2009-09-07 19:53:56.286355944 -0400 @@ -81,6 +81,7 @@ static void TestJitterbug2411(void); static void TestJB5275(void); @@ -439,10 +423,19 @@ static void TestEBCDIC_STATEFUL() { /* test input */ -diff -Nru icu.old/source/test/testdata/conversion.txt icu/source/test/testdata/conversion.txt ---- icu.old/source/test/testdata/conversion.txt 2007-12-12 19:57:18.000000000 +0100 -+++ icu/source/test/testdata/conversion.txt 2009-03-16 22:36:27.989604759 +0100 -@@ -182,6 +182,21 @@ +Index: icu/source/test/testdata/conversion.txt +=================================================================== +--- icu.orig/source/test/testdata/conversion.txt 2009-09-07 19:53:55.965855898 -0400 ++++ icu/source/test/testdata/conversion.txt 2009-09-07 19:53:56.290355936 -0400 +@@ -1,6 +1,6 @@ + //******************************************************************************* + // +-// Copyright (C) 2003-2007, International Business Machines ++// Copyright (C) 2003-2008, International Business Machines + // Corporation and others. All Rights Reserved. + // + // file name: conversion.txt +@@ -199,6 +199,21 @@ :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, :int{1}, :int{1}, "", "&", :bin{""} } @@ -464,7 +457,7 @@ // ISO-2022-JP -@@ -232,6 +247,21 @@ +@@ -249,6 +264,21 @@ :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 }, :int{1}, :int{1}, "", ".", :bin{""} } @@ -486,7 +479,7 @@ // ISO-2022-CN -@@ -302,6 +332,36 @@ +@@ -319,6 +349,36 @@ :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } } @@ -523,7 +516,7 @@ // ISO-2022 SBCS // [U_ENABLE_GENERIC_ISO_2022] -@@ -316,6 +376,39 @@ +@@ -333,6 +393,39 @@ // :int{1}, :int{1}, "", ".", :bin{""} //} @@ -566 +558,0 @@ - only in patch2: unchanged: --- icu-3.8.1.orig/debian/patches/CVE-2009-0153.patch +++ icu-3.8.1/debian/patches/CVE-2009-0153.patch @@ -0,0 +1,590 @@ +Index: icu/source/common/ucnv2022.c +=================================================================== +--- icu.orig/source/common/ucnv2022.c 2009-09-07 19:53:56.270356296 -0400 ++++ icu/source/common/ucnv2022.c 2009-09-07 19:53:56.601856654 -0400 +@@ -1973,6 +1973,7 @@ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; + cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -2102,18 +2103,45 @@ + default: + /* G0 DBCS */ + if(mySource < mySourceLimit) { ++ int leadIsOk, trailIsOk; + char trailByte; + getTrailByte: +- trailByte = *mySource++; +- if(cs == JISX208) { +- _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); +- } else { +- tempBuf[0] = (char)mySourceChar; +- tempBuf[1] = trailByte; +- } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); +- } else { ++ trailByte = *mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside ++ * the 21..7e range, then we treat them as a pair. ++ * Otherwise (valid lead byte + illegal trail byte, or vice versa) ++ * we report only the first byte as the illegal sequence. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk == trailIsOk) { ++ ++mySource; ++ uint32_t tmpSourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); ++ if (leadIsOk) { ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); ++ mySourceChar = tmpSourceChar; ++ } else { ++ /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ ++ mySourceChar = tmpSourceChar; ++ if (cs == KSC5601) { ++ tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ ++ } ++ tempBuf[0] = (char)(tmpSourceChar >> 8); ++ tempBuf[1] = (char)(tmpSourceChar); ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); ++ } else { ++ mySourceChar = tmpSourceChar; ++ } ++ } ++ } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; + goto endloop; +@@ -2254,7 +2282,12 @@ + } + /* only DBCS or SBCS characters are expected*/ + /* DB characters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){ ++ if( length > 2 || length==0 || ++ (length == 1 && targetByteUnit > 0x7f) || ++ (length == 2 && ++ ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || ++ (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) ++ ) { + targetByteUnit=missingCharMarker; + } + if (targetByteUnit != missingCharMarker){ +@@ -2583,17 +2616,36 @@ + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ + if(myData->toU2022State.g == 1) { + if(mySource < mySourceLimit) { ++ int leadIsOk, trailIsOk; + char trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempBuf[0] = (char)(mySourceChar + 0x80); +- tempBuf[1] = (char)(trailByte + 0x80); +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); +- if((mySourceChar & 0x8080) == 0) { +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); ++ targetUniChar = missingCharMarker; ++ trailByte = *mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside ++ * the 21..7e range, then we treat them as a pair. ++ * Otherwise (valid lead byte + illegal trail byte, or vice versa) ++ * we report only the first byte as the illegal sequence. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk == trailIsOk) { ++ ++mySource; ++ if (leadIsOk) { ++ tempBuf[0] = (char)(mySourceChar + 0x80); ++ tempBuf[1] = (char)(trailByte + 0x80); ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); ++ } else { ++ leadIsOk = TRUE; /* TODO: remove */ ++ } ++ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + } else { +- /* illegal bytes > 0x7f */ +- targetUniChar = missingCharMarker; ++ trailIsOk = TRUE; /* TODO: remove */ + } + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -2601,8 +2653,10 @@ + break; + } + } +- else{ ++ else if(mySourceChar <= 0x7f) { + targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); ++ } else { ++ targetUniChar = 0xffff; + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -3099,6 +3153,7 @@ + /* continue with a partial double-byte character */ + mySourceChar = args->converter->toUBytes[0]; + args->converter->toULength = 0; ++ targetUniChar = missingCharMarker; + goto getTrailByte; + } + +@@ -3178,29 +3233,48 @@ + UConverterSharedData *cnv; + StateEnum tempState; + int32_t tempBufLen; ++ int leadIsOk, trailIsOk; + char trailByte; + getTrailByte: +- trailByte = *mySource++; +- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; +- if(tempState > CNS_11643_0) { +- cnv = myData->myConverterArray[CNS_11643]; +- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); +- tempBuf[1] = (char) (mySourceChar); +- tempBuf[2] = trailByte; +- tempBufLen = 3; +- +- }else{ +- cnv = myData->myConverterArray[tempState]; +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte; +- tempBufLen = 2; ++ trailByte = *mySource; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In ISO-2022 DBCS, if both bytes are valid or both bytes are outside ++ * the 21..7e range, then we treat them as a pair. ++ * Otherwise (valid lead byte + illegal trail byte, or vice versa) ++ * we report only the first byte as the illegal sequence. ++ */ ++ leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk == trailIsOk) { ++ ++mySource; ++ if (leadIsOk) { ++ tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; ++ if(tempState >= CNS_11643_0) { ++ cnv = myData->myConverterArray[CNS_11643]; ++ tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); ++ tempBuf[1] = (char) (mySourceChar); ++ tempBuf[2] = trailByte; ++ tempBufLen = 3; ++ ++ }else{ ++ cnv = myData->myConverterArray[tempState]; ++ tempBuf[0] = (char) (mySourceChar); ++ tempBuf[1] = trailByte; ++ tempBufLen = 2; ++ } ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); ++ } ++ mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + } +- mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + if(pToU2022State->g>=2) { + /* return from a single-shift state to the previous one */ + pToU2022State->g=pToU2022State->prevG; + } +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); + } else { + args->converter->toUBytes[0] = (uint8_t)mySourceChar; + args->converter->toULength = 1; +Index: icu/source/common/ucnvhz.c +=================================================================== +--- icu.orig/source/common/ucnvhz.c 2009-09-07 19:53:56.282355941 -0400 ++++ icu/source/common/ucnvhz.c 2009-09-07 19:53:56.601856654 -0400 +@@ -215,19 +215,35 @@ + } + else{ + /* trail byte */ ++ int leadIsOk, trailIsOk; + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; +- if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && +- (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) +- ) { +- tempBuf[0] = (char) (leadByte+0x80) ; +- tempBuf[1] = (char) (mySourceChar+0x80); +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- tempBuf, 2, args->converter->useFallback); ++ targetUniChar = 0xffff; ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ * ++ * In HZ DBCS, if both bytes are valid or both bytes are outside ++ * the 21..7d/7e range, then we treat them as a pair. ++ * Otherwise (valid lead byte + illegal trail byte, or vice versa) ++ * we report only the first byte as the illegal sequence. ++ */ ++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); ++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); ++ if (leadIsOk == trailIsOk) { ++ if (leadIsOk) { ++ tempBuf[0] = (char) (leadByte+0x80) ; ++ tempBuf[1] = (char) (mySourceChar+0x80); ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, ++ tempBuf, 2, args->converter->useFallback); ++ } ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + } else { +- targetUniChar = 0xffff; ++ --mySource; ++ mySourceChar = (int32_t)leadByte; + } +- /* add another bit so that the code below writes 2 bytes in case of error */ +- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; + } + } +Index: icu/source/common/ucnvmbcs.c +=================================================================== +--- icu.orig/source/common/ucnvmbcs.c 2009-09-07 19:53:55.953856288 -0400 ++++ icu/source/common/ucnvmbcs.c 2009-09-07 19:53:56.605856524 -0400 +@@ -1,7 +1,7 @@ + /* + ****************************************************************************** + * +-* Copyright (C) 2000-2007, International Business Machines ++* Copyright (C) 2000-2008, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** +@@ -1791,6 +1791,65 @@ + pArgs->offsets=offsets; + } + ++static UBool ++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { ++ const int32_t *row=stateTable[state]; ++ int32_t b, entry; ++ /* First test for final entries in this state for some commonly valid byte values. */ ++ entry=row[0xa1]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ entry=row[0x41]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ /* Then test for final entries in this state. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) && ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL ++ ) { ++ return TRUE; ++ } ++ } ++ /* Then recurse for transition entries. */ ++ for(b=0; b<=0xff; ++b) { ++ entry=row[b]; ++ if( MBCS_ENTRY_IS_TRANSITION(entry) && ++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) ++ ) { ++ return TRUE; ++ } ++ } ++ return FALSE; ++} ++ ++/* ++ * Is byte b a single/lead byte in this state? ++ * Recurse for transition states, because here we don't want to say that ++ * b is a lead byte if all byte sequences that start with b are illegal. ++ */ ++static UBool ++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { ++ const int32_t *row=stateTable[state]; ++ int32_t entry=row[b]; ++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ ++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); ++ } else { ++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); ++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { ++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */ ++ } else { ++ return action!=MBCS_STATE_ILLEGAL; ++ } ++ } ++} ++ + U_CFUNC void + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { +@@ -2146,6 +2205,34 @@ + sourceIndex=nextSourceIndex; + } else if(U_FAILURE(*pErrorCode)) { + /* callback(illegal) */ ++ if(byteIndex>1) { ++ /* ++ * Ticket 5691: consistent illegal sequences: ++ * - We include at least the first byte in the illegal sequence. ++ * - If any of the non-initial bytes could be the start of a character, ++ * we stop the illegal sequence before the first one of those. ++ */ ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); ++ int8_t i; ++ for(i=1; ++ isource); ++ byteIndex=i; /* length of reported illegal byte sequence */ ++ if(backOutDistance<=bytesFromThisBuffer) { ++ source-=backOutDistance; ++ } else { ++ /* Back out bytes from the previous buffer: Need to replay them. */ ++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); ++ /* preToULength is negative! */ ++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); ++ source=(const uint8_t *)pArgs->source; ++ } ++ } ++ } + break; + } else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ +@@ -2156,6 +2243,7 @@ + &offsets, sourceIndex, + pArgs->flush, + pErrorCode); ++ /* TODO: nextSourceIndex+=diff instead of nextSourceIndex+diff ?? */ + sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); + + if(U_FAILURE(*pErrorCode)) { +@@ -2447,15 +2535,37 @@ + + if(c<0) { + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes; + cnv->toULength=(int8_t)(source-lastSource); + do { + *bytes++=*lastSource++; + } while(lastSourcesharedData->mbcs.dbcsOnlyState!=0); ++ uint8_t *bytes=cnv->toUBytes; ++ *bytes++=*lastSource++; /* first byte */ ++ if(lastSource==source) { ++ cnv->toULength=1; ++ } else /* lastSourcetoULength=i; ++ source=lastSource; ++ } + } else { + /* no output because of empty input or only state changes */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; +Index: icu/source/test/cintltst/nccbtst.c +=================================================================== +--- icu.orig/source/test/cintltst/nccbtst.c 2009-09-07 19:53:29.137855929 -0400 ++++ icu/source/test/cintltst/nccbtst.c 2009-09-07 19:53:56.609855790 -0400 +@@ -2497,13 +2497,13 @@ + + + static const uint8_t text943[] = { +- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; +- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; +- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57}; ++ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 }; ++ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 }; + static const UChar toUnicode943stop[]= { 0x304b}; + +- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7}; +- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7}; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 }; ++ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 }; + static const int32_t fromIBM943Offsstop[] = { 0}; + + gInBufferSize = inputsize; +@@ -2537,9 +2537,9 @@ + { + static const uint8_t sampleText[] = { + 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82, +- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33}; +- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033}; +- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8}; ++ 0xff, 0x32, 0x33}; ++ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 }; ++ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 }; + /*checking illegal value for ibm-943 with substitute*/ + gInBufferSize = inputsize; + gOutBufferSize = outputsize; +Index: icu/source/test/cintltst/nucnvtst.c +=================================================================== +--- icu.orig/source/test/cintltst/nucnvtst.c 2009-09-07 19:53:56.286355944 -0400 ++++ icu/source/test/cintltst/nucnvtst.c 2009-09-07 19:53:56.613855915 -0400 +@@ -2608,7 +2608,7 @@ + TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source"); + /*Test for the condition where there is an invalid character*/ + { +- static const uint8_t source2[]={0xa1, 0x01}; ++ static const uint8_t source2[]={0xa1, 0x80}; + TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character"); + } + /*Test for the condition where we have a truncated char*/ +@@ -3901,11 +3901,11 @@ + TestISO_2022_KR() { + /* test input */ + static const uint16_t in[]={ +- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D +- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04 ++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D ++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04 + ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029 + ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB +- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2 ++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2 + ,0x53E3,0x53E4,0x000A,0x000D}; + const UChar* uSource; + const UChar* uSourceLimit; +Index: icu/source/test/testdata/conversion.txt +=================================================================== +--- icu.orig/source/test/testdata/conversion.txt 2009-09-07 19:53:56.290355936 -0400 ++++ icu/source/test/testdata/conversion.txt 2009-09-07 19:53:56.617855361 -0400 +@@ -48,12 +48,83 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // Test ticket 5691: consistent illegal sequences ++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket ++ // comments because our Shift-JIS table is Windows-compatible and ++ // therefore has no illegal single bytes. Same for GBK. ++ // Instead, we use the stricter GB 18030 also for 2-byte examples. ++ // The byte sequences are generally slightly different from the ticket ++ // comment, simply using assigned characters rather than just ++ // theoretically valid sequences. ++ { ++ "gb18030", ++ :bin{ 618140813c81ff7a }, ++ "a\u4e02\\x81<\\x81\\xFFz", ++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "EUC-JP", ++ :bin{ 618fb0a98fb03c8f3cb0a97a }, ++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", ++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "gb18030", ++ :bin{ 618130fc318130fc8181303c3e813cfc817a }, ++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", ++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "UTF-8", ++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, ++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z", ++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-JP-2", ++ :bin{ 1b24424141af4142affe41431b2842 }, ++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e", ++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ibm-25546", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-KR", ++ :bin{ 411b242943420e4141af4142affe41430f5a }, ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "ISO-2022-CN", ++ :bin{ 411b242941420e4141af4142affe41430f5a }, ++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } ++ { ++ "HZ", ++ :bin{ 417e7b4141af4142affe41437e7d5a }, ++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", ++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, ++ :int{1}, :int{0}, "", "&C", :bin{""} ++ } + // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e + { + "HZ", + :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, +- "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", +- :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ "\u3000\ufffd\ufffd\u3013\ufffd\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", ++ :intvector{ 2,4,5,6,8,9,10,12,14,18,19,21,24 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and +@@ -61,8 +132,8 @@ + { + "ISO-2022-JP", + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, +- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", +- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\ufffd\ufffd\u25b2\ufffd\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,12,13,14,16,17,19,20,21,22,23,25,26,27 }, + :int{1}, :int{1}, "", "?", :bin{""} + } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() only in patch2: unchanged: --- icu-3.8.1.orig/debian/patches/03-redhat.icu5797.patch +++ icu-3.8.1/debian/patches/03-redhat.icu5797.patch @@ -0,0 +1,752 @@ +Index: icu/source/common/ucnv2022.c +=================================================================== +--- icu.orig/source/common/ucnv2022.c 2009-09-07 19:53:29.749856264 -0400 ++++ icu/source/common/ucnv2022.c 2009-09-07 19:53:55.205856630 -0400 +@@ -472,8 +472,7 @@ + if(jpCharsetMasks[version]&CSM(ISO8859_7)) { + myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); + } +- myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); +- myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); ++ myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); + if(jpCharsetMasks[version]&CSM(JISX212)) { + myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); + } +@@ -1040,14 +1039,6 @@ + length=3; + } + } +- /* +- * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. +- * Pass in parameter for type of output bytes, for validation and shifting: +- * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? +- * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) +- * - A1-FE: Subtract 80 after range check. +- * - SJIS: Shift DBCS result to 21-7E x 21-7E. +- */ + /* is this code point assigned, or do we use fallbacks? */ + if((stage2Entry&(1<<(16+(c&0xf))))!=0) { + /* assigned */ +@@ -1105,6 +1096,23 @@ + } + } + ++/* ++ * Check that the result is a 2-byte value with each byte in the range A1..FE ++ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte ++ * to move it to the ISO 2022 range 21..7E. ++ * Return 0 if out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromGR94DBCS(uint32_t value) { ++ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && ++ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ return value - 0x8080; /* shift down to 21..7e byte range */ ++ } else { ++ return 0; /* not valid for ISO 2022 */ ++ } ++} ++ + #ifdef U_ENABLE_GENERIC_ISO_2022 + + /********************************************************************************** +@@ -1233,7 +1241,7 @@ + } + else{ + cnv->toUBytes[0] =(char) sourceChar; +- cnv->toULength = 2; ++ cnv->toULength = 1; + } + + if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ +@@ -1344,6 +1352,181 @@ + * TODO: Implement a priority technique where the users are allowed to set the priority of code pages + */ + ++/* Map 00..7F to Unicode according to JIS X 0201. */ ++static U_INLINE uint32_t ++jisx201ToU(uint32_t value) { ++ if(value < 0x5c) { ++ return value; ++ } else if(value == 0x5c) { ++ return 0xa5; ++ } else if(value == 0x7e) { ++ return 0x203e; ++ } else /* value <= 0x7f */ { ++ return value; ++ } ++} ++ ++/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ ++static U_INLINE uint32_t ++jisx201FromU(uint32_t value) { ++ if(value<=0x7f) { ++ if(value!=0x5c && value!=0x7e) { ++ return value; ++ } ++ } else if(value==0xa5) { ++ return 0x5c; ++ } else if(value==0x203e) { ++ return 0x7e; ++ } ++ return 0xfffe; ++} ++ ++/* ++ * Take a valid Shift-JIS byte pair, check that it is in the range corresponding ++ * to JIS X 0208, and convert it to a pair of 21..7E bytes. ++ * Return 0 if the byte pair is out of range. ++ */ ++static U_INLINE uint32_t ++_2022FromSJIS(uint32_t value) { ++ uint8_t trail; ++ ++ if(value > 0xEFFC) { ++ return 0; /* beyond JIS X 0208 */ ++ } ++ ++ trail = (uint8_t)value; ++ ++ value &= 0xff00; /* lead byte */ ++ if(value <= 0x9f00) { ++ value -= 0x7000; ++ } else /* 0xe000 <= value <= 0xef00 */ { ++ value -= 0xb000; ++ } ++ value <<= 1; ++ ++ if(trail <= 0x9e) { ++ value -= 0x100; ++ if(trail <= 0x7e) { ++ value |= trail - 0x1f; ++ } else { ++ value |= trail - 0x20; ++ } ++ } else /* trail <= 0xfc */ { ++ value |= trail - 0x7e; ++ } ++ return value; ++} ++ ++/* ++ * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. ++ * If either byte is outside 21..7E make sure that the result is not valid ++ * for Shift-JIS so that the converter catches it. ++ * Some invalid byte values already turn into equally invalid Shift-JIS ++ * byte values and need not be tested explicitly. ++ */ ++static U_INLINE void ++_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { ++ if(c1&1) { ++ ++c1; ++ if(c2 <= 0x5f) { ++ c2 += 0x1f; ++ } else if(c2 <= 0x7e) { ++ c2 += 0x20; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } else { ++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { ++ c2 += 0x7e; ++ } else { ++ c2 = 0; /* invalid */ ++ } ++ } ++ c1 >>= 1; ++ if(c1 <= 0x2f) { ++ c1 += 0x70; ++ } else if(c1 <= 0x3f) { ++ c1 += 0xb0; ++ } else { ++ c1 = 0; /* invalid */ ++ } ++ bytes[0] = (char)c1; ++ bytes[1] = (char)c2; ++} ++ ++/* ++ * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) ++ * Katakana. ++ * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks ++ * because Shift-JIS roundtrips half-width Katakana to single bytes. ++ * These were the only fallbacks in ICU's jisx-208.ucm file. ++ */ ++static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { ++ 0x2123, /* U+FF61 */ ++ 0x2156, ++ 0x2157, ++ 0x2122, ++ 0x2126, ++ 0x2572, ++ 0x2521, ++ 0x2523, ++ 0x2525, ++ 0x2527, ++ 0x2529, ++ 0x2563, ++ 0x2565, ++ 0x2567, ++ 0x2543, ++ 0x213C, /* U+FF70 */ ++ 0x2522, ++ 0x2524, ++ 0x2526, ++ 0x2528, ++ 0x252A, ++ 0x252B, ++ 0x252D, ++ 0x252F, ++ 0x2531, ++ 0x2533, ++ 0x2535, ++ 0x2537, ++ 0x2539, ++ 0x253B, ++ 0x253D, ++ 0x253F, /* U+FF80 */ ++ 0x2541, ++ 0x2544, ++ 0x2546, ++ 0x2548, ++ 0x254A, ++ 0x254B, ++ 0x254C, ++ 0x254D, ++ 0x254E, ++ 0x254F, ++ 0x2552, ++ 0x2555, ++ 0x2558, ++ 0x255B, ++ 0x255E, ++ 0x255F, /* U+FF90 */ ++ 0x2560, ++ 0x2561, ++ 0x2562, ++ 0x2564, ++ 0x2566, ++ 0x2568, ++ 0x2569, ++ 0x256A, ++ 0x256B, ++ 0x256C, ++ 0x256D, ++ 0x256F, ++ 0x2573, ++ 0x212B, ++ 0x212C /* U+FF9F */ ++}; ++ + static void + UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { + UConverter *cnv = args->converter; +@@ -1499,7 +1682,7 @@ + } + break; + case HWKANA_7BIT: +- if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { ++ if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { + if(converterData->version==3) { + /* JIS7: use G1 (SO) */ + /* Shift U+FF61..U+FF9F to bytes 21..5F. */ +@@ -1526,13 +1709,34 @@ + break; + case JISX201: + /* G0 SBCS */ +- len2 = MBCS_SINGLE_FROM_UCHAR32( ++ value = jisx201FromU(sourceChar); ++ if(value <= 0x7f) { ++ targetValue = value; ++ len = 1; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ break; ++ case JISX208: ++ /* G0 DBCS from Shift-JIS table */ ++ len2 = MBCS_FROM_UCHAR32_ISO2022( + converterData->myConverterArray[cs0], + sourceChar, &value, +- useFallback); +- if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { +- targetValue = value; +- len = len2; ++ useFallback, MBCS_OUTPUT_2); ++ if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ ++ value = _2022FromSJIS(value); ++ if(value != 0) { ++ targetValue = value; ++ len = len2; ++ cs = cs0; ++ g = 0; ++ useFallback = FALSE; ++ } ++ } else if(len == 0 && useFallback && ++ (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { ++ targetValue = hwkana_fb[sourceChar - HWKANA_START]; ++ len = -2; + cs = cs0; + g = 0; + useFallback = FALSE; +@@ -1564,17 +1768,10 @@ + * Check for valid bytes for the encoding scheme. + * This is necessary because the sub-converter (windows-949) + * has a broader encoding scheme than is valid for 2022. +- * +- * Check that the result is a 2-byte value with each byte in the range A1..FE +- * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte +- * to move it to the ISO 2022 range 21..7E. + */ +- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && +- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) +- ) { +- value -= 0x8080; /* shift down to 21..7e byte range */ +- } else { +- break; /* not valid for ISO 2022 */ ++ value = _2022FromGR94DBCS(value); ++ if(value == 0) { ++ break; + } + } + targetValue = value; +@@ -1750,7 +1947,7 @@ + static void + UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, + UErrorCode* err){ +- char tempBuf[3]; ++ char tempBuf[2]; + const char *mySource = (char *) args->source; + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; +@@ -1868,10 +2065,7 @@ + break; + case JISX201: + if(mySourceChar <= 0x7f) { +- targetUniChar = +- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( +- myData->myConverterArray[cs], +- mySourceChar); ++ targetUniChar = jisx201ToU(mySourceChar); + } + break; + case HWKANA_7BIT: +@@ -1885,8 +2079,13 @@ + if(mySource < mySourceLimit) { + char trailByte; + getTrailByte: +- tempBuf[0] = (char) (mySourceChar); +- tempBuf[1] = trailByte = *mySource++; ++ trailByte = *mySource++; ++ if(cs == JISX208) { ++ _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); ++ } else { ++ tempBuf[0] = (char)mySourceChar; ++ tempBuf[1] = trailByte; ++ } + mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); + } else { +@@ -3190,6 +3389,9 @@ + /* open a set and initialize it with code points that are algorithmically round-tripped */ + switch(cnvData->locale[0]){ + case 'j': ++ /* include JIS X 0201 which is hardcoded */ ++ sa->add(sa->set, 0xa5); ++ sa->add(sa->set, 0x203e); + if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { + /* include Latin-1 for some variants of JP */ + sa->addRange(sa->set, 0, 0xff); +@@ -3198,6 +3400,11 @@ + sa->addRange(sa->set, 0, 0x7f); + } + if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ /* ++ * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, ++ * we need to include half-width Katakana for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them. ++ */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); + } +@@ -3217,15 +3424,7 @@ + break; + } + +- /* +- * Version-specific for CN: +- * CN version 0 does not map CNS planes 3..7 although +- * they are all available in the CNS conversion table; +- * CN version 1 does map them all. +- * The two versions create different Unicode sets. +- */ +- for (i=0; imyConverterArray[i]!=NULL) { ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { +@@ -3235,9 +3434,33 @@ + sa, UCNV_ROUNDTRIP_SET, + 0, 0x81, 0x82, + pErrorCode); ++ } ++#endif ++ ++ for (i=0; imyConverterArray[i]!=NULL) { ++ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && ++ cnvData->version==0 && i==CNS_11643 ++ ) { ++ /* ++ * Version-specific for CN: ++ * CN version 0 does not map CNS planes 3..7 although ++ * they are all available in the CNS conversion table; ++ * CN version 1 (-EXT) does map them all. ++ * The two versions create different Unicode sets. ++ */ ++ filter=UCNV_SET_FILTER_2022_CN; ++ } else if(cnvData->locale[0]=='j' && i==JISX208) { ++ /* ++ * Only add code points that map to Shift-JIS codes ++ * corresponding to JIS X 0208. ++ */ ++ filter=UCNV_SET_FILTER_SJIS; + } else { +- ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); ++ filter=UCNV_SET_FILTER_NONE; + } ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); + } + } + +Index: icu/source/common/ucnvmbcs.c +=================================================================== +--- icu.orig/source/common/ucnvmbcs.c 2009-09-07 19:53:29.757855891 -0400 ++++ icu/source/common/ucnvmbcs.c 2009-09-07 19:53:55.205856630 -0400 +@@ -362,6 +362,8 @@ + + /* Miscellaneous ------------------------------------------------------------ */ + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ ++ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ + static void + _getUnicodeSetForBytes(const UConverterSharedData *sharedData, +@@ -454,11 +456,14 @@ + pErrorCode); + } + ++#endif ++ + U_CFUNC void +-ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode) { + const UConverterMBCSTable *mbcsTable; + const uint16_t *table; + +@@ -512,50 +517,26 @@ + c+=1024; /* empty stage 2 block */ + } + } +- } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { +- /* ignore single-byte results */ ++ } else { + const uint32_t *stage2; +- const uint16_t *stage3, *results; ++ const uint8_t *stage3, *bytes; ++ uint32_t st3Multiplier; ++ uint32_t value; + +- results=(const uint16_t *)mbcsTable->fromUnicodeBytes; +- +- for(st1=0; st1(maxStage1>>1)) { +- stage2=(const uint32_t *)table+st2; +- for(st2=0; st2<64; ++st2) { +- if((st3=stage2[st2])!=0) { +- /* read the stage 3 block */ +- stage3=results+16*(uint32_t)(uint16_t)st3; +- +- /* get the roundtrip flags for the stage 3 block */ +- st3>>=16; ++ bytes=mbcsTable->fromUnicodeBytes; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. +- * See ucnv_MBCSFromUnicodeWithOffsets() for details. +- * +- * Ignore single-byte results (<0x100). +- */ +- do { +- if((st3&1)!=0 && *stage3>=0x100) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- ++stage3; +- } while((++c&0xf)!=0); +- } else { +- c+=16; /* empty stage 3 block */ +- } +- } +- } else { +- c+=1024; /* empty stage 2 block */ +- } ++ switch(mbcsTable->outputType) { ++ case MBCS_OUTPUT_3: ++ case MBCS_OUTPUT_4_EUC: ++ st3Multiplier=3; ++ break; ++ case MBCS_OUTPUT_4: ++ st3Multiplier=4; ++ break; ++ default: ++ st3Multiplier=2; ++ break; + } +- } else { +- const uint32_t *stage2; + + for(st1=0; st1>=16; + +@@ -572,12 +556,49 @@ + * non-roundtrip stage 3 results for whether they are 0. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ +- do { +- if(st3&1) { +- sa->add(sa->set, c); +- } +- st3>>=1; +- } while((++c&0xf)!=0); ++ switch(filter) { ++ case UCNV_SET_FILTER_NONE: ++ do { ++ if(st3&1) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_DBCS_ONLY: ++ /* Ignore single-byte results (<0x100). */ ++ do { ++ if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_2022_CN: ++ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ ++ do { ++ if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=3; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ ++ do { ++ if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ default: ++ *pErrorCode=U_INTERNAL_PROGRAM_ERROR; ++ return; ++ } + } else { + c+=16; /* empty stage 3 block */ + } +@@ -591,6 +612,19 @@ + ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); + } + ++U_CFUNC void ++ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode) { ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ sharedData, sa, which, ++ sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? ++ UCNV_SET_FILTER_DBCS_ONLY : ++ UCNV_SET_FILTER_NONE, ++ pErrorCode); ++} ++ + static void + ucnv_MBCSGetUnicodeSet(const UConverter *cnv, + const USetAdder *sa, +Index: icu/source/common/ucnvmbcs.h +=================================================================== +--- icu.orig/source/common/ucnvmbcs.h 2009-09-07 19:53:29.765856299 -0400 ++++ icu/source/common/ucnvmbcs.h 2009-09-07 19:53:55.209856712 -0400 +@@ -456,6 +456,7 @@ + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode); + ++#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. + * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. +@@ -470,6 +471,7 @@ + UConverterUnicodeSet which, + uint8_t state, int32_t lowByte, int32_t highByte, + UErrorCode *pErrorCode); ++#endif + + /* + * Internal function returning a UnicodeSet for toUnicode() conversion. +@@ -481,9 +483,30 @@ + */ + U_CFUNC void + ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode); ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UErrorCode *pErrorCode); ++ ++typedef enum UConverterSetFilter { ++ UCNV_SET_FILTER_NONE, ++ UCNV_SET_FILTER_DBCS_ONLY, ++ UCNV_SET_FILTER_2022_CN, ++ UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_COUNT ++} UConverterSetFilter; ++ ++/* ++ * Same as ucnv_MBCSGetUnicodeSetForUnicode() but ++ * the set can be filtered by encoding scheme. ++ * Used by stateful converters which share regular conversion tables ++ * but only use a subset of their mappings. ++ */ ++U_CFUNC void ++ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, ++ const USetAdder *sa, ++ UConverterUnicodeSet which, ++ UConverterSetFilter filter, ++ UErrorCode *pErrorCode); + + #endif + +Index: icu/source/test/cintltst/nucnvtst.c +=================================================================== +--- icu.orig/source/test/cintltst/nucnvtst.c 2009-09-07 19:53:29.773856113 -0400 ++++ icu/source/test/cintltst/nucnvtst.c 2009-09-07 19:53:55.213856047 -0400 +@@ -3202,7 +3202,7 @@ + 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x3014, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, +@@ -3730,7 +3730,7 @@ + 0x52C8, 0x52CC, 0x52CF, 0x52D1, 0x52D4, 0x52D6, 0x52DB, 0x52DC, 0x000D, 0x000A, + 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x000D, 0x000A, + 0x3005, 0x3006, 0x3007, 0x30FC, 0x2015, 0x2010, 0xFF0F, 0x005C, 0x000D, 0x000A, +- 0x301C, 0x2016, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, ++ 0x3013, 0x2018, 0x2026, 0x2025, 0x2018, 0x2019, 0x201C, 0x000D, 0x000A, + 0x201D, 0x000D, 0x000A, + 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x000D, 0x000A, + 0x4F94, 0x4F97, 0x52BA, 0x52BB, 0x52BD, 0x52C0, 0x52C4, 0x52C6, 0x000D, 0x000A, +Index: icu/source/test/cintltst/udatatst.c +=================================================================== +--- icu.orig/source/test/cintltst/udatatst.c 2009-09-07 19:53:29.785855751 -0400 ++++ icu/source/test/cintltst/udatatst.c 2009-09-07 19:53:55.217856144 -0400 +@@ -1281,7 +1281,7 @@ + * MBCS conversion table file without extension, + * to test swapping and preflighting of UTF-8-friendly mbcsIndex[]. + */ +- {"jisx-208", "cnv", ucnv_swap}, ++ {"jisx-212", "cnv", ucnv_swap}, + #endif + + #if !UCONFIG_NO_CONVERSION +Index: icu/source/test/testdata/conversion.txt +=================================================================== +--- icu.orig/source/test/testdata/conversion.txt 2009-09-07 19:53:29.797855779 -0400 ++++ icu/source/test/testdata/conversion.txt 2009-09-07 19:53:55.221856140 -0400 +@@ -48,6 +48,15 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", ++ :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() + { + "ISO-8859-3", +@@ -495,6 +504,15 @@ + fromUnicode { + Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" } + Cases { ++ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and ++ // using the Shift-JIS table for JIS X 0208 (ticket #5797) ++ { ++ "ISO-2022-JP", ++ "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", ++ :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, ++ :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, ++ :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e ++ } + // Verify that mappings that would result in byte values outside 20..7F (for SBCS) + // or 21..7E (for DBCS) are not used. + // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): +@@ -1293,13 +1311,13 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", + :int{0} + } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", ++ "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\x0e\x0f\x1b\uffe7-\U0010ffff]", + :int{0} + } only in patch2: unchanged: --- icu-3.8.1.orig/debian/patches/04-redhat.icu6001.patch +++ icu-3.8.1/debian/patches/04-redhat.icu6001.patch @@ -0,0 +1,771 @@ +Index: icu/source/common/ucnv2022.c +=================================================================== +--- icu.orig/source/common/ucnv2022.c 2009-09-07 19:53:55.205856630 -0400 ++++ icu/source/common/ucnv2022.c 2009-09-07 19:53:55.514356046 -0400 +@@ -3399,11 +3399,19 @@ + /* include ASCII for JP */ + sa->addRange(sa->set, 0, 0x7f); + } +- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { ++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { + /* +- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, +- * we need to include half-width Katakana for all JP variants because +- * JIS X 0208 has hardcoded fallbacks for them. ++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 ++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) ++ * use half-width Katakana. ++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) ++ * half-width Katakana via the ESC ( I sequence. ++ * However, we only emit (fromUnicode) half-width Katakana according to the ++ * definition of each variant. ++ * ++ * When including fallbacks, ++ * we need to include half-width Katakana Unicode code points for all JP variants because ++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). + */ + /* include half-width Katakana for JP */ + sa->addRange(sa->set, HWKANA_START, HWKANA_END); +@@ -3457,6 +3465,12 @@ + * corresponding to JIS X 0208. + */ + filter=UCNV_SET_FILTER_SJIS; ++ } else if(i==KSC5601) { ++ /* ++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) ++ * are broader than GR94. ++ */ ++ filter=UCNV_SET_FILTER_GR94DBCS; + } else { + filter=UCNV_SET_FILTER_NONE; + } +@@ -3472,6 +3486,9 @@ + sa->remove(sa->set, 0x0e); + sa->remove(sa->set, 0x0f); + sa->remove(sa->set, 0x1b); ++ ++ /* ISO 2022 converters do not convert C1 controls either */ ++ sa->removeRange(sa->set, 0x80, 0x9f); + } + + static const UConverterImpl _ISO2022Impl={ +Index: icu/source/common/ucnv_ext.c +=================================================================== +--- icu.orig/source/common/ucnv_ext.c 2009-09-07 19:53:29.561855436 -0400 ++++ icu/source/common/ucnv_ext.c 2009-09-07 19:53:55.518355882 -0400 +@@ -946,7 +946,7 @@ + ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, + const int32_t *cx, + const USetAdder *sa, +- UConverterUnicodeSet which, ++ UBool useFallback, + int32_t minLength, + UChar32 c, + UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, +@@ -966,7 +966,7 @@ + value=*fromUSectionValues++; + + if( value!=0 && +- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && ++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { + if(c>=0) { +@@ -987,12 +987,14 @@ + /* no mapping, do nothing */ + } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { + ucnv_extGetUnicodeSetString( +- sharedData, cx, sa, which, minLength, ++ sharedData, cx, sa, useFallback, minLength, + U_SENTINEL, s, length+1, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++ } else if((useFallback ? ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { + sa->addString(sa->set, s, length+1); +@@ -1004,6 +1006,7 @@ + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, ++ UConverterSetFilter filter, + UErrorCode *pErrorCode) { + const int32_t *cx; + const uint16_t *stage12, *stage3, *ps2, *ps3; +@@ -1011,6 +1014,7 @@ + + uint32_t value; + int32_t st1, stage1Length, st2, st3, minLength; ++ UBool useFallback; + + UChar s[UCNV_EXT_MAX_UCHARS]; + UChar32 c; +@@ -1027,12 +1031,20 @@ + + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; + ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + +- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) { ++ if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++ filter==UCNV_SET_FILTER_DBCS_ONLY || ++ filter==UCNV_SET_FILTER_SJIS || ++ filter==UCNV_SET_FILTER_GR94DBCS ++ ) { + /* DBCS-only, ignore single-byte results */ + minLength=2; ++ } else if(filter==UCNV_SET_FILTER_2022_CN) { ++ minLength=3; + } else { + minLength=1; + } +@@ -1064,14 +1076,41 @@ + length=0; + U16_APPEND_UNSAFE(s, length, c); + ucnv_extGetUnicodeSetString( +- sharedData, cx, sa, which, minLength, ++ sharedData, cx, sa, useFallback, minLength, + c, s, length, + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), + pErrorCode); +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && ++ } else if((useFallback ? ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))== ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength + ) { ++ switch(filter) { ++ case UCNV_SET_FILTER_2022_CN: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_SJIS: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_GR94DBCS: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe-0xa1))) { ++ continue; ++ } ++ break; ++ default: ++ /* ++ * UCNV_SET_FILTER_NONE, ++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength ++ */ ++ break; ++ } + sa->add(sa->set, c); + } + } while((++c&0xf)!=0); +Index: icu/source/common/ucnv_ext.h +=================================================================== +--- icu.orig/source/common/ucnv_ext.h 2009-09-07 19:53:29.569855630 -0400 ++++ icu/source/common/ucnv_ext.h 2009-09-07 19:53:55.526354928 -0400 +@@ -382,10 +382,20 @@ + UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode); + ++/* ++ * Add code points and strings to the set according to the extension mappings. ++ * Limitation on the UConverterSetFilter: ++ * The filters currently assume that they are used with 1:1 mappings. ++ * They only apply to single input code points, and then they pass through ++ * only mappings with single-charset-code results. ++ * For example, the Shift-JIS filter only works for 2-byte results and tests ++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS. ++ */ + U_CFUNC void + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, ++ UConverterSetFilter filter, + UErrorCode *pErrorCode); + + /* toUnicode helpers -------------------------------------------------------- */ +Index: icu/source/common/ucnvhz.c +=================================================================== +--- icu.orig/source/common/ucnvhz.c 2009-09-07 19:53:29.577856139 -0400 ++++ icu/source/common/ucnvhz.c 2009-09-07 19:53:55.534354874 -0400 +@@ -1,6 +1,6 @@ + /* + ********************************************************************** +-* Copyright (C) 2000-2006, International Business Machines ++* Copyright (C) 2000-2007, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * file name: ucnvhz.c +@@ -528,6 +528,7 @@ + sa->add(sa->set, 0x7e); + + /* add all of the code points that the sub-converter handles */ ++ /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ + ((UConverterDataHZ*)cnv->extraInfo)-> + gbConverter->sharedData->impl-> + getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +Index: icu/source/common/ucnv_lmb.c +=================================================================== +--- icu.orig/source/common/ucnv_lmb.c 2009-09-07 19:53:29.585855984 -0400 ++++ icu/source/common/ucnv_lmb.c 2009-09-07 19:53:55.541855559 -0400 +@@ -1,6 +1,6 @@ + /* + ********************************************************************** +-* Copyright (C) 2000-2006, International Business Machines ++* Copyright (C) 2000-2007, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * file name: ucnv_lmb.cpp +@@ -536,7 +536,7 @@ + NULL,\ + NULL,\ + _LMBCSSafeClone,\ +- _LMBCSGetUnicodeSet\ ++ ucnv_getCompleteUnicodeSet\ + };\ + static const UConverterStaticData _LMBCSStaticData##n={\ + sizeof(UConverterStaticData),\ +@@ -662,15 +662,14 @@ + return &newLMBCS->cnv; + } + +-static void +-_LMBCSGetUnicodeSet(const UConverter *cnv, +- const USetAdder *sa, +- UConverterUnicodeSet which, +- UErrorCode *pErrorCode) { +- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */ +- sa->addRange(sa->set, 0, 0xf5ff); +- sa->addRange(sa->set, 0xf700, 0x10ffff); +-} ++/* ++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117) ++ * which added all code points except for U+F6xx ++ * because those cannot be represented in the Unicode group. ++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx ++ * which means that LMBCS can convert all Unicode code points after all. ++ * We now simply use ucnv_getCompleteUnicodeSet(). ++ */ + + /* + Here's the basic helper function that we use when converting from +Index: icu/source/common/ucnvmbcs.c +=================================================================== +--- icu.orig/source/common/ucnvmbcs.c 2009-09-07 19:53:55.205856630 -0400 ++++ icu/source/common/ucnvmbcs.c 2009-09-07 19:53:55.545856037 -0400 +@@ -485,9 +485,23 @@ + + if(mbcsTable->outputType==MBCS_OUTPUT_1) { + const uint16_t *stage2, *stage3, *results; ++ uint16_t minValue; + + results=(const uint16_t *)mbcsTable->fromUnicodeBytes; + ++ /* ++ * Set a threshold variable for selecting which mappings to use. ++ * See ucnv_MBCSSingleFromBMPWithOffsets() and ++ * MBCS_SINGLE_RESULT_FROM_U() for details. ++ */ ++ if(which==UCNV_ROUNDTRIP_SET) { ++ /* use only roundtrips */ ++ minValue=0xf00; ++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { ++ /* use all roundtrip and fallback results */ ++ minValue=0x800; ++ } ++ + for(st1=0; st1maxStage1) { +@@ -497,15 +511,8 @@ + /* read the stage 3 block */ + stage3=results+st3; + +- /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to use +- * a threshold variable with a value of 0x800. +- * See ucnv_MBCSSingleFromBMPWithOffsets() and +- * MBCS_SINGLE_RESULT_FROM_U() for details. +- */ + do { +- if(*stage3++>=0xf00) { ++ if(*stage3++>=minValue) { + sa->add(sa->set, c); + } + } while((++c&0xf)!=0); +@@ -522,9 +529,12 @@ + const uint8_t *stage3, *bytes; + uint32_t st3Multiplier; + uint32_t value; ++ UBool useFallback; + + bytes=mbcsTable->fromUnicodeBytes; + ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); ++ + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: +@@ -551,9 +561,8 @@ + st3>>=16; + + /* +- * Add code points for which the roundtrip flag is set. +- * Once we get a set for fallback mappings, we have to check +- * non-roundtrip stage 3 results for whether they are 0. ++ * Add code points for which the roundtrip flag is set, ++ * or which map to non-zero bytes if we use fallbacks. + * See ucnv_MBCSFromUnicodeWithOffsets() for details. + */ + switch(filter) { +@@ -561,6 +570,23 @@ + do { + if(st3&1) { + sa->add(sa->set, c); ++ stage3+=st3Multiplier; ++ } else if(useFallback) { ++ uint8_t b=0; ++ switch(st3Multiplier) { ++ case 4: ++ b|=*stage3++; ++ case 3: ++ b|=*stage3++; ++ case 2: ++ b|=stage3[0]|stage3[1]; ++ stage3+=2; ++ default: ++ break; ++ } ++ if(b!=0) { ++ sa->add(sa->set, c); ++ } + } + st3>>=1; + } while((++c&0xf)!=0); +@@ -568,7 +594,7 @@ + case UCNV_SET_FILTER_DBCS_ONLY: + /* Ignore single-byte results (<0x100). */ + do { +- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { ++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -578,7 +604,7 @@ + case UCNV_SET_FILTER_2022_CN: + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ + do { +- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { ++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -588,7 +614,20 @@ + case UCNV_SET_FILTER_SJIS: + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ + do { +- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_GR94DBCS: ++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ ++ do { ++ if( ((st3&1)!=0 || useFallback) && ++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe-0xa1) ++ ) { + sa->add(sa->set, c); + } + st3>>=1; +@@ -609,7 +648,7 @@ + } + } + +- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); ++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); + } + + U_CFUNC void +Index: icu/source/common/ucnvmbcs.h +=================================================================== +--- icu.orig/source/common/ucnvmbcs.h 2009-09-07 19:53:55.209856712 -0400 ++++ icu/source/common/ucnvmbcs.h 2009-09-07 19:53:55.549855663 -0400 +@@ -492,6 +492,7 @@ + UCNV_SET_FILTER_DBCS_ONLY, + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, ++ UCNV_SET_FILTER_GR94DBCS, + UCNV_SET_FILTER_COUNT + } UConverterSetFilter; + +Index: icu/source/common/ucnv_set.c +=================================================================== +--- icu.orig/source/common/ucnv_set.c 2009-09-07 19:53:29.613855682 -0400 ++++ icu/source/common/ucnv_set.c 2009-09-07 19:53:55.553856118 -0400 +@@ -1,7 +1,7 @@ + /* + ******************************************************************************* + * +-* Copyright (C) 2003-2005, International Business Machines ++* Copyright (C) 2003-2007, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* +@@ -52,7 +52,8 @@ + uset_add, + uset_addRange, + uset_addString, +- uset_remove ++ uset_remove, ++ uset_removeRange + }; + sa.set=setFillIn; + +Index: icu/source/common/unicode/ucnv.h +=================================================================== +--- icu.orig/source/common/unicode/ucnv.h 2009-09-07 19:53:29.621856492 -0400 ++++ icu/source/common/unicode/ucnv.h 2009-09-07 19:53:55.557855943 -0400 +@@ -870,6 +870,8 @@ + typedef enum UConverterUnicodeSet { + /** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */ + UCNV_ROUNDTRIP_SET, ++ /** Select the set of Unicode code points with roundtrip or fallback mappings. @draft ICU 4.0 */ ++ UCNV_ROUNDTRIP_AND_FALLBACK_SET, + /** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */ + UCNV_SET_COUNT + } UConverterUnicodeSet; +@@ -878,11 +880,16 @@ + /** + * Returns the set of Unicode code points that can be converted by an ICU converter. + * +- * The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): ++ * Returns one of several kinds of set: ++ * ++ * 1. UCNV_ROUNDTRIP_SET ++ * + * The set of all Unicode code points that can be roundtrip-converted +- * (converted without any data loss) with the converter. ++ * (converted without any data loss) with the converter (ucnv_fromUnicode()). + * This set will not include code points that have fallback mappings + * or are only the result of reverse fallback mappings. ++ * This set will also not include PUA code points with fallbacks, although ++ * ucnv_fromUnicode() will always uses those mappings despite ucnv_setFallback(). + * See UTR #22 "Character Mapping Markup Language" + * at http://www.unicode.org/reports/tr22/ + * +@@ -893,6 +900,12 @@ + * by comparing its roundtrip set with the set of ExemplarCharacters from + * ICU's locale data or other sources + * ++ * 2. UCNV_ROUNDTRIP_AND_FALLBACK_SET ++ * ++ * The set of all Unicode code points that can be converted with the converter (ucnv_fromUnicode()) ++ * when fallbacks are turned on (see ucnv_setFallback()). ++ * This set includes all code points with roundtrips and fallbacks (but not reverse fallbacks). ++ * + * In the future, there may be more UConverterUnicodeSet choices to select + * sets with different properties. + * +Index: icu/source/common/uset_imp.h +=================================================================== +--- icu.orig/source/common/uset_imp.h 2009-09-07 19:53:29.633855953 -0400 ++++ icu/source/common/uset_imp.h 2009-09-07 19:53:55.561855997 -0400 +@@ -36,6 +36,9 @@ + typedef void U_CALLCONV + USetRemove(USet *set, UChar32 c); + ++typedef void U_CALLCONV ++USetRemoveRange(USet *set, UChar32 start, UChar32 end); ++ + /** + * Interface for adding items to a USet, to keep low-level code from + * statically depending on the USet implementation. +@@ -47,6 +50,7 @@ + USetAddRange *addRange; + USetAddString *addString; + USetRemove *remove; ++ USetRemoveRange *removeRange; + }; + typedef struct USetAdder USetAdder; + +Index: icu/source/test/intltest/convtest.cpp +=================================================================== +--- icu.orig/source/test/intltest/convtest.cpp 2009-09-07 19:53:29.641855974 -0400 ++++ icu/source/test/intltest/convtest.cpp 2009-09-07 19:53:55.565855916 -0400 +@@ -70,6 +70,7 @@ + case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; + case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; + case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; ++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; + default: name=""; break; //needed to end loop + } + } +@@ -465,6 +466,183 @@ + } + } + ++U_CDECL_BEGIN ++static void U_CALLCONV ++getUnicodeSetCallback(const void *context, ++ UConverterFromUnicodeArgs *fromUArgs, ++ const UChar* codeUnits, ++ int32_t length, ++ UChar32 codePoint, ++ UConverterCallbackReason reason, ++ UErrorCode *pErrorCode) { ++ if(reason<=UCNV_IRREGULAR) { ++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point ++ *pErrorCode=U_ZERO_ERROR; // skip ++ } // else ignore the reset, close and clone calls. ++} ++U_CDECL_END ++ ++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted. ++void ++ConversionTest::TestGetUnicodeSet2() { ++ // Build a string with all code points. ++ UChar32 cpLimit; ++ int32_t s0Length; ++ if(quick) { ++ cpLimit=s0Length=0x10000; // BMP only ++ } else { ++ cpLimit=0x110000; ++ s0Length=0x10000+0x200000; // BMP + surrogate pairs ++ } ++ UChar *s0=new UChar[s0Length]; ++ if(s0==NULL) { ++ return; ++ } ++ UChar *s=s0; ++ UChar32 c; ++ UChar c2; ++ // low BMP ++ for(c=0; c<=0xd7ff; ++c) { ++ *s++=(UChar)c; ++ } ++ // trail surrogates ++ for(c=0xdc00; c<=0xdfff; ++c) { ++ *s++=(UChar)c; ++ } ++ // lead surrogates ++ // (after trails so that there is not even one surrogate pair in between) ++ for(c=0xd800; c<=0xdbff; ++c) { ++ *s++=(UChar)c; ++ } ++ // high BMP ++ for(c=0xe000; c<=0xffff; ++c) { ++ *s++=(UChar)c; ++ } ++ // supplementary code points = surrogate pairs ++ if(cpLimit==0x110000) { ++ for(c=0xd800; c<=0xdbff; ++c) { ++ for(c2=0xdc00; c2<=0xdfff; ++c2) { ++ *s++=(UChar)c; ++ *s++=c2; ++ } ++ } ++ } ++ ++ static const char *const cnvNames[]={ ++ "UTF-8", ++ "UTF-7", ++ "UTF-16", ++ "US-ASCII", ++ "ISO-8859-1", ++ "windows-1252", ++ "Shift-JIS", ++ "ibm-1390", // EBCDIC_STATEFUL table ++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table ++ // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++ "ISO-2022-JP", ++ "JIS7", ++ "ISO-2022-CN", ++ "ISO-2022-CN-EXT", ++ "LMBCS" ++ }; ++ char buffer[1024]; ++ int32_t i; ++ for(i=0; i100) { ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++ } ++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d", ++ cnvNames[i], which); ++ errln(out); ++ } ++ ++ // are there items that must not be in the set but are? ++ (diffSet=set).removeAll(expected); ++ if(!diffSet.isEmpty()) { ++ diffSet.toPattern(out, TRUE); ++ if(out.length()>100) { ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis)); ++ } ++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d", ++ cnvNames[i], which); ++ errln(out); ++ } ++ } ++ } ++ } ++ ++ delete [] s0; ++} ++ + // open testdata or ICU data converter ------------------------------------- *** + + UConverter * +Index: icu/source/test/intltest/convtest.h +=================================================================== +--- icu.orig/source/test/intltest/convtest.h 2009-09-07 19:53:29.653855668 -0400 ++++ icu/source/test/intltest/convtest.h 2009-09-07 19:53:55.565855916 -0400 +@@ -72,6 +72,7 @@ + void TestToUnicode(); + void TestFromUnicode(); + void TestGetUnicodeSet(); ++ void TestGetUnicodeSet2(); + + private: + UBool +Index: icu/source/test/testdata/conversion.txt +=================================================================== +--- icu.orig/source/test/testdata/conversion.txt 2009-09-07 19:53:55.221856140 -0400 ++++ icu/source/test/testdata/conversion.txt 2009-09-07 19:53:55.569856004 -0400 +@@ -1311,16 +1311,29 @@ + // versions of ISO-2022-JP + { + "ISO-2022-JP", +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]", ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]", + :int{0} +- } ++ } + { + "ISO-2022-JP-2", +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", +- "[\x0e\x0f\x1b\uffe7-\U0010ffff]", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]", ++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]", ++ :int{0} ++ } ++ { ++ "JIS7", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]", ++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]", + :int{0} + } ++ // with fallbacks ++ { ++ "ISO-2022-JP", ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]", ++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]", ++ :int{1} ++ } + + // versions of ISO-2022-CN + { +@@ -1352,6 +1365,14 @@ + :int{0} + } + ++ // LMBCS ++ { ++ "LMBCS", ++ "[\x00-\U0010ffff]", ++ "[]", ++ :int{0} ++ } ++ + // extensions + { + "ibm-1390", only in patch2: unchanged: --- icu-3.8.1.orig/debian/patches/05-redhat.icu6002.patch +++ icu-3.8.1/debian/patches/05-redhat.icu6002.patch @@ -0,0 +1,412 @@ +Index: icu/source/common/ucnv_ext.c +=================================================================== +--- icu.orig/source/common/ucnv_ext.c 2009-09-07 19:53:55.518355882 -0400 ++++ icu/source/common/ucnv_ext.c 2009-09-07 19:53:55.949856455 -0400 +@@ -1036,15 +1036,13 @@ + /* enumerate the from-Unicode trie table */ + c=0; /* keep track of the current code point while enumerating */ + +- if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || +- filter==UCNV_SET_FILTER_DBCS_ONLY || +- filter==UCNV_SET_FILTER_SJIS || +- filter==UCNV_SET_FILTER_GR94DBCS ++ if(filter==UCNV_SET_FILTER_2022_CN) { ++ minLength=3; ++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || ++ filter!=UCNV_SET_FILTER_NONE + ) { + /* DBCS-only, ignore single-byte results */ + minLength=2; +- } else if(filter==UCNV_SET_FILTER_2022_CN) { +- minLength=3; + } else { + minLength=1; + } +@@ -1099,8 +1097,15 @@ + break; + case UCNV_SET_FILTER_GR94DBCS: + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && +- (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe-0xa1a1) && +- (uint8_t)(value-0xa1)<=(0xfe-0xa1))) { ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { ++ continue; ++ } ++ break; ++ case UCNV_SET_FILTER_HZ: ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { + continue; + } + break; +Index: icu/source/common/ucnvhz.c +=================================================================== +--- icu.orig/source/common/ucnvhz.c 2009-09-07 19:53:55.534354874 -0400 ++++ icu/source/common/ucnvhz.c 2009-09-07 19:53:55.949856455 -0400 +@@ -72,7 +72,7 @@ + cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ)); + if(cnv->extraInfo != NULL){ + uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ)); +- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode); ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode); + } + else { + *errorCode = U_MEMORY_ALLOCATION_ERROR; +@@ -141,7 +141,7 @@ + UChar *myTarget = args->target; + const char *mySourceLimit = args->sourceLimit; + UChar32 targetUniChar = 0x0000; +- UChar mySourceChar = 0x0000; ++ int32_t mySourceChar = 0x0000; + UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); + tempBuf[0]=0; + tempBuf[1]=0; +@@ -156,90 +156,71 @@ + + mySourceChar= (unsigned char) *mySource++; + +- switch(mySourceChar){ ++ if(args->converter->mode == UCNV_TILDE) { ++ /* second byte after ~ */ ++ args->converter->mode=0; ++ switch(mySourceChar) { + case 0x0A: +- if(args->converter->mode ==UCNV_TILDE){ +- args->converter->mode=0; +- +- } +- *(myTarget++)=(UChar)mySourceChar; ++ /* no output for ~\n (line-continuation marker) */ + continue; +- + case UCNV_TILDE: +- if(args->converter->mode ==UCNV_TILDE){ +- *(myTarget++)=(UChar)mySourceChar; +- args->converter->mode=0; +- continue; +- ++ if(args->offsets) { ++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2); + } +- else if(args->converter->toUnicodeStatus !=0){ +- args->converter->mode=0; +- break; +- } +- else{ +- args->converter->mode = UCNV_TILDE; +- continue; +- } +- +- ++ *(myTarget++)=(UChar)mySourceChar; ++ continue; + case UCNV_OPEN_BRACE: +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- myData->isStateDBCS = TRUE; +- continue; +- } +- else{ +- break; +- } +- +- ++ myData->isStateDBCS = TRUE; ++ continue; + case UCNV_CLOSE_BRACE: +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- myData->isStateDBCS = FALSE; +- continue; +- } +- else{ +- break; +- } +- ++ myData->isStateDBCS = FALSE; ++ continue; + default: + /* if the first byte is equal to TILDE and the trail byte + * is not a valid byte then it is an error condition + */ +- if(args->converter->mode == UCNV_TILDE){ +- args->converter->mode=0; +- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); +- goto SAVE_STATE; +- } +- ++ mySourceChar = 0x7e00 | mySourceChar; ++ targetUniChar = 0xffff; + break; +- +- } +- +- if(myData->isStateDBCS){ ++ } ++ } else if(myData->isStateDBCS) { + if(args->converter->toUnicodeStatus == 0x00){ +- args->converter->toUnicodeStatus = (UChar) mySourceChar; ++ /* lead byte */ ++ if(mySourceChar == UCNV_TILDE) { ++ args->converter->mode = UCNV_TILDE; ++ } else { ++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */ ++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100); ++ } + continue; + } + else{ +- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ; +- tempBuf[1] = (char) (mySourceChar+0x80); +- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80)); ++ /* trail byte */ ++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff; ++ if( (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21) && ++ (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) ++ ) { ++ tempBuf[0] = (char) (leadByte+0x80) ; ++ tempBuf[1] = (char) (mySourceChar+0x80); ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, ++ tempBuf, 2, args->converter->useFallback); ++ } else { ++ targetUniChar = 0xffff; ++ } ++ /* add another bit so that the code below writes 2 bytes in case of error */ ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; + args->converter->toUnicodeStatus =0x00; +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- tempBuf, 2, args->converter->useFallback); + } + } + else{ +- if(args->converter->fromUnicodeStatus == 0x00){ +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData, +- mySource - 1, 1, args->converter->useFallback); +- } +- else{ +- goto SAVE_STATE; ++ if(mySourceChar == UCNV_TILDE) { ++ args->converter->mode = UCNV_TILDE; ++ continue; ++ } else if(mySourceChar <= 0x7f) { ++ targetUniChar = (UChar)mySourceChar; /* ASCII */ ++ } else { ++ targetUniChar = 0xffff; + } +- + } + if(targetUniChar < 0xfffe){ + if(args->offsets) { +@@ -248,26 +229,17 @@ + + *(myTarget++)=(UChar)targetUniChar; + } +- else if(targetUniChar>=0xfffe){ +-SAVE_STATE: ++ else /* targetUniChar>=0xfffe */ { + if(targetUniChar == 0xfffe){ + *err = U_INVALID_CHAR_FOUND; + } + else{ + *err = U_ILLEGAL_CHAR_FOUND; + } +- if(myData->isStateDBCS){ +- /* this should never occur since isStateDBCS is set to true +- * only after tempBuf[0] and tempBuf[1] +- * are set to the input .. just to please BEAM +- */ +- if(tempBuf[0]==0 || tempBuf[1]==0){ +- *err = U_INTERNAL_PROGRAM_ERROR; +- }else{ +- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80); +- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80); +- args->converter->toULength=2; +- } ++ if(mySourceChar > 0xff){ ++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8); ++ args->converter->toUBytes[1] = (uint8_t)mySourceChar; ++ args->converter->toULength=2; + } + else{ + args->converter->toUBytes[0] = (uint8_t)mySourceChar; +@@ -328,16 +300,21 @@ + escSeq = TILDE_ESCAPE; + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex); + continue; +- } +- else{ ++ } else if(mySourceChar <= 0x7f) { ++ length = 1; ++ targetUniChar = mySourceChar; ++ } else { + length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData, + mySourceChar,&targetUniChar,args->converter->useFallback); +- +- } +- /* only DBCS or SBCS characters are expected*/ +- /* DB haracters with high bit set to 1 are expected */ +- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){ +- targetUniChar= missingCharMarker; ++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */ ++ if( length == 2 && ++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && ++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) ++ ) { ++ targetUniChar -= 0x8080; ++ } else { ++ targetUniChar = missingCharMarker; ++ } + } + if (targetUniChar != missingCharMarker){ + myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF); +@@ -360,22 +337,22 @@ + + if(isTargetUCharDBCS){ + if( myTargetIndex > 8) -0x80); ++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + if(myTargetIndex < targetLength){ +- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80); ++ myTarget[myTargetIndex++] =(char) targetUniChar; + if(offsets){ + *(offsets++) = mySourceIndex-1; + } + }else{ +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + }else{ +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80); +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8); ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar; + *err = U_BUFFER_OVERFLOW_ERROR; + } + +@@ -524,15 +501,14 @@ + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { +- /* the tilde '~' is hardcoded in the converter */ +- sa->add(sa->set, 0x7e); ++ /* HZ converts all of ASCII */ ++ sa->addRange(sa->set, 0, 0x7f); + + /* add all of the code points that the sub-converter handles */ +- /* ucnv_MBCSGetFilteredUnicodeSetForUnicode(((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, sa, which, UCNV_SET_FILTER_GR94DBCS, pErrorCode); */ +- ((UConverterDataHZ*)cnv->extraInfo)-> +- gbConverter->sharedData->impl-> +- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, +- sa, which, pErrorCode); ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode( ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, ++ sa, which, UCNV_SET_FILTER_HZ, ++ pErrorCode); + } + + static const UConverterImpl _HZImpl={ +Index: icu/source/common/ucnvmbcs.c +=================================================================== +--- icu.orig/source/common/ucnvmbcs.c 2009-09-07 19:53:55.545856037 -0400 ++++ icu/source/common/ucnvmbcs.c 2009-09-07 19:53:55.953856288 -0400 +@@ -625,8 +625,21 @@ + /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ + do { + if( ((st3&1)!=0 || useFallback) && +- (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfefe-0xa1a1) && +- (uint8_t)(value-0xa1)<=(0xfe-0xa1) ++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1) ++ ) { ++ sa->add(sa->set, c); ++ } ++ st3>>=1; ++ stage3+=2; /* +=st3Multiplier */ ++ } while((++c&0xf)!=0); ++ break; ++ case UCNV_SET_FILTER_HZ: ++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ ++ do { ++ if( ((st3&1)!=0 || useFallback) && ++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1) + ) { + sa->add(sa->set, c); + } +Index: icu/source/common/ucnvmbcs.h +=================================================================== +--- icu.orig/source/common/ucnvmbcs.h 2009-09-07 19:53:55.549855663 -0400 ++++ icu/source/common/ucnvmbcs.h 2009-09-07 19:53:55.957856597 -0400 +@@ -493,6 +493,7 @@ + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, + UCNV_SET_FILTER_GR94DBCS, ++ UCNV_SET_FILTER_HZ, + UCNV_SET_FILTER_COUNT + } UConverterSetFilter; + +Index: icu/source/test/cintltst/ncnvtst.c +=================================================================== +--- icu.orig/source/test/cintltst/ncnvtst.c 2009-09-07 19:53:29.421855943 -0400 ++++ icu/source/test/cintltst/ncnvtst.c 2009-09-07 19:53:55.957856597 -0400 +@@ -1928,7 +1928,7 @@ + #if !UCONFIG_NO_LEGACY_CONVERSION + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }, + { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff }, +- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff }, ++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */ + { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff } + #else + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff } +Index: icu/source/test/intltest/convtest.cpp +=================================================================== +--- icu.orig/source/test/intltest/convtest.cpp 2009-09-07 19:53:55.565855916 -0400 ++++ icu/source/test/intltest/convtest.cpp 2009-09-07 19:53:55.961856043 -0400 +@@ -538,7 +538,7 @@ + "Shift-JIS", + "ibm-1390", // EBCDIC_STATEFUL table + "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table +- // "HZ", TODO(markus): known bug, the set incorrectly contains [\u02CA\u02CB\u02D9\u2010\u2013\u2015...] ++ "HZ", + "ISO-2022-JP", + "JIS7", + "ISO-2022-CN", +Index: icu/source/test/testdata/conversion.txt +=================================================================== +--- icu.orig/source/test/testdata/conversion.txt 2009-09-07 19:53:55.569856004 -0400 ++++ icu/source/test/testdata/conversion.txt 2009-09-07 19:53:55.965855898 -0400 +@@ -48,6 +48,14 @@ + toUnicode { + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } + Cases { ++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e ++ { ++ "HZ", ++ :bin{ 7e7b21212120217e217f772100007e217e7d207e7e807e0a2b }, ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd ~\ufffd+", ++ :intvector{ 2,4,6,8,10,12,14,18,19,21,24 }, ++ :int{1}, :int{1}, "", "?", :bin{""} ++ } + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { +@@ -1349,6 +1357,14 @@ + :int{0} + } + ++ // HZ ++ { ++ "HZ", ++ "[\u0410-\u044f\u4e00\u4e01\u4e03]", ++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]", ++ :int{0} ++ } ++ + // DBCS-only + { + "ibm-971",