C++中各种string的相互转化 - c++编程基础

　　ch -= halfBase;

　　*target++ = （UTF16）（（ch 》 halfShift） + UNI_SUR_HIGH_START）；

　　*target++ = （UTF16）（（ch & halfMask） + UNI_SUR_LOW_START）；

　　}

　　*sourceStart = source;

　　*targetStart = target;

　　return result;

　　}

　　/* --------------------------------------------------------------------- */

　　ConversionResult ConvertUTF16toUTF32 （

　　const UTF16** sourceStart, const UTF16* sourceEnd,

　　UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags） {

　　ConversionResult result = conversionOK;

　　const UTF16* source = *sourceStart;

　　UTF32* target = *targetStart;

　　UTF32 ch, ch2;

　　while （source < sourceEnd） {

　　const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */

　　ch = *source++;

　　/* If we have a surrogate pair, convert to UTF32 first. */

　　if （ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END） {

　　/* If the 16 bits following the high surrogate are in the source buffer */

　　if （source < sourceEnd） {

　　ch2 = *source;

　　/* If it's a low surrogate, convert to UTF32. */

　　if （ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END） {

　　ch = （（ch - UNI_SUR_HIGH_START）《 halfShift）

　　+ （ch2 - UNI_SUR_LOW_START） + halfBase;

　　++source;

　　} else if （flags == strictConversion） { /* it's an unpaired high surrogate */

　　--source; /* return to the illegal value itself */

　　result = sourceIllegal;

　　break;

　　}

　　} else { /* We don't have the 16 bits following the high surrogate. */

　　--source; /* return to the high surrogate */

　　result = sourceExhausted;

　　break;

　　}

　　} else if （flags == strictConversion） {

　　/* UTF-16 surrogate values are illegal in UTF-32 */

　　if （ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END） {

　　--source; /* return to the illegal value itself */

　　result = sourceIllegal;

　　break;

　　}

　　if （target >= targetEnd） {

　　source = oldSource; /* Back up source pointer! */

　　result = targetExhausted; break;

　　}

　　*target++ = ch;

　　}

　　*sourceStart = source;

　　*targetStart = target;

　　#ifdef CVTUTF_DEBUG

　　if （result == sourceIllegal） {

　　fprintf（stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2）；

　　fflush（stderr）；

　　}

　　#endif

　　return result;

　　}

　　/* --------------------------------------------------------------------- */

　　* Index into the table below with the first byte of a UTF-8 sequence to

　　* get the number of trailing bytes that are supposed to follow it.

　　* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is

　　* left as-is for anyone who may want to do such conversion, which was

　　* allowed in earlier algorithms.

　　static const char trailingBytesForUTF8[256] = {

　　0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

　　1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

　　2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5

　　};

　　* Magic values subtracted from a buffer value during UTF8 conversion.

　　* This table contains as many values as there might be trailing bytes

　　* in a UTF-8 sequence.

　　static const UTF32 offsetsFromUTF8 = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,

　　0x03C82080UL, 0xFA082080UL, 0x82082080UL };

　　* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

　　* into the first byte, depending on how many bytes follow. There are

　　* as many entries in this table as there are UTF-8 sequence types.

　　* （I.e., one byte sequence, two byte etc.）。 Remember that sequencs

　　* for *legal* UTF-8 will be 4 or fewer bytes total.

　　static const UTF8 firstByteMark = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

　　/* --------------------------------------------------------------------- */

　　/* The interface converts a whole buffer to avoid function-call overhead.

　　* Constants have been gathered. Loops & conditionals have been removed as

　　* much as possible for efficiency, in favor of drop-through switches.

　　* （See "Note A" at the bottom of the file for equivalent code.）

　　* If your compiler supports it, the "isLegalUTF8" call can be turned

　　* into an inline function.

　　/* --------------------------------------------------------------------- */

　　ConversionResult ConvertUTF16toUTF8 （

　　const UTF16** sourceStart, const UTF16* sourceEnd,

　　UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags） {

　　ConversionResult result = conversionOK;

　　const UTF16* source = *sourceStart;

　　UTF8* target = *targetStart;

　　while （source < sourceEnd） {

　　UTF32 ch;

　　unsigned short bytesToWrite = 0;

　　const UTF32 byteMask = 0xBF;

　　const UTF32 byteMark = 0x80;

　　const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */

　　ch = *source++;

　　/* If we have a surrogate pair, convert to UTF32 first. */

　　if （ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END） {

　　/* If the 16 bits following the high surrogate are in the source buffer */

　　if （source < sourceEnd） {

　　UTF32 ch2 = *source;

　　/* If it's a low surrogate, convert to UTF32. */

　　if （ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END） {

　　ch = （（ch - UNI_SUR_HIGH_START）《 halfShift）

　　+ （ch2 - UNI_SUR_LOW_START） + halfBase;

　　++source;

　　} else if （flags == strictConversion） { /* it's an unpaired high surrogate */

　　--source; /* return to the illegal value itself */

　　result = sourceIllegal;

　　break;

　　}

　　} else { /* We don't have the 16 bits following the high surrogate. */

　　--source; /* return to the high surrogate */

　　result = sourceExhausted;

　　break;

　　}

　　} else if （flags == strictConversion） {

　　/* UTF-16 surrogate values are illegal in UTF-32 */

　　if （ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END） {

　　--source; /* return to the illegal value itself */

　　result = sourceIllegal;

　　break;

　　}

　　/* Figure out how many bytes the result will require */

　　if （ch < （UTF32）0x80） { bytesToWrite = 1;

　　} else if （ch < （UTF32）0x800） { bytesToWrite = 2;

　　} else if （ch < （UTF32）0x10000） { bytesToWrite = 3;

　　} else if （ch < （UTF32）0x110000） { bytesToWrite = 4;

　　} else { bytesToWrite = 3;

　　ch = UNI_REPLACEMENT_CHAR;

　　}

　　target += bytesToWrite;

　　if （target > targetEnd） {

　　source = oldSource; /* Back up source pointer! */

　　target -= bytesToWrite; result = targetExhausted; break;

　　}

　　switch （bytesToWrite） { /* note: everything falls through. */

　　case 4: *--target = （UTF8）（（ch | byteMark） & byteMask）； ch 》= 6;

　　case 3: *--target = （UTF8）（（ch | byteMark） & byteMask）； ch 》= 6;

　　case 2: *--target = （UTF8）（（ch | byteMark） & byteMask）； ch 》= 6;

　　case 1: *--target = （UTF8）（ch | firstByteMark[bytesToWrite]）；

　　}

　　target += bytesToWrite;

　　}

　　*sourceStart = source;

　　*targetStart = target;

　　return result;

　　}

　　/* --------------------------------------------------------------------- */

　　* Utility routine to tell whether a sequence of bytes is legal UTF-8.

　　* This must be called with the length pre-determined by the first byte.

　　* If not calling this from ConvertUTF8to*, then the length can be set by:

　　* length = trailingBytesForUTF8[*source]+1;

　　* and the sequence is illegal right away if there aren't that many bytes

　　* available.

　　* If presented with a length > 4, this returns false. The Unicode

　　* definition of UTF-8 goes up to 4-byte sequences.

　　static Boolean isLegalUTF8（const UTF8 *source, int length） {

　　UTF8 a;

　　const UTF8 *srcptr = source+length;

　　switch （length） {

　　default: return false;

　　/* Everything else falls through when "true" */

　　case 4: if （（a = （*--srcptr）） < 0x80 || a > 0xBF） return false;

　　case 3: if （（a = （*--srcptr）） < 0x80 || a > 0xBF） return false;

　　case 2: if （（a = （*--srcptr）） > 0xBF） return false;

　　switch （*source） {

　　/* no fall-through in this inner switch */

　　case 0xE0: if （a < 0xA0） return false; break;

　　case 0xED: if （a > 0x9F） return false; break;

　　case 0xF0: if （a < 0x90） return false; break;

　　case 0xF4: if （a > 0x8F） return false; break;

　　default: if （a < 0x80） return false;

　　}

　　case 1: if （*source >= 0x80 && *source < 0xC2） return false;

　　}

　　if （*source > 0xF4） return false;

　　return true;

　　}

　　/* --------------------------------------------------------------------- */

　　* Exported function to return whether a UTF-8 sequence is legal or not.

　　* This is not used here; it's just exported.

　　Boolean isLegalUTF8Sequence（const UTF8 *source, const UTF8 *sourceEnd） {

　　int length = trailingBytesForUTF8[*source]+1;

　　if （source+length > sourceEnd） {

　　return false;

　　}

　　return isLegalUTF8（source, length）；

　　}

　　/* --------------------------------------------------------------------- */

C++中各种string的相互转化(三)