From 328027bf269bb0c108bd8533908ccb36ba11e9f0 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Wed, 30 Jan 2013 12:05:05 +0000 Subject: Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@173923 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/ConvertUTF.h | 230 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 include/llvm/Support/ConvertUTF.h (limited to 'include/llvm/Support/ConvertUTF.h') diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h new file mode 100644 index 0000000000..f2dc7ea33d --- /dev/null +++ b/include/llvm/Support/ConvertUTF.h @@ -0,0 +1,230 @@ +/*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *==------------------------------------------------------------------------==*/ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Header file. + + Several funtions are included here, forming a complete set of + conversions between the three formats. UTF-7 is not included + here, but is handled in a separate source file. + + Each of these routines takes pointers to input buffers and output + buffers. The input buffers are const. + + Each routine converts the text between *sourceStart and sourceEnd, + putting the result into the buffer between *targetStart and + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, + and if not, whether the problem was in the source or target buffers. + (Only the first encountered problem is indicated.) + + After the conversion, *sourceStart and *targetStart are both + updated to point to the end of last text successfully converted in + the respective buffers. + + Input parameters: + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. + + These conversion functions take a ConversionFlags argument. When this + flag is set to strict, both irregular sequences and isolated surrogates + will cause an error. When the flag is set to lenient, both irregular + sequences and isolated surrogates are converted. + + Whether the flag is strict or lenient, all illegal sequences will cause + an error return. This includes sequences such as: , , + or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + must check for illegal sequences. + + When the flag is set to lenient, characters over 0x10FFFF are converted + to the replacement character; otherwise (when the flag is set to strict) + they constitute an error. + + Output parameters: + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. + + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Fixes & updates, Sept 2001. + +------------------------------------------------------------------------ */ + +#ifndef CLANG_BASIC_CONVERTUTF_H +#define CLANG_BASIC_CONVERTUTF_H + +/* --------------------------------------------------------------------- + The following 4 definitions are compiler-specific. + The C standard does not guarantee that wchar_t has at least + 16 bits, so wchar_t is no less portable than unsigned short! + All should be unsigned values to avoid sign extension during + bit mask & shift operations. +------------------------------------------------------------------------ */ + +typedef unsigned int UTF32; /* at least 32 bits */ +typedef unsigned short UTF16; /* at least 16 bits */ +typedef unsigned char UTF8; /* typically 8 bits */ +typedef unsigned char Boolean; /* 0 or 1 */ + +/* Some fundamental constants */ +#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD +#define UNI_MAX_BMP (UTF32)0x0000FFFF +#define UNI_MAX_UTF16 (UTF32)0x0010FFFF +#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF +#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF + +#define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4 + +typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +} ConversionResult; + +typedef enum { + strictConversion = 0, + lenientConversion +} ConversionFlags; + +/* This is for C++ and does no harm in C */ +#ifdef __cplusplus +extern "C" { +#endif + +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +#ifdef CLANG_NEEDS_THESE_ONE_DAY +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); +#endif + +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); + +Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); + +unsigned getNumBytesForUTF8(UTF8 firstByte); + +#ifdef __cplusplus +} + +/*************************************************************************/ +/* Below are LLVM-specific wrappers of the functions above. */ + +#include "llvm/ADT/StringRef.h" + +namespace llvm { + +/** + * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on + * WideCharWidth. The converted data is written to ResultPtr, which needs to + * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success, + * ResultPtr will point one after the end of the copied string. On failure, + * ResultPtr will not be changed, and ErrorPtr will be set to the location of + * the first character which could not be converted. + * \return true on success. + */ +bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, + char *&ResultPtr, const UTF8 *&ErrorPtr); + +/** + * Convert an Unicode code point to UTF8 sequence. + * + * \param Source a Unicode code point. + * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least + * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is + * updated one past end of the converted sequence. + * + * \returns true on success. + */ +bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr); + +/** + * Convert the first UTF8 sequence in the given source buffer to a UTF32 + * code point. + * + * \param [in,out] source A pointer to the source buffer. If the conversion + * succeeds, this pointer will be updated to point to the byte just past the + * end of the converted sequence. + * \param sourceEnd A pointer just past the end of the source buffer. + * \param [out] target The converted code + * \param flags Whether the conversion is strict or lenient. + * + * \returns conversionOK on success + * + * \sa ConvertUTF8toUTF32 + */ +static inline ConversionResult convertUTF8Sequence(const UTF8 **source, + const UTF8 *sourceEnd, + UTF32 *target, + ConversionFlags flags) { + if (*source == sourceEnd) + return sourceExhausted; + unsigned size = getNumBytesForUTF8(**source); + if ((ptrdiff_t)size > sourceEnd - *source) + return sourceExhausted; + return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); +} +} // end namespace llvm + +#endif + +/* --------------------------------------------------------------------- */ + +#endif -- cgit v1.2.3