Unicode code point string class

Preliminary proposal for a C++ Unicode code point string class.

Goals

    • One single C++ class for storing and processing Unicode strings.

    • Simplest possible API: Logically UTF-32 (fixed-width sequence of code points) with random access via index.

      • Trivial iteration, no dealing with variable-width code units or ill-formed unit sequences.

      • Similar to Python 3.3 and Dart strings.

      • However, this proposed string class is designed for C++ which does not have garbage collection (that is, polymorphic, allocated/cloned objects are inconvenient and error-prone).

    • Not templatized, so that it can be used in non-templatized libraries.

    • Does not throw exceptions. Builder has isBogus().

    • Supports library implementation code with fastpaths for ASCII, BMP, etc., including "max code point" API and access to the internal storage array.

    • Optimized storage: Builder class determines whether each code point fits into a byte, char16_t or char32_t, and builds a string using an array of the smallest possible type.

      • Contents of short strings is stored directly in the object. (15 Latin-1 characters, 7 BMP, or 4 arbitrary code points)

        • For short byte and char16_t strings, one unit at the end is used to store the maximum string code point.

      • length() is inline, but code point access is via function pointer indirection

    • Every instance owns the storage for its contents.

    • The builder is a subclass, so that it can be passed directly into APIs that take a string reference.

    • Once built, strings can be cleared, copied and assigned, but not otherwise modified.

    • Only the builder subclass has append/insert/remove etc. methods.

Omissions

    • We should probably add UniString methods for conversion to UTF-8/16. The question is, to what destination type? char[]/string/u16string/ByteSink/...?

    • We should probably add UniStringBuilder methods for appending UTF-8/16.

Non-goals

    • Conversion to/from non-Unicode encodings requires conversion tables and should be done externally.

      • There could be functions that convert to/from the current system encoding. Such functions should not be string class methods.

    • Higher-level string operations — for example to-lowercase, fold-case, iteration over "user characters" and words, regex, etc. — depend on Unicode or language data and should be implemented outside the string class itself.

unistring.h

// Proposed C++ Unicode code point string class.

// Author: Markus W. Scherer (Google)

// Started: 2012-oct-17

// unistring.h

#ifndef __EXPERIMENTAL_UNISTRING_H__

#define __EXPERIMENTAL_UNISTRING_H__

namespace experimental {

class UniString;

class UniStringBuilder;

namespace internal {

/**

* This is UniString's pseudo-vtable.

* It does not contain any data.

* @internal

*/

class UniStringImpl {

private:

friend class ::experimental::UniString;

friend class ::experimental::UniStringBuilder;

// parallel to UniString::ArrayType

enum ArrayType { CHAR8_ARRAY, CHAR16_ARRAY, CHAR32_ARRAY };

UniStringImpl() {}

~UniStringImpl();

virtual void releaseArray(UniString &dest) = 0;

virtual void copyTo(UniString &dest) = 0;

virtual void moveTo(UniString &dest) = 0;

virtual void clear(UniString &dest) {

releaseArray(dest);

dest.impl_ = getEmpty();

dest.length_ = 0;

}

virtual char32_t codePointAt(int32_t index) const = 0;

virtual char32_t maxCodePoint() const = 0;

virtual ArrayType arrayType() const = 0;

virtual const uint8_t *char8Array() const = 0;

virtual const char16_t *char16Array() const = 0;

virtual const char32_t *char32Array() const = 0;

static const UniStringImpl *getEmpty();

static const UniStringImpl *getBuilder();

};

} // namespace internal

/**

* Unicode code point string class.

* This class is pseudo-polymorphic,

* with several implementations optimized for the string contents,

* but allowing stack (auto) allocation, assignment and copying.

* All instances of UniString and its builder subclass have the same size.

*

* Use the UniStringBuilder subclass to create a UniString.

* UniString itself has convenience factory methods.

*/

class UniString {

public:

enum ArrayType { CHAR8_ARRAY, CHAR16_ARRAY, CHAR32_ARRAY };

UniString() : impl_(internal::UniStringImpl::getEmpty()), length_(0) {}

~UniString() { impl_->releaseArray(*this); }

UniString(const UniString &other) {

other.impl_->copyTo(*this);

}

UniString &operator=(const UniString &other) {

impl_->releaseArray(*this);

other.impl_->copyTo(*this);

return *this;

}

// TODO: move operators

void swap(UniString &other) {

UniString temp;

impl_->moveTo(temp);

other.impl_->moveTo(*this);

temp.impl_->moveTo(other);

temp.impl_ = internal::UniStringImpl::getEmpty();

}

UniString &clear() {

impl_->clear(*this);

}

/**

* Creates a new string from ASCII characters (U+0000..U+007F).

* If the length is not given (-1) then s must be NUL-terminated.

*/

static UniString fromASCII(const char *s, int32_t length = -1) {

UniStringBuilder builder;

return builder.appendASCII(s, length).build();

}

/**

* Creates a new string from Latin-1 characters (U+0000..U+00FF).

* If the length is not given (-1) then s must be NUL-terminated.

*/

static UniString fromLatin1(const uint8_t *s, int32_t length = -1) {

UniStringBuilder builder;

return builder.appendLatin1(s, length).build();

}

/**

* Creates a new string from BMP characters (U+0000..U+FFFF).

* If the length is not given (-1) then s must be NUL-terminated.

*/

static UniString fromBMP(const char16_t *s, int32_t length = -1) {

UniStringBuilder builder;

return builder.appendBMP(s, length).build();

}

/**

* Creates a new string from Unicode characters (U+0000..U+10FFFF).

* If the length is not given (-1) then s must be NUL-terminated.

*/

static UniString fromUnicode(const char32_t *s, int32_t length = -1) {

UniStringBuilder builder;

return builder.appendUnicode(s, length).build();

}

/**

* Creates a new string with the contents of this one beginning at the start index.

*/

UniString subString(int32_t start);

/**

* Creates a new string with the [start..limit[ slice of this string.

*/

UniString slice(int32_t start, int32_t limit);

int32_t length() const { return length_; }

bool isEmpty() const { return length_ == 0; }

char32_t operator[](int32_t index) const { return impl_->codePointAt(index); }

// TODO: indexOf(), lastIndexOf(), compare(), compareSlice(), ...

/**

* Returns the maximum code point in this string.

* The returned value might be higher than the actual maximum.

* <ul>

* <li>If all code points are in ASCII (at most U+007F) then maxCodePoint()

* must be at most 0x7f.

* <li>If all code points are in Latin-1 (at most U+00FF) then maxCodePoint()

* must be at most 0xff.

* <li>If all code points are on the BMP (at most U+FFFF) then maxCodePoint()

* must be at most 0xffff.

* <li>The highest returned value is 0x10ffff.

* <li>Callers might use optimized fastpaths for one or more of these boundaries,

* and/or for one or more of U+017F, U+07FF, U+0FFF, U+33FF.

* </ul>

*/

char32_t maxCodePoint() const { return impl_->maxCodePoint(); }

/**

* @return the unit type of the internal code point array

*/

ArrayType arrayType() const { return (ArrayType)impl_->arrayType(); }

/**

* @return the array of bytes if arrayType()==CHAR8_ARRAY, otherwise NULL

*/

const char *charArray() const { return reinterpret_cast<const char *>(impl_->char8Array()); }

/**

* @return the array of bytes if arrayType()==CHAR8_ARRAY, otherwise NULL

*/

const uint8_t *char8Array() const { impl_->char8Array(); }

/**

* @return the array of char16_t if arrayType()==CHAR16_ARRAY, otherwise NULL

*/

const uint8_t *char16Array() const { impl_->char16Array(); }

/**

* @return the array of char32_t if arrayType()==CHAR32_ARRAY, otherwise NULL

*/

const uint8_t *char32Array() const { impl_->char32Array(); }

protected:

friend class internal::UniStringImpl;

friend class internal::Char8UniStringImpl;

friend class internal::HeapChar8UniStringImpl;

friend class internal::Char16UniStringImpl;

friend class internal::HeapChar16UniStringImpl;

friend class internal::Char32UniStringImpl;

friend class internal::HeapChar32UniStringImpl;

friend class internal::UniStringBuilderImpl;

UniString &clearBuilder() {

// Keep this as a builder and keep the array.

length_ = 0;

u_.f.maxCP = 0;

return *this;

}

// This class layout is intended to yield an object size of 32 bytes

// on a 64-bit machine.

// We assume that the compiler lays out the fields in the declaration order.

// offset 0

const internal::UniStringImpl *impl_;

// offset 8

int32_t length_;

char misc_[4];

// offset 16

union {

uint8_t bytes[16];

struct {

// offset 16

void *p;

// offset 24

/** Number of bytes available at p. */

int32_t capacity;

char32_t maxCp;

} f;

} u_;

// size 32

};

class UniStringBuilder : public UniString {

public:

UniStringBuilder() : impl_(internal::UniStringImpl::getBuilder()), length_(0) {

u_.f.capacity = 0;

u_.f.maxCP = 0;

setArrayType(CHAR8_ARRAY);

}

UniString &clear() { return clearBuilder(); }

/**

* Builds and returns a UniString with the current builder contents.

* Returns an empty string if isBogus() is true.

*/

UniString build() const;

ArrayType arrayType() const { return (ArrayType)misc_[3]; }

/**

* @return true if an error occurred while building the string;

* in particular, attempting to add out-of-range code points

*/

bool isBogus() const { return (bool)misc[0]; }

// TODO: Allow surrogate code points (because they are valid code points),

// or forbid them (because they are not allowed in UTF-8/16/32)?

/**

* Appends one Unicode code point (U+0000..U+10FFFF).

*/

UniStringBuilder &appendCodePoint(char32_t c);

/**

* Appends ASCII characters (U+0000..U+007F).

* If the length is not given (-1) then s must be NUL-terminated.

* Sets isBogus() if s contains a non-ASCII character.

*/

UniStringBuilder &appendASCII(const char *s, int32_t length = -1);

/**

* Appends Latin-1 characters (U+0000..U+00FF).

* If the length is not given (-1) then s must be NUL-terminated.

*/

UniStringBuilder &appendLatin1(const uint8_t *s, int32_t length = -1);

/**

* Appends BMP characters (U+0000..U+FFFF).

* If the length is not given (-1) then s must be NUL-terminated.

*/

UniStringBuilder &appendBMP(const char16_t *s, int32_t length = -1);

/**

* Appends Unicode code points (U+0000..U+10FFFF).

* If the length is not given (-1) then s must be NUL-terminated.

* Sets isBogus() if s contains an out-of-range value.

*/

UniStringBuilder &appendUnicode(const char32_t *s, int32_t length = -1);

/**

* Appends the src string's contents.

* If the start index is given, then only the substring from there is appended.

*/

UniStringBuilder &append(const UniString &src, int32_t start = 0) {

return appendSlice(src, start, src.length();

}

/**

* Appends the [start..limit[ slice of the src string.

*/

UniStringBuilder &appendSlice(const UniString &src, int32_t start, int32_t limit);

UniStringBuilder &insert(int32_t pos, const UniString &src, int32_t start = 0) {

return insertSlice(pos, src, start, src.length();

}

UniStringBuilder &insertSlice(int32_t pos, const UniString &src, int32_t start, int32_t limit);

UniStringBuilder &removeSlice(int32_t start, int32_t limit);

private:

void setArrayType(ArrayType t) { misc_[3] = t; }

void setToBogus() {

length_ = 0;

misc_[0] = 1;

}

/**

* maxCp gets new code points ORed in,

* but when a slice is removed, we delay computing maxCp until

* maxCodePoint() or build() is called.

*/

bool maxCpMayBeTooHigh() const { return (bool)misc_[1]; }

void setMaxCpMayBeTooHigh(bool value) {

misc_[1] = (uint8_t)value;

}

};

} // namespace experimental

#endif // __EXPERIMENTAL_UNISTRING_H__

unistring.cpp

// Proposed C++ Unicode code point string class.

// Author: Markus W. Scherer (Google)

// Started: 2012-oct-17

// unistring.cpp

#include "unistring.h"

namespace experimental {

namespace internal {

UniStringImpl::~UniStringImpl() {}

virtual void UniStringImpl::clear(UniString &dest) {

// Non-builder implementation: Release storage, revert to an empty string.

releaseArray(dest);

dest.impl_ = getEmpty();

dest.length_ = 0;

}

class Char8UniStringImpl : public UniStringImpl {

// TODO

};

class HeapChar8UniStringImpl : public Char8UniStringImpl {

// TODO

};

class Char16UniStringImpl : public UniStringImpl {

// TODO

};

class HeapChar16UniStringImpl : public Char16UniStringImpl {

// TODO

};

class Char32UniStringImpl : public UniStringImpl {

// TODO

};

class HeapChar32UniStringImpl : public Char32UniStringImpl {

// TODO

};

class UniStringBuilderImpl : public UniStringImpl {

// TODO

virtual void clear(UniString &dest) { dest.clearBuilder(); }

};

namespace {

const UniStringImpl *getChar8() {

Char8UniStringImpl impl;

return &impl;

}

const UniStringImpl *getHeapChar8() {

HeapChar8UniStringImpl impl;

return &impl;

}

const UniStringImpl *getChar16() {

Char16UniStringImpl impl;

return &impl;

}

const UniStringImpl *getHeapChar16() {

HeapChar16UniStringImpl impl;

return &impl;

}

const UniStringImpl *getChar32() {

Char32UniStringImpl impl;

return &impl;

}

const UniStringImpl *getHeapChar32() {

HeapChar32UniStringImpl impl;

return &impl;

}

} // namespace

const UniStringImpl *UniStringImpl::getEmpty() {

return getChar8();

}

const UniStringImpl *UniStringImpl::getBuilder() {

UniStringBuilderImpl impl;

return &impl;

}

} // namespace internal

} // namespace experimental