Unicode code point string class
Preliminary proposal for a C++ Unicode code point string class.
Goals
One single C++ class for storing and processing Unicode strings.
Simplest possible API: Logically UTF-32 (fixed-width sequence of code points) with random access via index.
Trivial iteration, no dealing with variable-width code units or ill-formed unit sequences.
Similar to Python 3.3 and Dart strings.
However, this proposed string class is designed for C++ which does not have garbage collection (that is, polymorphic, allocated/cloned objects are inconvenient and error-prone).
Not templatized, so that it can be used in non-templatized libraries.
Does not throw exceptions. Builder has isBogus().
Supports library implementation code with fastpaths for ASCII, BMP, etc., including "max code point" API and access to the internal storage array.
Optimized storage: Builder class determines whether each code point fits into a byte, char16_t or char32_t, and builds a string using an array of the smallest possible type.
Contents of short strings is stored directly in the object. (15 Latin-1 characters, 7 BMP, or 4 arbitrary code points)
For short byte and char16_t strings, one unit at the end is used to store the maximum string code point.
length() is inline, but code point access is via function pointer indirection
Every instance owns the storage for its contents.
The builder is a subclass, so that it can be passed directly into APIs that take a string reference.
Once built, strings can be cleared, copied and assigned, but not otherwise modified.
Only the builder subclass has append/insert/remove etc. methods.
Omissions
We should probably add UniString methods for conversion to UTF-8/16. The question is, to what destination type? char[]/string/u16string/ByteSink/...?
We should probably add UniStringBuilder methods for appending UTF-8/16.
Non-goals
Conversion to/from non-Unicode encodings requires conversion tables and should be done externally.
There could be functions that convert to/from the current system encoding. Such functions should not be string class methods.
Higher-level string operations — for example to-lowercase, fold-case, iteration over "user characters" and words, regex, etc. — depend on Unicode or language data and should be implemented outside the string class itself.
unistring.h
// Proposed C++ Unicode code point string class.
// Author: Markus W. Scherer (Google)
// Started: 2012-oct-17
// unistring.h
#ifndef __EXPERIMENTAL_UNISTRING_H__
#define __EXPERIMENTAL_UNISTRING_H__
namespace experimental {
class UniString;
class UniStringBuilder;
namespace internal {
/**
* This is UniString's pseudo-vtable.
* It does not contain any data.
* @internal
*/
class UniStringImpl {
private:
friend class ::experimental::UniString;
friend class ::experimental::UniStringBuilder;
// parallel to UniString::ArrayType
enum ArrayType { CHAR8_ARRAY, CHAR16_ARRAY, CHAR32_ARRAY };
UniStringImpl() {}
~UniStringImpl();
virtual void releaseArray(UniString &dest) = 0;
virtual void copyTo(UniString &dest) = 0;
virtual void moveTo(UniString &dest) = 0;
virtual void clear(UniString &dest) {
releaseArray(dest);
dest.impl_ = getEmpty();
dest.length_ = 0;
}
virtual char32_t codePointAt(int32_t index) const = 0;
virtual char32_t maxCodePoint() const = 0;
virtual ArrayType arrayType() const = 0;
virtual const uint8_t *char8Array() const = 0;
virtual const char16_t *char16Array() const = 0;
virtual const char32_t *char32Array() const = 0;
static const UniStringImpl *getEmpty();
static const UniStringImpl *getBuilder();
};
} // namespace internal
/**
* Unicode code point string class.
* This class is pseudo-polymorphic,
* with several implementations optimized for the string contents,
* but allowing stack (auto) allocation, assignment and copying.
* All instances of UniString and its builder subclass have the same size.
*
* Use the UniStringBuilder subclass to create a UniString.
* UniString itself has convenience factory methods.
*/
class UniString {
public:
enum ArrayType { CHAR8_ARRAY, CHAR16_ARRAY, CHAR32_ARRAY };
UniString() : impl_(internal::UniStringImpl::getEmpty()), length_(0) {}
~UniString() { impl_->releaseArray(*this); }
UniString(const UniString &other) {
other.impl_->copyTo(*this);
}
UniString &operator=(const UniString &other) {
impl_->releaseArray(*this);
other.impl_->copyTo(*this);
return *this;
}
// TODO: move operators
void swap(UniString &other) {
UniString temp;
impl_->moveTo(temp);
other.impl_->moveTo(*this);
temp.impl_->moveTo(other);
temp.impl_ = internal::UniStringImpl::getEmpty();
}
UniString &clear() {
impl_->clear(*this);
}
/**
* Creates a new string from ASCII characters (U+0000..U+007F).
* If the length is not given (-1) then s must be NUL-terminated.
*/
static UniString fromASCII(const char *s, int32_t length = -1) {
UniStringBuilder builder;
return builder.appendASCII(s, length).build();
}
/**
* Creates a new string from Latin-1 characters (U+0000..U+00FF).
* If the length is not given (-1) then s must be NUL-terminated.
*/
static UniString fromLatin1(const uint8_t *s, int32_t length = -1) {
UniStringBuilder builder;
return builder.appendLatin1(s, length).build();
}
/**
* Creates a new string from BMP characters (U+0000..U+FFFF).
* If the length is not given (-1) then s must be NUL-terminated.
*/
static UniString fromBMP(const char16_t *s, int32_t length = -1) {
UniStringBuilder builder;
return builder.appendBMP(s, length).build();
}
/**
* Creates a new string from Unicode characters (U+0000..U+10FFFF).
* If the length is not given (-1) then s must be NUL-terminated.
*/
static UniString fromUnicode(const char32_t *s, int32_t length = -1) {
UniStringBuilder builder;
return builder.appendUnicode(s, length).build();
}
/**
* Creates a new string with the contents of this one beginning at the start index.
*/
UniString subString(int32_t start);
/**
* Creates a new string with the [start..limit[ slice of this string.
*/
UniString slice(int32_t start, int32_t limit);
int32_t length() const { return length_; }
bool isEmpty() const { return length_ == 0; }
char32_t operator[](int32_t index) const { return impl_->codePointAt(index); }
// TODO: indexOf(), lastIndexOf(), compare(), compareSlice(), ...
/**
* Returns the maximum code point in this string.
* The returned value might be higher than the actual maximum.
* <ul>
* <li>If all code points are in ASCII (at most U+007F) then maxCodePoint()
* must be at most 0x7f.
* <li>If all code points are in Latin-1 (at most U+00FF) then maxCodePoint()
* must be at most 0xff.
* <li>If all code points are on the BMP (at most U+FFFF) then maxCodePoint()
* must be at most 0xffff.
* <li>The highest returned value is 0x10ffff.
* <li>Callers might use optimized fastpaths for one or more of these boundaries,
* and/or for one or more of U+017F, U+07FF, U+0FFF, U+33FF.
* </ul>
*/
char32_t maxCodePoint() const { return impl_->maxCodePoint(); }
/**
* @return the unit type of the internal code point array
*/
ArrayType arrayType() const { return (ArrayType)impl_->arrayType(); }
/**
* @return the array of bytes if arrayType()==CHAR8_ARRAY, otherwise NULL
*/
const char *charArray() const { return reinterpret_cast<const char *>(impl_->char8Array()); }
/**
* @return the array of bytes if arrayType()==CHAR8_ARRAY, otherwise NULL
*/
const uint8_t *char8Array() const { impl_->char8Array(); }
/**
* @return the array of char16_t if arrayType()==CHAR16_ARRAY, otherwise NULL
*/
const uint8_t *char16Array() const { impl_->char16Array(); }
/**
* @return the array of char32_t if arrayType()==CHAR32_ARRAY, otherwise NULL
*/
const uint8_t *char32Array() const { impl_->char32Array(); }
protected:
friend class internal::UniStringImpl;
friend class internal::Char8UniStringImpl;
friend class internal::HeapChar8UniStringImpl;
friend class internal::Char16UniStringImpl;
friend class internal::HeapChar16UniStringImpl;
friend class internal::Char32UniStringImpl;
friend class internal::HeapChar32UniStringImpl;
friend class internal::UniStringBuilderImpl;
UniString &clearBuilder() {
// Keep this as a builder and keep the array.
length_ = 0;
u_.f.maxCP = 0;
return *this;
}
// This class layout is intended to yield an object size of 32 bytes
// on a 64-bit machine.
// We assume that the compiler lays out the fields in the declaration order.
// offset 0
const internal::UniStringImpl *impl_;
// offset 8
int32_t length_;
char misc_[4];
// offset 16
union {
uint8_t bytes[16];
struct {
// offset 16
void *p;
// offset 24
/** Number of bytes available at p. */
int32_t capacity;
char32_t maxCp;
} f;
} u_;
// size 32
};
class UniStringBuilder : public UniString {
public:
UniStringBuilder() : impl_(internal::UniStringImpl::getBuilder()), length_(0) {
u_.f.capacity = 0;
u_.f.maxCP = 0;
setArrayType(CHAR8_ARRAY);
}
UniString &clear() { return clearBuilder(); }
/**
* Builds and returns a UniString with the current builder contents.
* Returns an empty string if isBogus() is true.
*/
UniString build() const;
ArrayType arrayType() const { return (ArrayType)misc_[3]; }
/**
* @return true if an error occurred while building the string;
* in particular, attempting to add out-of-range code points
*/
bool isBogus() const { return (bool)misc[0]; }
// TODO: Allow surrogate code points (because they are valid code points),
// or forbid them (because they are not allowed in UTF-8/16/32)?
/**
* Appends one Unicode code point (U+0000..U+10FFFF).
*/
UniStringBuilder &appendCodePoint(char32_t c);
/**
* Appends ASCII characters (U+0000..U+007F).
* If the length is not given (-1) then s must be NUL-terminated.
* Sets isBogus() if s contains a non-ASCII character.
*/
UniStringBuilder &appendASCII(const char *s, int32_t length = -1);
/**
* Appends Latin-1 characters (U+0000..U+00FF).
* If the length is not given (-1) then s must be NUL-terminated.
*/
UniStringBuilder &appendLatin1(const uint8_t *s, int32_t length = -1);
/**
* Appends BMP characters (U+0000..U+FFFF).
* If the length is not given (-1) then s must be NUL-terminated.
*/
UniStringBuilder &appendBMP(const char16_t *s, int32_t length = -1);
/**
* Appends Unicode code points (U+0000..U+10FFFF).
* If the length is not given (-1) then s must be NUL-terminated.
* Sets isBogus() if s contains an out-of-range value.
*/
UniStringBuilder &appendUnicode(const char32_t *s, int32_t length = -1);
/**
* Appends the src string's contents.
* If the start index is given, then only the substring from there is appended.
*/
UniStringBuilder &append(const UniString &src, int32_t start = 0) {
return appendSlice(src, start, src.length();
}
/**
* Appends the [start..limit[ slice of the src string.
*/
UniStringBuilder &appendSlice(const UniString &src, int32_t start, int32_t limit);
UniStringBuilder &insert(int32_t pos, const UniString &src, int32_t start = 0) {
return insertSlice(pos, src, start, src.length();
}
UniStringBuilder &insertSlice(int32_t pos, const UniString &src, int32_t start, int32_t limit);
UniStringBuilder &removeSlice(int32_t start, int32_t limit);
private:
void setArrayType(ArrayType t) { misc_[3] = t; }
void setToBogus() {
length_ = 0;
misc_[0] = 1;
}
/**
* maxCp gets new code points ORed in,
* but when a slice is removed, we delay computing maxCp until
* maxCodePoint() or build() is called.
*/
bool maxCpMayBeTooHigh() const { return (bool)misc_[1]; }
void setMaxCpMayBeTooHigh(bool value) {
misc_[1] = (uint8_t)value;
}
};
} // namespace experimental
#endif // __EXPERIMENTAL_UNISTRING_H__
unistring.cpp
// Proposed C++ Unicode code point string class.
// Author: Markus W. Scherer (Google)
// Started: 2012-oct-17
// unistring.cpp
#include "unistring.h"
namespace experimental {
namespace internal {
UniStringImpl::~UniStringImpl() {}
virtual void UniStringImpl::clear(UniString &dest) {
// Non-builder implementation: Release storage, revert to an empty string.
releaseArray(dest);
dest.impl_ = getEmpty();
dest.length_ = 0;
}
class Char8UniStringImpl : public UniStringImpl {
// TODO
};
class HeapChar8UniStringImpl : public Char8UniStringImpl {
// TODO
};
class Char16UniStringImpl : public UniStringImpl {
// TODO
};
class HeapChar16UniStringImpl : public Char16UniStringImpl {
// TODO
};
class Char32UniStringImpl : public UniStringImpl {
// TODO
};
class HeapChar32UniStringImpl : public Char32UniStringImpl {
// TODO
};
class UniStringBuilderImpl : public UniStringImpl {
// TODO
virtual void clear(UniString &dest) { dest.clearBuilder(); }
};
namespace {
const UniStringImpl *getChar8() {
Char8UniStringImpl impl;
return &impl;
}
const UniStringImpl *getHeapChar8() {
HeapChar8UniStringImpl impl;
return &impl;
}
const UniStringImpl *getChar16() {
Char16UniStringImpl impl;
return &impl;
}
const UniStringImpl *getHeapChar16() {
HeapChar16UniStringImpl impl;
return &impl;
}
const UniStringImpl *getChar32() {
Char32UniStringImpl impl;
return &impl;
}
const UniStringImpl *getHeapChar32() {
HeapChar32UniStringImpl impl;
return &impl;
}
} // namespace
const UniStringImpl *UniStringImpl::getEmpty() {
return getChar8();
}
const UniStringImpl *UniStringImpl::getBuilder() {
UniStringBuilderImpl impl;
return &impl;
}
} // namespace internal
} // namespace experimental