+/** decoding of a byte stream to a single unicode character.
+ * This should be correct for well formed utf-8 sequences but doesn't check for
+ * all forms of illegal sequences.
+ * (see unicode standard section 3.10 table 3-5 and 3-6 for details)
+ */
+uint32_t decode_utf8(const std::string& text, size_t& p)
+{
+ // 1 byte sequence
+ uint32_t c = (unsigned char) text[p++];
+ if(c <= 0x7F) {
+ return c;
+ }
+
+ // 2 byte sequence
+ if(p >= text.size())
+ throw std::runtime_error("Malformed utf-8 sequence");
+ uint32_t c2 = (unsigned char) text[p++];
+ if(c <= 0xDF) {
+ if(c < 0xC2)
+ throw std::runtime_error("Malformed utf-8 sequence");
+ return (c & 0x1F) << 6 | (c2 & 0x3F);
+ }
+
+ // 3 byte sequence
+ if(p >= text.size())
+ throw std::runtime_error("Malformed utf-8 sequence");
+ uint32_t c3 = (unsigned char) text[p++];
+ if(c <= 0xEF) {
+ return (c & 0x0F) << 12 | (c2 & 0x3F) << 6 | (c3 & 0x3F);
+ }
+
+ // 4 byte sequence
+ if(p >= text.size())
+ throw std::runtime_error("Malformed utf-8 sequence");
+ uint32_t c4 = (unsigned char) text[p++];
+ if(c <= 0xF4) {
+ return (c & 0x07) << 18 | (c2 & 0x3F) << 12 | (c3 & 0x3F) << 6
+ | (c4 & 0x3F);
+ }
+
+ throw std::runtime_error("Malformed utf-8 sequence");
+}
+