TextMateLib 1.0
Modern C++ implementation of the TextMate syntax highlighting engine
Loading...
Searching...
No Matches
utf16_utils.h
1#ifndef TEXTMATELIB_UTF16_UTILS_H
2#define TEXTMATELIB_UTF16_UTILS_H
3
4#include <cstddef>
5#include <cstdint>
6#include <vector>
7
8namespace tml {
9
10/// Builds a lookup table mapping UTF-8 byte offsets to UTF-16 code unit indices.
11///
12/// For ASCII text, map[i] == i (identity).
13/// For multi-byte UTF-8 sequences:
14/// - 2-byte (U+0080..U+07FF) -> 1 UTF-16 code unit
15/// - 3-byte (U+0800..U+FFFF) -> 1 UTF-16 code unit
16/// - 4-byte (U+10000..U+10FFFF) -> 2 UTF-16 code units (surrogate pair)
17///
18/// The returned vector has byteLen+1 entries so that map[byteLen] gives the
19/// total UTF-16 length (useful for end-of-string indices).
20inline std::vector<int32_t> buildByteToUtf16Map(const char* utf8, size_t byteLen) {
21 std::vector<int32_t> map(byteLen + 1);
22 int32_t utf16Index = 0;
23 size_t i = 0;
24
25 while (i < byteLen) {
26 map[i] = utf16Index;
27
28 unsigned char ch = static_cast<unsigned char>(utf8[i]);
29 size_t seqLen;
30 int32_t utf16Units;
31
32 if (ch < 0x80) {
33 seqLen = 1;
34 utf16Units = 1;
35 } else if ((ch & 0xE0) == 0xC0) {
36 seqLen = 2;
37 utf16Units = 1;
38 } else if ((ch & 0xF0) == 0xE0) {
39 seqLen = 3;
40 utf16Units = 1;
41 } else if ((ch & 0xF8) == 0xF0) {
42 seqLen = 4;
43 utf16Units = 2; // surrogate pair in UTF-16
44 } else {
45 // Invalid leading byte — treat as 1 byte, 1 unit
46 seqLen = 1;
47 utf16Units = 1;
48 }
49
50 // Fill continuation bytes in the map with the same utf16Index
51 for (size_t j = 1; j < seqLen && (i + j) < byteLen; j++) {
52 map[i + j] = utf16Index;
53 }
54
55 i += seqLen;
56 utf16Index += utf16Units;
57 }
58
59 // Sentinel: map[byteLen] = total UTF-16 length
60 map[byteLen] = utf16Index;
61
62 return map;
63}
64
65/// Safe lookup: clamps the byte offset to [0, map.size()-1].
66/// The C++ tokenizer may internally append '\n', producing token indices
67/// up to byteLen+1 which would be out of range. Clamping to byteLen
68/// (the sentinel entry) returns the correct UTF-16 length in that case.
69inline int32_t mapByteToUtf16(const std::vector<int32_t>& map, int32_t byteOffset) {
70 if (byteOffset < 0) return 0;
71 size_t idx = static_cast<size_t>(byteOffset);
72 if (idx >= map.size()) idx = map.size() - 1;
73 return map[idx];
74}
75
76} // namespace tml
77
78#endif // TEXTMATELIB_UTF16_UTILS_H