Ring Daemon 16.0.0
Loading...
Searching...
No Matches
utf8_utils.cpp
Go to the documentation of this file.
1/*
2 * Copyright (C) 1999 Tom Tromey
3 * Copyright (C) 2000 Red Hat, Inc.
4 * Copyright (C) 2004-2025 Savoir-faire Linux Inc.
5 *
6 * Author: Pascal Potvin <pascal.potvin@extenway.com>
7 *
8 * This program is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <https://www.gnu.org/licenses/>.
20 */
21
22#include <cstring>
23#include <cassert>
25
26#if defined(_MSC_VER)
27#include <BaseTsd.h>
28using ssize_t = SSIZE_T;
29#endif
30
31/*
32 * The LIKELY and UNLIKELY macros let the programmer give hints to
33 * the compiler about the expected result of an expression. Some compilers
34 * can use this information for optimizations.
35 */
36#if defined(__GNUC__) && (__GNUC__ > 2) || defined(__clang__)
37#define LIKELY(expr) (__builtin_expect(expr, 1))
38#define UNLIKELY(expr) (__builtin_expect(expr, 0))
39#else
40#define LIKELY(expr) (expr)
41#define UNLIKELY(expr) (expr)
42#endif
43
44/*
45 * Check whether a Unicode (5.2) char is in a valid range.
46 *
47 * The first check comes from the Unicode guarantee to never encode
48 * a point above 0x0010ffff, since UTF-16 is unable to represent it.
49 *
50 * The second check covers surrogate pairs (category Cs).
51 *
52 * @param Char the character
53 */
54#define UNICODE_VALID(Char) ((Char) < 0x110000 && (((Char) &0xFFFFF800) != 0xD800))
55
56#define CONTINUATION_CHAR \
57 if ((*(unsigned char*) p & 0xc0) != 0x80) /* 10xxxxxx */ \
58 goto error; \
59 val <<= 6; \
60 val |= (*(unsigned char*) p) & 0x3f;
61
62namespace jami {
63
64bool utf8_validate_c_str(const char* str, ssize_t max_len, const char** end);
65
66static const char*
67fast_validate(const char* str)
68{
69 char32_t val = 0;
70 char32_t min = 0;
71 const char* p;
72
73 for (p = str; *p; p++) {
74 if (*(unsigned char*) p < 128)
75 /* done */;
76 else {
77 const char* last;
78
79 last = p;
80
81 if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */
82 if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0))
83 goto error;
84
85 p++;
86
87 if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */
88 goto error;
89 } else {
90 if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */
91 min = (1 << 11);
92 val = *(unsigned char*) p & 0x0f;
93 goto TWO_REMAINING;
94 } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */
95 min = (1 << 16);
96 val = *(unsigned char*) p & 0x07;
97 } else
98 goto error;
99
100 p++;
103 p++;
105 p++;
107
108 if (UNLIKELY(val < min))
109 goto error;
110
112 goto error;
113 }
114
115 continue;
116
117 error:
118 return last;
119 }
120 }
121
122 return p;
123}
124
125static const char*
127{
128 char32_t val = 0;
129 char32_t min = 0;
130 const char* p;
131
132 assert(max_len >= 0);
133
134 for (p = str; ((p - str) < max_len) && *p; p++) {
135 if (*(unsigned char*) p < 128)
136 /* done */;
137 else {
138 const char* last;
139
140 last = p;
141
142 if ((*(unsigned char*) p & 0xe0) == 0xc0) { /* 110xxxxx */
143 if (UNLIKELY(max_len - (p - str) < 2))
144 goto error;
145
146 if (UNLIKELY((*(unsigned char*) p & 0x1e) == 0))
147 goto error;
148
149 p++;
150
151 if (UNLIKELY((*(unsigned char*) p & 0xc0) != 0x80)) /* 10xxxxxx */
152 goto error;
153 } else {
154 if ((*(unsigned char*) p & 0xf0) == 0xe0) { /* 1110xxxx */
155 if (UNLIKELY(max_len - (p - str) < 3))
156 goto error;
157
158 min = (1 << 11);
159 val = *(unsigned char*) p & 0x0f;
160 goto TWO_REMAINING;
161 } else if ((*(unsigned char*) p & 0xf8) == 0xf0) { /* 11110xxx */
162 if (UNLIKELY(max_len - (p - str) < 4))
163 goto error;
164
165 min = (1 << 16);
166 val = *(unsigned char*) p & 0x07;
167 } else
168 goto error;
169
170 p++;
173 p++;
175 p++;
177
178 if (UNLIKELY(val < min))
179 goto error;
180
182 goto error;
183 }
184
185 continue;
186
187 error:
188 return last;
189 }
190 }
191
192 return p;
193}
194
219bool
220utf8_validate_c_str(const char* str, ssize_t max_len, const char** end)
221{
222 const char* p;
223
224 if (max_len < 0)
225 p = fast_validate(str);
226 else
228
229 if (end)
230 *end = p;
231
232 if ((max_len >= 0 && p != str + max_len) || (max_len < 0 && *p != '\0'))
233 return false;
234 else
235 return true;
236}
237
238bool
239utf8_validate(std::string_view str)
240{
241 const char* p = fast_validate_len(str.data(), str.size());
242
243 return (*p == '\0');
244}
245
246std::string
247utf8_make_valid(std::string_view name)
248{
249 ssize_t remaining_bytes = name.size();
251 const char* remainder = name.data();
252 const char* invalid;
253 char* str = NULL;
254 char* pos;
255
256 while (remaining_bytes != 0) {
258 break;
259
261
262 if (str == NULL)
263 // If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
264 str = new char[3 * remaining_bytes];
265
266 pos = str;
267
269 pos += valid_bytes;
270
271 /* append U+FFFD REPLACEMENT CHARACTER */
272 pos[0] = '\357';
273 pos[1] = '\277';
274 pos[2] = '\275';
275
276 pos += 3;
277
279 remainder = invalid + 1;
280 }
281
282 if (str == NULL)
283 return std::string(name);
284
287
288 std::string answer(str, pos - str);
289 assert(utf8_validate(answer));
290
291 delete[] str;
292
293 return answer;
294}
295
296} // namespace jami
static const char * fast_validate(const char *str)
bool utf8_validate(std::string_view str)
utf8_validate:
void emitSignal(Args... args)
Definition ring_signal.h:64
bool utf8_validate_c_str(const char *str, ssize_t max_len, const char **end)
utf8_validate_c_str: @str: a pointer to character data @max_len: max bytes to validate,...
std::string utf8_make_valid(std::string_view name)
static const char * fast_validate_len(const char *str, ssize_t max_len)
#define UNICODE_VALID(Char)
#define CONTINUATION_CHAR
#define UNLIKELY(expr)