Skip to content

Commit

Permalink
utf8: fix encoding of codepoints larger than 0xdfff
Browse files Browse the repository at this point in the history
Closes #66
  • Loading branch information
dchest committed Dec 11, 2023
1 parent a89a438 commit 976d152
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 26 deletions.
3 changes: 2 additions & 1 deletion packages/utf8/utf8.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ describe("utf8", () => {
"abcгдеjzy123",
"こんにちは世界",
"test 测试 тест",
"𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡"
"𝟘𝟙𝟚𝟛𝟜𝟝𝟞𝟟𝟠𝟡",
"❤️"
];
const encoded = tests.map(encode);
const decoded = encoded.map(decode);
Expand Down
56 changes: 31 additions & 25 deletions packages/utf8/utf8.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,30 @@ const INVALID_UTF8 = "utf8: invalid source encoding";
*/
export function encode(s: string): Uint8Array {
// Calculate result length and allocate output array.
// encodedLength() also validates string and throws errors,
// encodedLength() validates string and throws errors,
// so we don't need repeat validation here.
const arr = new Uint8Array(encodedLength(s));

let pos = 0;
for (let i = 0; i < s.length; i++) {
let c = s.charCodeAt(i);
if (c >= 0xd800 && c <= 0xdbff) {
c = ((c - 0xd800) << 10) + (s.charCodeAt(++i) - 0xdc00) + 0x10000;
}
if (c < 0x80) {
arr[pos++] = c;
} else if (c < 0x800) {
arr[pos++] = 0xc0 | c >> 6;
arr[pos++] = 0x80 | c & 0x3f;
} else if (c < 0xd800) {
arr[pos++] = 0xe0 | c >> 12;
arr[pos++] = 0x80 | (c >> 6) & 0x3f;
arr[pos++] = 0x80 | c & 0x3f;
arr[pos++] = 0xc0 | (c >> 6);
arr[pos++] = 0x80 | (c & 0x3f);
} else if (c < 0x10000) {
arr[pos++] = 0xe0 | (c >> 12);
arr[pos++] = 0x80 | ((c >> 6) & 0x3f);
arr[pos++] = 0x80 | (c & 0x3f);
} else {
i++; // get one more character
c = (c & 0x3ff) << 10;
c |= s.charCodeAt(i) & 0x3ff;
c += 0x10000;

arr[pos++] = 0xf0 | c >> 18;
arr[pos++] = 0x80 | (c >> 12) & 0x3f;
arr[pos++] = 0x80 | (c >> 6) & 0x3f;
arr[pos++] = 0x80 | c & 0x3f;
arr[pos++] = 0xf0 | (c >> 18);
arr[pos++] = 0x80 | ((c >> 12) & 0x3f);
arr[pos++] = 0x80 | ((c >> 6) & 0x3f);
arr[pos++] = 0x80 | (c & 0x3f);
}
}
return arr;
Expand All @@ -52,21 +50,29 @@ export function encode(s: string): Uint8Array {
export function encodedLength(s: string): number {
let result = 0;
for (let i = 0; i < s.length; i++) {
const c = s.charCodeAt(i);
let c = s.charCodeAt(i);

if (c >= 0xd800 && c <= 0xdbff) {
// surrogate pair
if (i === s.length - 1) {
throw new Error(INVALID_UTF16);
}
i++;
const c2 = s.charCodeAt(i);
if (c2 < 0xdc00 || c2 > 0xdfff) {
throw new Error(INVALID_UTF16);
}
c = ((c - 0xd800) << 10) + (c2 - 0xdc00) + 0x10000;
}

if (c < 0x80) {
result += 1;
} else if (c < 0x800) {
result += 2;
} else if (c < 0xd800) {
} else if (c < 0x10000) {
result += 3;
} else if (c <= 0xdfff) {
if (i >= s.length - 1) {
throw new Error(INVALID_UTF16);
}
i++; // "eat" next character
result += 4;
} else {
throw new Error(INVALID_UTF16);
result += 4;
}
}
return result;
Expand Down

0 comments on commit 976d152

Please sign in to comment.