Add utf-8 conversion from string to byte array

This commit is contained in:
Konstantin Nazarov 2024-07-28 16:40:52 +01:00
parent cdcd111fb2
commit 220adaace9
Signed by: knazarov
GPG key ID: 4CFE0A42FA409C22
5 changed files with 75 additions and 1 deletions

View file

@ -15,6 +15,7 @@ target_sources(vm_lib
src/common.cpp
src/arena.cpp
src/reader.cpp
src/utf8.cpp
PUBLIC
FILE_SET HEADERS
@ -28,6 +29,7 @@ target_sources(vm_lib
src/vm.hpp
src/sourcerange.hpp
src/reader.hpp
src/utf8.hpp
)
add_executable(vli src/vli.cpp)

View file

@ -3,6 +3,7 @@
#include "arena.hpp"
#include "error.hpp"
#include "pod.hpp"
#include "utf8.hpp"
Syntax::Syntax(String filename, String modulename, Value expression) {}
@ -45,6 +46,24 @@ Result<Symbol> Symbol::create(Arena& arena, String& rhs) {
return Symbol(TRY(MkGcRoot(pod, arena)));
}
Result<ByteArray> ByteArray::create(Arena& arena, String& str) {
uint64_t size = 0;
for (uint64_t i = 0; i < str.size(); i++) {
size += utf8_codepoint_size(TRY(str[i]));
}
auto pod = TRY(arena.alloc<PodByteArray>(size * sizeof(char)));
char* res = pod->data;
for (uint64_t i = 0; i < str.size(); i++) {
char32_t codepoint = TRY(str[i]);
size = utf8_codepoint_size(codepoint);
res = utf8_write_codepoint(res, codepoint);
}
pod->size = size;
return ByteArray(TRY(MkGcRoot(pod, arena)));
}
Result<Value> syntax_unwrap(Arena& arena, Value& val) {
Syntax* syntax = val.to<Syntax>();
if (syntax == 0) return val.copy(arena);

View file

@ -7,10 +7,12 @@
#include "arena.hpp"
#include "pod.hpp"
#include "utf8.hpp"
// Forward declarations
class Value;
class String;
class Object {
public:
@ -83,10 +85,12 @@ class ByteArray : public Object {
return ByteArray(TRY(MkGcRoot(pod, arena)));
}
static Result<ByteArray> create(Arena& arena, String& str);
uint64_t size() { return _value->size; }
virtual Result<Value> copy(Arena& arena) final;
Result<char32_t> operator[](uint64_t idx) {
Result<char> operator[](uint64_t idx) {
if (idx >= _value->size) return ErrorCode::IndexOutOfRange;
return _value->data[idx];
}

43
src/utf8.cpp Normal file
View file

@ -0,0 +1,43 @@
#include "utf8.hpp"
size_t utf8_codepoint_size(char32_t codepoint) {
if (0 == ((char32_t)0xffffff80 & codepoint)) {
return 1;
} else if (0 == ((char32_t)0xfffff800 & codepoint)) {
return 2;
} else if (0 == ((char32_t)0xffff0000 & codepoint)) {
return 3;
} else {
return 4;
}
}
char *utf8_write_codepoint(char *str, char32_t codepoint) {
if (0 == ((char32_t)0xffffff80 & codepoint)) {
/* 1-byte/7-bit ascii
* (0b0xxxxxxx) */
str[0] = (char)codepoint;
str += 1;
} else if (0 == ((char32_t)0xfffff800 & codepoint)) {
/* 2-byte/11-bit utf8 code point
* (0b110xxxxx 0b10xxxxxx) */
str[0] = (char)(0xc0 | (char)((codepoint >> 6) & 0x1f));
str[1] = (char)(0x80 | (char)(codepoint & 0x3f));
str += 2;
} else if (0 == ((char32_t)0xffff0000 & codepoint)) {
/* 3-byte/16-bit utf8 code point
* (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
str[0] = (char)(0xe0 | (char)((codepoint >> 12) & 0x0f));
str[1] = (char)(0x80 | (char)((codepoint >> 6) & 0x3f));
str[2] = (char)(0x80 | (char)(codepoint & 0x3f));
str += 3;
} else { /* if (0 == ((int)0xffe00000 & chr)) { */
str[0] = (char)(0xf0 | (char)((codepoint >> 18) & 0x07));
str[1] = (char)(0x80 | (char)((codepoint >> 12) & 0x3f));
str[2] = (char)(0x80 | (char)((codepoint >> 6) & 0x3f));
str[3] = (char)(0x80 | (char)(codepoint & 0x3f));
str += 4;
}
return str;
}

6
src/utf8.hpp Normal file
View file

@ -0,0 +1,6 @@
#pragma once
#include <cstddef>
size_t utf8_codepoint_size(char32_t codepoint);
char *utf8_write_codepoint(char *str, char32_t codepoint);