diff --git a/CMakeLists.txt b/CMakeLists.txt index 6382957..e631623 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ target_sources(vm_lib src/common.cpp src/arena.cpp src/reader.cpp + src/utf8.cpp PUBLIC FILE_SET HEADERS @@ -28,6 +29,7 @@ target_sources(vm_lib src/vm.hpp src/sourcerange.hpp src/reader.hpp + src/utf8.hpp ) add_executable(vli src/vli.cpp) diff --git a/src/common.cpp b/src/common.cpp index 79cfc51..2cd7772 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -3,6 +3,7 @@ #include "arena.hpp" #include "error.hpp" #include "pod.hpp" +#include "utf8.hpp" Syntax::Syntax(String filename, String modulename, Value expression) {} @@ -45,6 +46,24 @@ Result Symbol::create(Arena& arena, String& rhs) { return Symbol(TRY(MkGcRoot(pod, arena))); } +Result ByteArray::create(Arena& arena, String& str) { + uint64_t size = 0; + for (uint64_t i = 0; i < str.size(); i++) { + size += utf8_codepoint_size(TRY(str[i])); + } + auto pod = TRY(arena.alloc(size * sizeof(char))); + + char* res = pod->data; + for (uint64_t i = 0; i < str.size(); i++) { + char32_t codepoint = TRY(str[i]); + size = utf8_codepoint_size(codepoint); + res = utf8_write_codepoint(res, codepoint); + } + pod->size = size; + + return ByteArray(TRY(MkGcRoot(pod, arena))); +} + Result syntax_unwrap(Arena& arena, Value& val) { Syntax* syntax = val.to(); if (syntax == 0) return val.copy(arena); diff --git a/src/common.hpp b/src/common.hpp index 0733e39..41aaa98 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -7,10 +7,12 @@ #include "arena.hpp" #include "pod.hpp" +#include "utf8.hpp" // Forward declarations class Value; +class String; class Object { public: @@ -83,10 +85,12 @@ class ByteArray : public Object { return ByteArray(TRY(MkGcRoot(pod, arena))); } + static Result create(Arena& arena, String& str); + uint64_t size() { return _value->size; } virtual Result copy(Arena& arena) final; - Result operator[](uint64_t idx) { + Result operator[](uint64_t idx) { if (idx >= _value->size) return ErrorCode::IndexOutOfRange; return _value->data[idx]; } diff --git a/src/utf8.cpp b/src/utf8.cpp new file mode 100644 index 0000000..e4f7824 --- /dev/null +++ b/src/utf8.cpp @@ -0,0 +1,43 @@ +#include "utf8.hpp" + +size_t utf8_codepoint_size(char32_t codepoint) { + if (0 == ((char32_t)0xffffff80 & codepoint)) { + return 1; + } else if (0 == ((char32_t)0xfffff800 & codepoint)) { + return 2; + } else if (0 == ((char32_t)0xffff0000 & codepoint)) { + return 3; + } else { + return 4; + } +} + +char *utf8_write_codepoint(char *str, char32_t codepoint) { + if (0 == ((char32_t)0xffffff80 & codepoint)) { + /* 1-byte/7-bit ascii + * (0b0xxxxxxx) */ + str[0] = (char)codepoint; + str += 1; + } else if (0 == ((char32_t)0xfffff800 & codepoint)) { + /* 2-byte/11-bit utf8 code point + * (0b110xxxxx 0b10xxxxxx) */ + str[0] = (char)(0xc0 | (char)((codepoint >> 6) & 0x1f)); + str[1] = (char)(0x80 | (char)(codepoint & 0x3f)); + str += 2; + } else if (0 == ((char32_t)0xffff0000 & codepoint)) { + /* 3-byte/16-bit utf8 code point + * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */ + str[0] = (char)(0xe0 | (char)((codepoint >> 12) & 0x0f)); + str[1] = (char)(0x80 | (char)((codepoint >> 6) & 0x3f)); + str[2] = (char)(0x80 | (char)(codepoint & 0x3f)); + str += 3; + } else { /* if (0 == ((int)0xffe00000 & chr)) { */ + str[0] = (char)(0xf0 | (char)((codepoint >> 18) & 0x07)); + str[1] = (char)(0x80 | (char)((codepoint >> 12) & 0x3f)); + str[2] = (char)(0x80 | (char)((codepoint >> 6) & 0x3f)); + str[3] = (char)(0x80 | (char)(codepoint & 0x3f)); + str += 4; + } + + return str; +} diff --git a/src/utf8.hpp b/src/utf8.hpp new file mode 100644 index 0000000..fdba560 --- /dev/null +++ b/src/utf8.hpp @@ -0,0 +1,6 @@ +#pragma once + +#include + +size_t utf8_codepoint_size(char32_t codepoint); +char *utf8_write_codepoint(char *str, char32_t codepoint);