Add utf-8 conversion from string to byte array
This commit is contained in:
parent
cdcd111fb2
commit
220adaace9
5 changed files with 75 additions and 1 deletions
|
@ -15,6 +15,7 @@ target_sources(vm_lib
|
|||
src/common.cpp
|
||||
src/arena.cpp
|
||||
src/reader.cpp
|
||||
src/utf8.cpp
|
||||
|
||||
PUBLIC
|
||||
FILE_SET HEADERS
|
||||
|
@ -28,6 +29,7 @@ target_sources(vm_lib
|
|||
src/vm.hpp
|
||||
src/sourcerange.hpp
|
||||
src/reader.hpp
|
||||
src/utf8.hpp
|
||||
)
|
||||
|
||||
add_executable(vli src/vli.cpp)
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include "arena.hpp"
|
||||
#include "error.hpp"
|
||||
#include "pod.hpp"
|
||||
#include "utf8.hpp"
|
||||
|
||||
Syntax::Syntax(String filename, String modulename, Value expression) {}
|
||||
|
||||
|
@ -45,6 +46,24 @@ Result<Symbol> Symbol::create(Arena& arena, String& rhs) {
|
|||
return Symbol(TRY(MkGcRoot(pod, arena)));
|
||||
}
|
||||
|
||||
Result<ByteArray> ByteArray::create(Arena& arena, String& str) {
|
||||
uint64_t size = 0;
|
||||
for (uint64_t i = 0; i < str.size(); i++) {
|
||||
size += utf8_codepoint_size(TRY(str[i]));
|
||||
}
|
||||
auto pod = TRY(arena.alloc<PodByteArray>(size * sizeof(char)));
|
||||
|
||||
char* res = pod->data;
|
||||
for (uint64_t i = 0; i < str.size(); i++) {
|
||||
char32_t codepoint = TRY(str[i]);
|
||||
size = utf8_codepoint_size(codepoint);
|
||||
res = utf8_write_codepoint(res, codepoint);
|
||||
}
|
||||
pod->size = size;
|
||||
|
||||
return ByteArray(TRY(MkGcRoot(pod, arena)));
|
||||
}
|
||||
|
||||
Result<Value> syntax_unwrap(Arena& arena, Value& val) {
|
||||
Syntax* syntax = val.to<Syntax>();
|
||||
if (syntax == 0) return val.copy(arena);
|
||||
|
|
|
@ -7,10 +7,12 @@
|
|||
|
||||
#include "arena.hpp"
|
||||
#include "pod.hpp"
|
||||
#include "utf8.hpp"
|
||||
|
||||
// Forward declarations
|
||||
|
||||
class Value;
|
||||
class String;
|
||||
|
||||
class Object {
|
||||
public:
|
||||
|
@ -83,10 +85,12 @@ class ByteArray : public Object {
|
|||
return ByteArray(TRY(MkGcRoot(pod, arena)));
|
||||
}
|
||||
|
||||
static Result<ByteArray> create(Arena& arena, String& str);
|
||||
|
||||
uint64_t size() { return _value->size; }
|
||||
virtual Result<Value> copy(Arena& arena) final;
|
||||
|
||||
Result<char32_t> operator[](uint64_t idx) {
|
||||
Result<char> operator[](uint64_t idx) {
|
||||
if (idx >= _value->size) return ErrorCode::IndexOutOfRange;
|
||||
return _value->data[idx];
|
||||
}
|
||||
|
|
43
src/utf8.cpp
Normal file
43
src/utf8.cpp
Normal file
|
@ -0,0 +1,43 @@
|
|||
#include "utf8.hpp"
|
||||
|
||||
size_t utf8_codepoint_size(char32_t codepoint) {
|
||||
if (0 == ((char32_t)0xffffff80 & codepoint)) {
|
||||
return 1;
|
||||
} else if (0 == ((char32_t)0xfffff800 & codepoint)) {
|
||||
return 2;
|
||||
} else if (0 == ((char32_t)0xffff0000 & codepoint)) {
|
||||
return 3;
|
||||
} else {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
char *utf8_write_codepoint(char *str, char32_t codepoint) {
|
||||
if (0 == ((char32_t)0xffffff80 & codepoint)) {
|
||||
/* 1-byte/7-bit ascii
|
||||
* (0b0xxxxxxx) */
|
||||
str[0] = (char)codepoint;
|
||||
str += 1;
|
||||
} else if (0 == ((char32_t)0xfffff800 & codepoint)) {
|
||||
/* 2-byte/11-bit utf8 code point
|
||||
* (0b110xxxxx 0b10xxxxxx) */
|
||||
str[0] = (char)(0xc0 | (char)((codepoint >> 6) & 0x1f));
|
||||
str[1] = (char)(0x80 | (char)(codepoint & 0x3f));
|
||||
str += 2;
|
||||
} else if (0 == ((char32_t)0xffff0000 & codepoint)) {
|
||||
/* 3-byte/16-bit utf8 code point
|
||||
* (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
|
||||
str[0] = (char)(0xe0 | (char)((codepoint >> 12) & 0x0f));
|
||||
str[1] = (char)(0x80 | (char)((codepoint >> 6) & 0x3f));
|
||||
str[2] = (char)(0x80 | (char)(codepoint & 0x3f));
|
||||
str += 3;
|
||||
} else { /* if (0 == ((int)0xffe00000 & chr)) { */
|
||||
str[0] = (char)(0xf0 | (char)((codepoint >> 18) & 0x07));
|
||||
str[1] = (char)(0x80 | (char)((codepoint >> 12) & 0x3f));
|
||||
str[2] = (char)(0x80 | (char)((codepoint >> 6) & 0x3f));
|
||||
str[3] = (char)(0x80 | (char)(codepoint & 0x3f));
|
||||
str += 4;
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
6
src/utf8.hpp
Normal file
6
src/utf8.hpp
Normal file
|
@ -0,0 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
size_t utf8_codepoint_size(char32_t codepoint);
|
||||
char *utf8_write_codepoint(char *str, char32_t codepoint);
|
Loading…
Reference in a new issue