From 351a895d695cb2a709e7d6c9768d9ee8fe6be9fc Mon Sep 17 00:00:00 2001
From: Konstantin Nazarov <mail@knazarov.com>
Date: Sat, 14 Dec 2024 22:56:28 +0000
Subject: [PATCH] Add support for printf, UART, and iron out a few bugs

---
 example/Makefile  |   6 +-
 example/boot.s    |   2 +-
 example/example.c |   7 +-
 example/linker.ld |   6 +-
 example/printf.c  | 914 ++++++++++++++++++++++++++++++++++++++++++++++
 example/printf.h  | 117 ++++++
 example/putchar.c |   4 +
 src/debug.cpp     |   9 +-
 src/elf.cpp       |   6 +-
 src/rve.cpp       |   6 +-
 src/vm.cpp        | 229 ++++++++----
 src/vm.hpp        |  33 ++
 12 files changed, 1257 insertions(+), 82 deletions(-)
 create mode 100644 example/printf.c
 create mode 100644 example/printf.h
 create mode 100644 example/putchar.c

diff --git a/example/Makefile b/example/Makefile
index 76531bc..001e78d 100644
--- a/example/Makefile
+++ b/example/Makefile
@@ -1,4 +1,6 @@
-example: example.c Makefile boot.s linker.ld
+example: example.c Makefile boot.s linker.ld printf.h printf.c putchar.c
 	riscv32-none-elf-as -march=rv32i -mabi=ilp32 boot.s -o boot.o
+	riscv32-none-elf-gcc -fno-builtin -fvisibility=hidden -nostdlib -nostartfiles -march=rv32im -mabi=ilp32 -D PRINTF_DISABLE_SUPPORT_FLOAT=1 -D PRINTF_DISABLE_SUPPORT_LONG_LONG=1 -c printf.c -o printf.o -g
+	riscv32-none-elf-gcc -fno-builtin -fvisibility=hidden -nostdlib -nostartfiles -march=rv32im -mabi=ilp32 -c putchar.c -o putchar.o -g
 	riscv32-none-elf-gcc -fno-builtin -fvisibility=hidden -nostdlib -nostartfiles -march=rv32im -mabi=ilp32 -c example.c -o example.o -g
-	riscv32-none-elf-ld boot.o example.o -T linker.ld -o example -g
+	riscv32-none-elf-ld boot.o example.o printf.o putchar.o -T linker.ld -o example -g
diff --git a/example/boot.s b/example/boot.s
index af4f484..edaff0d 100644
--- a/example/boot.s
+++ b/example/boot.s
@@ -1,6 +1,6 @@
 .globl _boot
 _boot:
-    li x2, 0x8000
+    li x2, 0x80000
     call main
     sbreak
     j .
diff --git a/example/example.c b/example/example.c
index 7830293..0de0bb8 100644
--- a/example/example.c
+++ b/example/example.c
@@ -1,4 +1,4 @@
-static int mem = 1;
+#include "printf.h"
 
 int fact(int n) {
   if (n == 0)
@@ -8,7 +8,10 @@ int fact(int n) {
 }
 
 int main() {
-  mem = fact(8);
+  int n = 8;
+  int res = fact(8);
+
+  printf("%d! = %d\n", n, res);
 
   return 0;
 }
diff --git a/example/linker.ld b/example/linker.ld
index aac741b..3cf558e 100644
--- a/example/linker.ld
+++ b/example/linker.ld
@@ -7,19 +7,19 @@ SECTIONS {
         *(.text)   /* Place all .text sections (code) here */
     }
 
-    . = 0x1000;
+    . = 0x10000;
 
     .data : {
         *(.data)   /* Place all .data sections (initialized data) here */
     }
 
-    . = 0x2000;
+    . = 0x20000;
 
     .bss : {
         *(.bss)    /* Place all .bss sections (uninitialized data) here */
     }
 
-    . = 0x3000;
+    . = 0x30000;
 
     .stack : {
         *(.stack)
diff --git a/example/printf.c b/example/printf.c
new file mode 100644
index 0000000..8a700ad
--- /dev/null
+++ b/example/printf.c
@@ -0,0 +1,914 @@
+///////////////////////////////////////////////////////////////////////////////
+// \author (c) Marco Paland (info@paland.com)
+//             2014-2019, PALANDesign Hannover, Germany
+//
+// \license The MIT License (MIT)
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
+//        embedded systems with a very limited resources. These routines are thread
+//        safe and reentrant!
+//        Use this instead of the bloated standard/newlib printf cause these use
+//        malloc for printf (and may not be thread safe).
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "printf.h"
+
+
+// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
+// printf_config.h header file
+// default: undefined
+#ifdef PRINTF_INCLUDE_CONFIG_H
+#include "printf_config.h"
+#endif
+
+
+// 'ntoa' conversion buffer size, this must be big enough to hold one converted
+// numeric number including padded zeros (dynamically created on stack)
+// default: 32 byte
+#ifndef PRINTF_NTOA_BUFFER_SIZE
+#define PRINTF_NTOA_BUFFER_SIZE    32U
+#endif
+
+// 'ftoa' conversion buffer size, this must be big enough to hold one converted
+// float number including padded zeros (dynamically created on stack)
+// default: 32 byte
+#ifndef PRINTF_FTOA_BUFFER_SIZE
+#define PRINTF_FTOA_BUFFER_SIZE    32U
+#endif
+
+// support for the floating point type (%f)
+// default: activated
+#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
+#define PRINTF_SUPPORT_FLOAT
+#endif
+
+// support for exponential floating point notation (%e/%g)
+// default: activated
+#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
+#define PRINTF_SUPPORT_EXPONENTIAL
+#endif
+
+// define the default floating point precision
+// default: 6 digits
+#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
+#define PRINTF_DEFAULT_FLOAT_PRECISION  6U
+#endif
+
+// define the largest float suitable to print with %f
+// default: 1e9
+#ifndef PRINTF_MAX_FLOAT
+#define PRINTF_MAX_FLOAT  1e9
+#endif
+
+// support for the long long types (%llu or %p)
+// default: activated
+#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
+#define PRINTF_SUPPORT_LONG_LONG
+#endif
+
+// support for the ptrdiff_t type (%t)
+// ptrdiff_t is normally defined in <stddef.h> as long or long long type
+// default: activated
+#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
+#define PRINTF_SUPPORT_PTRDIFF_T
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+// internal flag definitions
+#define FLAGS_ZEROPAD   (1U <<  0U)
+#define FLAGS_LEFT      (1U <<  1U)
+#define FLAGS_PLUS      (1U <<  2U)
+#define FLAGS_SPACE     (1U <<  3U)
+#define FLAGS_HASH      (1U <<  4U)
+#define FLAGS_UPPERCASE (1U <<  5U)
+#define FLAGS_CHAR      (1U <<  6U)
+#define FLAGS_SHORT     (1U <<  7U)
+#define FLAGS_LONG      (1U <<  8U)
+#define FLAGS_LONG_LONG (1U <<  9U)
+#define FLAGS_PRECISION (1U << 10U)
+#define FLAGS_ADAPT_EXP (1U << 11U)
+
+
+// import float.h for DBL_MAX
+#if defined(PRINTF_SUPPORT_FLOAT)
+#include <float.h>
+#endif
+
+
+// output function type
+typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
+
+
+// wrapper (used as buffer) for output function type
+typedef struct {
+  void  (*fct)(char character, void* arg);
+  void* arg;
+} out_fct_wrap_type;
+
+
+// internal buffer output
+static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
+{
+  if (idx < maxlen) {
+    ((char*)buffer)[idx] = character;
+  }
+}
+
+
+// internal null output
+static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
+{
+  (void)character; (void)buffer; (void)idx; (void)maxlen;
+}
+
+
+// internal _putchar wrapper
+static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
+{
+  (void)buffer; (void)idx; (void)maxlen;
+  if (character) {
+    _putchar(character);
+  }
+}
+
+
+// internal output function wrapper
+static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
+{
+  (void)idx; (void)maxlen;
+  if (character) {
+    // buffer is the output fct pointer
+    ((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
+  }
+}
+
+
+// internal secure strlen
+// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
+static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
+{
+  const char* s;
+  for (s = str; *s && maxsize--; ++s);
+  return (unsigned int)(s - str);
+}
+
+
+// internal test if char is a digit (0-9)
+// \return true if char is a digit
+static inline bool _is_digit(char ch)
+{
+  return (ch >= '0') && (ch <= '9');
+}
+
+
+// internal ASCII string to unsigned int conversion
+static unsigned int _atoi(const char** str)
+{
+  unsigned int i = 0U;
+  while (_is_digit(**str)) {
+    i = i * 10U + (unsigned int)(*((*str)++) - '0');
+  }
+  return i;
+}
+
+
+// output the specified string in reverse, taking care of any zero-padding
+static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
+{
+  const size_t start_idx = idx;
+
+  // pad spaces up to given width
+  if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
+    for (size_t i = len; i < width; i++) {
+      out(' ', buffer, idx++, maxlen);
+    }
+  }
+
+  // reverse string
+  while (len) {
+    out(buf[--len], buffer, idx++, maxlen);
+  }
+
+  // append pad spaces up to given width
+  if (flags & FLAGS_LEFT) {
+    while (idx - start_idx < width) {
+      out(' ', buffer, idx++, maxlen);
+    }
+  }
+
+  return idx;
+}
+
+
+// internal itoa format
+static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
+{
+  // pad leading zeros
+  if (!(flags & FLAGS_LEFT)) {
+    if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
+      width--;
+    }
+    while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
+      buf[len++] = '0';
+    }
+    while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
+      buf[len++] = '0';
+    }
+  }
+
+  // handle hash
+  if (flags & FLAGS_HASH) {
+    if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
+      len--;
+      if (len && (base == 16U)) {
+        len--;
+      }
+    }
+    if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
+      buf[len++] = 'x';
+    }
+    else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
+      buf[len++] = 'X';
+    }
+    else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
+      buf[len++] = 'b';
+    }
+    if (len < PRINTF_NTOA_BUFFER_SIZE) {
+      buf[len++] = '0';
+    }
+  }
+
+  if (len < PRINTF_NTOA_BUFFER_SIZE) {
+    if (negative) {
+      buf[len++] = '-';
+    }
+    else if (flags & FLAGS_PLUS) {
+      buf[len++] = '+';  // ignore the space if the '+' exists
+    }
+    else if (flags & FLAGS_SPACE) {
+      buf[len++] = ' ';
+    }
+  }
+
+  return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
+}
+
+
+// internal itoa for 'long' type
+static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
+{
+  char buf[PRINTF_NTOA_BUFFER_SIZE];
+  size_t len = 0U;
+
+  // no hash for 0 values
+  if (!value) {
+    flags &= ~FLAGS_HASH;
+  }
+
+  // write if precision != 0 and value is != 0
+  if (!(flags & FLAGS_PRECISION) || value) {
+    do {
+      const char digit = (char)(value % base);
+      buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
+      value /= base;
+    } while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
+  }
+
+  return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
+}
+
+
+// internal itoa for 'long long' type
+#if defined(PRINTF_SUPPORT_LONG_LONG)
+static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
+{
+  char buf[PRINTF_NTOA_BUFFER_SIZE];
+  size_t len = 0U;
+
+  // no hash for 0 values
+  if (!value) {
+    flags &= ~FLAGS_HASH;
+  }
+
+  // write if precision != 0 and value is != 0
+  if (!(flags & FLAGS_PRECISION) || value) {
+    do {
+      const char digit = (char)(value % base);
+      buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
+      value /= base;
+    } while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
+  }
+
+  return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
+}
+#endif  // PRINTF_SUPPORT_LONG_LONG
+
+
+#if defined(PRINTF_SUPPORT_FLOAT)
+
+#if defined(PRINTF_SUPPORT_EXPONENTIAL)
+// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
+static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
+#endif
+
+
+// internal ftoa for fixed decimal floating point
+static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
+{
+  char buf[PRINTF_FTOA_BUFFER_SIZE];
+  size_t len  = 0U;
+  double diff = 0.0;
+
+  // powers of 10
+  static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
+
+  // test for special values
+  if (value != value)
+    return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
+  if (value < -DBL_MAX)
+    return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
+  if (value > DBL_MAX)
+    return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
+
+  // test for very large values
+  // standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
+  if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
+#if defined(PRINTF_SUPPORT_EXPONENTIAL)
+    return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
+#else
+    return 0U;
+#endif
+  }
+
+  // test for negative
+  bool negative = false;
+  if (value < 0) {
+    negative = true;
+    value = 0 - value;
+  }
+
+  // set default precision, if not set explicitly
+  if (!(flags & FLAGS_PRECISION)) {
+    prec = PRINTF_DEFAULT_FLOAT_PRECISION;
+  }
+  // limit precision to 9, cause a prec >= 10 can lead to overflow errors
+  while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
+    buf[len++] = '0';
+    prec--;
+  }
+
+  int whole = (int)value;
+  double tmp = (value - whole) * pow10[prec];
+  unsigned long frac = (unsigned long)tmp;
+  diff = tmp - frac;
+
+  if (diff > 0.5) {
+    ++frac;
+    // handle rollover, e.g. case 0.99 with prec 1 is 1.0
+    if (frac >= pow10[prec]) {
+      frac = 0;
+      ++whole;
+    }
+  }
+  else if (diff < 0.5) {
+  }
+  else if ((frac == 0U) || (frac & 1U)) {
+    // if halfway, round up if odd OR if last digit is 0
+    ++frac;
+  }
+
+  if (prec == 0U) {
+    diff = value - (double)whole;
+    if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
+      // exactly 0.5 and ODD, then round up
+      // 1.5 -> 2, but 2.5 -> 2
+      ++whole;
+    }
+  }
+  else {
+    unsigned int count = prec;
+    // now do fractional part, as an unsigned number
+    while (len < PRINTF_FTOA_BUFFER_SIZE) {
+      --count;
+      buf[len++] = (char)(48U + (frac % 10U));
+      if (!(frac /= 10U)) {
+        break;
+      }
+    }
+    // add extra 0s
+    while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
+      buf[len++] = '0';
+    }
+    if (len < PRINTF_FTOA_BUFFER_SIZE) {
+      // add decimal
+      buf[len++] = '.';
+    }
+  }
+
+  // do whole part, number is reversed
+  while (len < PRINTF_FTOA_BUFFER_SIZE) {
+    buf[len++] = (char)(48 + (whole % 10));
+    if (!(whole /= 10)) {
+      break;
+    }
+  }
+
+  // pad leading zeros
+  if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
+    if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
+      width--;
+    }
+    while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
+      buf[len++] = '0';
+    }
+  }
+
+  if (len < PRINTF_FTOA_BUFFER_SIZE) {
+    if (negative) {
+      buf[len++] = '-';
+    }
+    else if (flags & FLAGS_PLUS) {
+      buf[len++] = '+';  // ignore the space if the '+' exists
+    }
+    else if (flags & FLAGS_SPACE) {
+      buf[len++] = ' ';
+    }
+  }
+
+  return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
+}
+
+
+#if defined(PRINTF_SUPPORT_EXPONENTIAL)
+// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
+static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
+{
+  // check for NaN and special values
+  if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
+    return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
+  }
+
+  // determine the sign
+  const bool negative = value < 0;
+  if (negative) {
+    value = -value;
+  }
+
+  // default precision
+  if (!(flags & FLAGS_PRECISION)) {
+    prec = PRINTF_DEFAULT_FLOAT_PRECISION;
+  }
+
+  // determine the decimal exponent
+  // based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
+  union {
+    uint64_t U;
+    double   F;
+  } conv;
+
+  conv.F = value;
+  int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023;           // effectively log2
+  conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U);  // drop the exponent so conv.F is now in [1,2)
+  // now approximate log10 from the log2 integer part and an expansion of ln around 1.5
+  int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
+  // now we want to compute 10^expval but we want to be sure it won't overflow
+  exp2 = (int)(expval * 3.321928094887362 + 0.5);
+  const double z  = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
+  const double z2 = z * z;
+  conv.U = (uint64_t)(exp2 + 1023) << 52U;
+  // compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
+  conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
+  // correct for rounding errors
+  if (value < conv.F) {
+    expval--;
+    conv.F /= 10;
+  }
+
+  // the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
+  unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
+
+  // in "%g" mode, "prec" is the number of *significant figures* not decimals
+  if (flags & FLAGS_ADAPT_EXP) {
+    // do we want to fall-back to "%f" mode?
+    if ((value >= 1e-4) && (value < 1e6)) {
+      if ((int)prec > expval) {
+        prec = (unsigned)((int)prec - expval - 1);
+      }
+      else {
+        prec = 0;
+      }
+      flags |= FLAGS_PRECISION;   // make sure _ftoa respects precision
+      // no characters in exponent
+      minwidth = 0U;
+      expval   = 0;
+    }
+    else {
+      // we use one sigfig for the whole part
+      if ((prec > 0) && (flags & FLAGS_PRECISION)) {
+        --prec;
+      }
+    }
+  }
+
+  // will everything fit?
+  unsigned int fwidth = width;
+  if (width > minwidth) {
+    // we didn't fall-back so subtract the characters required for the exponent
+    fwidth -= minwidth;
+  } else {
+    // not enough characters, so go back to default sizing
+    fwidth = 0U;
+  }
+  if ((flags & FLAGS_LEFT) && minwidth) {
+    // if we're padding on the right, DON'T pad the floating part
+    fwidth = 0U;
+  }
+
+  // rescale the float value
+  if (expval) {
+    value /= conv.F;
+  }
+
+  // output the floating part
+  const size_t start_idx = idx;
+  idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
+
+  // output the exponent part
+  if (minwidth) {
+    // output the exponential symbol
+    out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
+    // output the exponent value
+    idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
+    // might need to right-pad spaces
+    if (flags & FLAGS_LEFT) {
+      while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
+    }
+  }
+  return idx;
+}
+#endif  // PRINTF_SUPPORT_EXPONENTIAL
+#endif  // PRINTF_SUPPORT_FLOAT
+
+
+// internal vsnprintf
+static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va)
+{
+  unsigned int flags, width, precision, n;
+  size_t idx = 0U;
+
+  if (!buffer) {
+    // use null output function
+    out = _out_null;
+  }
+
+  while (*format)
+  {
+    // format specifier?  %[flags][width][.precision][length]
+    if (*format != '%') {
+      // no
+      out(*format, buffer, idx++, maxlen);
+      format++;
+      continue;
+    }
+    else {
+      // yes, evaluate it
+      format++;
+    }
+
+    // evaluate flags
+    flags = 0U;
+    do {
+      switch (*format) {
+        case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
+        case '-': flags |= FLAGS_LEFT;    format++; n = 1U; break;
+        case '+': flags |= FLAGS_PLUS;    format++; n = 1U; break;
+        case ' ': flags |= FLAGS_SPACE;   format++; n = 1U; break;
+        case '#': flags |= FLAGS_HASH;    format++; n = 1U; break;
+        default :                                   n = 0U; break;
+      }
+    } while (n);
+
+    // evaluate width field
+    width = 0U;
+    if (_is_digit(*format)) {
+      width = _atoi(&format);
+    }
+    else if (*format == '*') {
+      const int w = va_arg(va, int);
+      if (w < 0) {
+        flags |= FLAGS_LEFT;    // reverse padding
+        width = (unsigned int)-w;
+      }
+      else {
+        width = (unsigned int)w;
+      }
+      format++;
+    }
+
+    // evaluate precision field
+    precision = 0U;
+    if (*format == '.') {
+      flags |= FLAGS_PRECISION;
+      format++;
+      if (_is_digit(*format)) {
+        precision = _atoi(&format);
+      }
+      else if (*format == '*') {
+        const int prec = (int)va_arg(va, int);
+        precision = prec > 0 ? (unsigned int)prec : 0U;
+        format++;
+      }
+    }
+
+    // evaluate length field
+    switch (*format) {
+      case 'l' :
+        flags |= FLAGS_LONG;
+        format++;
+        if (*format == 'l') {
+          flags |= FLAGS_LONG_LONG;
+          format++;
+        }
+        break;
+      case 'h' :
+        flags |= FLAGS_SHORT;
+        format++;
+        if (*format == 'h') {
+          flags |= FLAGS_CHAR;
+          format++;
+        }
+        break;
+#if defined(PRINTF_SUPPORT_PTRDIFF_T)
+      case 't' :
+        flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
+        format++;
+        break;
+#endif
+      case 'j' :
+        flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
+        format++;
+        break;
+      case 'z' :
+        flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
+        format++;
+        break;
+      default :
+        break;
+    }
+
+    // evaluate specifier
+    switch (*format) {
+      case 'd' :
+      case 'i' :
+      case 'u' :
+      case 'x' :
+      case 'X' :
+      case 'o' :
+      case 'b' : {
+        // set the base
+        unsigned int base;
+        if (*format == 'x' || *format == 'X') {
+          base = 16U;
+        }
+        else if (*format == 'o') {
+          base =  8U;
+        }
+        else if (*format == 'b') {
+          base =  2U;
+        }
+        else {
+          base = 10U;
+          flags &= ~FLAGS_HASH;   // no hash for dec format
+        }
+        // uppercase
+        if (*format == 'X') {
+          flags |= FLAGS_UPPERCASE;
+        }
+
+        // no plus or space flag for u, x, X, o, b
+        if ((*format != 'i') && (*format != 'd')) {
+          flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
+        }
+
+        // ignore '0' flag when precision is given
+        if (flags & FLAGS_PRECISION) {
+          flags &= ~FLAGS_ZEROPAD;
+        }
+
+        // convert the integer
+        if ((*format == 'i') || (*format == 'd')) {
+          // signed
+          if (flags & FLAGS_LONG_LONG) {
+#if defined(PRINTF_SUPPORT_LONG_LONG)
+            const long long value = va_arg(va, long long);
+            idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
+#endif
+          }
+          else if (flags & FLAGS_LONG) {
+            const long value = va_arg(va, long);
+            idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
+          }
+          else {
+            const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
+            idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
+          }
+        }
+        else {
+          // unsigned
+          if (flags & FLAGS_LONG_LONG) {
+#if defined(PRINTF_SUPPORT_LONG_LONG)
+            idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
+#endif
+          }
+          else if (flags & FLAGS_LONG) {
+            idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
+          }
+          else {
+            const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
+            idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
+          }
+        }
+        format++;
+        break;
+      }
+#if defined(PRINTF_SUPPORT_FLOAT)
+      case 'f' :
+      case 'F' :
+        if (*format == 'F') flags |= FLAGS_UPPERCASE;
+        idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
+        format++;
+        break;
+#if defined(PRINTF_SUPPORT_EXPONENTIAL)
+      case 'e':
+      case 'E':
+      case 'g':
+      case 'G':
+        if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
+        if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
+        idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
+        format++;
+        break;
+#endif  // PRINTF_SUPPORT_EXPONENTIAL
+#endif  // PRINTF_SUPPORT_FLOAT
+      case 'c' : {
+        unsigned int l = 1U;
+        // pre padding
+        if (!(flags & FLAGS_LEFT)) {
+          while (l++ < width) {
+            out(' ', buffer, idx++, maxlen);
+          }
+        }
+        // char output
+        out((char)va_arg(va, int), buffer, idx++, maxlen);
+        // post padding
+        if (flags & FLAGS_LEFT) {
+          while (l++ < width) {
+            out(' ', buffer, idx++, maxlen);
+          }
+        }
+        format++;
+        break;
+      }
+
+      case 's' : {
+        const char* p = va_arg(va, char*);
+        unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
+        // pre padding
+        if (flags & FLAGS_PRECISION) {
+          l = (l < precision ? l : precision);
+        }
+        if (!(flags & FLAGS_LEFT)) {
+          while (l++ < width) {
+            out(' ', buffer, idx++, maxlen);
+          }
+        }
+        // string output
+        while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
+          out(*(p++), buffer, idx++, maxlen);
+        }
+        // post padding
+        if (flags & FLAGS_LEFT) {
+          while (l++ < width) {
+            out(' ', buffer, idx++, maxlen);
+          }
+        }
+        format++;
+        break;
+      }
+
+      case 'p' : {
+        width = sizeof(void*) * 2U;
+        flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
+#if defined(PRINTF_SUPPORT_LONG_LONG)
+        const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
+        if (is_ll) {
+          idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
+        }
+        else {
+#endif
+          idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
+#if defined(PRINTF_SUPPORT_LONG_LONG)
+        }
+#endif
+        format++;
+        break;
+      }
+
+      case '%' :
+        out('%', buffer, idx++, maxlen);
+        format++;
+        break;
+
+      default :
+        out(*format, buffer, idx++, maxlen);
+        format++;
+        break;
+    }
+  }
+
+  // termination
+  out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
+
+  // return written chars without terminating \0
+  return (int)idx;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+int printf_(const char* format, ...)
+{
+  va_list va;
+  va_start(va, format);
+  char buffer[1];
+  const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
+  va_end(va);
+  return ret;
+}
+
+
+int sprintf_(char* buffer, const char* format, ...)
+{
+  va_list va;
+  va_start(va, format);
+  const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
+  va_end(va);
+  return ret;
+}
+
+
+int snprintf_(char* buffer, size_t count, const char* format, ...)
+{
+  va_list va;
+  va_start(va, format);
+  const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
+  va_end(va);
+  return ret;
+}
+
+
+int vprintf_(const char* format, va_list va)
+{
+  char buffer[1];
+  return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
+}
+
+
+int vsnprintf_(char* buffer, size_t count, const char* format, va_list va)
+{
+  return _vsnprintf(_out_buffer, buffer, count, format, va);
+}
+
+
+int fctprintf(void (*out)(char character, void* arg), void* arg, const char* format, ...)
+{
+  va_list va;
+  va_start(va, format);
+  const out_fct_wrap_type out_fct_wrap = { out, arg };
+  const int ret = _vsnprintf(_out_fct, (char*)(uintptr_t)&out_fct_wrap, (size_t)-1, format, va);
+  va_end(va);
+  return ret;
+}
diff --git a/example/printf.h b/example/printf.h
new file mode 100644
index 0000000..6104ccf
--- /dev/null
+++ b/example/printf.h
@@ -0,0 +1,117 @@
+///////////////////////////////////////////////////////////////////////////////
+// \author (c) Marco Paland (info@paland.com)
+//             2014-2019, PALANDesign Hannover, Germany
+//
+// \license The MIT License (MIT)
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
+//        embedded systems with a very limited resources.
+//        Use this instead of bloated standard/newlib printf.
+//        These routines are thread safe and reentrant.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef _PRINTF_H_
+#define _PRINTF_H_
+
+#include <stdarg.h>
+#include <stddef.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Output a character to a custom device like UART, used by the printf() function
+ * This function is declared here only. You have to write your custom implementation somewhere
+ * \param character Character to output
+ */
+void _putchar(char character);
+
+
+/**
+ * Tiny printf implementation
+ * You have to implement _putchar if you use printf()
+ * To avoid conflicts with the regular printf() API it is overridden by macro defines
+ * and internal underscore-appended functions like printf_() are used
+ * \param format A string that specifies the format of the output
+ * \return The number of characters that are written into the array, not counting the terminating null character
+ */
+#define printf printf_
+int printf_(const char* format, ...);
+
+
+/**
+ * Tiny sprintf implementation
+ * Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
+ * \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
+ * \param format A string that specifies the format of the output
+ * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
+ */
+#define sprintf sprintf_
+int sprintf_(char* buffer, const char* format, ...);
+
+
+/**
+ * Tiny snprintf/vsnprintf implementation
+ * \param buffer A pointer to the buffer where to store the formatted string
+ * \param count The maximum number of characters to store in the buffer, including a terminating null character
+ * \param format A string that specifies the format of the output
+ * \param va A value identifying a variable arguments list
+ * \return The number of characters that COULD have been written into the buffer, not counting the terminating
+ *         null character. A value equal or larger than count indicates truncation. Only when the returned value
+ *         is non-negative and less than count, the string has been completely written.
+ */
+#define snprintf  snprintf_
+#define vsnprintf vsnprintf_
+int  snprintf_(char* buffer, size_t count, const char* format, ...);
+int vsnprintf_(char* buffer, size_t count, const char* format, va_list va);
+
+
+/**
+ * Tiny vprintf implementation
+ * \param format A string that specifies the format of the output
+ * \param va A value identifying a variable arguments list
+ * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
+ */
+#define vprintf vprintf_
+int vprintf_(const char* format, va_list va);
+
+
+/**
+ * printf with output function
+ * You may use this as dynamic alternative to printf() with its fixed _putchar() output
+ * \param out An output function which takes one character and an argument pointer
+ * \param arg An argument pointer for user data passed to output function
+ * \param format A string that specifies the format of the output
+ * \return The number of characters that are sent to the output function, not counting the terminating null character
+ */
+int fctprintf(void (*out)(char character, void* arg), void* arg, const char* format, ...);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif  // _PRINTF_H_
diff --git a/example/putchar.c b/example/putchar.c
new file mode 100644
index 0000000..7753ccb
--- /dev/null
+++ b/example/putchar.c
@@ -0,0 +1,4 @@
+void _putchar(char character) {
+  char* uart_thr = (char*)0x10000000;
+  *uart_thr = character;
+}
diff --git a/src/debug.cpp b/src/debug.cpp
index 47cb85b..8fda93c 100644
--- a/src/debug.cpp
+++ b/src/debug.cpp
@@ -190,7 +190,10 @@ void GDBStub::handle_packet(const std::string &packet) {
       break;
 
     case 's':
-      vm.step();
+      try {
+        vm.step();
+      } catch (const EbreakException &ex) {
+      }
       send_packet("S05");  // TODO: step execution
       break;
 
@@ -222,7 +225,7 @@ void GDBStub::handle_packet(const std::string &packet) {
       // Insert breakpoint
       if (packet[1] == '0') {
         uint32_t addr =
-            std::stoul(packet.substr(3, packet.find(',')), nullptr, 16);
+            std::stoul(packet.substr(3, packet.find(',', 3)), nullptr, 16);
 
         if (breakpoints.count(addr) == 0) {
           uint32_t original_instr = vm.read_memory_word(addr);
@@ -237,7 +240,7 @@ void GDBStub::handle_packet(const std::string &packet) {
       // Delete breakpoint
       if (packet[1] == '0') {
         uint32_t addr =
-            std::stoul(packet.substr(3, packet.find(',')), nullptr, 16);
+            std::stoul(packet.substr(3, packet.find(',', 3)), nullptr, 16);
 
         if (breakpoints.count(addr) > 0) {
           // Restore the original instruction
diff --git a/src/elf.cpp b/src/elf.cpp
index ea65438..3062f41 100644
--- a/src/elf.cpp
+++ b/src/elf.cpp
@@ -95,7 +95,8 @@ std::vector<uint8_t> load_elf(const std::string& filename, size_t memory_size) {
   for (const Elf32Section& shdr : sectionHeaders) {
     const char* sectionName = &sectionStrTable[shdr.sh_name];
     if (std::strcmp(sectionName, ".text") == 0 ||
-        std::strcmp(sectionName, ".sdata") == 0) {
+        std::strcmp(sectionName, ".sdata") == 0 ||
+        std::strcmp(sectionName, ".rodata") == 0) {
       memoryEnd = std::max(memoryEnd, shdr.sh_addr + shdr.sh_size);
     }
   }
@@ -110,7 +111,8 @@ std::vector<uint8_t> load_elf(const std::string& filename, size_t memory_size) {
   for (const Elf32Section& shdr : sectionHeaders) {
     const char* sectionName = &sectionStrTable[shdr.sh_name];
     if (std::strcmp(sectionName, ".text") == 0 ||
-        std::strcmp(sectionName, ".sdata") == 0) {
+        std::strcmp(sectionName, ".sdata") == 0 ||
+        std::strcmp(sectionName, ".rodata") == 0) {
       std::vector<uint8_t> sectionData(shdr.sh_size);
       file.seekg(shdr.sh_offset);
       file.read(reinterpret_cast<char*>(&loadedData[shdr.sh_addr]),
diff --git a/src/rve.cpp b/src/rve.cpp
index 677218c..a8c1abe 100644
--- a/src/rve.cpp
+++ b/src/rve.cpp
@@ -10,7 +10,7 @@
 #include "vm.hpp"
 
 int main(int argc, char *argv[]) {
-  const size_t MEMORY_SIZE = 128 * 1024;
+  const size_t MEMORY_SIZE = 512 * 1024;
 
   bool debug = false;
   std::string program_filename = "";
@@ -46,10 +46,6 @@ int main(int argc, char *argv[]) {
       std::cerr << "Emulator error: " << e.what() << std::endl;
       return 1;
     }
-    std::vector res_mem = vm.read_memory(0x1000, 4);
-    uint32_t *res = (uint32_t *)&res_mem[0];
-
-    std::cout << "result: " << *res << std::endl;
   } else {
     // to debug, do: "set debug remote 1" in gdb
     // and then "target remote :1234"
diff --git a/src/vm.cpp b/src/vm.cpp
index 8e34c17..b5f86f4 100644
--- a/src/vm.cpp
+++ b/src/vm.cpp
@@ -12,9 +12,39 @@ inline int32_t sign_extend(int32_t value, int bits) {
   return (value ^ mask) - mask;
 }
 
+uint8_t UART::read_register(uint32_t address) {
+  switch (address) {
+    case UART_LSR:
+      // Always ready to transmit
+      return LSR_TRANSMITTER_EMPTY;
+    default:
+      return 0;
+  }
+}
+
+void UART::write_register(uint32_t address, uint8_t value) {
+  switch (address) {
+    case UART_THR:
+      std::cout.put(static_cast<char>(value));
+      break;
+  }
+}
+
+bool UART::is_transmitter_ready() {
+  return read_register(UART_LSR) & LSR_TRANSMITTER_EMPTY;
+}
+
 VM::VM(const std::vector<uint8_t>& memory, const std::string& file_path)
     : memory_(memory), file_path(file_path) {}
 
+void VM::setreg(int regnum, uint32_t value) {
+  if (regnum == 0) {
+    return;
+  }
+
+  registers[regnum] = value;
+}
+
 std::vector<uint8_t> VM::read_memory(size_t start, size_t size) {
   if (start + size > memory_.size()) {
     return std::vector<uint8_t>(size, 0);
@@ -23,12 +53,64 @@ std::vector<uint8_t> VM::read_memory(size_t start, size_t size) {
                               memory_.begin() + start + size);
 }
 
-uint32_t VM::read_memory_word(size_t pos) { return *(uint32_t*)&memory_[pos]; }
+uint32_t VM::read_memory_word(size_t pos) {
+  if (pos + 3 >= memory_.size()) {
+    throw std::runtime_error("Memory access out of bounds");
+  }
+  return *(uint32_t*)&memory_[pos];
+}
+
+uint16_t VM::read_memory_half_word(size_t pos) {
+  if (pos + 1 >= memory_.size()) {
+    throw std::runtime_error("Memory access out of bounds");
+  }
+
+  return *(uint16_t*)&memory_[pos];
+}
+
+uint8_t VM::read_memory_byte(size_t pos) {
+  if (pos >= memory_.size()) {
+    throw std::runtime_error("Memory access out of bounds");
+  }
+
+  return memory_[pos];
+}
 
 void VM::write_memory_word(size_t pos, uint32_t value) {
+  if (pos + 1 >= memory_.size()) {
+    throw std::runtime_error("Memory access out of bounds");
+  }
   *(uint32_t*)&memory_[pos] = value;
 }
 
+void VM::write_memory_half_word(size_t pos, uint16_t value) {
+  if (pos + 3 >= memory_.size()) {
+    throw std::runtime_error("Memory access out of bounds");
+  }
+  *(uint16_t*)&memory_[pos] = value;
+}
+
+void VM::write_memory_byte(size_t pos, uint8_t value) {
+  if (is_mmap(pos, 1)) {
+    if (pos >= UART_ADDR && pos < UART_ADDR + 8) {
+      uart.write_register(pos - UART_ADDR, value);
+    }
+    return;
+  }
+
+  if (pos >= memory_.size()) {
+    throw std::runtime_error("Memory access out of bounds");
+  }
+  memory_[pos] = value;
+}
+
+bool VM::is_mmap(size_t pos, size_t size) {
+  if (pos + size < UART_ADDR) return false;
+  if (pos >= UART_ADDR + 8) return false;
+
+  return (pos < UART_ADDR + 8) && (pos + size >= UART_ADDR - 1);
+}
+
 uint32_t VM::read_register(size_t regnum) {
   if (regnum == 32) return pc;
 
@@ -42,12 +124,9 @@ uint32_t VM::read_register(size_t regnum) {
 const std::string& VM::get_file_path() { return file_path; }
 
 void VM::step() {
-  size_t memory_size = memory_.size();
-  uint8_t* memory = &memory_[0];
-
-  uint32_t instr = *(uint32_t*)&memory[pc];
+  uint32_t instr = *(uint32_t*)&memory_[pc];
   // std::cout << "pc: " << std::hex << pc << std::dec << "\n";
-  //  std::cout << "instr: " << std::hex << instr << "\n";
+  // std::cout << "instr: " << std::hex << instr << "\n";
   pc += 4;
 
   // Decode instruction
@@ -63,26 +142,26 @@ void VM::step() {
     case 0x33: {  // R-type
       if (funct7 == 0x00) {
         if (funct3 == 0x0) {  // ADD
-          registers[rd] = registers[rs1] + registers[rs2];
+          setreg(rd, registers[rs1] + registers[rs2]);
         } else if (funct3 == 0x04) {  // XOR
-          registers[rd] = registers[rs1] ^ registers[rs2];
+          setreg(rd, registers[rs1] ^ registers[rs2]);
         } else if (funct3 == 0x06) {  // OR
-          registers[rd] = registers[rs1] | registers[rs2];
+          setreg(rd, registers[rd] = registers[rs1] | registers[rs2]);
         } else if (funct3 == 0x07) {  // AND
-          registers[rd] = registers[rs1] & registers[rs2];
+          setreg(rd, registers[rs1] & registers[rs2]);
         } else if (funct3 == 0x01) {  // SLL
-          registers[rd] = registers[rs1] << registers[rs2];
+          setreg(rd, registers[rs1] << registers[rs2]);
         } else if (funct3 == 0x05) {  // SRL
           uint32_t value = registers[rs1];
           uint32_t shift_amount = registers[rs2] & 0x1F;
-          registers[rd] = value << shift_amount;
+          setreg(rd, value >> shift_amount);
         } else if (funct3 == 0x02) {  // SLT
           registers[rd] = (static_cast<int32_t>(registers[rs1]) <
                            static_cast<int32_t>(registers[rs2]))
                               ? 0
                               : 1;
         } else if (funct3 == 0x03) {  // SLTU
-          registers[rd] = (registers[rs1] < registers[rs2]) ? 1 : 0;
+          setreg(rd, (registers[rs1] < registers[rs2]) ? 1 : 0);
         } else {
           throw std::runtime_error("Unknown R-type instruction");
         }
@@ -93,7 +172,7 @@ void VM::step() {
           // Only the lower 5 bits are used for shift
           int32_t value = static_cast<int32_t>(registers[rs1]);
           int32_t shift_amount = registers[rs2] & 0x1F;
-          registers[rd] = value >> shift_amount;
+          setreg(rd, value >> shift_amount);
         } else {
           throw std::runtime_error("Unknown R-type instruction");
         }
@@ -102,50 +181,50 @@ void VM::step() {
           int64_t result =
               static_cast<int64_t>(static_cast<int32_t>(registers[rs1])) *
               static_cast<int64_t>(static_cast<int32_t>(registers[rs2]));
-          registers[rd] = static_cast<uint32_t>(result);
+          setreg(rd, static_cast<uint32_t>(result));
         } else if (funct3 == 0x1) {  // MULH
           int64_t result =
               static_cast<int64_t>(static_cast<int32_t>(registers[rs1])) *
               static_cast<int64_t>(static_cast<int32_t>(registers[rs2]));
-          registers[rd] = static_cast<uint32_t>(result >> 32);
+          setreg(rd, static_cast<uint32_t>(result >> 32));
         } else if (funct3 == 0x2) {  // MULSU
           int64_t result =
               static_cast<int64_t>(static_cast<int32_t>(registers[rs1])) *
               static_cast<uint64_t>(registers[rs2]);
-          registers[rd] = static_cast<uint32_t>(result >> 32);
+          setreg(rd, static_cast<uint32_t>(result >> 32));
         } else if (funct3 == 0x3) {  // MULU
           uint64_t result = static_cast<uint64_t>(registers[rs1]) *
                             static_cast<uint64_t>(registers[rs2]);
-          registers[rd] = static_cast<uint32_t>(result >> 32);  // Upper 32 bits
-        } else if (funct3 == 0x4) {                             // DIV
+          setreg(rd, static_cast<uint32_t>(result >> 32));  // Upper 32 bits
+        } else if (funct3 == 0x4) {                         // DIV
           int32_t dividend = static_cast<int32_t>(registers[rs1]);
           int32_t divisor = static_cast<int32_t>(registers[rs2]);
           if (divisor == 0) {
-            registers[rd] = -1;  // Division by zero result
+            setreg(rd, -1);  // Division by zero result
           } else if (dividend == INT32_MIN && divisor == -1) {
-            registers[rd] = dividend;  // Overflow case
+            setreg(rd, dividend);  // Overflow case
           } else {
-            registers[rd] = dividend / divisor;
+            setreg(rd, dividend / divisor);
           }
         } else if (funct3 == 0x5) {  // DIVU
           uint32_t dividend = registers[rs1];
           uint32_t divisor = registers[rs2];
-          registers[rd] = (divisor == 0) ? UINT32_MAX : dividend / divisor;
+          setreg(rd, (divisor == 0) ? UINT32_MAX : dividend / divisor);
         } else if (funct3 == 0x6) {  // REM
           int32_t dividend = static_cast<int32_t>(registers[rs1]);
           int32_t divisor = static_cast<int32_t>(registers[rs2]);
           if (divisor == 0) {
-            registers[rd] =
-                dividend;  // Remainder with zero divisor is the dividend
+            setreg(rd,
+                   dividend);  // Remainder with zero divisor is the dividend
           } else if (dividend == INT32_MIN && divisor == -1) {
-            registers[rd] = 0;  // Overflow case
+            setreg(rd, 0);  // Overflow case
           } else {
-            registers[rd] = dividend % divisor;
+            setreg(rd, dividend % divisor);
           }
         } else if (funct3 == 0x7) {  // REMU
           uint32_t dividend = registers[rs1];
           uint32_t divisor = registers[rs2];
-          registers[rd] = (divisor == 0) ? dividend : dividend % divisor;
+          setreg(rd, (divisor == 0) ? dividend : dividend % divisor);
         } else {
           throw std::runtime_error("Unknown R-type instruction");
         }
@@ -154,20 +233,48 @@ void VM::step() {
       }
       break;
     }
-    case 0x13: {                           // I-type (ADDI)
+    case 0x13: {                           // I-type (ADDI and friends)
       imm = sign_extend(instr >> 20, 12);  // Extract 12-bit immediate
       if (funct3 == 0x0) {                 // ADDI
-        registers[rd] = registers[rs1] + imm;
+        setreg(rd, registers[rs1] + imm);
+      } else if (funct3 == 0x4) {  // XORI
+        setreg(rd, registers[rs1] ^ imm);
+      } else if (funct3 == 0x6) {  // ORI
+        setreg(rd, registers[rs1] | imm);
+      } else if (funct3 == 0x07) {  // ANDI
+        setreg(rd, registers[rs1] & imm);
+      } else if (funct3 == 0x01) {
+        if (((imm >> 5) & 0x7f) == 0x0) {  // SLLI
+          uint32_t value = registers[rs1];
+          uint32_t shift_amount = imm & 0x1F;
+          setreg(rd, value << shift_amount);
+        } else {
+          throw std::runtime_error("Unknown I-type instruction");
+        }
+      } else if (funct3 == 0x05) {
+        if (((imm >> 5) & 0x7f) == 0x20) {  // SRAI
+          int32_t value = static_cast<int32_t>(imm & 0x1f);
+          int32_t shift_amount = imm & 0x1F;
+          setreg(rd, value >> shift_amount);
+        } else if (((imm >> 5) & 0x7f) == 0x0) {  // SRLI
+          uint32_t value = registers[rs1];
+          uint32_t shift_amount = imm & 0x1F;
+          setreg(rd, value >> shift_amount);
+        } else {
+          throw std::runtime_error("Unknown I-type instruction");
+        }
       } else {
         throw std::runtime_error("Unknown I-type instruction");
       }
       break;
     }
     case 0x63: {  // B-type (branches)
-      imm = ((instr >> 7) & 0x1E) | ((instr >> 20) & 0x7E0) |
-            ((instr >> 19) & 0x800) | ((instr >> 31) << 12);
-      imm = sign_extend(imm, 13);  // Sign-extend 13-bit immediate
-      if (funct3 == 0x0) {         // BEQ
+      imm = ((int64_t)(int32_t)(instr & 0x80000000) >> 19) |
+            ((instr & 0x80) << 4)      // imm[11]
+            | ((instr >> 20) & 0x7e0)  // imm[10:5]
+            | ((instr >> 7) & 0x1e);
+
+      if (funct3 == 0x0) {  // BEQ
         if (registers[rs1] == registers[rs2]) {
           pc += imm - 4;  // Offset PC (adjust for pre-increment)
         }
@@ -198,24 +305,21 @@ void VM::step() {
       imm = sign_extend(instr >> 20, 12);  // Extract 12-bit immediate
       if (funct3 == 0x00) {                // LB
         uint32_t addr = registers[rs1] + imm;
-        if (addr + 1 > memory_size) {
-          throw std::runtime_error("Memory access out of bounds");
-        }
-        registers[rd] = 0;
-        std::memcpy(&registers[rd], memory + addr, sizeof(uint8_t));
+        // registers[rd] = sign_extend(read_memory_byte(addr), 8);
+        setreg(rd, read_memory_byte(addr));
       } else if (funct3 == 0x01) {  // LH
         uint32_t addr = registers[rs1] + imm;
-        if (addr + 2 > memory_size) {
-          throw std::runtime_error("Memory access out of bounds");
-        }
-        registers[rd] = 0;
-        std::memcpy(&registers[rd], memory + addr, sizeof(uint16_t));
+        // registers[rd] = sign_extend(read_memory_half_word(addr), 16);
+        setreg(rd, read_memory_half_word(addr));
       } else if (funct3 == 0x2) {  // LW
         uint32_t addr = registers[rs1] + imm;
-        if (addr + 4 > memory_size) {
-          throw std::runtime_error("Memory access out of bounds");
-        }
-        std::memcpy(&registers[rd], memory + addr, sizeof(uint32_t));
+        setreg(rd, read_memory_word(addr));
+      } else if (funct3 == 0x4) {  // LBU
+        uint32_t addr = registers[rs1] + imm;
+        setreg(rd, read_memory_byte(addr));
+      } else if (funct3 == 0x5) {  // LHU
+        uint32_t addr = registers[rs1] + imm;
+        setreg(rd, read_memory_half_word(addr));
       } else {
         throw std::runtime_error("Unknown load instruction");
       }
@@ -226,22 +330,19 @@ void VM::step() {
       imm = sign_extend(imm, 12);  // Sign-extend 12-bit immediate
       if (funct3 == 0x0) {         // SB
         uint32_t addr = registers[rs1] + imm;
-        if (addr + 1 > memory_size) {
-          throw std::runtime_error("Memory access out of bounds");
-        }
-        std::memcpy(memory + addr, &registers[rs2], sizeof(uint8_t));
+        write_memory_byte(addr, registers[rs2]);
       } else if (funct3 == 0x1) {  // SH
         uint32_t addr = registers[rs1] + imm;
-        if (addr + 2 > memory_size) {
+        if (addr + 2 > memory_.size()) {
           throw std::runtime_error("Memory access out of bounds");
         }
-        std::memcpy(memory + addr, &registers[rs2], sizeof(uint16_t));
+        std::memcpy(&memory_[addr], &registers[rs2], sizeof(uint16_t));
       } else if (funct3 == 0x2) {  // SW
         uint32_t addr = registers[rs1] + imm;
-        if (addr + 4 > memory_size) {
+        if (addr + 4 > memory_.size()) {
           throw std::runtime_error("Memory access out of bounds");
         }
-        std::memcpy(memory + addr, &registers[rs2], sizeof(uint32_t));
+        std::memcpy(&memory_[addr], &registers[rs2], sizeof(uint32_t));
       } else {
         throw std::runtime_error("Unknown store instruction");
       }
@@ -255,7 +356,7 @@ void VM::step() {
           ((instr >> 20) & 0x1) << 11 |   // imm[11]
           ((instr & 0xFF000));            // imm[19:12]
 
-      registers[rd] = pc;  // Save return address
+      setreg(rd, pc);  // Save return address
       pc += offset - 4;
       break;
     }
@@ -264,22 +365,22 @@ void VM::step() {
       uint32_t target =
           (registers[rs1] + offset) & ~1;  // Target address (LSB cleared)
 
-      registers[rd] = pc;  // Save return address
+      setreg(rd, pc);  // Save return address
       pc = target;
       break;
     }
     case 0x37: {                               // LUI
       uint32_t imm = (instr >> 12) & 0xFFFFF;  // Extract 20-bit immediate
-      registers[rd] =
-          imm
-          << 12;  // Shift the immediate to the upper 20 bits of the register
+      setreg(rd,
+             imm << 12);  // Shift the immediate to the upper 20 bits of the
+                          // register
       break;
     }
     case 0x17: {                               // AUIPC
       uint32_t imm = (instr >> 12) & 0xFFFFF;  // Extract 20-bit immediate
-      registers[rd] =
-          pc +
-          (imm << 12);  // Add the immediate (shifted left) to the current PC
+      setreg(rd,
+             pc - 4 + (imm << 12));  // Add the immediate (shifted left) to
+                                     // the current PC
       break;
     }
     case 0x73: {  // EBREAK
diff --git a/src/vm.hpp b/src/vm.hpp
index b81652d..75c62cd 100644
--- a/src/vm.hpp
+++ b/src/vm.hpp
@@ -8,6 +8,27 @@
 class EbreakException : std::exception {};
 
 const int NUM_REGISTERS = 32;  // Standard RISC-V has 32 registers
+const int UART_ADDR = 0x10000000;
+
+class UART {
+ public:
+  uint8_t read_register(uint32_t address);
+  void write_register(uint32_t address, uint8_t value);
+
+  bool is_transmitter_ready();
+
+ private:
+  enum Registers {
+    UART_RBR = 0x00,  // Receiver Buffer Register
+    UART_THR = 0x00,  // Transmitter Holding Register
+    UART_LSR = 0x05   // Line Status Register
+  };
+
+  // Line Status Register bits
+  enum LSRBits { LSR_TRANSMITTER_EMPTY = 0x20 };
+
+  uint8_t registers[8] = {0};
+};
 
 class VM {
  public:
@@ -17,17 +38,29 @@ class VM {
   void eval();
 
   std::vector<uint8_t> read_memory(size_t start, size_t size);
+
   uint32_t read_memory_word(size_t pos);
+  uint16_t read_memory_half_word(size_t pos);
+  uint8_t read_memory_byte(size_t pos);
+
   void write_memory_word(size_t pos, uint32_t value);
+  void write_memory_half_word(size_t pos, uint16_t value);
+  void write_memory_byte(size_t pos, uint8_t value);
+
+  bool is_mmap(size_t pos, size_t size);
 
   uint32_t read_register(size_t regnum);
 
   const std::string &get_file_path();
 
+  void setreg(int regnum, uint32_t value);
+
  private:
   std::vector<uint8_t> memory_;
 
   uint32_t registers[NUM_REGISTERS] = {0};
   uint32_t pc = 0;
   std::string file_path;
+
+  UART uart;
 };