diff --git a/markdown.awk b/markdown.awk
index 7a85d20..79c7399 100644
--- a/markdown.awk
+++ b/markdown.awk
@@ -1,235 +1,212 @@
-# markdown implementation in awk
-# references:
-# - https://gist.github.com/xdanger/116153
-# - https://github.com/nuex/zodiac/blob/master/lib/markdown.awk
-# - https://dataswamp.org/~solene/2019-08-26-minimal-markdown.html
-
BEGIN {
- si = 0;
- stack[si] = "body";
- val[si] = 0
- res[si] = ""
- i = 0;
+ body = ""
+ in_code = 0
}
-function peek(num) {
- return substr($0, i, num);
+function parse_header(str) {
+ match($0, /#+/);
+ hnum = RLENGTH;
+
+ content = parse_block(substr(str, hnum + 1, length(str) - hnum ));
+ return "" res[si] "
")
- return;
+function startswith(str, s, sl, j) {
+ sl = length(s);
+ for (j = 1; j <= length(str); j++) {
+ if (substr(str, j, sl) == s)
+ return j;
+ if (substr(str, j, 1) != " ")
+ return 0;
}
- if ($0 == "") {
- pop("`" res[si])
- return;
- }
- if (i > length($0)) {
- next;
- }
-
- if (i == 1 && length(res[si]) > 0)
- res[si] = res[si] " ";
-
- res[si] = res[si] peek(1)
- i++;
+ return 0;
}
+#function parse_list_item(str,
-function handle_code_long() {
- if (peek(3) == "```") {
- i = i + 3;
- pop("" res[si] "
")
- return;
- }
- if (i > length($0)) {
- res[si] = res[si] "\n";
- next;
- }
-
- res[si] = res[si] peek(1)
- i++;
-}
-
-function handle_inline_code() {
- if (peek(1) == "`") {
- i = i + 1;
- pop("" res[si] "
")
- return;
- }
- if (i > length($0)) {
- pop("`" res[si]);
- return;
- }
-
- res[si] = res[si] peek(1)
- i++;
-}
-
-function handle_inline_code_long() {
- if (peek(3) == "```") {
- i = i + 3;
- pop("" res[si] "
")
- return;
- }
- if (i > length($0)) {
- pop("```" res[si]);
- return;
- }
-
- res[si] = res[si] peek(1)
- i++;
-}
-
-function handle_inline_strong() {
- if (peek(2) == "**") {
- i = i + 2;
- pop("" res[si] "")
- return;
- }
- if (peek(3) == "```") {
- i = i + 3;
- push("inline_code_long");
- return;
- }
- if (peek(1) == "`") {
- i = i + 1;
- push("inline_code");
- return;
- }
- if (i > length($0)) {
- pop("**" res[si]);
- return;
- }
-
- res[si] = res[si] peek(1)
- i++;
-}
-
-function handle_inline() {
- if (peek(2) == "**") {
- i = i + 2;
- push("inline_strong");
- return;
- }
- if (peek(3) == "```") {
- i = i + 3;
- push("inline_code_long");
- return;
- }
-
- res[si] = res[si] peek(1);
- i++;
-}
-
-function handle_block() {
- if (peek(3) == "```") {
- i = i + 3;
- push("code_long");
- return;
- }
- if (peek(1) == "`") {
- i = i + 1;
- push("code");
- return;
- }
- res[si] = res[si] peek(1);
- i++;
-}
-
-function handle_paragraph() {
- if (i == 1 && ($0 == "" || peek(1) == "#")) {
- if (length(res[si]) > 0)
- pop("
" res[si] "
\n"); +function fold_lines(arr, i, result) { + for (i in arr) { + if (result != "") + result = result " " arr[i]; else - pop(""); - - if (peek(1) == "#") - return; - - next; + result = arr[i]; } - - if (i == 1 && length(res[si]) > 0) - res[si] = res[si] " "; - - handle_block(); - - if (i > length($0)) - next; } -function handle_header() { - if (i == 1) { - match($0, /#+/); - val[si] = RLENGTH; - i = RLENGTH + 1; - return; +function parse_list(str, buf, result, i, ind, line, lines, indent) { + result = "" substr(str, i+3, end - i - 3) "
";
+ i = end+1;
+ }
+ else {
+ result = result "```";
+ i=i+2;
+ }
+ }
+ else if (substr(str, i, 1) == "`") {
+ end = find(str, "`", i+1);
+
+ }
+ else {
+ if (substr(str, i, 1) == "\n") {
+ if (length(result) > 0)
+ result = result " ";
+ }
+ else {
+ result = result substr(str, i, 1);
+ }
+ }
+ }
+ #print "block result '" result "'"
+ return result;
+}
+
+function parse_paragraph(str) {
+ if (substr(str, 1, 2 ) == "* ") {
+ return parse_block(str);
+ }
+ else {
+ return "" parse_block(str) "
"; + } +} + +function parse_body(str) { + if (substr(str, 1, 1) == "#") { + print(parse_header(str)); } else { - push("paragraph"); - return; + print(parse_paragraph(str)); + } +} + +/^#/ { + if (body != "") { + parse_body(body); + } + parse_body($0); + body = ""; + next; +} + +/^$/ { + if (body == "") + next; + + if (startswith(body, "```") == 1) { + body = body "\n"; + next; + } + + parse_body(body); + body = ""; + next; +} + +/```/ { + if (startswith(body, "```") == 1) { + if (body != "") + body = body "\n"; + + print "" substr(body, 4, length(body)-3) "
";
+ body = "";
+ next;
}
}
// {
- i = 1;
+ if (body != "")
+ body = body "\n" $0;
+ else
+ body = $0;
- while (1) {
- if (stack[si] == "body")
- handle_body();
- else if (stack[si] == "paragraph")
- handle_paragraph();
- else if (stack[si] == "header")
- handle_header();
- else if (stack[si] == "inline_strong")
- handle_inline_strong();
- else if (stack[si] == "inline_code_long")
- handle_inline_code_long();
- else if (stack[si] == "inline_code")
- handle_inline_code();
- else if (stack[si] == "code_long")
- handle_code_long();
- else if (stack[si] == "code")
- handle_code();
- }
+ next;
}
-
END {
- #print res[si];
- #newblock();
+ if (body != "") {
+ parse_body(body);
+ }
}
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..183e88e
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+set -e
+
+check() {
+ input="$(mktemp)"
+ expected_output="$(mktemp)"
+ output="$(mktemp)"
+
+ current="input"
+ while IFS='$\n' read -r line; do
+ if [[ "$line" == "---" ]]; then
+ current="output"
+ elif [[ "$current" == "input" ]]; then
+ echo "$line" >> "$input"
+ else
+ echo $line >> "$expected_output"
+ fi
+ done
+
+ awk -f markdown.awk "$input" >> "$output"
+
+ result="success"
+
+ if ! cmp -s "$output" "$expected_output"; then
+ echo "FAIL"
+ echo "--- input"
+ cat "$input"
+ echo "--- expected"
+ cat "$expected_output"
+ echo "--- got"
+ cat "$output"
+ echo "---"
+ result="fail"
+ else
+ echo "SUCCESS"
+ fi
+
+ rm "$input"
+ rm "$expected_output"
+ rm "$output"
+
+ if [[ "$result" == "fail" ]]; then
+ exit 1
+ fi
+}
+
+check <<-EOF
+This is a simple sentence.
+---
+This is a simple sentence.
+EOF + + +check <<-EOF +This is a +simple sentence. +--- +This is a simple sentence.
+EOF + +check <<-EOF +First paragraph. + +Second paragraph. +--- +First paragraph.
+Second paragraph.
+EOF + +check <<-EOF +# Header +body +--- +body
+EOF + +check <<-EOF +# Header1 +## Header2 +### Header3 +--- +bold
+EOF + +check <<-EOF +**bold +multiline** +--- +bold multiline
+EOF + +check <<-EOF +**bold +--- +**bold
+EOF + +check <<-"EOF" +``` +first line of code + +second line of code +``` +--- +
+first line of code
+
+second line of code
+
+EOF
+
+check <<-"EOF"
+```
+first line of code
+
+second line of code
+---
+``` first line of code second line of code
+EOF + +check <<-"EOF" +asdf + +* foo +* bar +--- +asdf
+asdf
+