Second generation of the parser

This commit is contained in:
Konstantin Nazarov 2021-07-10 13:29:16 +00:00
parent 9dc13bc2bc
commit 632c454116
Signed by: knazarov
GPG key ID: 4CFE0A42FA409C22
2 changed files with 337 additions and 202 deletions

View file

@ -1,235 +1,212 @@
# markdown implementation in awk
# references:
# - https://gist.github.com/xdanger/116153
# - https://github.com/nuex/zodiac/blob/master/lib/markdown.awk
# - https://dataswamp.org/~solene/2019-08-26-minimal-markdown.html
BEGIN { BEGIN {
si = 0; body = ""
stack[si] = "body"; in_code = 0
val[si] = 0
res[si] = ""
i = 0;
} }
function peek(num) { function parse_header(str) {
return substr($0, i, num); match($0, /#+/);
hnum = RLENGTH;
content = parse_block(substr(str, hnum + 1, length(str) - hnum ));
return "<h" hnum ">" content "</h" hnum ">";
} }
function push(block) { function read_line(str, pos, res, i) {
stack[++si] = block; res = "";
res[si] = ""; for (i=pos; i<=length(str); i++) {
val[si] = 0; if (substr(str, i, 1) == "\n")
return res;
res = res substr(str, i, 1);
}
return res;
} }
function pop(str) { function find(str, s, i, sl, j) {
res[si-1] = res[si-1] str sl = length(s);
res[si] = "" for (j = i; j <= length(str); j++) {
val[si] = 0 if (substr(str, j, sl) == s)
si-- return j;
if (stack[si] == "body") {
printf(res[si]);
res[si] = ""
} }
return 0;
} }
function handle_code() { function startswith(str, s, sl, j) {
if (peek(1) == "`") { sl = length(s);
i = i + 1; for (j = 1; j <= length(str); j++) {
pop("<code>" res[si] "</code>") if (substr(str, j, sl) == s)
return; return j;
if (substr(str, j, 1) != " ")
return 0;
} }
if ($0 == "") { return 0;
pop("`" res[si])
return;
}
if (i > length($0)) {
next;
}
if (i == 1 && length(res[si]) > 0)
res[si] = res[si] " ";
res[si] = res[si] peek(1)
i++;
} }
#function parse_list_item(str,
function handle_code_long() { function fold_lines(arr, i, result) {
if (peek(3) == "```") { for (i in arr) {
i = i + 3; if (result != "")
pop("<code>" res[si] "</code>") result = result " " arr[i];
return;
}
if (i > length($0)) {
res[si] = res[si] "\n";
next;
}
res[si] = res[si] peek(1)
i++;
}
function handle_inline_code() {
if (peek(1) == "`") {
i = i + 1;
pop("<code>" res[si] "</code>")
return;
}
if (i > length($0)) {
pop("`" res[si]);
return;
}
res[si] = res[si] peek(1)
i++;
}
function handle_inline_code_long() {
if (peek(3) == "```") {
i = i + 3;
pop("<code>" res[si] "</code>")
return;
}
if (i > length($0)) {
pop("```" res[si]);
return;
}
res[si] = res[si] peek(1)
i++;
}
function handle_inline_strong() {
if (peek(2) == "**") {
i = i + 2;
pop("<strong>" res[si] "</strong>")
return;
}
if (peek(3) == "```") {
i = i + 3;
push("inline_code_long");
return;
}
if (peek(1) == "`") {
i = i + 1;
push("inline_code");
return;
}
if (i > length($0)) {
pop("**" res[si]);
return;
}
res[si] = res[si] peek(1)
i++;
}
function handle_inline() {
if (peek(2) == "**") {
i = i + 2;
push("inline_strong");
return;
}
if (peek(3) == "```") {
i = i + 3;
push("inline_code_long");
return;
}
res[si] = res[si] peek(1);
i++;
}
function handle_block() {
if (peek(3) == "```") {
i = i + 3;
push("code_long");
return;
}
if (peek(1) == "`") {
i = i + 1;
push("code");
return;
}
res[si] = res[si] peek(1);
i++;
}
function handle_paragraph() {
if (i == 1 && ($0 == "" || peek(1) == "#")) {
if (length(res[si]) > 0)
pop("<p>" res[si] "</p>\n");
else else
pop(""); result = arr[i];
if (peek(1) == "#")
return;
next;
} }
if (i == 1 && length(res[si]) > 0)
res[si] = res[si] " ";
handle_block();
if (i > length($0))
next;
} }
function handle_header() { function parse_list(str, buf, result, i, ind, line, lines, indent) {
if (i == 1) { result = "<ul>\n";
match($0, /#+/); buf = "";
val[si] = RLENGTH;
i = RLENGTH + 1; print "parse: " str ">" startswith(str, "* ")
return; split(str, lines, "\n");
for (i in lines) {
line = lines[i];
} }
if (i>length($0)) { indent = 0;
pop("<h" val[si] ">" res[si] "</h" val[si] ">\n"); for (i in lines) {
next; line = lines[i];
} print "line: " line " " startswith(line, "* ") " " indent
ind = startswith(line, "* ");
if (indent == 0 && ind > 0) {
indent = ind;
}
else if (indent > 0 && ind > 0 && ind <= indent) {
if (length(buf) > 0) {
result = result "<li>" parse_list(buf) "</li>\n";
buf = "";
}
}
if (length(buf) > 0)
buf = buf "\n";
handle_inline(); if (ind > 0 && ind <= indent) {
buf = buf substr(line, ind+2, length(line) - 2);
}
else
buf = buf line;
}
if (length(buf) > 0) {
result = result "<li>" parse_list(buf) "</li>\n";
}
result = result "</ul>";
return result;
} }
function handle_body() { function parse_block(str, result, end, i) {
if (peek(1) == "#") { #print "block '" str "'"
push("header"); result = ""
return;
if (substr(str, 1, 2) == "* ") {
return parse_list(str);
}
for (i=1; i<=length(str); i++) {
if (substr(str, i, 2) == "**") {
end = find(str, "**", i+2);
if (end != 0) {
result = result "<strong>" parse_block(substr(str, i+2, end - i - 2)) "</strong>";
i = end+1;
}
else {
result = result "**";
i++;
}
}
else if (substr(str, i, 3) == "```") {
end = find(str, "```", i+3);
if (end != 0) {
result = result "<code>" substr(str, i+3, end - i - 3) "</code>";
i = end+1;
}
else {
result = result "```";
i=i+2;
}
}
else if (substr(str, i, 1) == "`") {
end = find(str, "`", i+1);
}
else {
if (substr(str, i, 1) == "\n") {
if (length(result) > 0)
result = result " ";
}
else {
result = result substr(str, i, 1);
}
}
}
#print "block result '" result "'"
return result;
}
function parse_paragraph(str) {
if (substr(str, 1, 2 ) == "* ") {
return parse_block(str);
}
else {
return "<p>" parse_block(str) "</p>";
}
}
function parse_body(str) {
if (substr(str, 1, 1) == "#") {
print(parse_header(str));
} }
else { else {
push("paragraph"); print(parse_paragraph(str));
return; }
}
/^#/ {
if (body != "") {
parse_body(body);
}
parse_body($0);
body = "";
next;
}
/^$/ {
if (body == "")
next;
if (startswith(body, "```") == 1) {
body = body "\n";
next;
}
parse_body(body);
body = "";
next;
}
/```/ {
if (startswith(body, "```") == 1) {
if (body != "")
body = body "\n";
print "<code>" substr(body, 4, length(body)-3) "</code>";
body = "";
next;
} }
} }
// { // {
i = 1; if (body != "")
body = body "\n" $0;
else
body = $0;
while (1) { next;
if (stack[si] == "body")
handle_body();
else if (stack[si] == "paragraph")
handle_paragraph();
else if (stack[si] == "header")
handle_header();
else if (stack[si] == "inline_strong")
handle_inline_strong();
else if (stack[si] == "inline_code_long")
handle_inline_code_long();
else if (stack[si] == "inline_code")
handle_inline_code();
else if (stack[si] == "code_long")
handle_code_long();
else if (stack[si] == "code")
handle_code();
}
} }
END { END {
#print res[si]; if (body != "") {
#newblock(); parse_body(body);
}
} }

158
test.sh Executable file
View file

@ -0,0 +1,158 @@
#!/bin/bash
set -e
check() {
input="$(mktemp)"
expected_output="$(mktemp)"
output="$(mktemp)"
current="input"
while IFS='$\n' read -r line; do
if [[ "$line" == "---" ]]; then
current="output"
elif [[ "$current" == "input" ]]; then
echo "$line" >> "$input"
else
echo $line >> "$expected_output"
fi
done
awk -f markdown.awk "$input" >> "$output"
result="success"
if ! cmp -s "$output" "$expected_output"; then
echo "FAIL"
echo "--- input"
cat "$input"
echo "--- expected"
cat "$expected_output"
echo "--- got"
cat "$output"
echo "---"
result="fail"
else
echo "SUCCESS"
fi
rm "$input"
rm "$expected_output"
rm "$output"
if [[ "$result" == "fail" ]]; then
exit 1
fi
}
check <<-EOF
This is a simple sentence.
---
<p>This is a simple sentence.</p>
EOF
check <<-EOF
This is a
simple sentence.
---
<p>This is a simple sentence.</p>
EOF
check <<-EOF
First paragraph.
Second paragraph.
---
<p>First paragraph.</p>
<p>Second paragraph.</p>
EOF
check <<-EOF
# Header
body
---
<h1> Header</h1>
<p>body</p>
EOF
check <<-EOF
# Header1
## Header2
### Header3
---
<h1> Header1</h1>
<h2> Header2</h2>
<h3> Header3</h3>
EOF
check <<-EOF
**bold**
---
<p><strong>bold</strong></p>
EOF
check <<-EOF
**bold
multiline**
---
<p><strong>bold multiline</strong></p>
EOF
check <<-EOF
**bold
---
<p>**bold</p>
EOF
check <<-"EOF"
```
first line of code
second line of code
```
---
<code>
first line of code
second line of code
</code>
EOF
check <<-"EOF"
```
first line of code
second line of code
---
<p>``` first line of code second line of code</p>
EOF
check <<-"EOF"
asdf
* foo
* bar
---
<p>asdf</p>
<ul>
<li>foo</li>
<li>bar</li>
</ul>
EOF
check <<-"EOF"
asdf
* foo
* bar
qux
---
<p>asdf</p>
<ul>
<li>foo</li>
<li>bar qux</li>
</ul>
EOF
echo
echo "All tests passed"