aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore4
-rw-r--r--COPYING13
-rw-r--r--Makefile3
-rw-r--r--README.md5
-rwxr-xr-xconfigure4
-rw-r--r--package/info3
-rw-r--r--package/targets.mk3
-rw-r--r--src/triple9
-rw-r--r--src/triple-rdfa32
-rw-r--r--src/triple-text116
-rw-r--r--src/triple-turtle215
11 files changed, 407 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ace3b6e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.idea
+.policy
+build
+*.tar.gz
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..ab60150
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,13 @@
+Copyright (c) 2022 D. Olsson <hi@senzilla.io>
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..65491dc
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: ISC
+
+.include <sz.script.mk>
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5219e45
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+triple - rdfa parser written in awk
+===================================
+
+An RDFa parser written in awk, capable of outputting the results in
+different formats such as Turtle - Terse RDF Triple Language.
diff --git a/configure b/configure
new file mode 100755
index 0000000..7c3a6b9
--- /dev/null
+++ b/configure
@@ -0,0 +1,4 @@
+#!/bin/sh
+# SPDX-License-Identifier: ISC
+
+exec sz-configure-script $@
diff --git a/package/info b/package/info
new file mode 100644
index 0000000..c05c0be
--- /dev/null
+++ b/package/info
@@ -0,0 +1,3 @@
+package=triple
+version=0.0.1
+category=textproc
diff --git a/package/targets.mk b/package/targets.mk
new file mode 100644
index 0000000..27a6d88
--- /dev/null
+++ b/package/targets.mk
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: ISC
+
+BIN_TARGETS= triple triple-text triple-turtle triple-rdfa
diff --git a/src/triple b/src/triple
new file mode 100644
index 0000000..288bb3e
--- /dev/null
+++ b/src/triple
@@ -0,0 +1,9 @@
+#!@SHEBANGDIR@/execlineb -S1
+# SPDX-License-Identifier: ISC
+
+export IRI $1
+backtick DATEMODIFIED { stat -f %m $1 }
+pipeline {
+ lowdown --html-no-head-ids --html-no-escapehtml --html-no-num-ent --html-no-skiphtml $1
+}
+triple-rdfa
diff --git a/src/triple-rdfa b/src/triple-rdfa
new file mode 100644
index 0000000..e57423d
--- /dev/null
+++ b/src/triple-rdfa
@@ -0,0 +1,32 @@
+#!@SHEBANGDIR@/awk -f
+# SPDX-License-Identifier: ISC
+# vi: syntax=awk
+
+function path(s) { sub(/\/?index\.[\.a-z]+$/, "", s); return s }
+function prop(t, v) { sub(/ $/, "", t); return t " property=\"schema:" v "\"" }
+
+function lang(s, e) {
+ if (match(s, /index\.([a-z]+)\.[a-z]+$/)) return substr(s, RSTART + 6, 2)
+ if (e = substr(ENVIRON["LANG"], 0, 2)) return e
+ return "en"
+}
+
+BEGIN {
+ f = "%Y-%m-%dT%H:%M:%S%Z"
+ m = ENVIRON["DATEMODIFIED"] ? strftime(f, ENVIRON["DATEMODIFIED"]) : strftime(f)
+ i = ENVIRON["IRI"] ? "/" path(ENVIRON["IRI"]) : "#"
+ l = ENVIRON["IRI"] ? lang(ENVIRON["IRI"]) : "en"
+ t = ENVIRON["TYPEOF"] ? ENVIRON["TYPEOF"] : "blogposting"
+ print "<article typeof=\"schema:" t "\" resource=\"" i "\" lang=\"" l "\">"
+ print "<meta property=\"schema:datemodified\" content=\"" m "\"/>"
+ RS = "<"; ORS = "<"; FS = "(>|/>)"; OFS = ">"
+}
+
+END { ORS = "\n"; print "/article>" }
+/^h1/ && h1++ == 0 { $1 = prop($1, "headline") }
+/^p>[A-Z]/ && p++ == 0 { $1 = prop($1, "abstract") }
+/^img/ && img++ == 0 { $1 = prop($1, "image") }
+/^address/ && address++ == 0 { $1 = prop($1, "author") }
+/^time/ && time++ == 0 { $1 = prop($1, "datepublished") }
+/^(br|hr|img|link|meta)/ { sub(/[^\/]$/, "&/", $1) } # XXX: Close void tags
+{ print }
diff --git a/src/triple-text b/src/triple-text
new file mode 100644
index 0000000..071a419
--- /dev/null
+++ b/src/triple-text
@@ -0,0 +1,116 @@
+#!@SHEBANGDIR@/awk -f
+# SPDX-License-Identifier: ISC
+# vi: syntax=awk
+
+function begobj(s, p, o) { OPEN = o; OBJC[s, p]++ }
+function endall() { endbase(); endsubj() }
+function endbase() { BASE = 0 }
+function endobj() { OPEN = 0 }
+function endpred() { PRED = 0; endobj() }
+function endsubj() { SUBJ = 0; endpred() }
+
+function addobj(s, p, o) {
+ if (!OBJC[s, p]) OBJC[s, p] = 1
+ OBJS[s, p, OBJC[s, p]] = OBJS[s, p, OBJC[s, p]] o
+}
+
+function begpred(f) {
+ $f = tolower($f)
+ sub(":", ".", $f)
+ PRED = PREDS[$f] = $f
+ $f = ""
+}
+
+function begsubj(f, a) {
+ if (a == "<") s = substr($f, 2, length($f) - 2)
+ else if (a == "_") s = substr($f, 3, length($f) - 2)
+ SUBJC++; SUBJ = SUBJS[SUBJC] = s; $f = ""
+}
+
+BEGIN { BR = "---"; HEAD = 1; endall() }
+
+END {
+ print BR
+ for (i = 1; i <= SUBJC; i++) for (p in PREDS) if (OBJC[SUBJS[i], p])
+ for (ii = 1; ii <= OBJC[SUBJS[i], p]; ii++)
+ print SUBJS[i] "\t" p "\t" OBJS[SUBJS[i], p, ii]
+ print BR
+ ORS = ""; if (B && !ENVIRON["LIST"]) print B
+}
+
+BR == $0 { HEAD = 0; next }
+!HEAD { B = B $0 RS }
+
+HEAD {
+ for (f = 1; f <= NF; f++) {
+ # Process terminals.
+ if (!OPEN && $f == ";") {
+ endpred(); continue
+ }
+ if (!OPEN && $f == ".") {
+ endall(); continue
+ }
+ # Process subjects and predicates.
+ if ($f == "@base" || $f == "@prefix") BASE = 1
+ if (BASE) continue
+ if (!SUBJ) {
+ a = substr($f, 0, 1)
+ if (a == "<" || a == "_") {
+ begsubj(f, a); continue
+ }
+ }
+ if (SUBJ && !PRED) {
+ begpred(f); continue
+ }
+ # Process objects.
+ if (!SUBJ || !PRED) continue
+ if (PRED == "a") {
+ addobj(SUBJ, PRED, $f); continue
+ }
+ # XXX: Open object here means quoted string. Return the space.
+ if (OPEN) addobj(SUBJ, PRED, NEWLINE ? "\\n" : " ")
+ # TODO: Support blankNodePropertyList and collections
+ # TODO: Support language tags
+ split($f, OBJ, "")
+ for (o = 1; o <= length(OBJ); o++) {
+ if (!OPEN && OBJ[o] == ";") {
+ endpred(); break
+ }
+ if (!OPEN && OBJ[o] == ".") {
+ endall(); break
+ }
+ if (!OPEN && OBJ[o] == "\"") {
+ begobj(SUBJ, PRED, "str"); continue
+ }
+ else if (!OPEN && OBJ[o] == "<") {
+ begobj(SUBJ, PRED, "iri"); continue
+ }
+ else if (!OPEN && OBJ[o] == "_") {
+ begobj(SUBJ, PRED, "blk"); continue
+ }
+ # TODO: Comprehensive escape sequencing. Look forward.
+ if (!ESCAPE && OBJ[o] == "\\") {
+ ESCAPE = 1; continue
+ }
+ if (OPEN == "blk" && OBJ[o] == ":") continue
+ if (OPEN == "blk" && OBJ[o] == ",") {
+ endobj(); break
+ }
+ if (OPEN == "blk" && OBJ[o] == ".") {
+ endall(); break
+ }
+ if (!OPEN && OBJ[o] == ",") break
+ if (OPEN == "str" && OBJ[o] == "\"" && !ESCAPE) {
+ endobj(); break
+ }
+ if (OPEN == "iri" && OBJ[o] == ">" && !ESCAPE) {
+ endobj(); break
+ }
+ if (OPEN) addobj(SUBJ, PRED, OBJ[o])
+ ESCAPE = 0
+ }
+ NEWLINE = (OPEN == "str" && f == NF) ? 1 : 0
+ # XXX: Blank nodes end at the field boundary. No delimiter.
+ if (OPEN == "blk") endobj()
+ }
+}
diff --git a/src/triple-turtle b/src/triple-turtle
new file mode 100644
index 0000000..bb6db72
--- /dev/null
+++ b/src/triple-turtle
@@ -0,0 +1,215 @@
+#!@SHEBANGDIR@/awk -f
+# SPDX-License-Identifier: ISC
+# vi: syntax=awk
+
+function inarray(v, a, k) { for (k in a) if (a[k] == v) return k }
+function node() { return NSTACK[length(NSTACK)] }
+function parentnode() { return NSTACK[length(NSTACK)-1] }
+function pred() { return PSTACK[length(PSTACK)] }
+function stackpred(p) { PSTACK[length(PSTACK)+1] = HASH[node()] = p }
+function subj() { return SSTACK[length(SSTACK)] }
+function trim(s) { gsub(/(^[[:space:]]+|[[:space:]]+$)/, "", s); return s }
+function unstacknode() { delete NSTACK[length(NSTACK)] }
+function unstackpred() { delete PSTACK[length(PSTACK)] }
+function unstacksubj() { delete SSTACK[length(SSTACK)]; WRAPPER = 1 }
+
+function addobj(s, p, o, t, d, a, x, k) {
+ PRED[p] = 1; TERM[s, p] = t; DATA[s, p] = d
+ a ? INDEX[s, p] = 1 : ++INDEX[s, p]
+ k = s SUBSEP p SUBSEP INDEX[s, p]
+ # XXX: Let authorative files overwrite string objects.
+ if (HASH[s, p, 1] && t == "str") {
+ if (inarray(s, AUTH) && AUTH[FILE] != s) return
+ INDEX[s, p] = 1
+ k = s SUBSEP p SUBSEP 1
+ }
+ o = trim(o)
+ if (a) HASH[k] = HASH[k] ? HASH[k] x o : o
+ else HASH[k] = o
+}
+
+function att(s, a, o) {
+ o = length(a) + 2
+ if (match(s, a "=\"[^\"]+")) return substr(s, RSTART+o, RLENGTH-o)
+}
+
+function escape(s) {
+ gsub(/"/, "\\\"", s)
+ gsub(/ *\\n */, "\\\\n", s)
+ return s
+}
+
+function fmtliteral(d, o, s, x, n) {
+ if (!o && HASH[node()] != 1) return # XXX: Only close what's open.
+ x = D[d, TAG, o]
+ n = substr(parentnode(), 0, index(parentnode(), SUBSEP))
+ if (d == "html" && $1 && !x) x = "<" $1 ">"
+ # XXX: There be dragons!
+ if (d == "man") {
+ if (o && (TAG == "h1" || TAG == "h2" || TAG == "q"))
+ s = toupper(s)
+ if (o && TAG == "cite" && match(s, /\.[1-9]$/))
+ s = substr(s, 0, RSTART-1) " " substr(s, RSTART+1, 1)
+ # XXX: Unwrap nested inline tags into its parent.
+ if (n && !match(n, BLOCKTAG) && !match(TAG, BLOCKTAG))
+ x = o ? substr(x, index(x, ".")+1) : ""
+ # XXX: Unwrap content into immediately empty lists.
+ if (n && match(n, LISTTAG) && HASH[parentnode(), "imm"])
+ x = o ? substr(x, index(x, ".")+1) : ""
+ gsub(/[\.']/, "\\\\\\\\\\&&", s) # XXX: Escape mdoc delimiters.
+ }
+ else
+ if (o && TAG == "cite" && match(s, /\.[1-9]$/))
+ s = substr(s, 0, RSTART-1) "(" substr(s, RSTART+1, 1) ")"
+ return x s
+}
+
+function fmtterm(s, t, d, i) {
+ if (t == "iri") return "<" s ">"
+ if (t == "blk") return "_:" s
+ i = d ? "^^<" d ">" : ""
+ gsub(/^(\\n)*/, "", s)
+ return "\"" escape(unescape(trim(s))) "\"" i
+}
+
+function opentag(s, i) {
+ split(s, a, " ")
+ i = index(a[1], "/")
+ TAG = substr(a[1], i+1)
+ return !i
+}
+
+function predobjlist(s, o, i, ol, pl) {
+ for (p in PRED)
+ if (INDEX[s, p]) {
+ for (i = 1; i <= INDEX[s, p]; i++) {
+ o = fmtterm(HASH[s, p, i], TERM[s, p], DATA[s, p])
+ ol = (i == 1) ? o : ol ", " o
+ }
+ pl = pl " ;\n\t" p " " ol
+ }
+ return "\ta " TYPEOF[s] pl " ."
+}
+
+function stacknode(t) {
+ HASH[t]++; n = t SUBSEP HASH[t]; HASH[n] = 1
+ NSTACK[length(NSTACK)+1] = n
+ HASH[n, "imm"] = $2 ? 0 : 1 # XXX: Record immidiate emptiness.
+}
+
+function stacksubj(n, s, to, t, i) {
+ # XXX: Find the right index to update, if any
+ for (i = 1; i <= length(SUBJ); i++) if (SUBJ[i] == s) break
+ TYPEOF[s] = to; TERM[s] = t
+ if (!AUTH[FILE]) AUTH[FILE] = s
+ SSTACK[length(SSTACK)+1] = SUBJ[i] = HASH[n] = s
+ WRAPPER = 1
+}
+
+function unescape(s) {
+ if (ENVIRON["DATATYPE"] == "man") {
+ gsub(/&lt;/, "<", s)
+ gsub(/&gt;/, ">", s)
+ gsub(/&quot;/, "\"", s)
+ }
+ return s
+}
+
+function unstack() {
+ if (subj() == HASH[node()]) unstacksubj()
+ else if (pred() == HASH[node()]) unstackpred()
+ unstacknode()
+}
+
+BEGIN {
+ RS = "<"; FS = ">"; FILE = 0
+ BLOCKTAG = "^(address|aside|blockquote|d[dlt]|div|h[1-6]|li|[ou]l|p|pre|section)"
+ LISTTAG = "^(dt|li)"
+ VOIDTAG = "^(br|hr|img|link|meta)"
+ D["html"] = "https://www.w3.org/tr/html5/#"
+ D["html","b",1] = "<b>-"; D["html","b",0] = "</b>"
+
+ D["md"] = "https://spec.commonmark.org#"
+ D["md","b",1] = "**"; D["md","b",0] = "**"
+ D["md","cite",1] = "`"; D["md","cite",0] = "`"
+ D["md","code",1] = "`"; D["md","code",0] = "`"
+ D["md","em",1] = "*"; D["md","em",0] = "*"
+ D["md","i",1] = "*"; D["md","i",0] = "*"
+ D["md","mark",1] = "**"; D["md","mark",0] = "**"
+ D["md","q",1] = "“"; D["md","q",0] = "”"
+ D["md","strong",1] = "**"; D["md","strong",0] = "**"
+ D["md","var",1] = "`"; D["md","var",0] = "`"
+ D["md","h1",1] = "\\n\\n# "; D["md","h2",1] = "\\n\\n## "
+ D["md","h3",1] = "\\n\\n### "; D["md","h4",1] = "\\n\\n#### "
+ D["md","h5",1] = "\\n\\n##### "; D["md","h6",1] = "\\n\\n###### "
+ D["md","dd",1] = "\\n: "; D["md","dt",1] = "\\n\\n"
+ D["md","ol",1] = "\\n"; D["md","ul",1] = "\\n"
+ D["md","li",1] = "\\n- "; D["md","p",1] = "\\n\\n"
+ D["md","pre",1] = "\\n\\n```"; D["md","pre",0] = "```"
+
+ D["man"] = "https://man.openbsd.org/mdoc.7#"
+ D["man","b",1] = "\\n.Fl "; D["man","b",0] = "\\n"
+ D["man","cite",1] = "\\n.Xr "; D["man","cite",0] = "\\n"
+ D["man","code",1] = "\\n.Va "; D["man","code",0] = "\\n"
+ D["man","em",1] = "\\n.Em "; D["man","em",0] = "\\n"
+ D["man","i",1] = "\\n.Ar "; D["man","i",0] = "\\n"
+ D["man","mark",1] = "\\n.Nm "; D["man","mark",0] = "\\n"
+ D["man","q",1] = "\\n.Sx "; D["man","q",0] = "\\n"
+ D["man","strong",1] = "\\n.Sy "; D["man","strong",0] = "\\n"
+ D["man","var",1] = "\\n.Ev "; D["man","var",0] = "\\n"
+ D["man","h1",1] = "\\n.Sh "; D["man","h2",1] = "\\n.Sh "
+ D["man","h3",1] = "\\n.Ss "; D["man","h4",1] = "\\n.Ss "
+ D["man","h5",1] = "\\n.Ss "; D["man","h6",1] = "\\n.Ss "
+ D["man","dl",1] = "\\n.Bl -tag -width indent"; D["man","dl",0] = "\\n.El"
+ D["man","dt",1] = "\\n.It "; D["man","dt",0] = "\\n"
+ D["man","ol",1] = "\\n.Bl -enum"; D["man","ul",1] = "\\n.Bl -dash"
+ D["man","li",1] = "\\n.It\\n"; D["man","p",1] = "\\n.Pp\\n"
+ D["man","pre",1] = "\\n.Bd -literal -offset indent\\n"
+ D["man","pre",0] = "\\n.Ed"
+}
+
+END {
+ print "@prefix schema: <https://schema.org/> ."
+ if (ENVIRON["BASE"]) print "@base <" ENVIRON["BASE"] "> ."
+ for (i = 1; i <= length(SUBJ); i++)
+ print "\n" fmtterm(SUBJ[i], TERM[SUBJ[i]]) "\n" predobjlist(SUBJ[i])
+ print "---"
+ ORS = ""; if (B && !ENVIRON["LIST"]) print B
+}
+
+/^!--/ { next }
+FNR == 1 { FILE++ }
+opentag($1) { WRAPPER = 0; stacknode(TAG) }
+
+pred() {
+ d = ENVIRON["DATATYPE"]; o = opentag($1)
+ x = (!d || o && !match(TAG, BLOCKTAG)) ? " " : ""
+ addobj(subj(), pred(), fmtliteral(d, o, $2), "str", D[d], 1, x)
+}
+
+match($1, /property="[^"]+/) {
+ p = substr($0, RSTART+10, RLENGTH-10); s = subj(); o = ""
+ if (o = att($0, "content")) addobj(s, p, o, "str")
+ else if (o = att($0, "resource")) addobj(s, p, o, "iri")
+ else if (o = att($0, "href")) addobj(s, p, o, "iri")
+ else if (o = att($0, "src")) addobj(s, p, o, "iri")
+ else if (o = att($0, "datetime")) addobj(s, p, o, "iri")
+ else if (match($0, /typeof=/)) addobj(s, p, node(), "blk")
+ else {
+ if ($2) addobj(s, p, $2, "str")
+ stackpred(p)
+ }
+}
+
+match($1, /typeof="[^"]+/) {
+ to = substr($0, RSTART+8, RLENGTH-8); n = node()
+ if (s = att($0, "resource")) stacksubj(n, s, to, "iri")
+ else if (s = att($0, "href")) stacksubj(n, s, to, "iri")
+ else stacksubj(n, n, to, "blk")
+ if (o = att($0, "lang")) addobj(s, "schema:inlanguage", o, "str")
+}
+
+!opentag($1) { unstack() }
+match(TAG, VOIDTAG) { unstack() }
+WRAPPER || /^meta/ || /^link/ { next } # XXX: No meta data in the body
+subj() && $0 { B = B RS $0 }