2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,3 @@
"""
Various commandline tools for working with RDFLib
"""
@@ -0,0 +1,135 @@
"""
This file provides a single function `serialize_in_chunks()` which can serialize a
Graph into a number of NT files with a maximum number of triples or maximum file size.
There is an option to preserve any prefixes declared for the original graph in the first
file, which will be a Turtle file.
"""
from __future__ import annotations
from contextlib import ExitStack, contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, BinaryIO, Generator, Optional, Tuple
from rdflib.graph import Graph
from rdflib.plugins.serializers.nt import _nt_row
# from rdflib.term import Literal
# if TYPE_CHECKING:
# from rdflib.graph import _TriplePatternType
__all__ = ["serialize_in_chunks"]
def serialize_in_chunks(
g: Graph,
max_triples: int = 10000,
max_file_size_kb: Optional[int] = None,
file_name_stem: str = "chunk",
output_dir: Optional[Path] = None,
write_prefixes: bool = False,
) -> None:
"""
Serializes a given Graph into a series of n-triples with a given length.
:param g:
The graph to serialize.
:param max_file_size_kb:
Maximum size per NT file in kB (1,000 bytes)
Equivalent to ~6,000 triples, depending on Literal sizes.
:param max_triples:
Maximum size per NT file in triples
Equivalent to lines in file.
If both this parameter and max_file_size_kb are set, max_file_size_kb will be used.
:param file_name_stem:
Prefix of each file name.
e.g. "chunk" = chunk_000001.nt, chunk_000002.nt...
:param output_dir:
The directory you want the files to be written to.
:param write_prefixes:
The first file created is a Turtle file containing original graph prefixes.
See ``../test/test_tools/test_chunk_serializer.py`` for examples of this in use.
"""
if output_dir is None:
output_dir = Path.cwd()
if not output_dir.is_dir():
raise ValueError(
"If you specify an output_dir, it must actually be a directory!"
)
@contextmanager
def _start_new_file(file_no: int) -> Generator[Tuple[Path, BinaryIO], None, None]:
if TYPE_CHECKING:
# this is here because mypy gets a bit confused
assert output_dir is not None
fp = Path(output_dir) / f"{file_name_stem}_{str(file_no).zfill(6)}.nt"
with open(fp, "wb") as fh:
yield fp, fh
def _serialize_prefixes(g: Graph) -> str:
pres = []
for k, v in g.namespace_manager.namespaces():
pres.append(f"PREFIX {k}: <{v}>")
return "\n".join(sorted(pres)) + "\n"
if write_prefixes:
with open(
Path(output_dir) / f"{file_name_stem}_000000.ttl", "w", encoding="utf-8"
) as fh:
fh.write(_serialize_prefixes(g))
bytes_written = 0
with ExitStack() as xstack:
if max_file_size_kb is not None:
max_file_size = max_file_size_kb * 1000
file_no = 1 if write_prefixes else 0
for i, t in enumerate(g.triples((None, None, None))):
row_bytes = _nt_row(t).encode("utf-8")
if len(row_bytes) > max_file_size:
raise ValueError(
# type error: Unsupported operand types for / ("bytes" and "int")
f"cannot write triple {t!r} as it's serialized size of {row_bytes / 1000} exceeds max_file_size_kb = {max_file_size_kb}" # type: ignore[operator]
)
if i == 0:
fp, fhb = xstack.enter_context(_start_new_file(file_no))
bytes_written = 0
elif (bytes_written + len(row_bytes)) >= max_file_size:
file_no += 1
fp, fhb = xstack.enter_context(_start_new_file(file_no))
bytes_written = 0
bytes_written += fhb.write(row_bytes)
else:
# count the triples in the graph
graph_length = len(g)
if graph_length <= max_triples:
# the graph is less than max so just NT serialize the whole thing
g.serialize(
destination=Path(output_dir) / f"{file_name_stem}_all.nt",
format="nt",
)
else:
# graph_length is > max_lines, make enough files for all graph
# no_files = math.ceil(graph_length / max_triples)
file_no = 1 if write_prefixes else 0
for i, t in enumerate(g.triples((None, None, None))):
if i % max_triples == 0:
fp, fhb = xstack.enter_context(_start_new_file(file_no))
file_no += 1
fhb.write(_nt_row(t).encode("utf-8"))
return
@@ -0,0 +1,570 @@
"""
A commandline tool for semi-automatically converting CSV to RDF.
See also https://github.com/RDFLib/pyTARQL in the RDFlib family of tools
try: ``csv2rdf --help``
"""
from __future__ import annotations
import codecs
import configparser
import csv
import datetime
import fileinput
import getopt
import re
import sys
import time
import warnings
from typing import Any, Dict, List, Optional, Tuple, Union
from urllib.parse import quote
import rdflib
from rdflib.namespace import RDF, RDFS, split_uri
from rdflib.term import URIRef
__all__ = ["CSV2RDF"]
HELP = """
csv2rdf.py \
-b <instance-base> \
-p <property-base> \
[-D <default>] \
[-c <classname>] \
[-i <identity column(s)>] \
[-l <label columns>] \
[-s <N>] [-o <output>] \
[-f configfile] \
[--col<N> <colspec>] \
[--prop<N> <property>] \
<[-d <delim>] \
[-C] [files...]"
Reads csv files from stdin or given files
if -d is given, use this delimiter
if -s is given, skips N lines at the start
Creates a URI from the columns given to -i, or automatically by numbering if
none is given
Outputs RDFS labels from the columns given to -l
if -c is given adds a type triple with the given classname
if -C is given, the class is defined as rdfs:Class
Outputs one RDF triple per column in each row.
Output is in n3 format.
Output is stdout, unless -o is specified
Long options also supported: \
--base, \
--propbase, \
--ident, \
--class, \
--label, \
--out, \
--defineclass
Long options --col0, --col1, ...
can be used to specify conversion for columns.
Conversions can be:
ignore, float(), int(), split(sep, [more]), uri(base, [class]), date(format)
Long options --prop0, --prop1, ...
can be used to use specific properties, rather than ones auto-generated
from the headers
-D sets the default conversion for columns not listed
-f says to read config from a .ini/config file - the file must contain one
section called csv2rdf, with keys like the long options, i.e.:
[csv2rdf]
out=output.n3
base=http://example.org/
col0=split(";")
col1=split(";", uri("http://example.org/things/",
"http://xmlns.com/foaf/0.1/Person"))
col2=float()
col3=int()
col4=date("%Y-%b-%d %H:%M:%S")
"""
# bah - ugly global
uris: Dict[Any, Tuple[URIRef, Optional[URIRef]]] = {}
def toProperty(label: str): # noqa: N802
"""
CamelCase + lowercase initial a string
FIRST_NM => firstNm
firstNm => firstNm
"""
label = re.sub(r"[^\w]", " ", label)
label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
# type error: Incompatible types in assignment (expression has type "None", variable has type "BinaryIO")
label = label.split(" ") # type: ignore[assignment]
return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])
def toPropertyLabel(label): # noqa: N802
if not label[1:2].isupper():
return label[0:1].lower() + label[1:]
return label
def index(l_: List[int], i: Tuple[int, ...]) -> Tuple[int, ...]:
"""return a set of indexes from a list
>>> index([1,2,3],(0,2))
(1, 3)
"""
return tuple([l_[x] for x in i])
def csv_reader(csv_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(csv_data, dialect=dialect, **kwargs)
for row in csv_reader:
yield row
def prefixuri(x, prefix, class_: Optional[URIRef] = None):
if prefix:
r = rdflib.URIRef(prefix + quote(x.encode("utf8").replace(" ", "_"), safe=""))
else:
r = rdflib.URIRef(x)
uris[x] = (r, class_)
return r
# meta-language for config
class NodeMaker:
def range(self):
return rdflib.RDFS.Literal
def __call__(self, x: Any):
return rdflib.Literal(x)
class NodeUri(NodeMaker):
def __init__(self, prefix, class_):
self.class_: Optional[URIRef] = None
self.prefix = prefix
if class_:
self.class_ = rdflib.URIRef(class_)
else:
self.class_ = None
def __call__(self, x):
return prefixuri(x, self.prefix, self.class_)
def range(self):
return self.class_ or rdflib.RDF.Resource
class NodeLiteral(NodeMaker):
def __init__(self, f=None):
self.f = f
class NodeFloat(NodeLiteral):
def __call__(self, x):
if not self.f:
return rdflib.Literal(float(x))
if callable(self.f):
return rdflib.Literal(float(self.f(x)))
raise Exception("Function passed to float is not callable")
def range(self):
return rdflib.XSD.double
class NodeInt(NodeLiteral):
def __call__(self, x):
if not self.f:
return rdflib.Literal(int(x))
if callable(self.f):
return rdflib.Literal(int(self.f(x)))
raise Exception("Function passed to int is not callable")
def range(self):
return rdflib.XSD.int
class NodeBool(NodeLiteral):
def __call__(self, x):
if not self.f:
return rdflib.Literal(bool(x))
if callable(self.f):
return rdflib.Literal(bool(self.f(x)))
raise Exception("Function passed to bool is not callable")
def range(self):
return rdflib.XSD.bool
class NodeReplace(NodeMaker):
def __init__(self, a, b):
self.a = a
self.b = b
def __call__(self, x):
return x.replace(self.a, self.b)
class NodeDate(NodeLiteral):
def __call__(self, x):
return rdflib.Literal(datetime.datetime.strptime(x, self.f))
def range(self):
return rdflib.XSD.dateTime
class NodeSplit(NodeMaker):
def __init__(self, sep, f):
self.sep = sep
self.f = f
def __call__(self, x):
if not self.f:
self.f = rdflib.Literal
if not callable(self.f):
raise Exception("Function passed to split is not callable!")
return [self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]
def range(self):
if self.f and isinstance(self.f, NodeMaker):
return self.f.range()
return NodeMaker.range(self)
default_node_make = NodeMaker()
def _config_ignore(*args, **kwargs):
return "ignore"
def _config_uri(prefix=None, class_=None):
return NodeUri(prefix, class_)
def _config_literal():
return NodeLiteral()
def _config_float(f=None):
return NodeFloat(f)
def _config_replace(a, b):
return NodeReplace(a, b)
def _config_int(f=None):
return NodeInt(f)
def _config_bool(f=None):
return NodeBool(f)
def _config_date(format_):
return NodeDate(format_)
def _config_split(sep=None, f=None):
return NodeSplit(sep, f)
config_functions = {
"ignore": _config_ignore,
"uri": _config_uri,
"literal": _config_literal,
"float": _config_float,
"int": _config_int,
"date": _config_date,
"split": _config_split,
"replace": _config_replace,
"bool": _config_bool,
}
def column(v):
"""Return a function for column mapping"""
return eval(v, config_functions)
class CSV2RDF:
def __init__(self):
self.CLASS = None
self.BASE = None
self.PROPBASE = None
self.IDENT: Union[Tuple[str, ...], str] = "auto"
self.LABEL = None
self.DEFINECLASS = False
self.SKIP = 0
self.DELIM = ","
self.DEFAULT = None
self.COLUMNS = {}
self.PROPS = {}
self.OUT = sys.stdout
self.triples = 0
def triple(self, s, p, o):
self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
self.triples += 1
def convert(self, csvreader):
start = time.time()
if self.OUT:
sys.stderr.write("Output to %s\n" % self.OUT.name)
if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
self.IDENT = (self.IDENT,)
if not self.BASE:
warnings.warn("No base given, using http://example.org/instances/")
self.BASE = rdflib.Namespace("http://example.org/instances/")
if not self.PROPBASE:
warnings.warn("No property base given, using http://example.org/property/")
self.PROPBASE = rdflib.Namespace("http://example.org/props/")
# skip lines at the start
for x in range(self.SKIP):
next(csvreader)
# read header line
header_labels = list(next(csvreader))
headers = dict(enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
# override header properties if some are given
for k, v in self.PROPS.items():
headers[k] = v
header_labels[k] = split_uri(v)[1]
if self.DEFINECLASS:
# output class/property definitions
self.triple(self.CLASS, RDF.type, RDFS.Class)
for i in range(len(headers)):
h, l_ = headers[i], header_labels[i]
if h == "" or l_ == "":
continue
if self.COLUMNS.get(i, self.DEFAULT) == "ignore":
continue
self.triple(h, RDF.type, RDF.Property)
self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l_)))
self.triple(h, RDFS.domain, self.CLASS)
self.triple(
h, RDFS.range, self.COLUMNS.get(i, default_node_make).range()
)
rows = 0
for l_ in csvreader:
try:
if self.IDENT == "auto":
uri = self.BASE["%d" % rows]
else:
uri = self.BASE[
"_".join(
[
# type error: "int" has no attribute "encode"
quote(x.encode("utf8").replace(" ", "_"), safe="") # type: ignore[attr-defined]
# type error: Argument 2 to "index" has incompatible type "Union[Tuple[str, ...], str]"; expected "Tuple[int, ...]"
for x in index(l_, self.IDENT) # type: ignore[arg-type]
]
)
]
if self.LABEL:
self.triple(
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[int, ...]"; expected "Iterable[str]"
uri,
RDFS.label,
rdflib.Literal(" ".join(index(l_, self.LABEL))), # type: ignore[arg-type]
)
if self.CLASS:
# type triple
self.triple(uri, RDF.type, self.CLASS)
for i, x in enumerate(l_):
# type error: "int" has no attribute "strip"
x = x.strip() # type: ignore[attr-defined]
if x != "":
if self.COLUMNS.get(i, self.DEFAULT) == "ignore":
continue
try:
o = self.COLUMNS.get(i, rdflib.Literal)(x)
if isinstance(o, list):
for _o in o:
self.triple(uri, headers[i], _o)
else:
self.triple(uri, headers[i], o)
except Exception as e:
warnings.warn(
"Could not process value for column "
+ "%d:%s in row %d, ignoring: %s "
# type error: "Exception" has no attribute "message"
% (i, headers[i], rows, e.message) # type: ignore[attr-defined]
)
rows += 1
if rows % 100000 == 0:
sys.stderr.write(
"%d rows, %d triples, elapsed %.2fs.\n"
% (rows, self.triples, time.time() - start)
)
except Exception:
sys.stderr.write("Error processing line: %d\n" % rows)
raise
# output types/labels for generated URIs
classes = set()
# type error: Incompatible types in assignment (expression has type "Tuple[URIRef, Optional[URIRef]]", variable has type "int")
for l_, x in uris.items(): # type: ignore[assignment]
# type error: "int" object is not iterable
u, c = x # type: ignore[misc]
# type error: Cannot determine type of "u"
self.triple(u, RDFS.label, rdflib.Literal(l_)) # type: ignore[has-type]
# type error: Cannot determine type of "c"
if c: # type: ignore[has-type]
# type error: Cannot determine type of "c"
c = rdflib.URIRef(c) # type: ignore[has-type]
classes.add(c)
# type error: Cannot determine type of "u"
self.triple(u, RDF.type, c) # type: ignore[has-type]
for c in classes:
self.triple(c, RDF.type, RDFS.Class)
self.OUT.close()
sys.stderr.write("Converted %d rows into %d triples.\n" % (rows, self.triples))
sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))
def main():
csv2rdf = CSV2RDF()
opts: Union[Dict[str, str], List[Tuple[str, str]]]
opts, files = getopt.getopt(
sys.argv[1:],
"hc:b:p:i:o:Cf:l:s:d:D:",
[
"out=",
"base=",
"delim=",
"propbase=",
"class=",
"default=" "ident=",
"label=",
"skip=",
"defineclass",
"help",
],
)
opts = dict(opts)
if "-h" in opts or "--help" in opts:
print(HELP)
sys.exit(-1)
if "-f" in opts:
config = configparser.ConfigParser()
config.read_file(open(opts["-f"]))
for k, v in config.items("csv2rdf"):
if k == "out":
csv2rdf.OUT = codecs.open(v, "w", "utf-8")
elif k == "base":
csv2rdf.BASE = rdflib.Namespace(v)
elif k == "propbase":
csv2rdf.PROPBASE = rdflib.Namespace(v)
elif k == "class":
csv2rdf.CLASS = rdflib.URIRef(v)
elif k == "defineclass":
csv2rdf.DEFINECLASS = bool(v)
elif k == "ident":
csv2rdf.IDENT = eval(v)
elif k == "label":
csv2rdf.LABEL = eval(v)
elif k == "delim":
csv2rdf.DELIM = v
elif k == "skip":
csv2rdf.SKIP = int(v)
elif k == "default":
csv2rdf.DEFAULT = column(v)
elif k.startswith("col"):
csv2rdf.COLUMNS[int(k[3:])] = column(v)
elif k.startswith("prop"):
csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)
if "-o" in opts:
csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
if "--out" in opts:
csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")
if "-b" in opts:
csv2rdf.BASE = rdflib.Namespace(opts["-b"])
if "--base" in opts:
csv2rdf.BASE = rdflib.Namespace(opts["--base"])
if "-d" in opts:
csv2rdf.DELIM = opts["-d"]
if "--delim" in opts:
csv2rdf.DELIM = opts["--delim"]
if "-D" in opts:
csv2rdf.DEFAULT = column(opts["-D"])
if "--default" in opts:
csv2rdf.DEFAULT = column(opts["--default"])
if "-p" in opts:
csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
if "--propbase" in opts:
csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])
if "-l" in opts:
csv2rdf.LABEL = eval(opts["-l"])
if "--label" in opts:
csv2rdf.LABEL = eval(opts["--label"])
if "-i" in opts:
csv2rdf.IDENT = eval(opts["-i"])
if "--ident" in opts:
csv2rdf.IDENT = eval(opts["--ident"])
if "-s" in opts:
csv2rdf.SKIP = int(opts["-s"])
if "--skip" in opts:
csv2rdf.SKIP = int(opts["--skip"])
if "-c" in opts:
csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
if "--class" in opts:
csv2rdf.CLASS = rdflib.URIRef(opts["--class"])
for k, v in opts.items():
if k.startswith("--col"):
csv2rdf.COLUMNS[int(k[5:])] = column(v)
elif k.startswith("--prop"):
csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)
if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
csv2rdf.DEFINECLASS = True
csv2rdf.convert(csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))
if __name__ == "__main__":
main()
@@ -0,0 +1,222 @@
"""
This rdflib Python script creates a DefinedNamespace Python file from a given RDF file
It is a very simple script: it finds all things defined in the RDF file within a given
namespace:
<thing> a ?x
where ?x is anything and <thing> starts with the given namespace
Nicholas J. Car, Dec, 2021
"""
from __future__ import annotations
import argparse
import datetime
import keyword
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Tuple
from rdflib.graph import Graph
from rdflib.namespace import DCTERMS, OWL, RDFS, SKOS
from rdflib.util import guess_format
if TYPE_CHECKING:
from rdflib.query import ResultRow
def validate_namespace(namespace: str) -> None:
if not namespace.endswith(("/", "#")):
raise ValueError("The supplied namespace must end with '/' or '#'")
def validate_object_id(object_id: str) -> None:
for c in object_id:
if not c.isupper():
raise ValueError("The supplied object_id must be an all-capitals string")
# This function is not used: it was originally written to get classes and to be used
# alongside a method to get properties, but then it was decided that a single function
# to get everything in the namespace, get_target_namespace_elements(), was both simper
# and better covered all namespace elements, so that function is used instead.
#
# def get_classes(g, target_namespace):
# namespaces = {"dcterms": DCTERMS, "owl": OWL, "rdfs": RDFS, "skos": SKOS}
# q = """
# SELECT DISTINCT ?x ?def
# WHERE {
# # anything that is an instance of owl:Class or rdfs:Class
# # or any subclass of them
# VALUES ?c { owl:Class rdfs:Class }
# ?x rdfs:subClassOf*/a ?c .
#
# # get any definitions, if they have one
# OPTIONAL {
# ?x rdfs:comment|dcterms:description|skos:definition ?def
# }
#
# # only get results for the targetted namespace (supplied by user)
# FILTER STRSTARTS(STR(?x), "xxx")
# }
# """.replace("xxx", target_namespace)
# classes = []
# for r in g.query(q, initNs=namespaces):
# classes.append((str(r[0]), str(r[1])))
#
# classes.sort(key=lambda tup: tup[1])
#
# return classes
def get_target_namespace_elements(
g: Graph, target_namespace: str
) -> Tuple[List[Tuple[str, str]], List[str], List[str]]:
namespaces = {"dcterms": DCTERMS, "owl": OWL, "rdfs": RDFS, "skos": SKOS}
q = """
SELECT ?s (GROUP_CONCAT(DISTINCT STR(?def)) AS ?defs)
WHERE {
# all things in the RDF data (anything RDF.type...)
?s a ?o .
# get any definitions, if they have one
OPTIONAL {
?s dcterms:description|rdfs:comment|skos:definition ?def
}
# only get results for the target namespace (supplied by user)
FILTER STRSTARTS(STR(?s), "xxx")
FILTER (STR(?s) != "xxx")
}
GROUP BY ?s
""".replace(
"xxx", target_namespace
)
elements: List[Tuple[str, str]] = []
for r in g.query(q, initNs=namespaces):
if TYPE_CHECKING:
assert isinstance(r, ResultRow)
elements.append((str(r[0]), str(r[1])))
elements.sort(key=lambda tup: tup[0])
elements_strs: List[str] = []
non_python_elements_strs: List[str] = []
for e in elements:
name = e[0].replace(target_namespace, "")
desc = e[1].replace("\n", " ")
if name.isidentifier() and not keyword.iskeyword(name):
elements_strs.append(f" {name}: URIRef # {desc}\n")
else:
non_python_elements_strs.append(f""" "{name}", # {desc}\n""")
return elements, elements_strs, non_python_elements_strs
def make_dn_file(
output_file_name: Path,
target_namespace: str,
elements_strs: Iterable[str],
non_python_elements_strs: List[str],
object_id: str,
fail: bool,
) -> None:
header = f'''from rdflib.namespace import DefinedNamespace, Namespace
from rdflib.term import URIRef
class {object_id}(DefinedNamespace):
"""
DESCRIPTION_EDIT_ME_!
Generated from: SOURCE_RDF_FILE_EDIT_ME_!
Date: {datetime.datetime.utcnow()}
"""
'''
with open(output_file_name, "w") as f:
f.write(header)
f.write("\n")
f.write(f' _NS = Namespace("{target_namespace}")')
f.write("\n\n")
if fail:
f.write(" _fail = True")
f.write("\n\n")
f.writelines(elements_strs)
if len(non_python_elements_strs) > 0:
f.write("\n")
f.write(" # Valid non-python identifiers")
f.write("\n")
f.write(" _extras = [")
f.write("\n")
f.writelines(non_python_elements_strs)
f.write(" ]")
f.write("\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"ontology_file",
type=str,
help="Path to the RDF ontology to extract a DefinedNamespace from.",
)
parser.add_argument(
"target_namespace",
type=str,
help="The namespace within the ontology that you want to create a "
"DefinedNamespace for.",
)
parser.add_argument(
"object_id",
type=str,
help="The RDFlib object ID of the DefinedNamespace, e.g. GEO for GeoSPARQL.",
)
parser.add_argument(
"-f",
"--fail",
dest="fail",
action="store_true",
help="Whether (true) or not (false) to mimic ClosedNamespace and fail on "
"non-element use",
)
parser.add_argument("--no-fail", dest="fail", action="store_false")
parser.set_defaults(feature=False)
args = parser.parse_args()
fmt = guess_format(args.ontology_file)
if fmt is None:
print("The format of the file you've supplied is unknown.")
exit(1)
g = Graph().parse(args.ontology_file, format=fmt)
validate_namespace(args.target_namespace)
validate_object_id(args.object_id)
print(
f"Creating DefinedNamespace file {args.object_id} "
f"for {args.target_namespace}..."
)
print(f"Ontology with {len(g)} triples loaded...")
print("Getting all namespace elements...")
elements = get_target_namespace_elements(g, args.target_namespace)
output_file_name = Path().cwd() / f"_{args.object_id}.py"
print(f"Creating DefinedNamespace Python file {output_file_name}")
make_dn_file(
output_file_name,
args.target_namespace,
elements[1],
elements[2],
args.object_id,
args.fail,
)
@@ -0,0 +1,113 @@
"""
A commandline tool for testing if RDF graphs are isomorpic, i.e. equal
if BNode labels are ignored.
"""
from itertools import combinations
from rdflib import BNode, Graph
class IsomorphicTestableGraph(Graph):
"""
Ported from:
http://www.w3.org/2001/sw/DataAccess/proto-tests/tools/rdfdiff.py
(Sean B Palmer's RDF Graph Isomorphism Tester)
"""
def __init__(self, **kargs):
super(IsomorphicTestableGraph, self).__init__(**kargs)
self.hash = None
def internal_hash(self):
"""
This is defined instead of __hash__ to avoid a circular recursion
scenario with the Memory store for rdflib which requires a hash
lookup in order to return a generator of triples
"""
return hash(tuple(sorted(self.hashtriples())))
def hashtriples(self):
for triple in self:
g = ((isinstance(t, BNode) and self.vhash(t)) or t for t in triple)
yield hash(tuple(g))
def vhash(self, term, done=False):
return tuple(sorted(self.vhashtriples(term, done)))
def vhashtriples(self, term, done):
for t in self:
if term in t:
yield tuple(self.vhashtriple(t, term, done))
def vhashtriple(self, triple, term, done):
for p in range(3):
if not isinstance(triple[p], BNode):
yield triple[p]
elif done or (triple[p] == term):
yield p
else:
yield self.vhash(triple[p], done=True)
def __eq__(self, G): # noqa: N803
"""Graph isomorphism testing."""
if not isinstance(G, IsomorphicTestableGraph):
return False
elif len(self) != len(G):
return False
elif list.__eq__(list(self), list(G)):
return True # @@
return self.internal_hash() == G.internal_hash()
def __ne__(self, G): # noqa: N803
"""Negative graph isomorphism testing."""
return not self.__eq__(G)
def main():
import sys
from optparse import OptionParser
usage = """usage: %prog [options] file1 file2 ... fileN"""
op = OptionParser(usage=usage)
op.add_option(
"-s",
"--stdin",
action="store_true",
default=False,
help="Load from STDIN as well",
)
op.add_option(
"--format",
default="xml",
dest="inputFormat",
metavar="RDF_FORMAT",
choices=["xml", "trix", "n3", "nt", "rdfa"],
help="The format of the RDF document(s) to compare"
+ "One of 'xml','n3','trix', 'nt', "
+ "or 'rdfa'. The default is %default",
)
(options, args) = op.parse_args()
graphs = []
graph2FName = {} # noqa: N806
if options.stdin:
graph = IsomorphicTestableGraph().parse(sys.stdin, format=options.inputFormat)
graphs.append(graph)
graph2FName[graph] = "(STDIN)"
for fn in args:
graph = IsomorphicTestableGraph().parse(fn, format=options.inputFormat)
graphs.append(graph)
graph2FName[graph] = fn
checked = set()
for graph1, graph2 in combinations(graphs, 2):
if (graph1, graph2) not in checked and (graph2, graph1) not in checked:
assert graph1 == graph2, "%s != %s" % (
graph2FName[graph1],
graph2FName[graph2],
)
if __name__ == "__main__":
main()
@@ -0,0 +1,186 @@
"""
A commandline tool for drawing RDF graphs in Graphviz DOT format
You can draw the graph of an RDF file directly:
.. code-block: bash
rdf2dot my_rdf_file.rdf | dot -Tpng | display
"""
from __future__ import annotations
import collections
import html
import sys
from typing import Any, Dict, TextIO
import rdflib
import rdflib.extras.cmdlineutils
from rdflib import XSD
from rdflib.graph import Graph
from rdflib.term import Literal, Node, URIRef
LABEL_PROPERTIES = [
rdflib.RDFS.label,
rdflib.URIRef("http://purl.org/dc/elements/1.1/title"),
rdflib.URIRef("http://xmlns.com/foaf/0.1/name"),
rdflib.URIRef("http://www.w3.org/2006/vcard/ns#fn"),
rdflib.URIRef("http://www.w3.org/2006/vcard/ns#org"),
]
XSDTERMS = [
XSD[x]
for x in (
"anyURI",
"base64Binary",
"boolean",
"byte",
"date",
"dateTime",
"decimal",
"double",
"duration",
"float",
"gDay",
"gMonth",
"gMonthDay",
"gYear",
"gYearMonth",
"hexBinary",
"ID",
"IDREF",
"IDREFS",
"int",
"integer",
"language",
"long",
"Name",
"NCName",
"negativeInteger",
"NMTOKEN",
"NMTOKENS",
"nonNegativeInteger",
"nonPositiveInteger",
"normalizedString",
"positiveInteger",
"QName",
"short",
"string",
"time",
"token",
"unsignedByte",
"unsignedInt",
"unsignedLong",
"unsignedShort",
)
]
EDGECOLOR = "blue"
NODECOLOR = "black"
ISACOLOR = "black"
def rdf2dot(g: Graph, stream: TextIO, opts: Dict[str, Any] = {}):
"""
Convert the RDF graph to DOT
writes the dot output to the stream
"""
fields = collections.defaultdict(set)
nodes: Dict[Node, str] = {}
def node(x: Node) -> str:
if x not in nodes:
nodes[x] = "node%d" % len(nodes)
return nodes[x]
def label(x: Node, g: Graph):
for labelProp in LABEL_PROPERTIES: # noqa: N806
l_ = g.value(x, labelProp)
if l_:
return l_
try:
# type error: Argument 1 to "compute_qname" of "NamespaceManager" has incompatible type "Node"; expected "str"
return g.namespace_manager.compute_qname(x)[2] # type: ignore[arg-type]
except Exception:
return x
def formatliteral(l: Literal, g): # noqa: E741
v = html.escape(l)
if l.datatype:
return "&quot;%s&quot;^^%s" % (v, qname(l.datatype, g))
elif l.language:
return "&quot;%s&quot;@%s" % (v, l.language)
return "&quot;%s&quot;" % v
def qname(x: URIRef, g: Graph) -> str:
try:
q = g.compute_qname(x)
return q[0] + ":" + q[2]
except Exception:
return x
def color(p):
return "BLACK"
stream.write('digraph { \n node [ fontname="DejaVu Sans" ] ; \n')
for s, p, o in g:
sn = node(s)
if p == rdflib.RDFS.label:
continue
if isinstance(o, (rdflib.URIRef, rdflib.BNode)):
on = node(o)
opstr = (
"\t%s -> %s [ color=%s, label=< <font point-size='10' "
+ "color='#336633'>%s</font> > ] ;\n"
)
# type error: Argument 1 to "qname" has incompatible type "Node"; expected "URIRef"
stream.write(opstr % (sn, on, color(p), qname(p, g))) # type: ignore[arg-type]
else:
# type error: Argument 1 to "qname" has incompatible type "Node"; expected "URIRef"
fields[sn].add((qname(p, g), formatliteral(o, g))) # type: ignore[arg-type]
for u, n in nodes.items():
stream.write("# %s %s\n" % (u, n))
f = [
"<tr><td align='left'>%s</td><td align='left'>%s</td></tr>" % x
for x in sorted(fields[n])
]
opstr = (
"%s [ shape=none, color=%s label=< <table color='#666666'"
+ " cellborder='0' cellspacing='0' border='1'><tr>"
+ "<td colspan='2' bgcolor='grey'><B>%s</B></td></tr><tr>"
+ "<td href='%s' bgcolor='#eeeeee' colspan='2'>"
+ "<font point-size='10' color='#6666ff'>%s</font></td>"
+ "</tr>%s</table> > ] \n"
)
stream.write(
opstr
# type error: Value of type variable "AnyStr" of "escape" cannot be "Node"
% (n, NODECOLOR, html.escape(label(u, g)), u, html.escape(u), "".join(f)) # type: ignore[type-var]
)
stream.write("}\n")
def _help():
sys.stderr.write(
"""
rdf2dot.py [-f <format>] files...
Read RDF files given on STDOUT, writes a graph of the RDFS schema in DOT
language to stdout
-f specifies parser to use, if not given,
"""
)
def main():
rdflib.extras.cmdlineutils.main(rdf2dot, _help)
if __name__ == "__main__":
main()
@@ -0,0 +1,205 @@
#!/usr/bin/env python
"""
A commandline tool for parsing RDF in different formats and serializing the
resulting graph to a chosen format.
"""
from __future__ import annotations
import logging
import sys
from optparse import OptionParser
from typing import BinaryIO, Optional
import rdflib
from rdflib import plugin
from rdflib.graph import ConjunctiveGraph
from rdflib.parser import Parser
from rdflib.serializer import Serializer
from rdflib.store import Store
from rdflib.util import guess_format
DEFAULT_INPUT_FORMAT = "xml"
DEFAULT_OUTPUT_FORMAT = "n3"
def parse_and_serialize(
input_files,
input_format,
guess,
outfile,
output_format,
ns_bindings,
store_conn="",
store_type=None,
):
if store_type:
store = plugin.get(store_type, Store)()
store.open(store_conn)
graph = ConjunctiveGraph(store)
else:
store = None
graph = ConjunctiveGraph()
for prefix, uri in ns_bindings.items():
graph.namespace_manager.bind(prefix, uri, override=False)
for fpath in input_files:
use_format, kws = _format_and_kws(input_format)
if fpath == "-":
fpath = sys.stdin
elif not input_format and guess:
use_format = guess_format(fpath) or DEFAULT_INPUT_FORMAT
graph.parse(fpath, format=use_format, **kws)
if outfile:
output_format, kws = _format_and_kws(output_format)
kws.setdefault("base", None)
graph.serialize(destination=outfile, format=output_format, **kws)
if store:
store.rollback()
def _format_and_kws(fmt):
"""
>>> _format_and_kws("fmt")
('fmt', {})
>>> _format_and_kws("fmt:+a")
('fmt', {'a': True})
>>> _format_and_kws("fmt:a")
('fmt', {'a': True})
>>> _format_and_kws("fmt:+a,-b") #doctest: +SKIP
('fmt', {'a': True, 'b': False})
>>> _format_and_kws("fmt:c=d")
('fmt', {'c': 'd'})
>>> _format_and_kws("fmt:a=b:c")
('fmt', {'a': 'b:c'})
"""
fmt, kws = fmt, {}
if fmt and ":" in fmt:
fmt, kwrepr = fmt.split(":", 1)
for kw in kwrepr.split(","):
if "=" in kw:
k, v = kw.split("=")
kws[k] = v
elif kw.startswith("-"):
kws[kw[1:]] = False
elif kw.startswith("+"):
kws[kw[1:]] = True
else: # same as "+"
kws[kw] = True
return fmt, kws
def make_option_parser():
parser_names = _get_plugin_names(Parser)
serializer_names = _get_plugin_names(Serializer)
kw_example = "FORMAT:(+)KW1,-KW2,KW3=VALUE"
oparser = OptionParser(
"%prog [-h] [-i INPUT_FORMAT] [-o OUTPUT_FORMAT] "
+ "[--ns=PFX=NS ...] [-] [FILE ...]",
description=__doc__.strip()
+ (
" Reads file system paths, URLs or from stdin if '-' is given."
" The result is serialized to stdout."
),
version="%prog " + "(using rdflib %s)" % rdflib.__version__,
)
oparser.add_option(
"-i",
"--input-format",
type=str, # default=DEFAULT_INPUT_FORMAT,
help="Format of the input document(s)."
" Available input formats are: %s." % parser_names
+ " If no format is given, it will be "
+ "guessed from the file name extension."
+ " Keywords to parser can be given after format like: %s." % kw_example,
metavar="INPUT_FORMAT",
)
oparser.add_option(
"-o",
"--output-format",
type=str,
default=DEFAULT_OUTPUT_FORMAT,
help="Format of the graph serialization."
" Available output formats are: %s." % serializer_names
+ " Default format is: '%default'."
+ " Keywords to serializer can be given after format like: %s." % kw_example,
metavar="OUTPUT_FORMAT",
)
oparser.add_option(
"--ns",
action="append",
type=str,
help="Register a namespace binding (QName prefix to a base URI). "
"This can be used more than once.",
metavar="PREFIX=NAMESPACE",
)
oparser.add_option(
"--no-guess",
dest="guess",
action="store_false",
default=True,
help="Don't guess format based on file suffix.",
)
oparser.add_option(
"--no-out",
action="store_true",
default=False,
help="Don't output the resulting graph "
+ "(useful for checking validity of input).",
)
oparser.add_option(
"-w",
"--warn",
action="store_true",
default=False,
help="Output warnings to stderr (by default only critical errors).",
)
return oparser
def _get_plugin_names(kind):
return ", ".join(p.name for p in plugin.plugins(kind=kind))
def main():
oparser = make_option_parser()
opts, args = oparser.parse_args()
if len(args) < 1:
oparser.print_usage()
oparser.exit()
if opts.warn:
loglevel = logging.WARNING
else:
loglevel = logging.CRITICAL
logging.basicConfig(level=loglevel)
ns_bindings = {}
if opts.ns:
for ns_kw in opts.ns:
pfx, uri = ns_kw.split("=")
ns_bindings[pfx] = uri
outfile: Optional[BinaryIO] = sys.stdout.buffer
if opts.no_out:
outfile = None
parse_and_serialize(
args, opts.input_format, opts.guess, outfile, opts.output_format, ns_bindings
)
if __name__ == "__main__":
main()
@@ -0,0 +1,158 @@
"""
A commandline tool for drawing RDFS Class diagrams in Graphviz DOT
format
You can draw the graph of an RDFS file directly:
.. code-block: bash
rdf2dot my_rdfs_file.rdf | dot -Tpng | display
"""
from __future__ import annotations
import collections
import itertools
import sys
from typing import Dict
import rdflib.extras.cmdlineutils
from rdflib import RDF, RDFS, XSD
from rdflib.term import Identifier
XSDTERMS = [
XSD[x]
for x in (
"anyURI",
"base64Binary",
"boolean",
"byte",
"date",
"dateTime",
"decimal",
"double",
"duration",
"float",
"gDay",
"gMonth",
"gMonthDay",
"gYear",
"gYearMonth",
"hexBinary",
"ID",
"IDREF",
"IDREFS",
"int",
"integer",
"language",
"long",
"Name",
"NCName",
"negativeInteger",
"NMTOKEN",
"NMTOKENS",
"nonNegativeInteger",
"nonPositiveInteger",
"normalizedString",
"positiveInteger",
"QName",
"short",
"string",
"time",
"token",
"unsignedByte",
"unsignedInt",
"unsignedLong",
"unsignedShort",
)
]
EDGECOLOR = "blue"
NODECOLOR = "black"
ISACOLOR = "black"
def rdfs2dot(g, stream, opts={}):
"""
Convert the RDFS schema in a graph
writes the dot output to the stream
"""
fields = collections.defaultdict(set)
nodes: Dict[Identifier, str] = {}
def node(nd):
if nd not in nodes:
nodes[nd] = "node%d" % len(nodes)
return nodes[nd]
def label(xx, grf):
lbl = grf.value(xx, RDFS.label)
if lbl is None:
try:
lbl = grf.namespace_manager.compute_qname(xx)[2]
except Exception:
pass # bnodes and some weird URIs cannot be split
return lbl
stream.write('digraph { \n node [ fontname="DejaVu Sans" ] ; \n')
for x in g.subjects(RDF.type, RDFS.Class):
n = node(x)
for x, y in g.subject_objects(RDFS.subClassOf):
x = node(x)
y = node(y)
stream.write("\t%s -> %s [ color=%s ] ;\n" % (y, x, ISACOLOR))
for x in g.subjects(RDF.type, RDF.Property):
for a, b in itertools.product(
g.objects(x, RDFS.domain), g.objects(x, RDFS.range)
):
if b in XSDTERMS or b == RDFS.Literal:
l_ = label(b, g)
if b == RDFS.Literal:
l_ = "literal"
fields[node(a)].add((label(x, g), l_))
else:
# if a in nodes and b in nodes:
stream.write(
'\t%s -> %s [ color=%s, label="%s" ];\n'
% (node(a), node(b), EDGECOLOR, label(x, g))
)
for u, n in nodes.items():
stream.write("# %s %s\n" % (u, n))
f = [
"<tr><td align='left'>%s</td><td>%s</td></tr>" % x
for x in sorted(fields[n])
]
opstr = (
"%s [ shape=none, color=%s label=< <table color='#666666'"
+ " cellborder='0' cellspacing='0' border='1'><tr>"
+ "<td colspan='2' bgcolor='grey'><B>%s</B></td>"
+ "</tr>%s</table> > ] \n"
)
stream.write(opstr % (n, NODECOLOR, label(u, g), "".join(f)))
stream.write("}\n")
def _help():
sys.stderr.write(
"""
rdfs2dot.py [-f <format>] files...
Read RDF files given on STDOUT, writes a graph of the RDFS schema in
DOT language to stdout
-f specifies parser to use, if not given,
"""
)
def main():
rdflib.extras.cmdlineutils.main(rdfs2dot, _help)
if __name__ == "__main__":
main()