2025-12-01
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
Various commandline tools for working with RDFLib
|
||||
"""
|
||||
@@ -0,0 +1,135 @@
|
||||
"""
|
||||
This file provides a single function `serialize_in_chunks()` which can serialize a
|
||||
Graph into a number of NT files with a maximum number of triples or maximum file size.
|
||||
|
||||
There is an option to preserve any prefixes declared for the original graph in the first
|
||||
file, which will be a Turtle file.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, BinaryIO, Generator, Optional, Tuple
|
||||
|
||||
from rdflib.graph import Graph
|
||||
from rdflib.plugins.serializers.nt import _nt_row
|
||||
|
||||
# from rdflib.term import Literal
|
||||
|
||||
# if TYPE_CHECKING:
|
||||
# from rdflib.graph import _TriplePatternType
|
||||
|
||||
__all__ = ["serialize_in_chunks"]
|
||||
|
||||
|
||||
def serialize_in_chunks(
|
||||
g: Graph,
|
||||
max_triples: int = 10000,
|
||||
max_file_size_kb: Optional[int] = None,
|
||||
file_name_stem: str = "chunk",
|
||||
output_dir: Optional[Path] = None,
|
||||
write_prefixes: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Serializes a given Graph into a series of n-triples with a given length.
|
||||
|
||||
:param g:
|
||||
The graph to serialize.
|
||||
|
||||
:param max_file_size_kb:
|
||||
Maximum size per NT file in kB (1,000 bytes)
|
||||
Equivalent to ~6,000 triples, depending on Literal sizes.
|
||||
|
||||
:param max_triples:
|
||||
Maximum size per NT file in triples
|
||||
Equivalent to lines in file.
|
||||
|
||||
If both this parameter and max_file_size_kb are set, max_file_size_kb will be used.
|
||||
|
||||
:param file_name_stem:
|
||||
Prefix of each file name.
|
||||
e.g. "chunk" = chunk_000001.nt, chunk_000002.nt...
|
||||
|
||||
:param output_dir:
|
||||
The directory you want the files to be written to.
|
||||
|
||||
:param write_prefixes:
|
||||
The first file created is a Turtle file containing original graph prefixes.
|
||||
|
||||
|
||||
See ``../test/test_tools/test_chunk_serializer.py`` for examples of this in use.
|
||||
"""
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path.cwd()
|
||||
|
||||
if not output_dir.is_dir():
|
||||
raise ValueError(
|
||||
"If you specify an output_dir, it must actually be a directory!"
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def _start_new_file(file_no: int) -> Generator[Tuple[Path, BinaryIO], None, None]:
|
||||
if TYPE_CHECKING:
|
||||
# this is here because mypy gets a bit confused
|
||||
assert output_dir is not None
|
||||
fp = Path(output_dir) / f"{file_name_stem}_{str(file_no).zfill(6)}.nt"
|
||||
with open(fp, "wb") as fh:
|
||||
yield fp, fh
|
||||
|
||||
def _serialize_prefixes(g: Graph) -> str:
|
||||
pres = []
|
||||
for k, v in g.namespace_manager.namespaces():
|
||||
pres.append(f"PREFIX {k}: <{v}>")
|
||||
|
||||
return "\n".join(sorted(pres)) + "\n"
|
||||
|
||||
if write_prefixes:
|
||||
with open(
|
||||
Path(output_dir) / f"{file_name_stem}_000000.ttl", "w", encoding="utf-8"
|
||||
) as fh:
|
||||
fh.write(_serialize_prefixes(g))
|
||||
|
||||
bytes_written = 0
|
||||
with ExitStack() as xstack:
|
||||
if max_file_size_kb is not None:
|
||||
max_file_size = max_file_size_kb * 1000
|
||||
file_no = 1 if write_prefixes else 0
|
||||
for i, t in enumerate(g.triples((None, None, None))):
|
||||
row_bytes = _nt_row(t).encode("utf-8")
|
||||
if len(row_bytes) > max_file_size:
|
||||
raise ValueError(
|
||||
# type error: Unsupported operand types for / ("bytes" and "int")
|
||||
f"cannot write triple {t!r} as it's serialized size of {row_bytes / 1000} exceeds max_file_size_kb = {max_file_size_kb}" # type: ignore[operator]
|
||||
)
|
||||
if i == 0:
|
||||
fp, fhb = xstack.enter_context(_start_new_file(file_no))
|
||||
bytes_written = 0
|
||||
elif (bytes_written + len(row_bytes)) >= max_file_size:
|
||||
file_no += 1
|
||||
fp, fhb = xstack.enter_context(_start_new_file(file_no))
|
||||
bytes_written = 0
|
||||
|
||||
bytes_written += fhb.write(row_bytes)
|
||||
|
||||
else:
|
||||
# count the triples in the graph
|
||||
graph_length = len(g)
|
||||
|
||||
if graph_length <= max_triples:
|
||||
# the graph is less than max so just NT serialize the whole thing
|
||||
g.serialize(
|
||||
destination=Path(output_dir) / f"{file_name_stem}_all.nt",
|
||||
format="nt",
|
||||
)
|
||||
else:
|
||||
# graph_length is > max_lines, make enough files for all graph
|
||||
# no_files = math.ceil(graph_length / max_triples)
|
||||
file_no = 1 if write_prefixes else 0
|
||||
for i, t in enumerate(g.triples((None, None, None))):
|
||||
if i % max_triples == 0:
|
||||
fp, fhb = xstack.enter_context(_start_new_file(file_no))
|
||||
file_no += 1
|
||||
fhb.write(_nt_row(t).encode("utf-8"))
|
||||
return
|
||||
@@ -0,0 +1,570 @@
|
||||
"""
|
||||
A commandline tool for semi-automatically converting CSV to RDF.
|
||||
|
||||
See also https://github.com/RDFLib/pyTARQL in the RDFlib family of tools
|
||||
|
||||
try: ``csv2rdf --help``
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
import configparser
|
||||
import csv
|
||||
import datetime
|
||||
import fileinput
|
||||
import getopt
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import warnings
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import quote
|
||||
|
||||
import rdflib
|
||||
from rdflib.namespace import RDF, RDFS, split_uri
|
||||
from rdflib.term import URIRef
|
||||
|
||||
__all__ = ["CSV2RDF"]
|
||||
|
||||
HELP = """
|
||||
csv2rdf.py \
|
||||
-b <instance-base> \
|
||||
-p <property-base> \
|
||||
[-D <default>] \
|
||||
[-c <classname>] \
|
||||
[-i <identity column(s)>] \
|
||||
[-l <label columns>] \
|
||||
[-s <N>] [-o <output>] \
|
||||
[-f configfile] \
|
||||
[--col<N> <colspec>] \
|
||||
[--prop<N> <property>] \
|
||||
<[-d <delim>] \
|
||||
[-C] [files...]"
|
||||
|
||||
Reads csv files from stdin or given files
|
||||
if -d is given, use this delimiter
|
||||
if -s is given, skips N lines at the start
|
||||
Creates a URI from the columns given to -i, or automatically by numbering if
|
||||
none is given
|
||||
Outputs RDFS labels from the columns given to -l
|
||||
if -c is given adds a type triple with the given classname
|
||||
if -C is given, the class is defined as rdfs:Class
|
||||
Outputs one RDF triple per column in each row.
|
||||
Output is in n3 format.
|
||||
Output is stdout, unless -o is specified
|
||||
|
||||
Long options also supported: \
|
||||
--base, \
|
||||
--propbase, \
|
||||
--ident, \
|
||||
--class, \
|
||||
--label, \
|
||||
--out, \
|
||||
--defineclass
|
||||
|
||||
Long options --col0, --col1, ...
|
||||
can be used to specify conversion for columns.
|
||||
Conversions can be:
|
||||
ignore, float(), int(), split(sep, [more]), uri(base, [class]), date(format)
|
||||
|
||||
Long options --prop0, --prop1, ...
|
||||
can be used to use specific properties, rather than ones auto-generated
|
||||
from the headers
|
||||
|
||||
-D sets the default conversion for columns not listed
|
||||
|
||||
-f says to read config from a .ini/config file - the file must contain one
|
||||
section called csv2rdf, with keys like the long options, i.e.:
|
||||
|
||||
[csv2rdf]
|
||||
out=output.n3
|
||||
base=http://example.org/
|
||||
col0=split(";")
|
||||
col1=split(";", uri("http://example.org/things/",
|
||||
"http://xmlns.com/foaf/0.1/Person"))
|
||||
col2=float()
|
||||
col3=int()
|
||||
col4=date("%Y-%b-%d %H:%M:%S")
|
||||
|
||||
"""
|
||||
|
||||
# bah - ugly global
|
||||
uris: Dict[Any, Tuple[URIRef, Optional[URIRef]]] = {}
|
||||
|
||||
|
||||
def toProperty(label: str): # noqa: N802
|
||||
"""
|
||||
CamelCase + lowercase initial a string
|
||||
|
||||
|
||||
FIRST_NM => firstNm
|
||||
|
||||
firstNm => firstNm
|
||||
|
||||
"""
|
||||
label = re.sub(r"[^\w]", " ", label)
|
||||
label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
|
||||
# type error: Incompatible types in assignment (expression has type "None", variable has type "BinaryIO")
|
||||
label = label.split(" ") # type: ignore[assignment]
|
||||
return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])
|
||||
|
||||
|
||||
def toPropertyLabel(label): # noqa: N802
|
||||
if not label[1:2].isupper():
|
||||
return label[0:1].lower() + label[1:]
|
||||
return label
|
||||
|
||||
|
||||
def index(l_: List[int], i: Tuple[int, ...]) -> Tuple[int, ...]:
|
||||
"""return a set of indexes from a list
|
||||
>>> index([1,2,3],(0,2))
|
||||
(1, 3)
|
||||
"""
|
||||
return tuple([l_[x] for x in i])
|
||||
|
||||
|
||||
def csv_reader(csv_data, dialect=csv.excel, **kwargs):
|
||||
csv_reader = csv.reader(csv_data, dialect=dialect, **kwargs)
|
||||
for row in csv_reader:
|
||||
yield row
|
||||
|
||||
|
||||
def prefixuri(x, prefix, class_: Optional[URIRef] = None):
|
||||
if prefix:
|
||||
r = rdflib.URIRef(prefix + quote(x.encode("utf8").replace(" ", "_"), safe=""))
|
||||
else:
|
||||
r = rdflib.URIRef(x)
|
||||
uris[x] = (r, class_)
|
||||
return r
|
||||
|
||||
|
||||
# meta-language for config
|
||||
|
||||
|
||||
class NodeMaker:
|
||||
def range(self):
|
||||
return rdflib.RDFS.Literal
|
||||
|
||||
def __call__(self, x: Any):
|
||||
return rdflib.Literal(x)
|
||||
|
||||
|
||||
class NodeUri(NodeMaker):
|
||||
def __init__(self, prefix, class_):
|
||||
self.class_: Optional[URIRef] = None
|
||||
self.prefix = prefix
|
||||
if class_:
|
||||
self.class_ = rdflib.URIRef(class_)
|
||||
else:
|
||||
self.class_ = None
|
||||
|
||||
def __call__(self, x):
|
||||
return prefixuri(x, self.prefix, self.class_)
|
||||
|
||||
def range(self):
|
||||
return self.class_ or rdflib.RDF.Resource
|
||||
|
||||
|
||||
class NodeLiteral(NodeMaker):
|
||||
def __init__(self, f=None):
|
||||
self.f = f
|
||||
|
||||
|
||||
class NodeFloat(NodeLiteral):
|
||||
def __call__(self, x):
|
||||
if not self.f:
|
||||
return rdflib.Literal(float(x))
|
||||
if callable(self.f):
|
||||
return rdflib.Literal(float(self.f(x)))
|
||||
raise Exception("Function passed to float is not callable")
|
||||
|
||||
def range(self):
|
||||
return rdflib.XSD.double
|
||||
|
||||
|
||||
class NodeInt(NodeLiteral):
|
||||
def __call__(self, x):
|
||||
if not self.f:
|
||||
return rdflib.Literal(int(x))
|
||||
if callable(self.f):
|
||||
return rdflib.Literal(int(self.f(x)))
|
||||
raise Exception("Function passed to int is not callable")
|
||||
|
||||
def range(self):
|
||||
return rdflib.XSD.int
|
||||
|
||||
|
||||
class NodeBool(NodeLiteral):
|
||||
def __call__(self, x):
|
||||
if not self.f:
|
||||
return rdflib.Literal(bool(x))
|
||||
if callable(self.f):
|
||||
return rdflib.Literal(bool(self.f(x)))
|
||||
raise Exception("Function passed to bool is not callable")
|
||||
|
||||
def range(self):
|
||||
return rdflib.XSD.bool
|
||||
|
||||
|
||||
class NodeReplace(NodeMaker):
|
||||
def __init__(self, a, b):
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
def __call__(self, x):
|
||||
return x.replace(self.a, self.b)
|
||||
|
||||
|
||||
class NodeDate(NodeLiteral):
|
||||
def __call__(self, x):
|
||||
return rdflib.Literal(datetime.datetime.strptime(x, self.f))
|
||||
|
||||
def range(self):
|
||||
return rdflib.XSD.dateTime
|
||||
|
||||
|
||||
class NodeSplit(NodeMaker):
|
||||
def __init__(self, sep, f):
|
||||
self.sep = sep
|
||||
self.f = f
|
||||
|
||||
def __call__(self, x):
|
||||
if not self.f:
|
||||
self.f = rdflib.Literal
|
||||
if not callable(self.f):
|
||||
raise Exception("Function passed to split is not callable!")
|
||||
return [self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]
|
||||
|
||||
def range(self):
|
||||
if self.f and isinstance(self.f, NodeMaker):
|
||||
return self.f.range()
|
||||
return NodeMaker.range(self)
|
||||
|
||||
|
||||
default_node_make = NodeMaker()
|
||||
|
||||
|
||||
def _config_ignore(*args, **kwargs):
|
||||
return "ignore"
|
||||
|
||||
|
||||
def _config_uri(prefix=None, class_=None):
|
||||
return NodeUri(prefix, class_)
|
||||
|
||||
|
||||
def _config_literal():
|
||||
return NodeLiteral()
|
||||
|
||||
|
||||
def _config_float(f=None):
|
||||
return NodeFloat(f)
|
||||
|
||||
|
||||
def _config_replace(a, b):
|
||||
return NodeReplace(a, b)
|
||||
|
||||
|
||||
def _config_int(f=None):
|
||||
return NodeInt(f)
|
||||
|
||||
|
||||
def _config_bool(f=None):
|
||||
return NodeBool(f)
|
||||
|
||||
|
||||
def _config_date(format_):
|
||||
return NodeDate(format_)
|
||||
|
||||
|
||||
def _config_split(sep=None, f=None):
|
||||
return NodeSplit(sep, f)
|
||||
|
||||
|
||||
config_functions = {
|
||||
"ignore": _config_ignore,
|
||||
"uri": _config_uri,
|
||||
"literal": _config_literal,
|
||||
"float": _config_float,
|
||||
"int": _config_int,
|
||||
"date": _config_date,
|
||||
"split": _config_split,
|
||||
"replace": _config_replace,
|
||||
"bool": _config_bool,
|
||||
}
|
||||
|
||||
|
||||
def column(v):
|
||||
"""Return a function for column mapping"""
|
||||
|
||||
return eval(v, config_functions)
|
||||
|
||||
|
||||
class CSV2RDF:
|
||||
def __init__(self):
|
||||
self.CLASS = None
|
||||
self.BASE = None
|
||||
self.PROPBASE = None
|
||||
self.IDENT: Union[Tuple[str, ...], str] = "auto"
|
||||
self.LABEL = None
|
||||
self.DEFINECLASS = False
|
||||
self.SKIP = 0
|
||||
self.DELIM = ","
|
||||
self.DEFAULT = None
|
||||
|
||||
self.COLUMNS = {}
|
||||
self.PROPS = {}
|
||||
|
||||
self.OUT = sys.stdout
|
||||
|
||||
self.triples = 0
|
||||
|
||||
def triple(self, s, p, o):
|
||||
self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
|
||||
self.triples += 1
|
||||
|
||||
def convert(self, csvreader):
|
||||
start = time.time()
|
||||
|
||||
if self.OUT:
|
||||
sys.stderr.write("Output to %s\n" % self.OUT.name)
|
||||
|
||||
if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
|
||||
self.IDENT = (self.IDENT,)
|
||||
|
||||
if not self.BASE:
|
||||
warnings.warn("No base given, using http://example.org/instances/")
|
||||
self.BASE = rdflib.Namespace("http://example.org/instances/")
|
||||
|
||||
if not self.PROPBASE:
|
||||
warnings.warn("No property base given, using http://example.org/property/")
|
||||
self.PROPBASE = rdflib.Namespace("http://example.org/props/")
|
||||
|
||||
# skip lines at the start
|
||||
for x in range(self.SKIP):
|
||||
next(csvreader)
|
||||
|
||||
# read header line
|
||||
header_labels = list(next(csvreader))
|
||||
headers = dict(enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
|
||||
# override header properties if some are given
|
||||
for k, v in self.PROPS.items():
|
||||
headers[k] = v
|
||||
header_labels[k] = split_uri(v)[1]
|
||||
|
||||
if self.DEFINECLASS:
|
||||
# output class/property definitions
|
||||
self.triple(self.CLASS, RDF.type, RDFS.Class)
|
||||
for i in range(len(headers)):
|
||||
h, l_ = headers[i], header_labels[i]
|
||||
if h == "" or l_ == "":
|
||||
continue
|
||||
if self.COLUMNS.get(i, self.DEFAULT) == "ignore":
|
||||
continue
|
||||
self.triple(h, RDF.type, RDF.Property)
|
||||
self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l_)))
|
||||
self.triple(h, RDFS.domain, self.CLASS)
|
||||
self.triple(
|
||||
h, RDFS.range, self.COLUMNS.get(i, default_node_make).range()
|
||||
)
|
||||
|
||||
rows = 0
|
||||
for l_ in csvreader:
|
||||
try:
|
||||
if self.IDENT == "auto":
|
||||
uri = self.BASE["%d" % rows]
|
||||
else:
|
||||
uri = self.BASE[
|
||||
"_".join(
|
||||
[
|
||||
# type error: "int" has no attribute "encode"
|
||||
quote(x.encode("utf8").replace(" ", "_"), safe="") # type: ignore[attr-defined]
|
||||
# type error: Argument 2 to "index" has incompatible type "Union[Tuple[str, ...], str]"; expected "Tuple[int, ...]"
|
||||
for x in index(l_, self.IDENT) # type: ignore[arg-type]
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
if self.LABEL:
|
||||
self.triple(
|
||||
# type error: Argument 1 to "join" of "str" has incompatible type "Tuple[int, ...]"; expected "Iterable[str]"
|
||||
uri,
|
||||
RDFS.label,
|
||||
rdflib.Literal(" ".join(index(l_, self.LABEL))), # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
if self.CLASS:
|
||||
# type triple
|
||||
self.triple(uri, RDF.type, self.CLASS)
|
||||
|
||||
for i, x in enumerate(l_):
|
||||
# type error: "int" has no attribute "strip"
|
||||
x = x.strip() # type: ignore[attr-defined]
|
||||
if x != "":
|
||||
if self.COLUMNS.get(i, self.DEFAULT) == "ignore":
|
||||
continue
|
||||
try:
|
||||
o = self.COLUMNS.get(i, rdflib.Literal)(x)
|
||||
if isinstance(o, list):
|
||||
for _o in o:
|
||||
self.triple(uri, headers[i], _o)
|
||||
else:
|
||||
self.triple(uri, headers[i], o)
|
||||
|
||||
except Exception as e:
|
||||
warnings.warn(
|
||||
"Could not process value for column "
|
||||
+ "%d:%s in row %d, ignoring: %s "
|
||||
# type error: "Exception" has no attribute "message"
|
||||
% (i, headers[i], rows, e.message) # type: ignore[attr-defined]
|
||||
)
|
||||
|
||||
rows += 1
|
||||
if rows % 100000 == 0:
|
||||
sys.stderr.write(
|
||||
"%d rows, %d triples, elapsed %.2fs.\n"
|
||||
% (rows, self.triples, time.time() - start)
|
||||
)
|
||||
except Exception:
|
||||
sys.stderr.write("Error processing line: %d\n" % rows)
|
||||
raise
|
||||
|
||||
# output types/labels for generated URIs
|
||||
classes = set()
|
||||
# type error: Incompatible types in assignment (expression has type "Tuple[URIRef, Optional[URIRef]]", variable has type "int")
|
||||
for l_, x in uris.items(): # type: ignore[assignment]
|
||||
# type error: "int" object is not iterable
|
||||
u, c = x # type: ignore[misc]
|
||||
# type error: Cannot determine type of "u"
|
||||
self.triple(u, RDFS.label, rdflib.Literal(l_)) # type: ignore[has-type]
|
||||
# type error: Cannot determine type of "c"
|
||||
if c: # type: ignore[has-type]
|
||||
# type error: Cannot determine type of "c"
|
||||
c = rdflib.URIRef(c) # type: ignore[has-type]
|
||||
classes.add(c)
|
||||
# type error: Cannot determine type of "u"
|
||||
self.triple(u, RDF.type, c) # type: ignore[has-type]
|
||||
|
||||
for c in classes:
|
||||
self.triple(c, RDF.type, RDFS.Class)
|
||||
|
||||
self.OUT.close()
|
||||
sys.stderr.write("Converted %d rows into %d triples.\n" % (rows, self.triples))
|
||||
sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))
|
||||
|
||||
|
||||
def main():
|
||||
csv2rdf = CSV2RDF()
|
||||
|
||||
opts: Union[Dict[str, str], List[Tuple[str, str]]]
|
||||
opts, files = getopt.getopt(
|
||||
sys.argv[1:],
|
||||
"hc:b:p:i:o:Cf:l:s:d:D:",
|
||||
[
|
||||
"out=",
|
||||
"base=",
|
||||
"delim=",
|
||||
"propbase=",
|
||||
"class=",
|
||||
"default=" "ident=",
|
||||
"label=",
|
||||
"skip=",
|
||||
"defineclass",
|
||||
"help",
|
||||
],
|
||||
)
|
||||
opts = dict(opts)
|
||||
|
||||
if "-h" in opts or "--help" in opts:
|
||||
print(HELP)
|
||||
sys.exit(-1)
|
||||
|
||||
if "-f" in opts:
|
||||
config = configparser.ConfigParser()
|
||||
config.read_file(open(opts["-f"]))
|
||||
for k, v in config.items("csv2rdf"):
|
||||
if k == "out":
|
||||
csv2rdf.OUT = codecs.open(v, "w", "utf-8")
|
||||
elif k == "base":
|
||||
csv2rdf.BASE = rdflib.Namespace(v)
|
||||
elif k == "propbase":
|
||||
csv2rdf.PROPBASE = rdflib.Namespace(v)
|
||||
elif k == "class":
|
||||
csv2rdf.CLASS = rdflib.URIRef(v)
|
||||
elif k == "defineclass":
|
||||
csv2rdf.DEFINECLASS = bool(v)
|
||||
elif k == "ident":
|
||||
csv2rdf.IDENT = eval(v)
|
||||
elif k == "label":
|
||||
csv2rdf.LABEL = eval(v)
|
||||
elif k == "delim":
|
||||
csv2rdf.DELIM = v
|
||||
elif k == "skip":
|
||||
csv2rdf.SKIP = int(v)
|
||||
elif k == "default":
|
||||
csv2rdf.DEFAULT = column(v)
|
||||
elif k.startswith("col"):
|
||||
csv2rdf.COLUMNS[int(k[3:])] = column(v)
|
||||
elif k.startswith("prop"):
|
||||
csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)
|
||||
|
||||
if "-o" in opts:
|
||||
csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
|
||||
if "--out" in opts:
|
||||
csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")
|
||||
|
||||
if "-b" in opts:
|
||||
csv2rdf.BASE = rdflib.Namespace(opts["-b"])
|
||||
if "--base" in opts:
|
||||
csv2rdf.BASE = rdflib.Namespace(opts["--base"])
|
||||
|
||||
if "-d" in opts:
|
||||
csv2rdf.DELIM = opts["-d"]
|
||||
if "--delim" in opts:
|
||||
csv2rdf.DELIM = opts["--delim"]
|
||||
|
||||
if "-D" in opts:
|
||||
csv2rdf.DEFAULT = column(opts["-D"])
|
||||
if "--default" in opts:
|
||||
csv2rdf.DEFAULT = column(opts["--default"])
|
||||
|
||||
if "-p" in opts:
|
||||
csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
|
||||
if "--propbase" in opts:
|
||||
csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])
|
||||
|
||||
if "-l" in opts:
|
||||
csv2rdf.LABEL = eval(opts["-l"])
|
||||
if "--label" in opts:
|
||||
csv2rdf.LABEL = eval(opts["--label"])
|
||||
|
||||
if "-i" in opts:
|
||||
csv2rdf.IDENT = eval(opts["-i"])
|
||||
if "--ident" in opts:
|
||||
csv2rdf.IDENT = eval(opts["--ident"])
|
||||
|
||||
if "-s" in opts:
|
||||
csv2rdf.SKIP = int(opts["-s"])
|
||||
if "--skip" in opts:
|
||||
csv2rdf.SKIP = int(opts["--skip"])
|
||||
|
||||
if "-c" in opts:
|
||||
csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
|
||||
if "--class" in opts:
|
||||
csv2rdf.CLASS = rdflib.URIRef(opts["--class"])
|
||||
|
||||
for k, v in opts.items():
|
||||
if k.startswith("--col"):
|
||||
csv2rdf.COLUMNS[int(k[5:])] = column(v)
|
||||
elif k.startswith("--prop"):
|
||||
csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)
|
||||
|
||||
if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
|
||||
csv2rdf.DEFINECLASS = True
|
||||
|
||||
csv2rdf.convert(csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+222
@@ -0,0 +1,222 @@
|
||||
"""
|
||||
This rdflib Python script creates a DefinedNamespace Python file from a given RDF file
|
||||
|
||||
It is a very simple script: it finds all things defined in the RDF file within a given
|
||||
namespace:
|
||||
|
||||
<thing> a ?x
|
||||
|
||||
where ?x is anything and <thing> starts with the given namespace
|
||||
|
||||
Nicholas J. Car, Dec, 2021
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import keyword
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Tuple
|
||||
|
||||
from rdflib.graph import Graph
|
||||
from rdflib.namespace import DCTERMS, OWL, RDFS, SKOS
|
||||
from rdflib.util import guess_format
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from rdflib.query import ResultRow
|
||||
|
||||
|
||||
def validate_namespace(namespace: str) -> None:
|
||||
if not namespace.endswith(("/", "#")):
|
||||
raise ValueError("The supplied namespace must end with '/' or '#'")
|
||||
|
||||
|
||||
def validate_object_id(object_id: str) -> None:
|
||||
for c in object_id:
|
||||
if not c.isupper():
|
||||
raise ValueError("The supplied object_id must be an all-capitals string")
|
||||
|
||||
|
||||
# This function is not used: it was originally written to get classes and to be used
|
||||
# alongside a method to get properties, but then it was decided that a single function
|
||||
# to get everything in the namespace, get_target_namespace_elements(), was both simper
|
||||
# and better covered all namespace elements, so that function is used instead.
|
||||
#
|
||||
# def get_classes(g, target_namespace):
|
||||
# namespaces = {"dcterms": DCTERMS, "owl": OWL, "rdfs": RDFS, "skos": SKOS}
|
||||
# q = """
|
||||
# SELECT DISTINCT ?x ?def
|
||||
# WHERE {
|
||||
# # anything that is an instance of owl:Class or rdfs:Class
|
||||
# # or any subclass of them
|
||||
# VALUES ?c { owl:Class rdfs:Class }
|
||||
# ?x rdfs:subClassOf*/a ?c .
|
||||
#
|
||||
# # get any definitions, if they have one
|
||||
# OPTIONAL {
|
||||
# ?x rdfs:comment|dcterms:description|skos:definition ?def
|
||||
# }
|
||||
#
|
||||
# # only get results for the targetted namespace (supplied by user)
|
||||
# FILTER STRSTARTS(STR(?x), "xxx")
|
||||
# }
|
||||
# """.replace("xxx", target_namespace)
|
||||
# classes = []
|
||||
# for r in g.query(q, initNs=namespaces):
|
||||
# classes.append((str(r[0]), str(r[1])))
|
||||
#
|
||||
# classes.sort(key=lambda tup: tup[1])
|
||||
#
|
||||
# return classes
|
||||
|
||||
|
||||
def get_target_namespace_elements(
|
||||
g: Graph, target_namespace: str
|
||||
) -> Tuple[List[Tuple[str, str]], List[str], List[str]]:
|
||||
namespaces = {"dcterms": DCTERMS, "owl": OWL, "rdfs": RDFS, "skos": SKOS}
|
||||
q = """
|
||||
SELECT ?s (GROUP_CONCAT(DISTINCT STR(?def)) AS ?defs)
|
||||
WHERE {
|
||||
# all things in the RDF data (anything RDF.type...)
|
||||
?s a ?o .
|
||||
|
||||
# get any definitions, if they have one
|
||||
OPTIONAL {
|
||||
?s dcterms:description|rdfs:comment|skos:definition ?def
|
||||
}
|
||||
|
||||
# only get results for the target namespace (supplied by user)
|
||||
FILTER STRSTARTS(STR(?s), "xxx")
|
||||
FILTER (STR(?s) != "xxx")
|
||||
}
|
||||
GROUP BY ?s
|
||||
""".replace(
|
||||
"xxx", target_namespace
|
||||
)
|
||||
elements: List[Tuple[str, str]] = []
|
||||
for r in g.query(q, initNs=namespaces):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(r, ResultRow)
|
||||
elements.append((str(r[0]), str(r[1])))
|
||||
|
||||
elements.sort(key=lambda tup: tup[0])
|
||||
|
||||
elements_strs: List[str] = []
|
||||
non_python_elements_strs: List[str] = []
|
||||
for e in elements:
|
||||
name = e[0].replace(target_namespace, "")
|
||||
desc = e[1].replace("\n", " ")
|
||||
if name.isidentifier() and not keyword.iskeyword(name):
|
||||
elements_strs.append(f" {name}: URIRef # {desc}\n")
|
||||
else:
|
||||
non_python_elements_strs.append(f""" "{name}", # {desc}\n""")
|
||||
|
||||
return elements, elements_strs, non_python_elements_strs
|
||||
|
||||
|
||||
def make_dn_file(
|
||||
output_file_name: Path,
|
||||
target_namespace: str,
|
||||
elements_strs: Iterable[str],
|
||||
non_python_elements_strs: List[str],
|
||||
object_id: str,
|
||||
fail: bool,
|
||||
) -> None:
|
||||
header = f'''from rdflib.namespace import DefinedNamespace, Namespace
|
||||
from rdflib.term import URIRef
|
||||
|
||||
|
||||
class {object_id}(DefinedNamespace):
|
||||
"""
|
||||
DESCRIPTION_EDIT_ME_!
|
||||
|
||||
Generated from: SOURCE_RDF_FILE_EDIT_ME_!
|
||||
Date: {datetime.datetime.utcnow()}
|
||||
"""
|
||||
'''
|
||||
with open(output_file_name, "w") as f:
|
||||
f.write(header)
|
||||
f.write("\n")
|
||||
f.write(f' _NS = Namespace("{target_namespace}")')
|
||||
f.write("\n\n")
|
||||
if fail:
|
||||
f.write(" _fail = True")
|
||||
f.write("\n\n")
|
||||
f.writelines(elements_strs)
|
||||
|
||||
if len(non_python_elements_strs) > 0:
|
||||
f.write("\n")
|
||||
f.write(" # Valid non-python identifiers")
|
||||
f.write("\n")
|
||||
f.write(" _extras = [")
|
||||
f.write("\n")
|
||||
f.writelines(non_python_elements_strs)
|
||||
f.write(" ]")
|
||||
f.write("\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"ontology_file",
|
||||
type=str,
|
||||
help="Path to the RDF ontology to extract a DefinedNamespace from.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"target_namespace",
|
||||
type=str,
|
||||
help="The namespace within the ontology that you want to create a "
|
||||
"DefinedNamespace for.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"object_id",
|
||||
type=str,
|
||||
help="The RDFlib object ID of the DefinedNamespace, e.g. GEO for GeoSPARQL.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--fail",
|
||||
dest="fail",
|
||||
action="store_true",
|
||||
help="Whether (true) or not (false) to mimic ClosedNamespace and fail on "
|
||||
"non-element use",
|
||||
)
|
||||
parser.add_argument("--no-fail", dest="fail", action="store_false")
|
||||
parser.set_defaults(feature=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
fmt = guess_format(args.ontology_file)
|
||||
if fmt is None:
|
||||
print("The format of the file you've supplied is unknown.")
|
||||
exit(1)
|
||||
g = Graph().parse(args.ontology_file, format=fmt)
|
||||
|
||||
validate_namespace(args.target_namespace)
|
||||
|
||||
validate_object_id(args.object_id)
|
||||
|
||||
print(
|
||||
f"Creating DefinedNamespace file {args.object_id} "
|
||||
f"for {args.target_namespace}..."
|
||||
)
|
||||
print(f"Ontology with {len(g)} triples loaded...")
|
||||
|
||||
print("Getting all namespace elements...")
|
||||
elements = get_target_namespace_elements(g, args.target_namespace)
|
||||
|
||||
output_file_name = Path().cwd() / f"_{args.object_id}.py"
|
||||
print(f"Creating DefinedNamespace Python file {output_file_name}")
|
||||
make_dn_file(
|
||||
output_file_name,
|
||||
args.target_namespace,
|
||||
elements[1],
|
||||
elements[2],
|
||||
args.object_id,
|
||||
args.fail,
|
||||
)
|
||||
@@ -0,0 +1,113 @@
|
||||
"""
|
||||
A commandline tool for testing if RDF graphs are isomorpic, i.e. equal
|
||||
if BNode labels are ignored.
|
||||
"""
|
||||
|
||||
from itertools import combinations
|
||||
|
||||
from rdflib import BNode, Graph
|
||||
|
||||
|
||||
class IsomorphicTestableGraph(Graph):
|
||||
"""
|
||||
Ported from:
|
||||
http://www.w3.org/2001/sw/DataAccess/proto-tests/tools/rdfdiff.py
|
||||
(Sean B Palmer's RDF Graph Isomorphism Tester)
|
||||
"""
|
||||
|
||||
def __init__(self, **kargs):
|
||||
super(IsomorphicTestableGraph, self).__init__(**kargs)
|
||||
self.hash = None
|
||||
|
||||
def internal_hash(self):
|
||||
"""
|
||||
This is defined instead of __hash__ to avoid a circular recursion
|
||||
scenario with the Memory store for rdflib which requires a hash
|
||||
lookup in order to return a generator of triples
|
||||
"""
|
||||
return hash(tuple(sorted(self.hashtriples())))
|
||||
|
||||
def hashtriples(self):
|
||||
for triple in self:
|
||||
g = ((isinstance(t, BNode) and self.vhash(t)) or t for t in triple)
|
||||
yield hash(tuple(g))
|
||||
|
||||
def vhash(self, term, done=False):
|
||||
return tuple(sorted(self.vhashtriples(term, done)))
|
||||
|
||||
def vhashtriples(self, term, done):
|
||||
for t in self:
|
||||
if term in t:
|
||||
yield tuple(self.vhashtriple(t, term, done))
|
||||
|
||||
def vhashtriple(self, triple, term, done):
|
||||
for p in range(3):
|
||||
if not isinstance(triple[p], BNode):
|
||||
yield triple[p]
|
||||
elif done or (triple[p] == term):
|
||||
yield p
|
||||
else:
|
||||
yield self.vhash(triple[p], done=True)
|
||||
|
||||
def __eq__(self, G): # noqa: N803
|
||||
"""Graph isomorphism testing."""
|
||||
if not isinstance(G, IsomorphicTestableGraph):
|
||||
return False
|
||||
elif len(self) != len(G):
|
||||
return False
|
||||
elif list.__eq__(list(self), list(G)):
|
||||
return True # @@
|
||||
return self.internal_hash() == G.internal_hash()
|
||||
|
||||
def __ne__(self, G): # noqa: N803
|
||||
"""Negative graph isomorphism testing."""
|
||||
return not self.__eq__(G)
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
|
||||
usage = """usage: %prog [options] file1 file2 ... fileN"""
|
||||
op = OptionParser(usage=usage)
|
||||
op.add_option(
|
||||
"-s",
|
||||
"--stdin",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Load from STDIN as well",
|
||||
)
|
||||
op.add_option(
|
||||
"--format",
|
||||
default="xml",
|
||||
dest="inputFormat",
|
||||
metavar="RDF_FORMAT",
|
||||
choices=["xml", "trix", "n3", "nt", "rdfa"],
|
||||
help="The format of the RDF document(s) to compare"
|
||||
+ "One of 'xml','n3','trix', 'nt', "
|
||||
+ "or 'rdfa'. The default is %default",
|
||||
)
|
||||
|
||||
(options, args) = op.parse_args()
|
||||
|
||||
graphs = []
|
||||
graph2FName = {} # noqa: N806
|
||||
if options.stdin:
|
||||
graph = IsomorphicTestableGraph().parse(sys.stdin, format=options.inputFormat)
|
||||
graphs.append(graph)
|
||||
graph2FName[graph] = "(STDIN)"
|
||||
for fn in args:
|
||||
graph = IsomorphicTestableGraph().parse(fn, format=options.inputFormat)
|
||||
graphs.append(graph)
|
||||
graph2FName[graph] = fn
|
||||
checked = set()
|
||||
for graph1, graph2 in combinations(graphs, 2):
|
||||
if (graph1, graph2) not in checked and (graph2, graph1) not in checked:
|
||||
assert graph1 == graph2, "%s != %s" % (
|
||||
graph2FName[graph1],
|
||||
graph2FName[graph2],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,186 @@
|
||||
"""
|
||||
A commandline tool for drawing RDF graphs in Graphviz DOT format
|
||||
|
||||
You can draw the graph of an RDF file directly:
|
||||
|
||||
.. code-block: bash
|
||||
|
||||
rdf2dot my_rdf_file.rdf | dot -Tpng | display
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import html
|
||||
import sys
|
||||
from typing import Any, Dict, TextIO
|
||||
|
||||
import rdflib
|
||||
import rdflib.extras.cmdlineutils
|
||||
from rdflib import XSD
|
||||
from rdflib.graph import Graph
|
||||
from rdflib.term import Literal, Node, URIRef
|
||||
|
||||
LABEL_PROPERTIES = [
|
||||
rdflib.RDFS.label,
|
||||
rdflib.URIRef("http://purl.org/dc/elements/1.1/title"),
|
||||
rdflib.URIRef("http://xmlns.com/foaf/0.1/name"),
|
||||
rdflib.URIRef("http://www.w3.org/2006/vcard/ns#fn"),
|
||||
rdflib.URIRef("http://www.w3.org/2006/vcard/ns#org"),
|
||||
]
|
||||
|
||||
XSDTERMS = [
|
||||
XSD[x]
|
||||
for x in (
|
||||
"anyURI",
|
||||
"base64Binary",
|
||||
"boolean",
|
||||
"byte",
|
||||
"date",
|
||||
"dateTime",
|
||||
"decimal",
|
||||
"double",
|
||||
"duration",
|
||||
"float",
|
||||
"gDay",
|
||||
"gMonth",
|
||||
"gMonthDay",
|
||||
"gYear",
|
||||
"gYearMonth",
|
||||
"hexBinary",
|
||||
"ID",
|
||||
"IDREF",
|
||||
"IDREFS",
|
||||
"int",
|
||||
"integer",
|
||||
"language",
|
||||
"long",
|
||||
"Name",
|
||||
"NCName",
|
||||
"negativeInteger",
|
||||
"NMTOKEN",
|
||||
"NMTOKENS",
|
||||
"nonNegativeInteger",
|
||||
"nonPositiveInteger",
|
||||
"normalizedString",
|
||||
"positiveInteger",
|
||||
"QName",
|
||||
"short",
|
||||
"string",
|
||||
"time",
|
||||
"token",
|
||||
"unsignedByte",
|
||||
"unsignedInt",
|
||||
"unsignedLong",
|
||||
"unsignedShort",
|
||||
)
|
||||
]
|
||||
|
||||
EDGECOLOR = "blue"
|
||||
NODECOLOR = "black"
|
||||
ISACOLOR = "black"
|
||||
|
||||
|
||||
def rdf2dot(g: Graph, stream: TextIO, opts: Dict[str, Any] = {}):
|
||||
"""
|
||||
Convert the RDF graph to DOT
|
||||
writes the dot output to the stream
|
||||
"""
|
||||
|
||||
fields = collections.defaultdict(set)
|
||||
nodes: Dict[Node, str] = {}
|
||||
|
||||
def node(x: Node) -> str:
|
||||
if x not in nodes:
|
||||
nodes[x] = "node%d" % len(nodes)
|
||||
return nodes[x]
|
||||
|
||||
def label(x: Node, g: Graph):
|
||||
for labelProp in LABEL_PROPERTIES: # noqa: N806
|
||||
l_ = g.value(x, labelProp)
|
||||
if l_:
|
||||
return l_
|
||||
try:
|
||||
# type error: Argument 1 to "compute_qname" of "NamespaceManager" has incompatible type "Node"; expected "str"
|
||||
return g.namespace_manager.compute_qname(x)[2] # type: ignore[arg-type]
|
||||
except Exception:
|
||||
return x
|
||||
|
||||
def formatliteral(l: Literal, g): # noqa: E741
|
||||
v = html.escape(l)
|
||||
if l.datatype:
|
||||
return ""%s"^^%s" % (v, qname(l.datatype, g))
|
||||
elif l.language:
|
||||
return ""%s"@%s" % (v, l.language)
|
||||
return ""%s"" % v
|
||||
|
||||
def qname(x: URIRef, g: Graph) -> str:
|
||||
try:
|
||||
q = g.compute_qname(x)
|
||||
return q[0] + ":" + q[2]
|
||||
except Exception:
|
||||
return x
|
||||
|
||||
def color(p):
|
||||
return "BLACK"
|
||||
|
||||
stream.write('digraph { \n node [ fontname="DejaVu Sans" ] ; \n')
|
||||
|
||||
for s, p, o in g:
|
||||
sn = node(s)
|
||||
if p == rdflib.RDFS.label:
|
||||
continue
|
||||
if isinstance(o, (rdflib.URIRef, rdflib.BNode)):
|
||||
on = node(o)
|
||||
opstr = (
|
||||
"\t%s -> %s [ color=%s, label=< <font point-size='10' "
|
||||
+ "color='#336633'>%s</font> > ] ;\n"
|
||||
)
|
||||
# type error: Argument 1 to "qname" has incompatible type "Node"; expected "URIRef"
|
||||
stream.write(opstr % (sn, on, color(p), qname(p, g))) # type: ignore[arg-type]
|
||||
else:
|
||||
# type error: Argument 1 to "qname" has incompatible type "Node"; expected "URIRef"
|
||||
fields[sn].add((qname(p, g), formatliteral(o, g))) # type: ignore[arg-type]
|
||||
|
||||
for u, n in nodes.items():
|
||||
stream.write("# %s %s\n" % (u, n))
|
||||
f = [
|
||||
"<tr><td align='left'>%s</td><td align='left'>%s</td></tr>" % x
|
||||
for x in sorted(fields[n])
|
||||
]
|
||||
opstr = (
|
||||
"%s [ shape=none, color=%s label=< <table color='#666666'"
|
||||
+ " cellborder='0' cellspacing='0' border='1'><tr>"
|
||||
+ "<td colspan='2' bgcolor='grey'><B>%s</B></td></tr><tr>"
|
||||
+ "<td href='%s' bgcolor='#eeeeee' colspan='2'>"
|
||||
+ "<font point-size='10' color='#6666ff'>%s</font></td>"
|
||||
+ "</tr>%s</table> > ] \n"
|
||||
)
|
||||
stream.write(
|
||||
opstr
|
||||
# type error: Value of type variable "AnyStr" of "escape" cannot be "Node"
|
||||
% (n, NODECOLOR, html.escape(label(u, g)), u, html.escape(u), "".join(f)) # type: ignore[type-var]
|
||||
)
|
||||
|
||||
stream.write("}\n")
|
||||
|
||||
|
||||
def _help():
|
||||
sys.stderr.write(
|
||||
"""
|
||||
rdf2dot.py [-f <format>] files...
|
||||
Read RDF files given on STDOUT, writes a graph of the RDFS schema in DOT
|
||||
language to stdout
|
||||
-f specifies parser to use, if not given,
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
rdflib.extras.cmdlineutils.main(rdf2dot, _help)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
A commandline tool for parsing RDF in different formats and serializing the
|
||||
resulting graph to a chosen format.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
from typing import BinaryIO, Optional
|
||||
|
||||
import rdflib
|
||||
from rdflib import plugin
|
||||
from rdflib.graph import ConjunctiveGraph
|
||||
from rdflib.parser import Parser
|
||||
from rdflib.serializer import Serializer
|
||||
from rdflib.store import Store
|
||||
from rdflib.util import guess_format
|
||||
|
||||
DEFAULT_INPUT_FORMAT = "xml"
|
||||
DEFAULT_OUTPUT_FORMAT = "n3"
|
||||
|
||||
|
||||
def parse_and_serialize(
|
||||
input_files,
|
||||
input_format,
|
||||
guess,
|
||||
outfile,
|
||||
output_format,
|
||||
ns_bindings,
|
||||
store_conn="",
|
||||
store_type=None,
|
||||
):
|
||||
if store_type:
|
||||
store = plugin.get(store_type, Store)()
|
||||
store.open(store_conn)
|
||||
graph = ConjunctiveGraph(store)
|
||||
else:
|
||||
store = None
|
||||
graph = ConjunctiveGraph()
|
||||
|
||||
for prefix, uri in ns_bindings.items():
|
||||
graph.namespace_manager.bind(prefix, uri, override=False)
|
||||
|
||||
for fpath in input_files:
|
||||
use_format, kws = _format_and_kws(input_format)
|
||||
if fpath == "-":
|
||||
fpath = sys.stdin
|
||||
elif not input_format and guess:
|
||||
use_format = guess_format(fpath) or DEFAULT_INPUT_FORMAT
|
||||
graph.parse(fpath, format=use_format, **kws)
|
||||
|
||||
if outfile:
|
||||
output_format, kws = _format_and_kws(output_format)
|
||||
kws.setdefault("base", None)
|
||||
graph.serialize(destination=outfile, format=output_format, **kws)
|
||||
|
||||
if store:
|
||||
store.rollback()
|
||||
|
||||
|
||||
def _format_and_kws(fmt):
|
||||
"""
|
||||
>>> _format_and_kws("fmt")
|
||||
('fmt', {})
|
||||
>>> _format_and_kws("fmt:+a")
|
||||
('fmt', {'a': True})
|
||||
>>> _format_and_kws("fmt:a")
|
||||
('fmt', {'a': True})
|
||||
>>> _format_and_kws("fmt:+a,-b") #doctest: +SKIP
|
||||
('fmt', {'a': True, 'b': False})
|
||||
>>> _format_and_kws("fmt:c=d")
|
||||
('fmt', {'c': 'd'})
|
||||
>>> _format_and_kws("fmt:a=b:c")
|
||||
('fmt', {'a': 'b:c'})
|
||||
"""
|
||||
fmt, kws = fmt, {}
|
||||
if fmt and ":" in fmt:
|
||||
fmt, kwrepr = fmt.split(":", 1)
|
||||
for kw in kwrepr.split(","):
|
||||
if "=" in kw:
|
||||
k, v = kw.split("=")
|
||||
kws[k] = v
|
||||
elif kw.startswith("-"):
|
||||
kws[kw[1:]] = False
|
||||
elif kw.startswith("+"):
|
||||
kws[kw[1:]] = True
|
||||
else: # same as "+"
|
||||
kws[kw] = True
|
||||
return fmt, kws
|
||||
|
||||
|
||||
def make_option_parser():
|
||||
parser_names = _get_plugin_names(Parser)
|
||||
serializer_names = _get_plugin_names(Serializer)
|
||||
kw_example = "FORMAT:(+)KW1,-KW2,KW3=VALUE"
|
||||
|
||||
oparser = OptionParser(
|
||||
"%prog [-h] [-i INPUT_FORMAT] [-o OUTPUT_FORMAT] "
|
||||
+ "[--ns=PFX=NS ...] [-] [FILE ...]",
|
||||
description=__doc__.strip()
|
||||
+ (
|
||||
" Reads file system paths, URLs or from stdin if '-' is given."
|
||||
" The result is serialized to stdout."
|
||||
),
|
||||
version="%prog " + "(using rdflib %s)" % rdflib.__version__,
|
||||
)
|
||||
|
||||
oparser.add_option(
|
||||
"-i",
|
||||
"--input-format",
|
||||
type=str, # default=DEFAULT_INPUT_FORMAT,
|
||||
help="Format of the input document(s)."
|
||||
" Available input formats are: %s." % parser_names
|
||||
+ " If no format is given, it will be "
|
||||
+ "guessed from the file name extension."
|
||||
+ " Keywords to parser can be given after format like: %s." % kw_example,
|
||||
metavar="INPUT_FORMAT",
|
||||
)
|
||||
|
||||
oparser.add_option(
|
||||
"-o",
|
||||
"--output-format",
|
||||
type=str,
|
||||
default=DEFAULT_OUTPUT_FORMAT,
|
||||
help="Format of the graph serialization."
|
||||
" Available output formats are: %s." % serializer_names
|
||||
+ " Default format is: '%default'."
|
||||
+ " Keywords to serializer can be given after format like: %s." % kw_example,
|
||||
metavar="OUTPUT_FORMAT",
|
||||
)
|
||||
|
||||
oparser.add_option(
|
||||
"--ns",
|
||||
action="append",
|
||||
type=str,
|
||||
help="Register a namespace binding (QName prefix to a base URI). "
|
||||
"This can be used more than once.",
|
||||
metavar="PREFIX=NAMESPACE",
|
||||
)
|
||||
|
||||
oparser.add_option(
|
||||
"--no-guess",
|
||||
dest="guess",
|
||||
action="store_false",
|
||||
default=True,
|
||||
help="Don't guess format based on file suffix.",
|
||||
)
|
||||
|
||||
oparser.add_option(
|
||||
"--no-out",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Don't output the resulting graph "
|
||||
+ "(useful for checking validity of input).",
|
||||
)
|
||||
|
||||
oparser.add_option(
|
||||
"-w",
|
||||
"--warn",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Output warnings to stderr (by default only critical errors).",
|
||||
)
|
||||
|
||||
return oparser
|
||||
|
||||
|
||||
def _get_plugin_names(kind):
|
||||
return ", ".join(p.name for p in plugin.plugins(kind=kind))
|
||||
|
||||
|
||||
def main():
|
||||
oparser = make_option_parser()
|
||||
opts, args = oparser.parse_args()
|
||||
if len(args) < 1:
|
||||
oparser.print_usage()
|
||||
oparser.exit()
|
||||
|
||||
if opts.warn:
|
||||
loglevel = logging.WARNING
|
||||
else:
|
||||
loglevel = logging.CRITICAL
|
||||
logging.basicConfig(level=loglevel)
|
||||
|
||||
ns_bindings = {}
|
||||
if opts.ns:
|
||||
for ns_kw in opts.ns:
|
||||
pfx, uri = ns_kw.split("=")
|
||||
ns_bindings[pfx] = uri
|
||||
|
||||
outfile: Optional[BinaryIO] = sys.stdout.buffer
|
||||
|
||||
if opts.no_out:
|
||||
outfile = None
|
||||
|
||||
parse_and_serialize(
|
||||
args, opts.input_format, opts.guess, outfile, opts.output_format, ns_bindings
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
A commandline tool for drawing RDFS Class diagrams in Graphviz DOT
|
||||
format
|
||||
|
||||
You can draw the graph of an RDFS file directly:
|
||||
|
||||
.. code-block: bash
|
||||
|
||||
rdf2dot my_rdfs_file.rdf | dot -Tpng | display
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
import sys
|
||||
from typing import Dict
|
||||
|
||||
import rdflib.extras.cmdlineutils
|
||||
from rdflib import RDF, RDFS, XSD
|
||||
from rdflib.term import Identifier
|
||||
|
||||
XSDTERMS = [
|
||||
XSD[x]
|
||||
for x in (
|
||||
"anyURI",
|
||||
"base64Binary",
|
||||
"boolean",
|
||||
"byte",
|
||||
"date",
|
||||
"dateTime",
|
||||
"decimal",
|
||||
"double",
|
||||
"duration",
|
||||
"float",
|
||||
"gDay",
|
||||
"gMonth",
|
||||
"gMonthDay",
|
||||
"gYear",
|
||||
"gYearMonth",
|
||||
"hexBinary",
|
||||
"ID",
|
||||
"IDREF",
|
||||
"IDREFS",
|
||||
"int",
|
||||
"integer",
|
||||
"language",
|
||||
"long",
|
||||
"Name",
|
||||
"NCName",
|
||||
"negativeInteger",
|
||||
"NMTOKEN",
|
||||
"NMTOKENS",
|
||||
"nonNegativeInteger",
|
||||
"nonPositiveInteger",
|
||||
"normalizedString",
|
||||
"positiveInteger",
|
||||
"QName",
|
||||
"short",
|
||||
"string",
|
||||
"time",
|
||||
"token",
|
||||
"unsignedByte",
|
||||
"unsignedInt",
|
||||
"unsignedLong",
|
||||
"unsignedShort",
|
||||
)
|
||||
]
|
||||
|
||||
EDGECOLOR = "blue"
|
||||
NODECOLOR = "black"
|
||||
ISACOLOR = "black"
|
||||
|
||||
|
||||
def rdfs2dot(g, stream, opts={}):
|
||||
"""
|
||||
Convert the RDFS schema in a graph
|
||||
writes the dot output to the stream
|
||||
"""
|
||||
|
||||
fields = collections.defaultdict(set)
|
||||
nodes: Dict[Identifier, str] = {}
|
||||
|
||||
def node(nd):
|
||||
if nd not in nodes:
|
||||
nodes[nd] = "node%d" % len(nodes)
|
||||
return nodes[nd]
|
||||
|
||||
def label(xx, grf):
|
||||
lbl = grf.value(xx, RDFS.label)
|
||||
if lbl is None:
|
||||
try:
|
||||
lbl = grf.namespace_manager.compute_qname(xx)[2]
|
||||
except Exception:
|
||||
pass # bnodes and some weird URIs cannot be split
|
||||
return lbl
|
||||
|
||||
stream.write('digraph { \n node [ fontname="DejaVu Sans" ] ; \n')
|
||||
|
||||
for x in g.subjects(RDF.type, RDFS.Class):
|
||||
n = node(x)
|
||||
|
||||
for x, y in g.subject_objects(RDFS.subClassOf):
|
||||
x = node(x)
|
||||
y = node(y)
|
||||
stream.write("\t%s -> %s [ color=%s ] ;\n" % (y, x, ISACOLOR))
|
||||
|
||||
for x in g.subjects(RDF.type, RDF.Property):
|
||||
for a, b in itertools.product(
|
||||
g.objects(x, RDFS.domain), g.objects(x, RDFS.range)
|
||||
):
|
||||
if b in XSDTERMS or b == RDFS.Literal:
|
||||
l_ = label(b, g)
|
||||
if b == RDFS.Literal:
|
||||
l_ = "literal"
|
||||
fields[node(a)].add((label(x, g), l_))
|
||||
else:
|
||||
# if a in nodes and b in nodes:
|
||||
stream.write(
|
||||
'\t%s -> %s [ color=%s, label="%s" ];\n'
|
||||
% (node(a), node(b), EDGECOLOR, label(x, g))
|
||||
)
|
||||
|
||||
for u, n in nodes.items():
|
||||
stream.write("# %s %s\n" % (u, n))
|
||||
f = [
|
||||
"<tr><td align='left'>%s</td><td>%s</td></tr>" % x
|
||||
for x in sorted(fields[n])
|
||||
]
|
||||
opstr = (
|
||||
"%s [ shape=none, color=%s label=< <table color='#666666'"
|
||||
+ " cellborder='0' cellspacing='0' border='1'><tr>"
|
||||
+ "<td colspan='2' bgcolor='grey'><B>%s</B></td>"
|
||||
+ "</tr>%s</table> > ] \n"
|
||||
)
|
||||
stream.write(opstr % (n, NODECOLOR, label(u, g), "".join(f)))
|
||||
|
||||
stream.write("}\n")
|
||||
|
||||
|
||||
def _help():
|
||||
sys.stderr.write(
|
||||
"""
|
||||
rdfs2dot.py [-f <format>] files...
|
||||
Read RDF files given on STDOUT, writes a graph of the RDFS schema in
|
||||
DOT language to stdout
|
||||
-f specifies parser to use, if not given,
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
rdflib.extras.cmdlineutils.main(rdfs2dot, _help)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user