2025-12-01

This commit is contained in:
2026-03-17 14:58:51 -06:00
parent 183e865f8b
commit 4b82b57113
6846 changed files with 954887 additions and 162606 deletions
@@ -0,0 +1,207 @@
"""
HextuplesSerializer RDF graph serializer for RDFLib.
See <https://github.com/ontola/hextuples> for details about the format.
"""
from __future__ import annotations
import json
import warnings
from typing import IO, Any, Callable, List, Optional, Type, Union, cast
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, ConjunctiveGraph, Dataset, Graph
from rdflib.namespace import RDF, XSD
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Literal, URIRef
try:
import orjson
_HAS_ORJSON = True
except ImportError:
orjson = None # type: ignore[assignment, unused-ignore]
_HAS_ORJSON = False
__all__ = ["HextuplesSerializer"]
class HextuplesSerializer(Serializer):
"""
Serializes RDF graphs to NTriples format.
"""
contexts: List[Union[Graph, IdentifiedNode]]
dumps: Callable
def __new__(cls, store: Union[Graph, Dataset, ConjunctiveGraph]):
if _HAS_ORJSON:
cls.str_local_id: Union[str, Any] = orjson.Fragment(b'"localId"')
cls.str_global_id: Union[str, Any] = orjson.Fragment(b'"globalId"')
cls.empty: Union[str, Any] = orjson.Fragment(b'""')
cls.lang_str: Union[str, Any] = orjson.Fragment(
b'"' + RDF.langString.encode("utf-8") + b'"'
)
cls.xsd_string: Union[str, Any] = orjson.Fragment(
b'"' + XSD.string.encode("utf-8") + b'"'
)
else:
cls.str_local_id = "localId"
cls.str_global_id = "globalId"
cls.empty = ""
cls.lang_str = f"{RDF.langString}"
cls.xsd_string = f"{XSD.string}"
return super(cls, cls).__new__(cls)
def __init__(self, store: Union[Graph, Dataset, ConjunctiveGraph]):
self.default_context: Optional[Union[Graph, IdentifiedNode]]
self.graph_type: Union[Type[Graph], Type[Dataset], Type[ConjunctiveGraph]]
if isinstance(store, (Dataset, ConjunctiveGraph)):
self.graph_type = (
Dataset if isinstance(store, Dataset) else ConjunctiveGraph
)
self.contexts = list(store.contexts())
if store.default_context:
self.default_context = store.default_context
self.contexts.append(store.default_context)
else:
self.default_context = None
else:
self.graph_type = Graph
self.contexts = [store]
self.default_context = None
Serializer.__init__(self, store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = "utf-8",
**kwargs: Any,
) -> None:
if base is not None:
warnings.warn(
"base has no meaning for Hextuples serialization. "
"I will ignore this value"
)
if encoding not in [None, "utf-8"]:
warnings.warn(
f"Hextuples files are always utf-8 encoded. "
f"I was passed: {encoding}, "
"but I'm still going to use utf-8 anyway!"
)
if self.store.formula_aware is True:
raise Exception(
"Hextuple serialization can't (yet) handle formula-aware stores"
)
context: Union[Graph, IdentifiedNode]
context_str: Union[bytes, str]
for context in self.contexts:
for triple in context:
# Generate context string just once, because it doesn't change
# for every triple in this context
context_str = cast(
Union[str, bytes],
(
self.empty
if self.graph_type is Graph
else (
orjson.Fragment('"' + self._context_str(context) + '"')
if _HAS_ORJSON
else self._context_str(context)
)
),
)
hl = self._hex_line(triple, context_str)
if hl is not None:
stream.write(hl if _HAS_ORJSON else hl.encode())
def _hex_line(self, triple, context_str: Union[bytes, str]):
if isinstance(
triple[0], (URIRef, BNode)
): # exclude QuotedGraph and other objects
# value
value = (
triple[2]
if isinstance(triple[2], Literal)
else self._iri_or_bn(triple[2])
)
# datatype
if isinstance(triple[2], URIRef):
# datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#namedNode"
datatype = self.str_global_id
elif isinstance(triple[2], BNode):
# datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#blankNode"
datatype = self.str_local_id
elif isinstance(triple[2], Literal):
if triple[2].datatype is not None:
datatype = f"{triple[2].datatype}"
else:
if triple[2].language is not None: # language
datatype = self.lang_str
else:
datatype = self.xsd_string
else:
return None # can't handle non URI, BN or Literal Object (QuotedGraph)
# language
if isinstance(triple[2], Literal):
if triple[2].language is not None:
language = f"{triple[2].language}"
else:
language = self.empty
else:
language = self.empty
line_list = [
self._iri_or_bn(triple[0]),
triple[1],
value,
datatype,
language,
context_str,
]
outline: Union[str, bytes]
if _HAS_ORJSON:
outline = orjson.dumps(line_list, option=orjson.OPT_APPEND_NEWLINE)
else:
outline = json.dumps(line_list) + "\n"
return outline
else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects
return None
def _iri_or_bn(self, i_):
if isinstance(i_, URIRef):
return f"{i_}"
elif isinstance(i_, BNode):
return f"{i_.n3()}"
else:
return None
def _context_str(self, context: Union[Graph, IdentifiedNode]) -> str:
context_identifier: IdentifiedNode = (
context.identifier if isinstance(context, Graph) else context
)
if context_identifier == DATASET_DEFAULT_GRAPH_ID:
return ""
if self.default_context is not None:
if (
isinstance(self.default_context, IdentifiedNode)
and context_identifier == self.default_context
):
return ""
elif (
isinstance(self.default_context, Graph)
and context_identifier == self.default_context.identifier
):
return ""
if self.graph_type is Graph:
# Only emit a context name when serializing a Dataset or ConjunctiveGraph
return ""
return (
f"{context_identifier}"
if isinstance(context_identifier, URIRef)
else context_identifier.n3()
)
@@ -0,0 +1,433 @@
"""
This serialiser will output an RDF Graph as a JSON-LD formatted document. See:
http://json-ld.org/
Example usage::
>>> from rdflib import Graph
>>> testrdf = '''
... @prefix dc: <http://purl.org/dc/terms/> .
... <http://example.org/about>
... dc:title "Someone's Homepage"@en .
... '''
>>> g = Graph().parse(data=testrdf, format='n3')
>>> print(g.serialize(format='json-ld', indent=2))
[
{
"@id": "http://example.org/about",
"http://purl.org/dc/terms/title": [
{
"@language": "en",
"@value": "Someone's Homepage"
}
]
}
]
"""
# From: https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/serializer.py
# NOTE: This code writes the entire JSON object into memory before serialising,
# but we should consider streaming the output to deal with arbitrarily large
# graphs.
from __future__ import annotations
import warnings
from typing import IO, Any, Dict, List, Optional
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Graph, _ObjectType
from rdflib.namespace import RDF, XSD
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Identifier, Literal, URIRef
from ..shared.jsonld.context import UNDEF, Context
from ..shared.jsonld.keys import CONTEXT, GRAPH, ID, LANG, LIST, SET, VOCAB
from ..shared.jsonld.util import _HAS_ORJSON, json, orjson
__all__ = ["JsonLDSerializer", "from_rdf"]
PLAIN_LITERAL_TYPES = {XSD.boolean, XSD.integer, XSD.double, XSD.string}
class JsonLDSerializer(Serializer):
def __init__(self, store: Graph):
super(JsonLDSerializer, self).__init__(store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
# TODO: docstring w. args and return value
encoding = encoding or "utf-8"
if encoding not in ("utf-8", "utf-16"):
warnings.warn(
"JSON should be encoded as unicode. " f"Given encoding was: {encoding}"
)
context_data = kwargs.get("context")
use_native_types = (kwargs.get("use_native_types", False),)
use_rdf_type = kwargs.get("use_rdf_type", False)
auto_compact = kwargs.get("auto_compact", False)
indent = kwargs.get("indent", 2)
separators = kwargs.get("separators", (",", ": "))
sort_keys = kwargs.get("sort_keys", True)
ensure_ascii = kwargs.get("ensure_ascii", False)
obj = from_rdf(
self.store,
context_data,
base,
use_native_types,
use_rdf_type,
auto_compact=auto_compact,
)
if _HAS_ORJSON:
option: int = orjson.OPT_NON_STR_KEYS
if indent is not None:
option |= orjson.OPT_INDENT_2
if sort_keys:
option |= orjson.OPT_SORT_KEYS
if ensure_ascii:
warnings.warn("Cannot use ensure_ascii with orjson")
data_bytes = orjson.dumps(obj, option=option)
stream.write(data_bytes)
else:
data = json.dumps(
obj,
indent=indent,
separators=separators,
sort_keys=sort_keys,
ensure_ascii=ensure_ascii,
)
stream.write(data.encode(encoding, "replace"))
def from_rdf(
graph,
context_data=None,
base=None,
use_native_types=False,
use_rdf_type=False,
auto_compact=False,
startnode=None,
index=False,
):
# TODO: docstring w. args and return value
# TODO: support for index and startnode
if not context_data and auto_compact:
context_data = dict(
(pfx, str(ns))
for (pfx, ns) in graph.namespaces()
if pfx and str(ns) != "http://www.w3.org/XML/1998/namespace"
)
if isinstance(context_data, Context):
context = context_data
context_data = context.to_dict()
else:
context = Context(context_data, base=base)
converter = Converter(context, use_native_types, use_rdf_type)
result = converter.convert(graph)
if converter.context.active:
if isinstance(result, list):
result = {context.get_key(GRAPH): result}
result[CONTEXT] = context_data
return result
class Converter:
def __init__(self, context: Context, use_native_types: bool, use_rdf_type: bool):
self.context = context
self.use_native_types = context.active or use_native_types
self.use_rdf_type = use_rdf_type
def convert(self, graph: Graph):
# TODO: bug in rdflib dataset parsing (nquads et al):
# plain triples end up in separate unnamed graphs (rdflib issue #436)
if graph.context_aware:
# type error: "Graph" has no attribute "contexts"
all_contexts = list(graph.contexts()) # type: ignore[attr-defined]
has_dataset_default_id = any(
c.identifier == DATASET_DEFAULT_GRAPH_ID for c in all_contexts
)
if (
has_dataset_default_id
# # type error: "Graph" has no attribute "contexts"
and graph.default_context.identifier == DATASET_DEFAULT_GRAPH_ID # type: ignore[attr-defined]
):
default_graph = graph.default_context # type: ignore[attr-defined]
else:
default_graph = Graph()
graphs = [default_graph]
default_graph_id = default_graph.identifier
for g in all_contexts:
if g in graphs:
continue
if isinstance(g.identifier, URIRef):
graphs.append(g)
else:
default_graph += g
else:
graphs = [graph]
default_graph_id = graph.identifier
context = self.context
objs: List[Any] = []
for g in graphs:
obj = {}
graphname = None
if isinstance(g.identifier, URIRef):
if g.identifier != default_graph_id:
graphname = context.shrink_iri(g.identifier)
obj[context.id_key] = graphname
nodes = self.from_graph(g)
if not graphname and len(nodes) == 1:
obj.update(nodes[0])
else:
if not nodes:
continue
obj[context.graph_key] = nodes
if objs and objs[0].get(context.get_key(ID)) == graphname:
objs[0].update(obj)
else:
objs.append(obj)
if len(graphs) == 1 and len(objs) == 1 and not self.context.active:
default = objs[0]
items = default.get(context.graph_key)
if len(default) == 1 and items:
objs = items
elif len(objs) == 1 and self.context.active:
objs = objs[0]
return objs
def from_graph(self, graph: Graph):
nodemap: Dict[Any, Any] = {}
for s in set(graph.subjects()):
## only iri:s and unreferenced (rest will be promoted to top if needed)
if isinstance(s, URIRef) or (
isinstance(s, BNode) and not any(graph.subjects(None, s))
):
self.process_subject(graph, s, nodemap)
return list(nodemap.values())
def process_subject(self, graph: Graph, s: IdentifiedNode, nodemap):
if isinstance(s, URIRef):
node_id = self.context.shrink_iri(s)
elif isinstance(s, BNode):
node_id = s.n3()
else:
# This does not seem right, this probably should be an error.
node_id = None
# used_as_object = any(graph.subjects(None, s))
if node_id in nodemap:
return None
node = {}
node[self.context.id_key] = node_id
nodemap[node_id] = node
for p, o in graph.predicate_objects(s):
# type error: Argument 3 to "add_to_node" of "Converter" has incompatible type "Node"; expected "IdentifiedNode"
# type error: Argument 4 to "add_to_node" of "Converter" has incompatible type "Node"; expected "Identifier"
self.add_to_node(graph, s, p, o, node, nodemap) # type: ignore[arg-type]
return node
def add_to_node(
self,
graph: Graph,
s: IdentifiedNode,
p: IdentifiedNode,
o: Identifier,
s_node: Dict[str, Any],
nodemap,
):
context = self.context
if isinstance(o, Literal):
datatype = str(o.datatype) if o.datatype else None
language = o.language
term = context.find_term(str(p), datatype, language=language)
else:
containers = [LIST, None] if graph.value(o, RDF.first) else [None]
for container in containers:
for coercion in (ID, VOCAB, UNDEF):
# type error: Argument 2 to "find_term" of "Context" has incompatible type "object"; expected "Union[str, Defined, None]"
# type error: Argument 3 to "find_term" of "Context" has incompatible type "Optional[str]"; expected "Union[Defined, str]"
term = context.find_term(str(p), coercion, container) # type: ignore[arg-type]
if term:
break
if term:
break
node = None
use_set = not context.active
if term:
p_key = term.name
if term.type:
node = self.type_coerce(o, term.type)
# type error: "Identifier" has no attribute "language"
elif term.language and o.language == term.language: # type: ignore[attr-defined]
node = str(o)
# type error: Right operand of "and" is never evaluated
elif context.language and (term.language is None and o.language is None): # type: ignore[unreachable]
node = str(o) # type: ignore[unreachable]
if LIST in term.container:
node = [
self.type_coerce(v, term.type)
or self.to_raw_value(graph, s, v, nodemap)
for v in self.to_collection(graph, o)
]
elif LANG in term.container and language:
value = s_node.setdefault(p_key, {})
values = value.get(language)
node = str(o)
if values or SET in term.container:
if not isinstance(values, list):
value[language] = values = [values]
values.append(node)
else:
value[language] = node
return
elif SET in term.container:
use_set = True
else:
p_key = context.to_symbol(p)
# TODO: for coercing curies - quite clumsy; unify to_symbol and find_term?
key_term = context.terms.get(p_key)
if key_term and (key_term.type or key_term.container):
p_key = p
if not term and p == RDF.type and not self.use_rdf_type:
if isinstance(o, URIRef):
node = context.to_symbol(o)
p_key = context.type_key
if node is None:
node = self.to_raw_value(graph, s, o, nodemap)
value = s_node.get(p_key)
if value:
if not isinstance(value, list):
value = [value]
value.append(node)
elif use_set:
value = [node]
else:
value = node
s_node[p_key] = value
def type_coerce(self, o: Identifier, coerce_type: str):
if coerce_type == ID:
if isinstance(o, URIRef):
return self.context.shrink_iri(o)
elif isinstance(o, BNode):
return o.n3()
else:
return o
elif coerce_type == VOCAB and isinstance(o, URIRef):
return self.context.to_symbol(o)
elif isinstance(o, Literal) and str(o.datatype) == coerce_type:
return o
else:
return None
def to_raw_value(
self, graph: Graph, s: IdentifiedNode, o: Identifier, nodemap: Dict[str, Any]
):
context = self.context
coll = self.to_collection(graph, o)
if coll is not None:
coll = [
self.to_raw_value(graph, s, lo, nodemap)
for lo in self.to_collection(graph, o)
]
return {context.list_key: coll}
elif isinstance(o, BNode):
embed = (
False # TODO: self.context.active or using startnode and only one ref
)
onode = self.process_subject(graph, o, nodemap)
if onode:
if embed and not any(s2 for s2 in graph.subjects(None, o) if s2 != s):
return onode
else:
nodemap[onode[context.id_key]] = onode
return {context.id_key: o.n3()}
elif isinstance(o, URIRef):
# TODO: embed if o != startnode (else reverse)
return {context.id_key: context.shrink_iri(o)}
elif isinstance(o, Literal):
# TODO: if compact
native = self.use_native_types and o.datatype in PLAIN_LITERAL_TYPES
if native:
v = o.toPython()
else:
v = str(o)
if o.datatype:
if native and self.context.active:
return v
return {
context.type_key: context.to_symbol(o.datatype),
context.value_key: v,
}
elif o.language and o.language != context.language:
return {context.lang_key: o.language, context.value_key: v}
# type error: Right operand of "and" is never evaluated
elif not context.active or context.language and not o.language: # type: ignore[unreachable]
return {context.value_key: v}
else:
return v
def to_collection(self, graph: Graph, l_: Identifier):
if l_ != RDF.nil and not graph.value(l_, RDF.first):
return None
list_nodes: List[Optional[_ObjectType]] = []
chain = set([l_])
while l_:
if l_ == RDF.nil:
return list_nodes
if isinstance(l_, URIRef):
return None
first, rest = None, None
for p, o in graph.predicate_objects(l_):
if not first and p == RDF.first:
first = o
elif not rest and p == RDF.rest:
rest = o
elif p != RDF.type or o != RDF.List:
return None
list_nodes.append(first)
# type error: Incompatible types in assignment (expression has type "Optional[Node]", variable has type "Identifier")
l_ = rest # type: ignore[assignment]
if l_ in chain:
return None
chain.add(l_)
@@ -0,0 +1,326 @@
"""
LongTurtle RDF graph serializer for RDFLib.
See <http://www.w3.org/TeamSubmission/turtle/> for syntax specification.
This variant, longturtle as opposed to just turtle, makes some small format changes
to turtle - the original turtle serializer. It:
* uses PREFIX instead of @prefix
* uses BASE instead of @base
* adds a new line at RDF.type, or 'a'
* adds a newline and an indent for all triples with more than one object (object list)
* adds a new line and ';' for the last triple in a set with '.'
on the start of the next line
* uses default encoding (encode()) is used instead of "latin-1"
- Nicholas Car, 2023
"""
from __future__ import annotations
from typing import IO, Any, Optional
from rdflib.compare import to_canonical_graph
from rdflib.exceptions import Error
from rdflib.graph import Graph
from rdflib.namespace import RDF
from rdflib.term import BNode, Literal, URIRef
from .turtle import RecursiveSerializer
__all__ = ["LongTurtleSerializer"]
SUBJECT = 0
VERB = 1
OBJECT = 2
_GEN_QNAME_FOR_DT = False
_SPACIOUS_OUTPUT = False
class LongTurtleSerializer(RecursiveSerializer):
short_name = "longturtle"
indentString = " "
def __init__(self, store):
self._ns_rewrite = {}
store = to_canonical_graph(store)
content = store.serialize(format="application/n-triples")
lines = content.split("\n")
lines.sort()
graph = Graph()
graph.parse(
data="\n".join(lines), format="application/n-triples", skolemize=True
)
graph = graph.de_skolemize()
super(LongTurtleSerializer, self).__init__(graph)
self.keywords = {RDF.type: "a"}
self.reset()
self.stream = None
self._spacious: bool = _SPACIOUS_OUTPUT
def addNamespace(self, prefix, namespace):
# Turtle does not support prefixes that start with _
# if they occur in the graph, rewrite to p_blah
# this is more complicated since we need to make sure p_blah
# does not already exist. And we register namespaces as we go, i.e.
# we may first see a triple with prefix _9 - rewrite it to p_9
# and then later find a triple with a "real" p_9 prefix
# so we need to keep track of ns rewrites we made so far.
if (prefix > "" and prefix[0] == "_") or self.namespaces.get(
prefix, namespace
) != namespace:
if prefix not in self._ns_rewrite:
p = "p" + prefix
while p in self.namespaces:
p = "p" + p
self._ns_rewrite[prefix] = p
prefix = self._ns_rewrite.get(prefix, prefix)
super(LongTurtleSerializer, self).addNamespace(prefix, namespace)
return prefix
def reset(self):
super(LongTurtleSerializer, self).reset()
self._shortNames = {}
self._started = False
self._ns_rewrite = {}
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
spacious: Optional[bool] = None,
**kwargs: Any,
) -> None:
self.reset()
self.stream = stream
# if base is given here, use, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
if spacious is not None:
self._spacious = spacious
self.preprocess()
subjects_list = self.orderSubjects()
self.startDocument()
firstTime = True
for subject in subjects_list:
if self.isDone(subject):
continue
if firstTime:
firstTime = False
if self.statement(subject) and not firstTime:
self.write("\n")
self.endDocument()
self.base = None
def preprocessTriple(self, triple):
super(LongTurtleSerializer, self).preprocessTriple(triple)
for i, node in enumerate(triple):
if node in self.keywords:
continue
# Don't use generated prefixes for subjects and objects
self.getQName(node, gen_prefix=(i == VERB))
if isinstance(node, Literal) and node.datatype:
self.getQName(node.datatype, gen_prefix=_GEN_QNAME_FOR_DT)
p = triple[1]
if isinstance(p, BNode): # hmm - when is P ever a bnode?
self._references[p] += 1
def getQName(self, uri, gen_prefix=True):
if not isinstance(uri, URIRef):
return None
try:
parts = self.store.compute_qname(uri, generate=gen_prefix)
except Exception:
# is the uri a namespace in itself?
pfx = self.store.store.prefix(uri)
if pfx is not None:
parts = (pfx, uri, "")
else:
# nothing worked
return None
prefix, namespace, local = parts
# QName cannot end with .
if local.endswith("."):
return None
prefix = self.addNamespace(prefix, namespace)
return "%s:%s" % (prefix, local)
def startDocument(self):
self._started = True
ns_list = sorted(self.namespaces.items())
if self.base:
self.write(self.indent() + "BASE <%s>\n" % self.base)
for prefix, uri in ns_list:
self.write(self.indent() + "PREFIX %s: <%s>\n" % (prefix, uri))
if ns_list and self._spacious:
self.write("\n")
def endDocument(self):
if self._spacious:
self.write("\n")
def statement(self, subject):
self.subjectDone(subject)
return self.s_squared(subject) or self.s_default(subject)
def s_default(self, subject):
self.write("\n" + self.indent())
self.path(subject, SUBJECT)
self.write("\n" + self.indent())
self.predicateList(subject)
self.write("\n.")
return True
def s_squared(self, subject):
if (self._references[subject] > 0) or not isinstance(subject, BNode):
return False
self.write("\n" + self.indent() + "[]")
self.predicateList(subject, newline=False)
self.write("\n.")
return True
def path(self, node, position, newline=False):
if not (
self.p_squared(node, position) or self.p_default(node, position, newline)
):
raise Error("Cannot serialize node '%s'" % (node,))
def p_default(self, node, position, newline=False):
if position != SUBJECT and not newline:
self.write(" ")
self.write(self.label(node, position))
return True
def label(self, node, position):
if node == RDF.nil:
return "()"
if position is VERB and node in self.keywords:
return self.keywords[node]
if isinstance(node, Literal):
return node._literal_n3(
use_plain=True,
qname_callback=lambda dt: self.getQName(dt, _GEN_QNAME_FOR_DT),
)
else:
node = self.relativize(node)
return self.getQName(node, position == VERB) or node.n3()
def p_squared(
self,
node,
position,
):
if (
not isinstance(node, BNode)
or node in self._serialized
or self._references[node] > 1
or position == SUBJECT
):
return False
if self.isValidList(node):
# this is a list
self.depth += 2
self.write(" (\n")
self.depth -= 2
self.doList(node)
self.write("\n" + self.indent() + ")")
else:
# this is a Blank Node
self.subjectDone(node)
self.write("\n" + self.indent(1) + "[\n")
self.depth += 1
self.predicateList(node)
self.depth -= 1
self.write("\n" + self.indent(1) + "]")
return True
def isValidList(self, l_):
"""
Checks if l is a valid RDF list, i.e. no nodes have other properties.
"""
try:
if self.store.value(l_, RDF.first) is None:
return False
except Exception:
return False
while l_:
if l_ != RDF.nil and len(list(self.store.predicate_objects(l_))) != 2:
return False
l_ = self.store.value(l_, RDF.rest)
return True
def doList(self, l_):
i = 0
while l_:
item = self.store.value(l_, RDF.first)
if item is not None:
if i == 0:
self.write(self.indent(1))
else:
self.write("\n" + self.indent(1))
self.path(item, OBJECT, newline=True)
self.subjectDone(l_)
l_ = self.store.value(l_, RDF.rest)
i += 1
def predicateList(self, subject, newline=False):
properties = self.buildPredicateHash(subject)
propList = self.sortProperties(properties)
if len(propList) == 0:
return
self.write(self.indent(1))
self.verb(propList[0], newline=True)
self.objectList(properties[propList[0]])
for predicate in propList[1:]:
self.write(" ;\n" + self.indent(1))
self.verb(predicate, newline=True)
self.objectList(properties[predicate])
self.write(" ;")
def verb(self, node, newline=False):
self.path(node, VERB, newline)
def objectList(self, objects):
count = len(objects)
if count == 0:
return
depthmod = (count == 1) and 0 or 1
self.depth += depthmod
first_nl = False
if count > 1:
if not isinstance(objects[0], BNode):
self.write("\n" + self.indent(1))
else:
self.write(" ")
first_nl = True
self.path(objects[0], OBJECT, newline=first_nl)
for obj in objects[1:]:
self.write(" ,")
if not isinstance(obj, BNode):
self.write("\n" + self.indent(1))
self.path(obj, OBJECT, newline=True)
self.depth -= depthmod
@@ -0,0 +1,91 @@
"""
Notation 3 (N3) RDF graph serializer for RDFLib.
"""
from rdflib.graph import Graph
from rdflib.namespace import OWL, Namespace
from rdflib.plugins.serializers.turtle import OBJECT, SUBJECT, TurtleSerializer
__all__ = ["N3Serializer"]
SWAP_LOG = Namespace("http://www.w3.org/2000/10/swap/log#")
class N3Serializer(TurtleSerializer):
short_name = "n3"
def __init__(self, store: Graph, parent=None):
super(N3Serializer, self).__init__(store)
self.keywords.update({OWL.sameAs: "=", SWAP_LOG.implies: "=>"})
self.parent = parent
def reset(self):
super(N3Serializer, self).reset()
self._stores = {}
def endDocument(self): # noqa: N802
if not self.parent:
super(N3Serializer, self).endDocument()
def indent(self, modifier=0):
indent = super(N3Serializer, self).indent(modifier)
if self.parent is not None:
indent += self.parent.indent() # modifier)
return indent
def preprocessTriple(self, triple): # noqa: N802
super(N3Serializer, self).preprocessTriple(triple)
if isinstance(triple[0], Graph):
for t in triple[0]:
self.preprocessTriple(t)
if isinstance(triple[1], Graph):
for t in triple[1]:
self.preprocessTriple(t)
if isinstance(triple[2], Graph):
for t in triple[2]:
self.preprocessTriple(t)
def getQName(self, uri, gen_prefix=True): # noqa: N802
qname = None
if self.parent is not None:
qname = self.parent.getQName(uri, gen_prefix)
if qname is None:
qname = super(N3Serializer, self).getQName(uri, gen_prefix)
return qname
def statement(self, subject):
self.subjectDone(subject)
properties = self.buildPredicateHash(subject)
if len(properties) == 0:
return False
return self.s_clause(subject) or super(N3Serializer, self).statement(subject)
def path(self, node, position, newline=False):
if not self.p_clause(node, position):
super(N3Serializer, self).path(node, position, newline)
def s_clause(self, subject):
if isinstance(subject, Graph):
self.write("\n" + self.indent())
self.p_clause(subject, SUBJECT)
self.predicateList(subject)
self.write(" .")
return True
else:
return False
def p_clause(self, node, position):
if isinstance(node, Graph):
self.subjectDone(node)
if position is OBJECT:
self.write(" ")
self.write("{")
self.depth += 1
serializer = N3Serializer(node, parent=self)
# type error: Argument 1 to "serialize" of "TurtleSerializer" has incompatible type "Optional[IO[bytes]]"; expected "IO[bytes]"
serializer.serialize(self.stream) # type: ignore[arg-type]
self.depth -= 1
self.write(self.indent() + "}")
return True
else:
return False
@@ -0,0 +1,61 @@
from __future__ import annotations
import warnings
from typing import IO, Any, Optional
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.plugins.serializers.nt import _quoteLiteral
from rdflib.serializer import Serializer
from rdflib.term import Literal
__all__ = ["NQuadsSerializer"]
class NQuadsSerializer(Serializer):
def __init__(self, store: Graph):
if not store.context_aware:
raise Exception(
"NQuads serialization only makes " "sense for context-aware stores!"
)
super(NQuadsSerializer, self).__init__(store)
self.store: ConjunctiveGraph
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
if base is not None:
warnings.warn("NQuadsSerializer does not support base.")
if encoding is not None and encoding.lower() != self.encoding.lower():
warnings.warn(
"NQuadsSerializer does not use custom encoding. "
f"Given encoding was: {encoding}"
)
encoding = self.encoding
for context in self.store.contexts():
for triple in context:
stream.write(
_nq_row(triple, context.identifier).encode(encoding, "replace")
)
stream.write("\n".encode("latin-1"))
def _nq_row(triple, context):
if isinstance(triple[2], Literal):
return "%s %s %s %s .\n" % (
triple[0].n3(),
triple[1].n3(),
_quoteLiteral(triple[2]),
context.n3(),
)
else:
return "%s %s %s %s .\n" % (
triple[0].n3(),
triple[1].n3(),
triple[2].n3(),
context.n3(),
)
@@ -0,0 +1,115 @@
from __future__ import annotations
import codecs
import warnings
from typing import IO, TYPE_CHECKING, Any, Optional, Tuple, Union
from rdflib.graph import Graph
from rdflib.serializer import Serializer
from rdflib.term import Literal
if TYPE_CHECKING:
from rdflib.graph import _TripleType
"""
N-Triples RDF graph serializer for RDFLib.
See <http://www.w3.org/TR/rdf-testcases/#ntriples> for details about the
format.
"""
__all__ = ["NTSerializer"]
class NTSerializer(Serializer):
"""
Serializes RDF graphs to NTriples format.
"""
def __init__(self, store: Graph):
Serializer.__init__(self, store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = "utf-8",
**kwargs: Any,
) -> None:
if base is not None:
warnings.warn("NTSerializer does not support base.")
if encoding != "utf-8":
warnings.warn(
"NTSerializer always uses UTF-8 encoding. "
f"Given encoding was: {encoding}"
)
for triple in self.store:
stream.write(_nt_row(triple).encode())
class NT11Serializer(NTSerializer):
"""
Serializes RDF graphs to RDF 1.1 NTriples format.
Exactly like nt - only utf8 encoded.
"""
def __init__(self, store: Graph):
Serializer.__init__(self, store) # default to utf-8
def _nt_row(triple: _TripleType) -> str:
if isinstance(triple[2], Literal):
return "%s %s %s .\n" % (
triple[0].n3(),
triple[1].n3(),
_quoteLiteral(triple[2]),
)
else:
return "%s %s %s .\n" % (triple[0].n3(), triple[1].n3(), triple[2].n3())
def _quoteLiteral(l_: Literal) -> str: # noqa: N802
"""
a simpler version of term.Literal.n3()
"""
encoded = _quote_encode(l_)
if l_.language:
if l_.datatype:
raise Exception("Literal has datatype AND language!")
return "%s@%s" % (encoded, l_.language)
elif l_.datatype:
return "%s^^<%s>" % (encoded, l_.datatype)
else:
return "%s" % encoded
def _quote_encode(l_: str) -> str:
return '"%s"' % l_.replace("\\", "\\\\").replace("\n", "\\n").replace(
'"', '\\"'
).replace("\r", "\\r")
def _nt_unicode_error_resolver(
err: UnicodeError,
) -> Tuple[Union[str, bytes], int]:
"""
Do unicode char replaces as defined in https://www.w3.org/TR/2004/REC-rdf-testcases-20040210/#ntrip_strings
"""
def _replace_single(c):
c = ord(c)
fmt = "\\u%04X" if c <= 0xFFFF else "\\U%08X"
return fmt % c
# type error: "UnicodeError" has no attribute "object"
# type error: "UnicodeError" has no attribute "start"
# type error: "UnicodeError" has no attribute "end"
string = err.object[err.start : err.end] # type: ignore[attr-defined]
# type error: "UnicodeError" has no attribute "end"
return "".join(_replace_single(c) for c in string), err.end # type: ignore[attr-defined]
codecs.register_error("_rdflib_nt_escape", _nt_unicode_error_resolver)
@@ -0,0 +1,108 @@
from __future__ import annotations
import warnings
from typing import IO, Any, Optional
from uuid import uuid4
from rdflib import Dataset
from rdflib.plugins.serializers.nquads import _nq_row
from rdflib.plugins.serializers.nt import _nt_row
from rdflib.serializer import Serializer
add_remove_methods = {"add": "A", "remove": "D"}
class PatchSerializer(Serializer):
"""
Creates an RDF patch file to add and remove triples/quads.
Can either:
- Create an add or delete patch for a single Dataset.
- Create a patch to represent the difference between two Datasets.
"""
def __init__(
self,
store: Dataset,
):
self.store: Dataset = store
super().__init__(store)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
"""
Serialize the store to the given stream.
:param stream: The stream to serialize to.
:param base: The base URI to use for the serialization.
:param encoding: The encoding to use for the serialization.
:param kwargs: Additional keyword arguments.
Supported keyword arguments:
- operation: The operation to perform. Either 'add' or 'remove'.
- target: The target Dataset to compare against.
NB: Only one of 'operation' or 'target' should be provided.
- header_id: The header ID to use.
- header_prev: The previous header ID to use.
"""
operation = kwargs.get("operation")
target = kwargs.get("target")
header_id = kwargs.get("header_id")
header_prev = kwargs.get("header_prev")
if not header_id:
header_id = f"uuid:{uuid4()}"
encoding = self.encoding
if base is not None:
warnings.warn("PatchSerializer does not support base.")
if encoding is not None and encoding.lower() != self.encoding.lower():
warnings.warn(
"PatchSerializer does not use custom encoding. "
f"Given encoding was: {encoding}"
)
def write_header():
stream.write(f"H id <{header_id}> .\n".encode(encoding, "replace"))
if header_prev:
stream.write(f"H prev <{header_prev}>\n".encode(encoding, "replace"))
stream.write("TX .\n".encode(encoding, "replace"))
def write_triples(contexts, op_code, use_passed_contexts=False):
for context in contexts:
if not use_passed_contexts:
context = self.store.get_context(context.identifier)
for triple in context:
stream.write(
self._patch_row(triple, context.identifier, op_code).encode(
encoding, "replace"
)
)
if operation:
assert operation in add_remove_methods, f"Invalid operation: {operation}"
elif not target:
# No operation specified and no target specified
# Fall back to default operation of "add" to prevent a no-op
operation = "add"
write_header()
if operation:
operation_code = add_remove_methods.get(operation)
write_triples(self.store.contexts(), operation_code)
elif target:
to_add, to_remove = self._diff(target)
write_triples(to_add.contexts(), "A", use_passed_contexts=True)
write_triples(to_remove.contexts(), "D", use_passed_contexts=True)
stream.write("TC .\n".encode(encoding, "replace"))
def _diff(self, target):
rows_to_add = target - self.store
rows_to_remove = self.store - target
return rows_to_add, rows_to_remove
def _patch_row(self, triple, context_id, operation):
if context_id == self.store.default_context.identifier:
return f"{operation} {_nt_row(triple)}"
else:
return f"{operation} {_nq_row(triple, context_id)}"
@@ -0,0 +1,391 @@
from __future__ import annotations
import xml.dom.minidom
from typing import IO, Any, Dict, Generator, Optional, Set, Tuple
from xml.sax.saxutils import escape, quoteattr
from rdflib.collection import Collection
from rdflib.graph import Graph
from rdflib.namespace import RDF, RDFS, Namespace # , split_uri
from rdflib.plugins.parsers.RDFVOC import RDFVOC
from rdflib.plugins.serializers.xmlwriter import XMLWriter
from rdflib.serializer import Serializer
from rdflib.term import BNode, IdentifiedNode, Identifier, Literal, Node, URIRef
from rdflib.util import first, more_than
from .xmlwriter import ESCAPE_ENTITIES
__all__ = ["fix", "XMLSerializer", "PrettyXMLSerializer"]
class XMLSerializer(Serializer):
def __init__(self, store: Graph):
super(XMLSerializer, self).__init__(store)
def __bindings(self) -> Generator[Tuple[str, URIRef], None, None]:
store = self.store
nm = store.namespace_manager
bindings: Dict[str, URIRef] = {}
for predicate in set(store.predicates()):
# type error: Argument 1 to "compute_qname_strict" of "NamespaceManager" has incompatible type "Node"; expected "str"
prefix, namespace, name = nm.compute_qname_strict(predicate) # type: ignore[arg-type]
bindings[prefix] = URIRef(namespace)
RDFNS = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#") # noqa: N806
if "rdf" in bindings:
assert bindings["rdf"] == RDFNS
else:
bindings["rdf"] = RDFNS
for prefix, namespace in bindings.items():
yield prefix, namespace
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
self.__stream = stream
self.__serialized: Dict[Identifier, int] = {}
encoding = self.encoding
self.write = write = lambda uni: stream.write(uni.encode(encoding, "replace"))
# startDocument
write('<?xml version="1.0" encoding="%s"?>\n' % self.encoding)
# startRDF
write("<rdf:RDF\n")
# If provided, write xml:base attribute for the RDF
if "xml_base" in kwargs:
write(' xml:base="%s"\n' % kwargs["xml_base"])
elif self.base:
write(' xml:base="%s"\n' % self.base)
# TODO:
# assert(
# namespaces["http://www.w3.org/1999/02/22-rdf-syntax-ns#"]=='rdf')
bindings = list(self.__bindings())
bindings.sort()
for prefix, namespace in bindings:
if prefix:
write(' xmlns:%s="%s"\n' % (prefix, namespace))
else:
write(' xmlns="%s"\n' % namespace)
write(">\n")
# write out triples by subject
for subject in self.store.subjects():
# type error: Argument 1 to "subject" of "XMLSerializer" has incompatible type "Node"; expected "Identifier"
self.subject(subject, 1) # type: ignore[arg-type]
# endRDF
write("</rdf:RDF>\n")
# Set to None so that the memory can get garbage collected.
# self.__serialized = None
del self.__serialized
def subject(self, subject: Identifier, depth: int = 1) -> None:
if subject not in self.__serialized:
self.__serialized[subject] = 1
if isinstance(subject, (BNode, URIRef)):
write = self.write
indent = " " * depth
element_name = "rdf:Description"
if isinstance(subject, BNode):
write('%s<%s rdf:nodeID="%s"' % (indent, element_name, subject))
else:
uri = quoteattr(self.relativize(subject))
write("%s<%s rdf:about=%s" % (indent, element_name, uri))
if (subject, None, None) in self.store:
write(">\n")
for predicate, object in self.store.predicate_objects(subject):
# type error: Argument 1 to "predicate" of "XMLSerializer" has incompatible type "Node"; expected "Identifier"
# type error: Argument 2 to "predicate" of "XMLSerializer" has incompatible type "Node"; expected "Identifier"
self.predicate(predicate, object, depth + 1) # type: ignore[arg-type]
write("%s</%s>\n" % (indent, element_name))
else:
write("/>\n")
def predicate(
self, predicate: Identifier, object: Identifier, depth: int = 1
) -> None:
write = self.write
indent = " " * depth
qname = self.store.namespace_manager.qname_strict(predicate)
if isinstance(object, Literal):
attributes = ""
if object.language:
attributes += ' xml:lang="%s"' % object.language
if object.datatype:
attributes += ' rdf:datatype="%s"' % object.datatype
write(
"%s<%s%s>%s</%s>\n"
% (indent, qname, attributes, escape(object, ESCAPE_ENTITIES), qname)
)
else:
if isinstance(object, BNode):
write('%s<%s rdf:nodeID="%s"/>\n' % (indent, qname, object))
else:
write(
"%s<%s rdf:resource=%s/>\n"
% (indent, qname, quoteattr(self.relativize(object)))
)
XMLLANG = "http://www.w3.org/XML/1998/namespacelang"
XMLBASE = "http://www.w3.org/XML/1998/namespacebase"
OWL_NS = Namespace("http://www.w3.org/2002/07/owl#")
# TODO:
def fix(val: str) -> str:
"strip off _: from nodeIDs... as they are not valid NCNames"
if val.startswith("_:"):
return val[2:]
else:
return val
class PrettyXMLSerializer(Serializer):
def __init__(self, store: Graph, max_depth=3):
super(PrettyXMLSerializer, self).__init__(store)
self.forceRDFAbout: Set[URIRef] = set()
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
self.__serialized: Dict[Identifier, int] = {}
store = self.store
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif store.base is not None:
self.base = store.base
self.max_depth = kwargs.get("max_depth", 3)
assert self.max_depth > 0, "max_depth must be greater than 0"
self.nm = nm = store.namespace_manager
self.writer = writer = XMLWriter(stream, nm, encoding)
namespaces = {}
possible: Set[Node] = set(store.predicates()).union(
store.objects(None, RDF.type)
)
for predicate in possible:
# type error: Argument 1 to "compute_qname_strict" of "NamespaceManager" has incompatible type "Node"; expected "str"
prefix, namespace, local = nm.compute_qname_strict(predicate) # type: ignore[arg-type]
namespaces[prefix] = namespace
namespaces["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
writer.push(RDFVOC.RDF)
if "xml_base" in kwargs:
writer.attribute(XMLBASE, kwargs["xml_base"])
elif self.base:
writer.attribute(XMLBASE, self.base)
writer.namespaces(namespaces.items())
subject: IdentifiedNode
# Write out subjects that can not be inline
# type error: Incompatible types in assignment (expression has type "Node", variable has type "IdentifiedNode")
for subject in store.subjects(): # type: ignore[assignment]
if (None, None, subject) in store:
if (subject, None, subject) in store:
self.subject(subject, 1)
else:
self.subject(subject, 1)
# write out anything that has not yet been reached
# write out BNodes last (to ensure they can be inlined where possible)
bnodes = set()
# type error: Incompatible types in assignment (expression has type "Node", variable has type "IdentifiedNode")
for subject in store.subjects(): # type: ignore[assignment]
if isinstance(subject, BNode):
bnodes.add(subject)
continue
self.subject(subject, 1)
# now serialize only those BNodes that have not been serialized yet
for bnode in bnodes:
if bnode not in self.__serialized:
self.subject(subject, 1)
writer.pop(RDFVOC.RDF)
stream.write("\n".encode("latin-1"))
# Set to None so that the memory can get garbage collected.
self.__serialized = None # type: ignore[assignment]
def subject(self, subject: Identifier, depth: int = 1):
store = self.store
writer = self.writer
if subject in self.forceRDFAbout:
writer.push(RDFVOC.Description)
writer.attribute(RDFVOC.about, self.relativize(subject))
writer.pop(RDFVOC.Description)
self.forceRDFAbout.remove(subject) # type: ignore[arg-type]
elif subject not in self.__serialized:
self.__serialized[subject] = 1
type = first(store.objects(subject, RDF.type))
try:
# type error: Argument 1 to "qname" of "NamespaceManager" has incompatible type "Optional[Node]"; expected "str"
self.nm.qname(type) # type: ignore[arg-type]
except Exception:
type = None
element = type or RDFVOC.Description
# type error: Argument 1 to "push" of "XMLWriter" has incompatible type "Node"; expected "str"
writer.push(element) # type: ignore[arg-type]
if isinstance(subject, BNode):
def subj_as_obj_more_than(ceil):
return True
# more_than(store.triples((None, None, subject)), ceil)
# here we only include BNode labels if they are referenced
# more than once (this reduces the use of redundant BNode
# identifiers)
if subj_as_obj_more_than(1):
writer.attribute(RDFVOC.nodeID, fix(subject))
else:
writer.attribute(RDFVOC.about, self.relativize(subject))
if (subject, None, None) in store:
for predicate, object in store.predicate_objects(subject):
if not (predicate == RDF.type and object == type):
# type error: Argument 1 to "predicate" of "PrettyXMLSerializer" has incompatible type "Node"; expected "Identifier"
# type error: Argument 2 to "predicate" of "PrettyXMLSerializer" has incompatible type "Node"; expected "Identifier"
self.predicate(predicate, object, depth + 1) # type: ignore[arg-type]
# type error: Argument 1 to "pop" of "XMLWriter" has incompatible type "Node"; expected "Optional[str]"
writer.pop(element) # type: ignore[arg-type]
elif subject in self.forceRDFAbout:
# TODO FIXME?: this looks like a duplicate of first condition
writer.push(RDFVOC.Description)
writer.attribute(RDFVOC.about, self.relativize(subject))
writer.pop(RDFVOC.Description)
self.forceRDFAbout.remove(subject) # type: ignore[arg-type]
def predicate(
self, predicate: Identifier, object: Identifier, depth: int = 1
) -> None:
writer = self.writer
store = self.store
writer.push(predicate)
if isinstance(object, Literal):
if object.language:
writer.attribute(XMLLANG, object.language)
if object.datatype == RDF.XMLLiteral and isinstance(
object.value, xml.dom.minidom.Document
):
writer.attribute(RDFVOC.parseType, "Literal")
writer.text("")
writer.stream.write(object)
else:
if object.datatype:
writer.attribute(RDFVOC.datatype, object.datatype)
writer.text(object)
elif (
object in self.__serialized
or not (object, None, None) in store # noqa: E713
):
if isinstance(object, BNode):
if more_than(store.triples((None, None, object)), 0):
writer.attribute(RDFVOC.nodeID, fix(object))
else:
writer.attribute(RDFVOC.resource, self.relativize(object))
else:
if first(store.objects(object, RDF.first)): # may not have type
# RDF.List
self.__serialized[object] = 1
# Warn that any assertions on object other than
# RDF.first and RDF.rest are ignored... including RDF.List
import warnings
warnings.warn(
"Assertions on %s other than RDF.first " % repr(object)
+ "and RDF.rest are ignored ... including RDF.List",
UserWarning,
stacklevel=2,
)
writer.attribute(RDFVOC.parseType, "Collection")
col = Collection(store, object)
for item in col:
if isinstance(item, URIRef):
self.forceRDFAbout.add(item)
# type error: Argument 1 to "subject" of "PrettyXMLSerializer" has incompatible type "Node"; expected "Identifier"
self.subject(item) # type: ignore[arg-type]
if not isinstance(item, URIRef):
# type error: Invalid index type "Node" for "Dict[Identifier, int]"; expected type "Identifier"
self.__serialized[item] = 1 # type: ignore[index]
else:
if first(
store.triples_choices(
# type error: Argument 1 to "triples_choices" of "Graph" has incompatible type "Tuple[Identifier, URIRef, List[URIRef]]"; expected "Union[Tuple[List[Node], Node, Node], Tuple[Node, List[Node], Node], Tuple[Node, Node, List[Node]]]"
(object, RDF.type, [OWL_NS.Class, RDFS.Class]) # type: ignore[arg-type]
)
) and isinstance(object, URIRef):
writer.attribute(RDFVOC.resource, self.relativize(object))
elif depth <= self.max_depth:
self.subject(object, depth + 1)
elif isinstance(object, BNode):
if (
object not in self.__serialized
and (object, None, None) in store
and len(list(store.subjects(object=object))) == 1
):
# inline blank nodes if they haven't been serialized yet
# and are only referenced once (regardless of depth)
self.subject(object, depth + 1)
else:
writer.attribute(RDFVOC.nodeID, fix(object))
else:
writer.attribute(RDFVOC.resource, self.relativize(object))
writer.pop(predicate)
@@ -0,0 +1,121 @@
"""
Trig RDF graph serializer for RDFLib.
See <http://www.w3.org/TR/trig/> for syntax specification.
"""
from __future__ import annotations
from typing import IO, TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.plugins.serializers.turtle import TurtleSerializer
from rdflib.term import BNode, Node
if TYPE_CHECKING:
from rdflib.graph import _ContextType, _SubjectType
__all__ = ["TrigSerializer"]
class TrigSerializer(TurtleSerializer):
short_name = "trig"
indentString = 4 * " "
def __init__(self, store: Union[Graph, ConjunctiveGraph]):
self.default_context: Optional[Node]
if store.context_aware:
if TYPE_CHECKING:
assert isinstance(store, ConjunctiveGraph)
self.contexts = list(store.contexts())
self.default_context = store.default_context.identifier
if store.default_context:
self.contexts.append(store.default_context)
else:
self.contexts = [store]
self.default_context = None
super(TrigSerializer, self).__init__(store)
def preprocess(self) -> None:
for context in self.contexts:
# do not write unnecessary prefix (ex: for an empty default graph)
if len(context) == 0:
continue
self.store = context
# Don't generate a new prefix for a graph URI if one already exists
self.getQName(context.identifier, False)
self._subjects = {}
for triple in context:
self.preprocessTriple(triple)
for subject in self._subjects.keys():
self._references[subject] += 1
self._contexts[context] = (self.orderSubjects(), self._subjects)
def reset(self) -> None:
super(TrigSerializer, self).reset()
self._contexts: Dict[
_ContextType,
Tuple[List[_SubjectType], Dict[_SubjectType, bool]],
] = {}
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
spacious: Optional[bool] = None,
**kwargs: Any,
) -> None:
self.reset()
self.stream = stream
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
if spacious is not None:
self._spacious = spacious
self.preprocess()
self.startDocument()
firstTime = True
for store, (ordered_subjects, subjects) in self._contexts.items():
if not ordered_subjects:
continue
self._serialized = {}
self.store = store
self._subjects = subjects
if self.default_context and store.identifier == self.default_context:
self.write(self.indent() + "\n{")
else:
iri: Optional[str]
if isinstance(store.identifier, BNode):
iri = store.identifier.n3()
else:
# Show the full graph URI if a prefix for it doesn't already exist
iri = self.getQName(store.identifier, False)
if iri is None:
iri = store.identifier.n3()
self.write(self.indent() + "\n%s {" % iri)
self.depth += 1
for subject in ordered_subjects:
if self.isDone(subject):
continue
if firstTime:
firstTime = False
if self.statement(subject) and not firstTime:
self.write("\n")
self.depth -= 1
self.write("}\n")
self.endDocument()
stream.write("\n".encode("latin-1"))
@@ -0,0 +1,91 @@
from __future__ import annotations
from typing import IO, Any, Optional
from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.namespace import Namespace
from rdflib.plugins.serializers.xmlwriter import XMLWriter
from rdflib.serializer import Serializer
from rdflib.term import BNode, Literal, URIRef
__all__ = ["TriXSerializer"]
# TODO: Move this somewhere central
TRIXNS = Namespace("http://www.w3.org/2004/03/trix/trix-1/")
XMLNS = Namespace("http://www.w3.org/XML/1998/namespace")
class TriXSerializer(Serializer):
def __init__(self, store: Graph):
super(TriXSerializer, self).__init__(store)
if not store.context_aware:
raise Exception(
"TriX serialization only makes sense for context-aware stores"
)
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
**kwargs: Any,
) -> None:
nm = self.store.namespace_manager
self.writer = XMLWriter(stream, nm, encoding, extra_ns={"": TRIXNS})
self.writer.push(TRIXNS["TriX"])
# if base is given here, use that, if not and a base is set for the graph use that
if base is None and self.store.base is not None:
base = self.store.base
if base is not None:
self.writer.attribute("http://www.w3.org/XML/1998/namespacebase", base)
self.writer.namespaces()
if isinstance(self.store, ConjunctiveGraph):
for subgraph in self.store.contexts():
self._writeGraph(subgraph)
elif isinstance(self.store, Graph):
self._writeGraph(self.store)
else:
raise Exception(f"Unknown graph type: {type(self.store)}")
self.writer.pop()
stream.write("\n".encode("latin-1"))
def _writeGraph(self, graph): # noqa: N802
self.writer.push(TRIXNS["graph"])
if graph.base:
self.writer.attribute(
"http://www.w3.org/XML/1998/namespacebase", graph.base
)
if isinstance(graph.identifier, URIRef):
self.writer.element(TRIXNS["uri"], content=str(graph.identifier))
for triple in graph.triples((None, None, None)):
self._writeTriple(triple)
self.writer.pop()
def _writeTriple(self, triple): # noqa: N802
self.writer.push(TRIXNS["triple"])
for component in triple:
if isinstance(component, URIRef):
self.writer.element(TRIXNS["uri"], content=str(component))
elif isinstance(component, BNode):
self.writer.element(TRIXNS["id"], content=str(component))
elif isinstance(component, Literal):
if component.datatype:
self.writer.element(
TRIXNS["typedLiteral"],
content=str(component),
attributes={TRIXNS["datatype"]: str(component.datatype)},
)
elif component.language:
self.writer.element(
TRIXNS["plainLiteral"],
content=str(component),
attributes={XMLNS["lang"]: str(component.language)},
)
else:
self.writer.element(TRIXNS["plainLiteral"], content=str(component))
self.writer.pop()
@@ -0,0 +1,453 @@
"""
Turtle RDF graph serializer for RDFLib.
See <http://www.w3.org/TeamSubmission/turtle/> for syntax specification.
"""
from __future__ import annotations
from collections import defaultdict
from typing import (
IO,
TYPE_CHECKING,
Any,
DefaultDict,
Dict,
List,
Mapping,
Optional,
Sequence,
Tuple,
)
from rdflib.exceptions import Error
from rdflib.graph import Graph
from rdflib.namespace import RDF, RDFS
from rdflib.serializer import Serializer
from rdflib.term import BNode, Literal, Node, URIRef
if TYPE_CHECKING:
from rdflib.graph import _PredicateType, _SubjectType, _TripleType
__all__ = ["RecursiveSerializer", "TurtleSerializer"]
class RecursiveSerializer(Serializer):
topClasses = [RDFS.Class]
predicateOrder = [RDF.type, RDFS.label]
maxDepth = 10
indentString = " "
roundtrip_prefixes: Tuple[Any, ...] = ()
def __init__(self, store: Graph):
super(RecursiveSerializer, self).__init__(store)
self.stream: Optional[IO[bytes]] = None
self.reset()
def addNamespace(self, prefix: str, uri: URIRef) -> None:
if prefix in self.namespaces and self.namespaces[prefix] != uri:
raise Exception(
"Trying to override namespace prefix %s => %s, but it's already bound to %s"
% (prefix, uri, self.namespaces[prefix])
)
self.namespaces[prefix] = uri
def checkSubject(self, subject: _SubjectType) -> bool:
"""Check to see if the subject should be serialized yet"""
if (
(self.isDone(subject))
or (subject not in self._subjects)
or ((subject in self._topLevels) and (self.depth > 1))
or (isinstance(subject, URIRef) and (self.depth >= self.maxDepth))
):
return False
return True
def isDone(self, subject: _SubjectType) -> bool:
"""Return true if subject is serialized"""
return subject in self._serialized
def orderSubjects(self) -> List[_SubjectType]:
seen: Dict[_SubjectType, bool] = {}
subjects: List[_SubjectType] = []
for classURI in self.topClasses:
members = list(self.store.subjects(RDF.type, classURI))
members.sort()
subjects.extend(members)
for member in members:
self._topLevels[member] = True
seen[member] = True
recursable = [
(isinstance(subject, BNode), self._references[subject], subject)
for subject in self._subjects
if subject not in seen
]
recursable.sort()
subjects.extend([subject for (isbnode, refs, subject) in recursable])
return subjects
def preprocess(self) -> None:
for triple in self.store.triples((None, None, None)):
self.preprocessTriple(triple)
def preprocessTriple(self, spo: _TripleType) -> None:
s, p, o = spo
self._references[o] += 1
self._subjects[s] = True
def reset(self) -> None:
self.depth = 0
# Typed none because nothing is using it ...
self.lists: Dict[None, None] = {}
self.namespaces: Dict[str, URIRef] = {}
self._references: DefaultDict[Node, int] = defaultdict(int)
self._serialized: Dict[_SubjectType, bool] = {}
self._subjects: Dict[_SubjectType, bool] = {}
self._topLevels: Dict[_SubjectType, bool] = {}
if self.roundtrip_prefixes:
if hasattr(self.roundtrip_prefixes, "__iter__"):
for prefix, ns in self.store.namespaces():
if prefix in self.roundtrip_prefixes:
self.addNamespace(prefix, ns)
else:
for prefix, ns in self.store.namespaces():
self.addNamespace(prefix, ns)
def buildPredicateHash(
self, subject: _SubjectType
) -> Mapping[_PredicateType, List[Node]]:
"""
Build a hash key by predicate to a list of objects for the given
subject
"""
properties: Dict[_PredicateType, List[Node]] = {}
for s, p, o in self.store.triples((subject, None, None)):
oList = properties.get(p, [])
oList.append(o)
properties[p] = oList
return properties
def sortProperties(
self, properties: Mapping[_PredicateType, List[Node]]
) -> List[_PredicateType]:
"""Take a hash from predicate uris to lists of values.
Sort the lists of values. Return a sorted list of properties."""
# Sort object lists
for prop, objects in properties.items():
objects.sort()
# Make sorted list of properties
propList: List[_PredicateType] = []
seen: Dict[_PredicateType, bool] = {}
for prop in self.predicateOrder:
if (prop in properties) and (prop not in seen):
propList.append(prop)
seen[prop] = True
props = list(properties.keys())
props.sort()
for prop in props:
if prop not in seen:
propList.append(prop)
seen[prop] = True
return propList
def subjectDone(self, subject: _SubjectType) -> None:
"""Mark a subject as done."""
self._serialized[subject] = True
def indent(self, modifier: int = 0) -> str:
"""Returns indent string multiplied by the depth"""
return (self.depth + modifier) * self.indentString
def write(self, text: str) -> None:
"""Write text in given encoding."""
# type error: Item "None" of "Optional[IO[bytes]]" has no attribute "write"
self.stream.write(text.encode(self.encoding, "replace")) # type: ignore[union-attr]
SUBJECT = 0
VERB = 1
OBJECT = 2
_GEN_QNAME_FOR_DT = False
_SPACIOUS_OUTPUT = False
class TurtleSerializer(RecursiveSerializer):
short_name = "turtle"
indentString = " "
def __init__(self, store: Graph):
self._ns_rewrite: Dict[str, str] = {}
super(TurtleSerializer, self).__init__(store)
self.keywords: Dict[Node, str] = {RDF.type: "a"}
self.reset()
self.stream = None
self._spacious = _SPACIOUS_OUTPUT
# type error: Return type "str" of "addNamespace" incompatible with return type "None" in supertype "RecursiveSerializer"
def addNamespace(self, prefix: str, namespace: URIRef) -> str: # type: ignore[override]
# Turtle does not support prefix that start with _
# if they occur in the graph, rewrite to p_blah
# this is more complicated since we need to make sure p_blah
# does not already exist. And we register namespaces as we go, i.e.
# we may first see a triple with prefix _9 - rewrite it to p_9
# and then later find a triple with a "real" p_9 prefix
# so we need to keep track of ns rewrites we made so far.
if (prefix > "" and prefix[0] == "_") or self.namespaces.get(
prefix, namespace
) != namespace:
if prefix not in self._ns_rewrite:
p = "p" + prefix
while p in self.namespaces:
p = "p" + p
self._ns_rewrite[prefix] = p
prefix = self._ns_rewrite.get(prefix, prefix)
super(TurtleSerializer, self).addNamespace(prefix, namespace)
return prefix
def reset(self) -> None:
super(TurtleSerializer, self).reset()
# typing as Dict[None, None] because nothing seems to be using it
self._shortNames: Dict[None, None] = {}
self._started = False
self._ns_rewrite = {}
def serialize(
self,
stream: IO[bytes],
base: Optional[str] = None,
encoding: Optional[str] = None,
spacious: Optional[bool] = None,
**kwargs: Any,
) -> None:
self.reset()
self.stream = stream
# if base is given here, use that, if not and a base is set for the graph use that
if base is not None:
self.base = base
elif self.store.base is not None:
self.base = self.store.base
if spacious is not None:
self._spacious = spacious
self.preprocess()
subjects_list = self.orderSubjects()
self.startDocument()
firstTime = True
for subject in subjects_list:
if self.isDone(subject):
continue
if firstTime:
firstTime = False
if self.statement(subject) and not firstTime:
self.write("\n")
self.endDocument()
stream.write("\n".encode("latin-1"))
self.base = None
def preprocessTriple(self, triple: _TripleType) -> None:
super(TurtleSerializer, self).preprocessTriple(triple)
for i, node in enumerate(triple):
if i == VERB and node in self.keywords:
# predicate is a keyword
continue
# Don't use generated prefixes for subjects and objects
self.getQName(node, gen_prefix=(i == VERB))
if isinstance(node, Literal) and node.datatype:
self.getQName(node.datatype, gen_prefix=_GEN_QNAME_FOR_DT)
p = triple[1]
if isinstance(p, BNode): # hmm - when is P ever a bnode?
self._references[p] += 1
# TODO: Rename to get_pname
def getQName(self, uri: Node, gen_prefix: bool = True) -> Optional[str]:
if not isinstance(uri, URIRef):
return None
parts = None
try:
parts = self.store.compute_qname(uri, generate=gen_prefix)
except Exception:
# is the uri a namespace in itself?
pfx = self.store.store.prefix(uri)
if pfx is not None:
parts = (pfx, uri, "")
else:
# nothing worked
return None
prefix, namespace, local = parts
local = local.replace(r"(", r"\(").replace(r")", r"\)")
# QName cannot end with .
if local.endswith("."):
return None
prefix = self.addNamespace(prefix, namespace)
return "%s:%s" % (prefix, local)
def startDocument(self) -> None:
self._started = True
ns_list = sorted(self.namespaces.items())
if self.base:
self.write(self.indent() + "@base <%s> .\n" % self.base)
for prefix, uri in ns_list:
self.write(self.indent() + "@prefix %s: <%s> .\n" % (prefix, uri))
if ns_list and self._spacious:
self.write("\n")
def endDocument(self) -> None:
if self._spacious:
self.write("\n")
def statement(self, subject: _SubjectType) -> bool:
self.subjectDone(subject)
return self.s_squared(subject) or self.s_default(subject)
def s_default(self, subject: _SubjectType) -> bool:
self.write("\n" + self.indent())
self.path(subject, SUBJECT)
self.predicateList(subject)
self.write(" .")
return True
def s_squared(self, subject: _SubjectType) -> bool:
if (self._references[subject] > 0) or not isinstance(subject, BNode):
return False
self.write("\n" + self.indent() + "[]")
self.predicateList(subject)
self.write(" .")
return True
def path(self, node: Node, position: int, newline: bool = False) -> None:
if not (
self.p_squared(node, position, newline)
or self.p_default(node, position, newline)
):
raise Error("Cannot serialize node '%s'" % (node,))
def p_default(self, node: Node, position: int, newline: bool = False) -> bool:
if position != SUBJECT and not newline:
self.write(" ")
self.write(self.label(node, position))
return True
def label(self, node: Node, position: int) -> str:
if node == RDF.nil:
return "()"
if position is VERB and node in self.keywords:
return self.keywords[node]
if isinstance(node, Literal):
return node._literal_n3(
use_plain=True,
qname_callback=lambda dt: self.getQName(dt, _GEN_QNAME_FOR_DT),
)
else:
node = self.relativize(node) # type: ignore[type-var]
return self.getQName(node, position == VERB) or node.n3()
def p_squared(self, node: Node, position: int, newline: bool = False) -> bool:
if (
not isinstance(node, BNode)
or node in self._serialized
or self._references[node] > 1
or position == SUBJECT
):
return False
if not newline:
self.write(" ")
if self.isValidList(node):
# this is a list
self.write("(")
self.depth += 1 # 2
self.doList(node)
self.depth -= 1 # 2
self.write(" )")
else:
self.subjectDone(node)
self.depth += 2
# self.write('[\n' + self.indent())
self.write("[")
self.depth -= 1
# self.predicateList(node, newline=True)
self.predicateList(node, newline=False)
# self.write('\n' + self.indent() + ']')
self.write(" ]")
self.depth -= 1
return True
def isValidList(self, l_: Node) -> bool:
"""
Checks if l is a valid RDF list, i.e. no nodes have other properties.
"""
try:
if self.store.value(l_, RDF.first) is None:
return False
except Exception:
return False
while l_:
if l_ != RDF.nil and len(list(self.store.predicate_objects(l_))) != 2:
return False
# type error: Incompatible types in assignment (expression has type "Optional[Node]", variable has type "Node")
l_ = self.store.value(l_, RDF.rest) # type: ignore[assignment]
return True
def doList(self, l_: Node) -> None:
while l_:
item = self.store.value(l_, RDF.first)
if item is not None:
self.path(item, OBJECT)
self.subjectDone(l_)
# type error: Incompatible types in assignment (expression has type "Optional[Node]", variable has type "Node")
l_ = self.store.value(l_, RDF.rest) # type: ignore[assignment]
def predicateList(self, subject: Node, newline: bool = False) -> None:
properties = self.buildPredicateHash(subject)
propList = self.sortProperties(properties)
if len(propList) == 0:
return
self.verb(propList[0], newline=newline)
self.objectList(properties[propList[0]])
for predicate in propList[1:]:
self.write(" ;\n" + self.indent(1))
self.verb(predicate, newline=True)
self.objectList(properties[predicate])
def verb(self, node: Node, newline: bool = False) -> None:
self.path(node, VERB, newline)
def objectList(self, objects: Sequence[Node]) -> None:
count = len(objects)
if count == 0:
return
depthmod = (count == 1) and 0 or 1
self.depth += depthmod
self.path(objects[0], OBJECT)
for obj in objects[1:]:
self.write(",\n" + self.indent(1))
self.path(obj, OBJECT, newline=True)
self.depth -= depthmod
@@ -0,0 +1,128 @@
from __future__ import annotations
import codecs
from typing import IO, TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
from xml.sax.saxutils import escape, quoteattr
from rdflib.term import URIRef
if TYPE_CHECKING:
from rdflib.namespace import Namespace, NamespaceManager
__all__ = ["XMLWriter"]
ESCAPE_ENTITIES = {"\r": "&#13;"}
class XMLWriter:
def __init__(
self,
stream: IO[bytes],
namespace_manager: NamespaceManager,
encoding: Optional[str] = None,
decl: int = 1,
extra_ns: Optional[Dict[str, Namespace]] = None,
):
encoding = encoding or "utf-8"
encoder, decoder, stream_reader, stream_writer = codecs.lookup(encoding)
# NOTE on type ignores: this is mainly because the variable is being re-used.
# type error: Incompatible types in assignment (expression has type "StreamWriter", variable has type "IO[bytes]")
self.stream = stream = stream_writer(stream) # type: ignore[assignment]
if decl:
# type error: No overload variant of "write" of "IO" matches argument type "str"
stream.write('<?xml version="1.0" encoding="%s"?>' % encoding) # type: ignore[call-overload]
self.element_stack: List[str] = []
self.nm = namespace_manager
self.extra_ns = extra_ns or {}
self.closed = True
def __get_indent(self) -> str:
return " " * len(self.element_stack)
indent = property(__get_indent)
def __close_start_tag(self) -> None:
if not self.closed: # TODO:
self.closed = True
self.stream.write(">")
def push(self, uri: str) -> None:
self.__close_start_tag()
write = self.stream.write
write("\n")
write(self.indent)
write("<%s" % self.qname(uri))
self.element_stack.append(uri)
self.closed = False
self.parent = False
def pop(self, uri: Optional[str] = None) -> None:
top = self.element_stack.pop()
if uri:
assert uri == top
write = self.stream.write
if not self.closed:
self.closed = True
write("/>")
else:
if self.parent:
write("\n")
write(self.indent)
write("</%s>" % self.qname(top))
self.parent = True
def element(
self, uri: str, content: str, attributes: Dict[URIRef, str] = {}
) -> None:
"""Utility method for adding a complete simple element"""
self.push(uri)
for k, v in attributes.items():
self.attribute(k, v)
self.text(content)
self.pop()
def namespaces(self, namespaces: Iterable[Tuple[str, str]] = None) -> None:
if not namespaces:
namespaces = self.nm.namespaces()
write = self.stream.write
write("\n")
for prefix, namespace in namespaces:
if prefix:
write(' xmlns:%s="%s"\n' % (prefix, namespace))
# Allow user-provided namespace bindings to prevail
elif prefix not in self.extra_ns:
write(' xmlns="%s"\n' % namespace)
for prefix, namespace in self.extra_ns.items():
if prefix:
write(' xmlns:%s="%s"\n' % (prefix, namespace))
else:
write(' xmlns="%s"\n' % namespace)
def attribute(self, uri: str, value: str) -> None:
write = self.stream.write
write(" %s=%s" % (self.qname(uri), quoteattr(value)))
def text(self, text: str) -> None:
self.__close_start_tag()
if "<" in text and ">" in text and "]]>" not in text:
self.stream.write("<![CDATA[")
self.stream.write(text)
self.stream.write("]]>")
else:
self.stream.write(escape(text, ESCAPE_ENTITIES))
def qname(self, uri: str) -> str:
"""Compute qname for a uri using our extra namespaces,
or the given namespace manager"""
for pre, ns in self.extra_ns.items():
if uri.startswith(ns):
if pre != "":
return ":".join([pre, uri[len(ns) :]])
else:
return uri[len(ns) :]
return self.nm.qname_strict(uri)